| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201 | 
							- # -*- coding: utf-8 -*-
 
- # @Author: wangkun
 
- # @Time: 2023/3/27
 
- import os, sys
 
- import time
 
- import random
 
- import difflib
 
- sys.path.append(os.getcwd())
 
- from common.common import Common
 
- from common.scheduling_db import MysqlHelper
 
- # from common import Common
 
- # from scheduling_db import MysqlHelper
 
- def get_user_from_mysql(log_type, crawler, source, env, action=''):
 
-     sql = f"select * from crawler_user_v3 where source='{source}' and mode='{log_type}'"
 
-     results = MysqlHelper.get_values(log_type, crawler, sql, env, action=action)
 
-     if results:
 
-         return results
 
-     else:
 
-         Common.logger(log_type, crawler).warning(f"爬虫:{crawler},没有查到抓取名单")
 
-         return []
 
- def title_like(log_type, crawler, platform, title, env):
 
-     """
 
-     标题相似度
 
-     :param log_type: 日志
 
-     :param crawler: 哪款爬虫
 
-     :param platform: 爬虫渠道,如:公众号 / 小年糕
 
-     :param title: 视频标题
 
-     :param env: 环境
 
-     :return: 相似度>=80%,返回 True;反之,返回 False
 
-     """
 
-     select_sql = f""" select video_title from crawler_video where platform="{platform}" """
 
-     video_list = MysqlHelper.get_values(log_type, crawler, select_sql, env, action="")
 
-     # print(video_list)
 
-     if len(video_list) == 0:
 
-         return False
 
-     for video_dict in video_list:
 
-         video_title = video_dict["video_title"]
 
-         # print(video_title)
 
-         if difflib.SequenceMatcher(None, title, video_title).quick_ratio() >= 0.8:
 
-             return True
 
-         else:
 
-             continue
 
-     return False
 
- def get_config_from_mysql(log_type, source, env, text, action=''):
 
-     select_sql = f"""select * from crawler_config where source="{source}" """
 
-     contents = MysqlHelper.get_values(log_type, source, select_sql, env, action=action)
 
-     title_list = []
 
-     filter_list = []
 
-     emoji_list = []
 
-     search_word_list = []
 
-     for content in contents:
 
-         config = content['config']
 
-         config_dict = eval(config)
 
-         for k, v in config_dict.items():
 
-             if k == "title":
 
-                 title_list_config = v.split(",")
 
-                 for title in title_list_config:
 
-                     title_list.append(title)
 
-             if k == "filter":
 
-                 filter_list_config = v.split(",")
 
-                 for filter_word in filter_list_config:
 
-                     filter_list.append(filter_word)
 
-             if k == "emoji":
 
-                 emoji_list_config = v.split(",")
 
-                 for emoji in emoji_list_config:
 
-                     emoji_list.append(emoji)
 
-             if k == "search_word":
 
-                 search_word_list_config = v.split(",")
 
-                 for search_word in search_word_list_config:
 
-                     search_word_list.append(search_word)
 
-     if text == "title":
 
-         return title_list
 
-     elif text == "filter":
 
-         return filter_list
 
-     elif text == "emoji":
 
-         return emoji_list
 
-     elif text == "search_word":
 
-         return search_word_list
 
- def random_title(log_type, crawler, env, text):
 
-     random_title_list = get_config_from_mysql(log_type, crawler, env, text)
 
-     return random.choice(random_title_list)
 
- def task_fun(task_str):
 
-     task_str = task_str.replace("'[", '[').replace("]'", ']')
 
-     task_dict = dict(eval(task_str))
 
-     rule = task_dict['rule']
 
-     task_dict['rule'] = dict()
 
-     for item in rule:
 
-         for k, val in item.items():
 
-             task_dict['rule'][k] = val
 
-     rule_dict = task_dict['rule']
 
-     task_dict = {
 
-         "task_dict": task_dict,
 
-         "rule_dict": rule_dict
 
-     }
 
-     return task_dict
 
- def get_consumer(topic_name, group_id):
 
-     # 初始化client。
 
-     mq_client = MQClient(
 
-         # 设置HTTP协议客户端接入点,进入云消息队列 RocketMQ 版控制台实例详情页面的接入点区域查看。
 
-         "${HTTP_ENDPOINT}",
 
-         # AccessKey ID,阿里云身份验证标识。获取方式,请参见创建AccessKey。
 
-         "${ACCESS_KEY}",
 
-         # AccessKey Secret,阿里云身份验证密钥。获取方式,请参见创建AccessKey。
 
-         "${SECRET_KEY}"
 
-     )
 
-     # 消息所属的Topic,在云消息队列 RocketMQ 版控制台创建。
 
-     # topic_name = "${TOPIC}"
 
-     topic_name = f"{topic_name}"
 
-     # 您在云消息队列 RocketMQ 版控制台创建的Group ID。
 
-     # group_id = "${GROUP_ID}"
 
-     group_id = f"{group_id}"
 
-     # Topic所属的实例ID,在云消息队列 RocketMQ 版控制台创建。
 
-     # 若实例有命名空间,则实例ID必须传入;若实例无命名空间,则实例ID传入空字符串。实例的命名空间可以在云消息队列 RocketMQ 版控制台的实例详情页面查看。
 
-     instance_id = "${INSTANCE_ID}"
 
-     consumer = mq_client.get_consumer(instance_id, topic_name, group_id)
 
-     return consumer
 
- def ack_message(log_type, crawler, recv_msgs, consumer):
 
-     # msg.next_consume_time前若不确认消息消费成功,则消息会被重复消费。
 
-     # 消息句柄有时间戳,同一条消息每次消费拿到的都不一样。
 
-     try:
 
-         receipt_handle_list = [msg.receipt_handle for msg in recv_msgs]
 
-         consumer.ack_message(receipt_handle_list)
 
-         Common.logger(log_type, crawler).info(f"Ack {len(receipt_handle_list)} Message Succeed.\n")
 
-     except MQExceptionBase as err:
 
-         Common.logger(log_type, crawler).info(f"Ack Message Fail! Exception:{err}\n")
 
- def download_rule(log_type, crawler, video_dict, rule_dict):
 
-     """
 
-     下载视频的基本规则
 
-     :param log_type: 日志
 
-     :param crawler: 哪款爬虫
 
-     :param video_dict: 视频信息,字典格式
 
-     :param rule_dict: 规则信息,字典格式
 
-     :return: 满足规则,返回 True;反之,返回 False
 
-     """
 
-     # 格式化 video_dict:publish_time_stamp
 
-     if "publish_time_stamp" in video_dict.keys():
 
-         video_dict["publish_time"] = video_dict["publish_time_stamp"] * 1000
 
-     # 格式化 video_dict:period
 
-     if "period" not in video_dict.keys() and "publish_time" in video_dict.keys():
 
-         video_dict["period"] = int((int(time.time() * 1000) - video_dict["publish_time"]) / (3600 * 24 * 1000))
 
-     # 格式化 rule_dict 最大值取值为 0 的问题
 
-     for rule_value in rule_dict.values():
 
-         if rule_value["max"] == 0:
 
-             rule_value["max"] = 999999999999999
 
-     # 格式化 rule_dict 有的 key,video_dict 中没有的问题
 
-     for rule_key in rule_dict.keys():
 
-         if rule_key not in video_dict.keys():
 
-             video_dict[rule_key] = int(rule_dict[rule_key]["max"] / 2)
 
-     # 比较结果,输出:True / False
 
-     for video_key, video_value in video_dict.items():
 
-         for rule_key, rule_value in rule_dict.items():
 
-             if video_key == rule_key == "period":
 
-                 result = 0 <= int(video_value) <= int(rule_value["max"])
 
-                 Common.logger(log_type, crawler).info(f'{video_key}: 0 <= {video_value} <= {rule_value["min"]}, {result}')
 
-             elif video_key == rule_key:
 
-                 result = int(rule_value["min"]) <= int(video_value) <= int(rule_value["max"])
 
-                 Common.logger(log_type, crawler).info(f'{video_key}: {rule_value["min"]} <= {video_value} <= {rule_value["max"]},{result}')
 
-             else:
 
-                 result = True
 
-             if result is False:
 
-                 return False
 
-             else:
 
-                 continue
 
-     return True
 
- if __name__ == "__main__":
 
-     # print(filter_word('public', 'xiaoniangao', '小年糕', 'prod'))
 
-     # print(get_config_from_mysql('test', 'gongzhonghao', 'prod', 'filter'))
 
-     # print(filter_word('test', 'gongzhonghao', '公众号', 'prod'))
 
-     # task_str = "[('task_id','11')," \
 
-     #            "('task_name','小年糕小时榜')," \
 
-     #            "('source','xiaoniangao')," \
 
-     #            "('start_time','1681834560000')," \
 
-     #            "('interval','1'),('mode','hour')," \
 
-     #            "('rule','[{'duration':{'min':40,'max':0}},{'playCnt':{'min':4000,'max':0}},{'period':{'min':10,'max':0}},{'fans':{'min':0,'max':0}},{'videos':{'min':0,'max':0}},{'like':{'min':0,'max':0}},{'videoWidth':{'min':0,'max':0}},{'videoHeight':{'min':0,'max':0}}]')," \
 
-     #            "('spider_name','')," \
 
-     #            "('machine','')," \
 
-     #            "('status','0')," \
 
-     #            "('create_time','1681889875288')," \
 
-     #            "('update_time','1681889904908')," \
 
-     #            "('operator','王坤')]"
 
-     # print(task(task_str))
 
-     pass
 
 
  |