|
@@ -22,7 +22,7 @@ from common.scheduling_db import MysqlHelper
|
|
# from scheduling_db import MysqlHelper
|
|
# from scheduling_db import MysqlHelper
|
|
|
|
|
|
|
|
|
|
-def get_user_from_mysql(log_type, crawler, source, env, action=''):
|
|
|
|
|
|
+def get_user_from_mysql(log_type, crawler, source, env, action=""):
|
|
sql = f"select * from crawler_user_v3 where source='{source}' and mode='{log_type}'"
|
|
sql = f"select * from crawler_user_v3 where source='{source}' and mode='{log_type}'"
|
|
results = MysqlHelper.get_values(log_type, crawler, sql, env, action=action)
|
|
results = MysqlHelper.get_values(log_type, crawler, sql, env, action=action)
|
|
if results:
|
|
if results:
|
|
@@ -56,7 +56,9 @@ def title_like(log_type, crawler, platform, title, env):
|
|
:param env: 环境
|
|
:param env: 环境
|
|
:return: 相似度>=80%,返回 True;反之,返回 False
|
|
:return: 相似度>=80%,返回 True;反之,返回 False
|
|
"""
|
|
"""
|
|
- select_sql = f""" select video_title from crawler_video where platform="{platform}" """
|
|
|
|
|
|
+ select_sql = (
|
|
|
|
+ f""" select video_title from crawler_video where platform="{platform}" """
|
|
|
|
+ )
|
|
video_list = MysqlHelper.get_values(log_type, crawler, select_sql, env, action="")
|
|
video_list = MysqlHelper.get_values(log_type, crawler, select_sql, env, action="")
|
|
# print(video_list)
|
|
# print(video_list)
|
|
if len(video_list) == 0:
|
|
if len(video_list) == 0:
|
|
@@ -71,7 +73,7 @@ def title_like(log_type, crawler, platform, title, env):
|
|
return False
|
|
return False
|
|
|
|
|
|
|
|
|
|
-def get_config_from_mysql(log_type, source, env, text, action=''):
|
|
|
|
|
|
+def get_config_from_mysql(log_type, source, env, text, action=""):
|
|
select_sql = f"""select * from crawler_config where source="{source}" """
|
|
select_sql = f"""select * from crawler_config where source="{source}" """
|
|
contents = MysqlHelper.get_values(log_type, source, select_sql, env, action=action)
|
|
contents = MysqlHelper.get_values(log_type, source, select_sql, env, action=action)
|
|
title_list = []
|
|
title_list = []
|
|
@@ -79,7 +81,7 @@ def get_config_from_mysql(log_type, source, env, text, action=''):
|
|
emoji_list = []
|
|
emoji_list = []
|
|
search_word_list = []
|
|
search_word_list = []
|
|
for content in contents:
|
|
for content in contents:
|
|
- config = content['config']
|
|
|
|
|
|
+ config = content["config"]
|
|
config_dict = eval(config)
|
|
config_dict = eval(config)
|
|
for k, v in config_dict.items():
|
|
for k, v in config_dict.items():
|
|
if k == "title":
|
|
if k == "title":
|
|
@@ -110,8 +112,10 @@ def get_config_from_mysql(log_type, source, env, text, action=''):
|
|
|
|
|
|
def get_rule_from_mysql(task_id, log_type, crawler, env):
|
|
def get_rule_from_mysql(task_id, log_type, crawler, env):
|
|
select_rule_sql = f"""select rule from crawler_task_v3 where id={task_id}"""
|
|
select_rule_sql = f"""select rule from crawler_task_v3 where id={task_id}"""
|
|
- rule_list = MysqlHelper.get_values(log_type, crawler, select_rule_sql, env, action="")
|
|
|
|
- return json.loads(rule_list[0]['rule'])
|
|
|
|
|
|
+ rule_list = MysqlHelper.get_values(
|
|
|
|
+ log_type, crawler, select_rule_sql, env, action=""
|
|
|
|
+ )
|
|
|
|
+ return json.loads(rule_list[0]["rule"])
|
|
|
|
|
|
|
|
|
|
def random_title(log_type, crawler, env, text):
|
|
def random_title(log_type, crawler, env, text):
|
|
@@ -120,34 +124,28 @@ def random_title(log_type, crawler, env, text):
|
|
|
|
|
|
|
|
|
|
def task_fun(task_str):
|
|
def task_fun(task_str):
|
|
- task_str = task_str.replace("'[", '[').replace("]'", ']')
|
|
|
|
|
|
+ task_str = task_str.replace("'[", "[").replace("]'", "]")
|
|
task_dict = dict(eval(task_str))
|
|
task_dict = dict(eval(task_str))
|
|
- rule = task_dict['rule']
|
|
|
|
- task_dict['rule'] = dict()
|
|
|
|
|
|
+ rule = task_dict["rule"]
|
|
|
|
+ task_dict["rule"] = dict()
|
|
for item in rule:
|
|
for item in rule:
|
|
for k, val in item.items():
|
|
for k, val in item.items():
|
|
- task_dict['rule'][k] = val
|
|
|
|
- rule_dict = task_dict['rule']
|
|
|
|
- task_dict = {
|
|
|
|
- "task_dict": task_dict,
|
|
|
|
- "rule_dict": rule_dict
|
|
|
|
- }
|
|
|
|
|
|
+ task_dict["rule"][k] = val
|
|
|
|
+ rule_dict = task_dict["rule"]
|
|
|
|
+ task_dict = {"task_dict": task_dict, "rule_dict": rule_dict}
|
|
return task_dict
|
|
return task_dict
|
|
|
|
|
|
|
|
|
|
def task_fun_mq(task_str):
|
|
def task_fun_mq(task_str):
|
|
- task_str = task_str.replace('"[', '[').replace(']"', ']').replace('\\', '')
|
|
|
|
|
|
+ task_str = task_str.replace('"[', "[").replace(']"', "]").replace("\\", "")
|
|
task_dict = dict(eval(task_str))
|
|
task_dict = dict(eval(task_str))
|
|
- rule = task_dict['rule']
|
|
|
|
- task_dict['rule'] = dict()
|
|
|
|
|
|
+ rule = task_dict["rule"]
|
|
|
|
+ task_dict["rule"] = dict()
|
|
for item in rule:
|
|
for item in rule:
|
|
for k, val in item.items():
|
|
for k, val in item.items():
|
|
- task_dict['rule'][k] = val
|
|
|
|
- rule_dict = task_dict['rule']
|
|
|
|
- task_dict = {
|
|
|
|
- "task_dict": task_dict,
|
|
|
|
- "rule_dict": rule_dict
|
|
|
|
- }
|
|
|
|
|
|
+ task_dict["rule"][k] = val
|
|
|
|
+ rule_dict = task_dict["rule"]
|
|
|
|
+ task_dict = {"task_dict": task_dict, "rule_dict": rule_dict}
|
|
return task_dict
|
|
return task_dict
|
|
|
|
|
|
|
|
|
|
@@ -159,7 +157,7 @@ def get_consumer(topic_name, group_id):
|
|
# AccessKey ID,阿里云身份验证标识。获取方式,请参见创建AccessKey。
|
|
# AccessKey ID,阿里云身份验证标识。获取方式,请参见创建AccessKey。
|
|
"LTAI4G7puhXtLyHzHQpD6H7A",
|
|
"LTAI4G7puhXtLyHzHQpD6H7A",
|
|
# AccessKey Secret,阿里云身份验证密钥。获取方式,请参见创建AccessKey。
|
|
# AccessKey Secret,阿里云身份验证密钥。获取方式,请参见创建AccessKey。
|
|
- "nEbq3xWNQd1qLpdy2u71qFweHkZjSG"
|
|
|
|
|
|
+ "nEbq3xWNQd1qLpdy2u71qFweHkZjSG",
|
|
)
|
|
)
|
|
# 消息所属的Topic,在云消息队列 RocketMQ 版控制台创建。
|
|
# 消息所属的Topic,在云消息队列 RocketMQ 版控制台创建。
|
|
# topic_name = "${TOPIC}"
|
|
# topic_name = "${TOPIC}"
|
|
@@ -181,7 +179,9 @@ def ack_message(log_type, crawler, recv_msgs, consumer):
|
|
try:
|
|
try:
|
|
receipt_handle_list = [msg.receipt_handle for msg in recv_msgs]
|
|
receipt_handle_list = [msg.receipt_handle for msg in recv_msgs]
|
|
consumer.ack_message(receipt_handle_list)
|
|
consumer.ack_message(receipt_handle_list)
|
|
- Common.logger(log_type, crawler).info(f"Ack {len(receipt_handle_list)} Message Succeed.\n")
|
|
|
|
|
|
+ Common.logger(log_type, crawler).info(
|
|
|
|
+ f"Ack {len(receipt_handle_list)} Message Succeed.\n"
|
|
|
|
+ )
|
|
except MQExceptionBase as err:
|
|
except MQExceptionBase as err:
|
|
Common.logger(log_type, crawler).info(f"Ack Message Fail! Exception:{err}\n")
|
|
Common.logger(log_type, crawler).info(f"Ack Message Fail! Exception:{err}\n")
|
|
|
|
|
|
@@ -200,7 +200,9 @@ def download_rule(log_type, crawler, video_dict, rule_dict):
|
|
video_dict["publish_time"] = video_dict["publish_time_stamp"] * 1000
|
|
video_dict["publish_time"] = video_dict["publish_time_stamp"] * 1000
|
|
# 格式化 video_dict:period
|
|
# 格式化 video_dict:period
|
|
if "period" not in video_dict.keys() and "publish_time" in video_dict.keys():
|
|
if "period" not in video_dict.keys() and "publish_time" in video_dict.keys():
|
|
- video_dict["period"] = int((int(time.time() * 1000) - video_dict["publish_time"]) / (3600 * 24 * 1000))
|
|
|
|
|
|
+ video_dict["period"] = int(
|
|
|
|
+ (int(time.time() * 1000) - video_dict["publish_time"]) / (3600 * 24 * 1000)
|
|
|
|
+ )
|
|
# 格式化 rule_dict 最大值取值为 0 的问题
|
|
# 格式化 rule_dict 最大值取值为 0 的问题
|
|
for rule_value in rule_dict.values():
|
|
for rule_value in rule_dict.values():
|
|
if rule_value["max"] == 0:
|
|
if rule_value["max"] == 0:
|
|
@@ -215,11 +217,15 @@ def download_rule(log_type, crawler, video_dict, rule_dict):
|
|
if video_key == rule_key == "period":
|
|
if video_key == rule_key == "period":
|
|
result = 0 <= int(video_value) <= int(rule_value["max"])
|
|
result = 0 <= int(video_value) <= int(rule_value["max"])
|
|
Common.logger(log_type, crawler).info(
|
|
Common.logger(log_type, crawler).info(
|
|
- f'{video_key}: 0 <= {video_value} <= {rule_value["min"]}, {result}')
|
|
|
|
|
|
+ f'{video_key}: 0 <= {video_value} <= {rule_value["min"]}, {result}'
|
|
|
|
+ )
|
|
elif video_key == rule_key:
|
|
elif video_key == rule_key:
|
|
- result = int(rule_value["min"]) <= int(video_value) <= int(rule_value["max"])
|
|
|
|
|
|
+ result = (
|
|
|
|
+ int(rule_value["min"]) <= int(video_value) <= int(rule_value["max"])
|
|
|
|
+ )
|
|
Common.logger(log_type, crawler).info(
|
|
Common.logger(log_type, crawler).info(
|
|
- f'{video_key}: {rule_value["min"]} <= {video_value} <= {rule_value["max"]},{result}')
|
|
|
|
|
|
+ f'{video_key}: {rule_value["min"]} <= {video_value} <= {rule_value["max"]},{result}'
|
|
|
|
+ )
|
|
else:
|
|
else:
|
|
result = True
|
|
result = True
|
|
|
|
|
|
@@ -243,20 +249,37 @@ def download_rule_v2(log_type, crawler, video_dict, rule_dict):
|
|
if video_dict.get("publish_time_stamp"):
|
|
if video_dict.get("publish_time_stamp"):
|
|
video_dict["publish_time"] = video_dict["publish_time_stamp"] * 1000
|
|
video_dict["publish_time"] = video_dict["publish_time_stamp"] * 1000
|
|
# 格式化 video_dict:period
|
|
# 格式化 video_dict:period
|
|
- if video_dict.get("publish_time") and video_dict.get("period", "noperiod") == "noperiod":
|
|
|
|
- video_dict["period"] = int((int(time.time() * 1000) - video_dict["publish_time"]) / (3600 * 24 * 1000))
|
|
|
|
|
|
+ if (
|
|
|
|
+ video_dict.get("publish_time")
|
|
|
|
+ and video_dict.get("period", "noperiod") == "noperiod"
|
|
|
|
+ ):
|
|
|
|
+ video_dict["period"] = int(
|
|
|
|
+ (int(time.time() * 1000) - video_dict["publish_time"]) / (3600 * 24 * 1000)
|
|
|
|
+ )
|
|
# 格式化 rule_dict 最大值取值为 0 的问题
|
|
# 格式化 rule_dict 最大值取值为 0 的问题
|
|
for key in video_dict:
|
|
for key in video_dict:
|
|
if rule_dict.get(key):
|
|
if rule_dict.get(key):
|
|
- max_value = int(rule_dict[key]["max"]) if int(rule_dict[key]["max"]) > 0 else 999999999999999
|
|
|
|
|
|
+ max_value = (
|
|
|
|
+ int(rule_dict[key]["max"])
|
|
|
|
+ if int(rule_dict[key]["max"]) > 0
|
|
|
|
+ else 999999999999999
|
|
|
|
+ )
|
|
if key == "peroid":
|
|
if key == "peroid":
|
|
flag = 0 <= int(video_dict[key]) <= max_value
|
|
flag = 0 <= int(video_dict[key]) <= max_value
|
|
- Common.logger(log_type, crawler).info('{}: 0 <= {} <= {}, {}'.format(key, video_dict[key], max_value, flag))
|
|
|
|
|
|
+ Common.logger(log_type, crawler).info(
|
|
|
|
+ "{}: 0 <= {} <= {}, {}".format(
|
|
|
|
+ key, video_dict[key], max_value, flag
|
|
|
|
+ )
|
|
|
|
+ )
|
|
if not flag:
|
|
if not flag:
|
|
return flag
|
|
return flag
|
|
else:
|
|
else:
|
|
flag = int(rule_dict[key]["min"]) <= int(video_dict[key] <= max_value)
|
|
flag = int(rule_dict[key]["min"]) <= int(video_dict[key] <= max_value)
|
|
- Common.logger(log_type, crawler).info('{}: {} <= {} <= {}, {}'.format(key, rule_dict[key]["min"],video_dict[key], max_value, flag))
|
|
|
|
|
|
+ Common.logger(log_type, crawler).info(
|
|
|
|
+ "{}: {} <= {} <= {}, {}".format(
|
|
|
|
+ key, rule_dict[key]["min"], video_dict[key], max_value, flag
|
|
|
|
+ )
|
|
|
|
+ )
|
|
if not flag:
|
|
if not flag:
|
|
return flag
|
|
return flag
|
|
else:
|
|
else:
|
|
@@ -325,7 +348,7 @@ def task_unbind(log_type, crawler, taskid, uids, env):
|
|
params = {
|
|
params = {
|
|
"taskId": taskid, # 任务 ID
|
|
"taskId": taskid, # 任务 ID
|
|
"uids": uids, # 解绑用户uid(多个英文逗号隔开),例如"3222121,213231"
|
|
"uids": uids, # 解绑用户uid(多个英文逗号隔开),例如"3222121,213231"
|
|
- "operator": "" # 默认 system
|
|
|
|
|
|
+ "operator": "", # 默认 system
|
|
}
|
|
}
|
|
Common.logger(log_type, crawler).info(f"url:{url}")
|
|
Common.logger(log_type, crawler).info(f"url:{url}")
|
|
Common.logging(log_type, crawler, env, f"url:{url}")
|
|
Common.logging(log_type, crawler, env, f"url:{url}")
|
|
@@ -340,6 +363,30 @@ def task_unbind(log_type, crawler, taskid, uids, env):
|
|
return response.text
|
|
return response.text
|
|
|
|
|
|
|
|
|
|
|
|
+def clean_title(strings):
|
|
|
|
+ return (
|
|
|
|
+ strings.strip()
|
|
|
|
+ .replace("\n", "")
|
|
|
|
+ .replace("/", "")
|
|
|
|
+ .replace("\r", "")
|
|
|
|
+ .replace("#", "")
|
|
|
|
+ .replace(".", "。")
|
|
|
|
+ .replace("\\", "")
|
|
|
|
+ .replace("&NBSP", "")
|
|
|
|
+ .replace(":", "")
|
|
|
|
+ .replace("*", "")
|
|
|
|
+ .replace("?", "")
|
|
|
|
+ .replace("?", "")
|
|
|
|
+ .replace('"', "")
|
|
|
|
+ .replace("<", "")
|
|
|
|
+ .replace(">", "")
|
|
|
|
+ .replace("|", "")
|
|
|
|
+ .replace(" ", "")
|
|
|
|
+ .replace('"', "")
|
|
|
|
+ .replace("'", "")
|
|
|
|
+ )
|
|
|
|
+
|
|
|
|
+
|
|
if __name__ == "__main__":
|
|
if __name__ == "__main__":
|
|
- print(get_title_score("recommend", "kuaishou", "16QspO", "0usaDk", '像梦一场'))
|
|
|
|
|
|
+ print(get_title_score("recommend", "kuaishou", "16QspO", "0usaDk", "像梦一场"))
|
|
pass
|
|
pass
|