|
@@ -54,7 +54,7 @@ class ColdStartTitleSimilarityTask(object):
|
|
|
title_list = [i[0] for i in mysql_response]
|
|
|
return title_list
|
|
|
|
|
|
- def get_title_from_meta_base(self, limit):
|
|
|
+ def get_article_title_from_meta_base(self, limit):
|
|
|
"""
|
|
|
获取meta_base表中文章标题列表
|
|
|
status: 1 表示文章初始化状态
|
|
@@ -70,17 +70,56 @@ class ColdStartTitleSimilarityTask(object):
|
|
|
mysql_response = self.db_client.fetch(query=sql, cursor_type=DictCursor)
|
|
|
return mysql_response
|
|
|
|
|
|
- def update_meta_article_batch(self, update_data_list: list[tuple]) -> int:
|
|
|
+ def get_video_title_from_meta_table(self, limit):
|
|
|
"""
|
|
|
- 批量更新crawler_meta_article
|
|
|
+ 获取meta_base表中视频标题列表
|
|
|
+ audit_status = 0 表示视频初始化状态
|
|
|
+ """
|
|
|
+ if limit:
|
|
|
+ sql = f"""
|
|
|
+ select id as article_id, article_title as title
|
|
|
+ from publish_single_video_source
|
|
|
+ where audit_status = 0
|
|
|
+ and score is null
|
|
|
+ and bad_status = 0
|
|
|
+ limit {limit};
|
|
|
+ """
|
|
|
+ else:
|
|
|
+ sql = f"""
|
|
|
+ select id as article_id, article_title as title
|
|
|
+ from publish_single_video_source
|
|
|
+ where audit_status = 0
|
|
|
+ and score is null
|
|
|
+ and bad_status = 0;
|
|
|
+ """
|
|
|
+ mysql_response = self.db_client.fetch(query=sql, cursor_type=DictCursor)
|
|
|
+ return mysql_response
|
|
|
+
|
|
|
+ def update_meta_database_batch(self, meta_source: str, update_data_list: list[tuple]) -> int:
|
|
|
"""
|
|
|
- sql = """
|
|
|
- update crawler_meta_article
|
|
|
- set score = case article_id
|
|
|
- {}
|
|
|
- end
|
|
|
- where article_id in %s and score is null;
|
|
|
+ 批量更新crawler_meta_article
|
|
|
"""
|
|
|
+ match meta_source:
|
|
|
+ case "video":
|
|
|
+ sql = """
|
|
|
+ update publish_single_video_source
|
|
|
+ set score = case id
|
|
|
+ {}
|
|
|
+ end
|
|
|
+ where id in %s and score is null;
|
|
|
+ """
|
|
|
+ case "article":
|
|
|
+ sql = """
|
|
|
+ update crawler_meta_article
|
|
|
+ set score = case article_id
|
|
|
+ {}
|
|
|
+ end
|
|
|
+ where article_id in %s and score is null;
|
|
|
+ """
|
|
|
+ case _:
|
|
|
+ print("source_type is not valid")
|
|
|
+ return 0
|
|
|
+
|
|
|
case_statement = []
|
|
|
article_id_list = []
|
|
|
params = []
|
|
@@ -95,22 +134,29 @@ class ColdStartTitleSimilarityTask(object):
|
|
|
affected_rows = self.db_client.save(formatted_sql, params)
|
|
|
return affected_rows
|
|
|
|
|
|
- def run(self, limit=None):
|
|
|
+ def run(self, meta_source, limit=None):
|
|
|
"""
|
|
|
执行任务
|
|
|
"""
|
|
|
- target_article_list = self.get_title_from_meta_base(limit=limit)
|
|
|
- if not target_article_list:
|
|
|
+ match meta_source:
|
|
|
+ case "article":
|
|
|
+ target_list = self.get_article_title_from_meta_base(limit=limit)
|
|
|
+ case "video":
|
|
|
+ target_list = self.get_video_title_from_meta_table(limit=limit)
|
|
|
+ case _:
|
|
|
+ print("meta_source is not valid")
|
|
|
+ return
|
|
|
+
|
|
|
+ if not target_list:
|
|
|
print("No more articles to process.")
|
|
|
return
|
|
|
|
|
|
base_title_list = self.get_level_up_title_list()
|
|
|
-
|
|
|
- batch_task_list = chunks(target_article_list, ARTICLE_BATCH)
|
|
|
+ batch_task_list = chunks(target_list, ARTICLE_BATCH)
|
|
|
|
|
|
for batch_task in batch_task_list:
|
|
|
try:
|
|
|
- batch_target_title_list = [i['title'] for i in batch_task]
|
|
|
+ batch_target_title_list = [i['title'][:30] for i in batch_task]
|
|
|
similarity_array = similarity_between_title_list(batch_target_title_list, base_title_list)
|
|
|
|
|
|
update_data_list = []
|
|
@@ -119,7 +165,7 @@ class ColdStartTitleSimilarityTask(object):
|
|
|
percent_threshold_score = np.percentile(sorted_score_list, PERCENT_THRESHOLD)
|
|
|
update_data_list.append((percent_threshold_score, batch_task[index]['article_id']))
|
|
|
|
|
|
- affected_rows = self.update_meta_article_batch(update_data_list)
|
|
|
+ affected_rows = self.update_meta_database_batch(meta_source=meta_source, update_data_list=update_data_list)
|
|
|
|
|
|
print("{}: \t本次任务处理数量: {}".format(datetime.datetime.today().__str__(), affected_rows))
|
|
|
except Exception as e:
|