|
@@ -0,0 +1,99 @@
|
|
|
+"""
|
|
|
+@author: luojunhui
|
|
|
+"""
|
|
|
+import numpy as np
|
|
|
+
|
|
|
+from pymysql.cursors import DictCursor
|
|
|
+
|
|
|
+from applications.api import similarity_between_title_list
|
|
|
+from applications.db import DatabaseConnector
|
|
|
+from config import long_articles_config
|
|
|
+
|
|
|
+threshold_date = '20250101'
|
|
|
+article_batch = 1000
|
|
|
+PERCENT_THRESHOLD = 95
|
|
|
+
|
|
|
+
|
|
|
+class ColdStartTitleSimilarityTask(object):
|
|
|
+ """
|
|
|
+ 冷启动文章标题相似度任务
|
|
|
+ """
|
|
|
+ def __init__(self):
|
|
|
+ self.db_client = None
|
|
|
+
|
|
|
+ def init_database(self):
|
|
|
+ """
|
|
|
+ init database
|
|
|
+ """
|
|
|
+ self.db_client = DatabaseConnector(long_articles_config)
|
|
|
+ self.db_client.connect()
|
|
|
+
|
|
|
+ def get_level_up_title_list(self):
|
|
|
+ """
|
|
|
+ 获取晋级文章标题列表
|
|
|
+ """
|
|
|
+ sql = f"""
|
|
|
+ select distinct title from datastat_sort_strategy where date_str > '{threshold_date}' and position < 3;
|
|
|
+ """
|
|
|
+ mysql_response = self.db_client.fetch(query=sql)
|
|
|
+ title_list = [i[0] for i in mysql_response]
|
|
|
+ return title_list
|
|
|
+
|
|
|
+ def get_title_from_meta_base(self):
|
|
|
+ """
|
|
|
+ 获取meta_base表中文章标题列表
|
|
|
+ """
|
|
|
+ sql = f"""
|
|
|
+ select article_id, title from crawler_meta_article where status = 1 and score is null limit {article_batch};
|
|
|
+ """
|
|
|
+ mysql_response = self.db_client.fetch(query=sql, cursor_type=DictCursor)
|
|
|
+ return mysql_response
|
|
|
+
|
|
|
+ def update_meta_article_batch(self, update_data_list: list[tuple]) -> int:
|
|
|
+ """
|
|
|
+ 批量更新crawler_meta_article
|
|
|
+ """
|
|
|
+ sql = """
|
|
|
+ update crawler_meta_article
|
|
|
+ set score = case article_id
|
|
|
+ {}
|
|
|
+ end
|
|
|
+ where article_id in %s;
|
|
|
+ """
|
|
|
+ case_statement = []
|
|
|
+ article_id_list = []
|
|
|
+ params = []
|
|
|
+ for score, article_id in update_data_list:
|
|
|
+ case_statement.append(f"when %s then %s")
|
|
|
+ article_id_list.append(article_id)
|
|
|
+ params.extend([article_id, score])
|
|
|
+
|
|
|
+ params.append(tuple(article_id_list))
|
|
|
+ case_statements = "\n".join(case_statement)
|
|
|
+ formatted_sql = sql.format(case_statements)
|
|
|
+ affected_rows = self.db_client.save(formatted_sql, params)
|
|
|
+ return affected_rows
|
|
|
+
|
|
|
+ def run(self):
|
|
|
+ """
|
|
|
+ 执行任务
|
|
|
+ """
|
|
|
+ base_title_list = self.get_level_up_title_list()
|
|
|
+ target_article_list = self.get_title_from_meta_base()
|
|
|
+ if not target_article_list:
|
|
|
+ print("No more articles to process.")
|
|
|
+ return
|
|
|
+
|
|
|
+ target_title_list = [i['title'] for i in target_article_list]
|
|
|
+ similarity_array = similarity_between_title_list(target_title_list, base_title_list)
|
|
|
+
|
|
|
+ update_data_list = []
|
|
|
+ for index, score_list in enumerate(similarity_array):
|
|
|
+ sorted_score_list = sorted(score_list)
|
|
|
+ percent_threshold_score = np.percentile(sorted_score_list, PERCENT_THRESHOLD)
|
|
|
+ update_data_list.append((percent_threshold_score, target_article_list[index]['article_id']))
|
|
|
+
|
|
|
+ affected_rows = self.update_meta_article_batch(update_data_list)
|
|
|
+ print(affected_rows)
|
|
|
+
|
|
|
+
|