|
@@ -9,15 +9,25 @@ from applications.api import similarity_between_title_list
|
|
|
from applications.db import DatabaseConnector
|
|
|
from config import long_articles_config
|
|
|
|
|
|
-threshold_date = '20250101'
|
|
|
-article_batch = 1000
|
|
|
+
|
|
|
+TIMESTAMP_MS_THRESHOLD = 1732982400000
|
|
|
+ARTICLE_BATCH = 1000
|
|
|
PERCENT_THRESHOLD = 95
|
|
|
|
|
|
|
|
|
+def chunks(total_list, batch_size):
|
|
|
+ """
|
|
|
+ yield batch tasks
|
|
|
+ """
|
|
|
+ for i in range(0, len(total_list), batch_size):
|
|
|
+ yield total_list[i:i + batch_size]
|
|
|
+
|
|
|
+
|
|
|
class ColdStartTitleSimilarityTask(object):
|
|
|
"""
|
|
|
冷启动文章标题相似度任务
|
|
|
"""
|
|
|
+
|
|
|
def __init__(self):
|
|
|
self.db_client = None
|
|
|
|
|
@@ -31,9 +41,12 @@ class ColdStartTitleSimilarityTask(object):
|
|
|
def get_level_up_title_list(self):
|
|
|
"""
|
|
|
获取晋级文章标题列表
|
|
|
+ status: 1 表示文章已经溯源完成
|
|
|
+ deleted: 0 表示文章正常
|
|
|
+ level = 'autoArticlePoolLevel1' 表示头条
|
|
|
"""
|
|
|
sql = f"""
|
|
|
- select distinct title from datastat_sort_strategy where date_str > '{threshold_date}' and position < 3;
|
|
|
+ select distinct title from article_pool_promotion_source where level = 'autoArticlePoolLevel1' and status = 1 and deleted = 0;
|
|
|
"""
|
|
|
mysql_response = self.db_client.fetch(query=sql)
|
|
|
title_list = [i[0] for i in mysql_response]
|
|
@@ -42,9 +55,10 @@ class ColdStartTitleSimilarityTask(object):
|
|
|
def get_title_from_meta_base(self):
|
|
|
"""
|
|
|
获取meta_base表中文章标题列表
|
|
|
+ status: 1 表示文章初始化状态
|
|
|
"""
|
|
|
sql = f"""
|
|
|
- select article_id, title from crawler_meta_article where status = 1 and score is null limit {article_batch};
|
|
|
+ select article_id, title from crawler_meta_article where status = 1 and score is null;
|
|
|
"""
|
|
|
mysql_response = self.db_client.fetch(query=sql, cursor_type=DictCursor)
|
|
|
return mysql_response
|
|
@@ -78,22 +92,24 @@ class ColdStartTitleSimilarityTask(object):
|
|
|
"""
|
|
|
执行任务
|
|
|
"""
|
|
|
- base_title_list = self.get_level_up_title_list()
|
|
|
target_article_list = self.get_title_from_meta_base()
|
|
|
if not target_article_list:
|
|
|
print("No more articles to process.")
|
|
|
return
|
|
|
|
|
|
- target_title_list = [i['title'] for i in target_article_list]
|
|
|
- similarity_array = similarity_between_title_list(target_title_list, base_title_list)
|
|
|
+ base_title_list = self.get_level_up_title_list()
|
|
|
|
|
|
- update_data_list = []
|
|
|
- for index, score_list in enumerate(similarity_array):
|
|
|
- sorted_score_list = sorted(score_list)
|
|
|
- percent_threshold_score = np.percentile(sorted_score_list, PERCENT_THRESHOLD)
|
|
|
- update_data_list.append((percent_threshold_score, target_article_list[index]['article_id']))
|
|
|
+ batch_task_list = chunks(target_article_list, ARTICLE_BATCH)
|
|
|
|
|
|
- affected_rows = self.update_meta_article_batch(update_data_list)
|
|
|
- print(affected_rows)
|
|
|
+ for batch_task in batch_task_list:
|
|
|
+ batch_target_title_list = [i['title'] for i in batch_task]
|
|
|
+ similarity_array = similarity_between_title_list(batch_target_title_list, base_title_list)
|
|
|
|
|
|
+ update_data_list = []
|
|
|
+ for index, score_list in enumerate(similarity_array):
|
|
|
+ sorted_score_list = sorted(score_list)
|
|
|
+ percent_threshold_score = np.percentile(sorted_score_list, PERCENT_THRESHOLD)
|
|
|
+ update_data_list.append((percent_threshold_score, batch_target_title_list[index]['article_id']))
|
|
|
|
|
|
+ affected_rows = self.update_meta_article_batch(update_data_list)
|
|
|
+ print(affected_rows)
|