|
@@ -1,7 +1,9 @@
|
|
|
"""
|
|
|
@author: luojunhui
|
|
|
"""
|
|
|
+import datetime
|
|
|
import numpy as np
|
|
|
+import traceback
|
|
|
|
|
|
from pymysql.cursors import DictCursor
|
|
|
|
|
@@ -52,14 +54,19 @@ class ColdStartTitleSimilarityTask(object):
|
|
|
title_list = [i[0] for i in mysql_response]
|
|
|
return title_list
|
|
|
|
|
|
- def get_title_from_meta_base(self):
|
|
|
+ def get_title_from_meta_base(self, limit):
|
|
|
"""
|
|
|
获取meta_base表中文章标题列表
|
|
|
status: 1 表示文章初始化状态
|
|
|
"""
|
|
|
- sql = f"""
|
|
|
- select article_id, title from crawler_meta_article where status = 1 and score is null;
|
|
|
- """
|
|
|
+ if limit:
|
|
|
+ sql = f"""
|
|
|
+ select article_id, title from crawler_meta_article where status = 1 and score is null limit {limit};
|
|
|
+ """
|
|
|
+ else:
|
|
|
+ sql = f"""
|
|
|
+ select article_id, title from crawler_meta_article where status = 1 and score is null;
|
|
|
+ """
|
|
|
mysql_response = self.db_client.fetch(query=sql, cursor_type=DictCursor)
|
|
|
return mysql_response
|
|
|
|
|
@@ -72,7 +79,7 @@ class ColdStartTitleSimilarityTask(object):
|
|
|
set score = case article_id
|
|
|
{}
|
|
|
end
|
|
|
- where article_id in %s;
|
|
|
+ where article_id in %s and score is null;
|
|
|
"""
|
|
|
case_statement = []
|
|
|
article_id_list = []
|
|
@@ -88,11 +95,11 @@ class ColdStartTitleSimilarityTask(object):
|
|
|
affected_rows = self.db_client.save(formatted_sql, params)
|
|
|
return affected_rows
|
|
|
|
|
|
- def run(self):
|
|
|
+ def run(self, limit=None):
|
|
|
"""
|
|
|
执行任务
|
|
|
"""
|
|
|
- target_article_list = self.get_title_from_meta_base()
|
|
|
+ target_article_list = self.get_title_from_meta_base(limit=limit)
|
|
|
if not target_article_list:
|
|
|
print("No more articles to process.")
|
|
|
return
|
|
@@ -102,14 +109,20 @@ class ColdStartTitleSimilarityTask(object):
|
|
|
batch_task_list = chunks(target_article_list, ARTICLE_BATCH)
|
|
|
|
|
|
for batch_task in batch_task_list:
|
|
|
- batch_target_title_list = [i['title'] for i in batch_task]
|
|
|
- similarity_array = similarity_between_title_list(batch_target_title_list, base_title_list)
|
|
|
-
|
|
|
- update_data_list = []
|
|
|
- for index, score_list in enumerate(similarity_array):
|
|
|
- sorted_score_list = sorted(score_list)
|
|
|
- percent_threshold_score = np.percentile(sorted_score_list, PERCENT_THRESHOLD)
|
|
|
- update_data_list.append((percent_threshold_score, batch_task[index]['article_id']))
|
|
|
-
|
|
|
- affected_rows = self.update_meta_article_batch(update_data_list)
|
|
|
- print(affected_rows)
|
|
|
+ try:
|
|
|
+ batch_target_title_list = [i['title'] for i in batch_task]
|
|
|
+ similarity_array = similarity_between_title_list(batch_target_title_list, base_title_list)
|
|
|
+
|
|
|
+ update_data_list = []
|
|
|
+ for index, score_list in enumerate(similarity_array):
|
|
|
+ sorted_score_list = sorted(score_list)
|
|
|
+ percent_threshold_score = np.percentile(sorted_score_list, PERCENT_THRESHOLD)
|
|
|
+ update_data_list.append((percent_threshold_score, batch_task[index]['article_id']))
|
|
|
+
|
|
|
+ affected_rows = self.update_meta_article_batch(update_data_list)
|
|
|
+
|
|
|
+ print("{}: \t本次任务处理数量: {}".format(datetime.datetime.today().__str__(), affected_rows))
|
|
|
+ except Exception as e:
|
|
|
+ print("{}: \t本次任务处理失败: {}".format(datetime.datetime.today().__str__(), e))
|
|
|
+ print(traceback.format_exc())
|
|
|
+ continue
|