Explorar o código

Merge branch '2025-02-07-add-nlp-similarity-score' of luojunhui/LongArticlesJob into 2025-02-07-account-assocaition-improve

luojunhui hai 2 meses
pai
achega
374c2d42df

+ 2 - 1
applications/api/__init__.py

@@ -1,4 +1,5 @@
 """
 @author: luojunhui
 """
-from .moon_shot_api import generate_mini_program_title
+from .moon_shot_api import generate_mini_program_title
+from .nlp_api import similarity_between_title_list

+ 26 - 0
applications/api/nlp_api.py

@@ -0,0 +1,26 @@
+"""
+@author: luojunhui
+"""
+import requests
+
+
+def similarity_between_title_list(target_title_list: list[str], base_title_list: list[str]) -> list[list[float]]:
+    """
+    cal the similarity between two list of title
+    :param target_title_list: target title_list
+    :param base_title_list: base title_list
+    :return: list of similarity
+    """
+    url = 'http://61.48.133.26:6060/nlp'
+    body = {
+        "data": {
+            "text_list_a": target_title_list,
+            "text_list_b": base_title_list
+        },
+        "function": "similarities_cross",
+        "use_cache": False
+    }
+    response_json = requests.post(url, json=body, timeout=120).json()
+    score_array = response_json['score_list_list']
+    return score_array
+

+ 99 - 0
coldStartTasks/filter/title_similarity_task.py

@@ -0,0 +1,99 @@
+"""
+@author: luojunhui
+"""
+import numpy as np
+
+from pymysql.cursors import DictCursor
+
+from applications.api import similarity_between_title_list
+from applications.db import DatabaseConnector
+from config import long_articles_config
+
+threshold_date = '20250101'
+article_batch = 1000
+PERCENT_THRESHOLD = 95
+
+
+class ColdStartTitleSimilarityTask(object):
+    """
+    冷启动文章标题相似度任务
+    """
+    def __init__(self):
+        self.db_client = None
+
+    def init_database(self):
+        """
+        init database
+        """
+        self.db_client = DatabaseConnector(long_articles_config)
+        self.db_client.connect()
+
+    def get_level_up_title_list(self):
+        """
+        获取晋级文章标题列表
+        """
+        sql = f"""
+        select distinct title from datastat_sort_strategy where date_str > '{threshold_date}' and position < 3;
+        """
+        mysql_response = self.db_client.fetch(query=sql)
+        title_list = [i[0] for i in mysql_response]
+        return title_list
+
+    def get_title_from_meta_base(self):
+        """
+        获取meta_base表中文章标题列表
+        """
+        sql = f"""
+            select article_id, title from crawler_meta_article where status = 1 and score is null limit {article_batch};
+        """
+        mysql_response = self.db_client.fetch(query=sql, cursor_type=DictCursor)
+        return mysql_response
+
+    def update_meta_article_batch(self, update_data_list: list[tuple]) -> int:
+        """
+        批量更新crawler_meta_article
+        """
+        sql = """
+            update crawler_meta_article
+            set score = case article_id
+                {}
+            end
+            where article_id in %s;
+        """
+        case_statement = []
+        article_id_list = []
+        params = []
+        for score, article_id in update_data_list:
+            case_statement.append(f"when %s then %s")
+            article_id_list.append(article_id)
+            params.extend([article_id, score])
+
+        params.append(tuple(article_id_list))
+        case_statements = "\n".join(case_statement)
+        formatted_sql = sql.format(case_statements)
+        affected_rows = self.db_client.save(formatted_sql, params)
+        return affected_rows
+
+    def run(self):
+        """
+        执行任务
+        """
+        base_title_list = self.get_level_up_title_list()
+        target_article_list = self.get_title_from_meta_base()
+        if not target_article_list:
+            print("No more articles to process.")
+            return
+
+        target_title_list = [i['title'] for i in target_article_list]
+        similarity_array = similarity_between_title_list(target_title_list, base_title_list)
+
+        update_data_list = []
+        for index, score_list in enumerate(similarity_array):
+            sorted_score_list = sorted(score_list)
+            percent_threshold_score = np.percentile(sorted_score_list, PERCENT_THRESHOLD)
+            update_data_list.append((percent_threshold_score, target_article_list[index]['article_id']))
+
+        affected_rows = self.update_meta_article_batch(update_data_list)
+        print(affected_rows)
+
+

+ 11 - 0
title_similarity_score_task.py

@@ -0,0 +1,11 @@
+"""
+@author: luojunhui
+"""
+from coldStartTasks.filter.title_similarity_task import ColdStartTitleSimilarityTask
+
+
+if __name__ == '__main__':
+    task = ColdStartTitleSimilarityTask()
+    task.init_database()
+    while True:
+        task.run()