Przeglądaj źródła

账号联想--优化

luojunhui 2 miesięcy temu
rodzic
commit
e434eaebfe

+ 7 - 0
account_cold_start_daily.py

@@ -9,6 +9,7 @@ from argparse import ArgumentParser
 from applications import longArticlesMySQL, bot
 from coldStartTasks.crawler.weixinCategoryCrawler import weixinCategory
 from coldStartTasks.publish.publishCategoryArticles import CategoryColdStartTask
+from coldStartTasks.filter.title_similarity_task import ColdStartTitleSimilarityTask
 
 DEFAULT_CATEGORY_LIST = ['1030-手动挑号', 'account_association']
 
@@ -49,6 +50,12 @@ class AccountColdStartDailyTask(object):
         try:
             weixin_category_crawler = weixinCategory(db_client=self.db_client)
             weixin_category_crawler.deal(category_list=category_list, date_str=date_str)
+
+            # 抓取完成之后,给抓取到的标题进行相似度打分
+            cold_start_title_similarity_task = ColdStartTitleSimilarityTask()
+            cold_start_title_similarity_task.init_database()
+            cold_start_title_similarity_task.run()
+
             bot(
                 title="账号冷启动任务,抓取完成",
                 detail={

+ 2 - 5
coldStartTasks/crawler/weixinCategoryCrawler.py

@@ -22,7 +22,7 @@ ACCOUNT_NOT_DAILY_SCRAPE = 0
 DEFAULT_VIEW_COUNT = 0
 DEFAULT_LIKE_COUNT = 0
 DEFAULT_ARTICLE_STATUS = 1
-DEFAULT_TIMESTAMP = 1704038400
+DEFAULT_TIMESTAMP = 1717171200
 
 
 class weixinCategory(object):
@@ -246,13 +246,10 @@ class weixinCategory(object):
         association_account_list = self.get_association_account_list(date_str)
         self.crawler_each_category(account_list=association_account_list, category="association")
 
-        # 抓完之后,执行相似度打分任务
-        return
-
     def deal_accounts(self, account_list):
         """
         input account list
-        :param account_list:
+        :param account_list: 具体账号抓取,只抓一页
         :return:
         """
         account_tuple = tuple(account_list)

+ 30 - 14
coldStartTasks/filter/title_similarity_task.py

@@ -9,15 +9,25 @@ from applications.api import similarity_between_title_list
 from applications.db import DatabaseConnector
 from config import long_articles_config
 
-threshold_date = '20250101'
-article_batch = 1000
+
+TIMESTAMP_MS_THRESHOLD = 1732982400000
+ARTICLE_BATCH = 1000
 PERCENT_THRESHOLD = 95
 
 
+def chunks(total_list, batch_size):
+    """
+    yield batch tasks
+    """
+    for i in range(0, len(total_list), batch_size):
+        yield total_list[i:i + batch_size]
+
+
 class ColdStartTitleSimilarityTask(object):
     """
     冷启动文章标题相似度任务
     """
+
     def __init__(self):
         self.db_client = None
 
@@ -31,9 +41,12 @@ class ColdStartTitleSimilarityTask(object):
     def get_level_up_title_list(self):
         """
         获取晋级文章标题列表
+        status: 1 表示文章已经溯源完成
+        deleted: 0 表示文章正常
+        level = 'autoArticlePoolLevel1' 表示头条
         """
         sql = f"""
-        select distinct title from datastat_sort_strategy where date_str > '{threshold_date}' and position < 3;
+        select distinct title from article_pool_promotion_source where level = 'autoArticlePoolLevel1' and status = 1 and deleted = 0;
         """
         mysql_response = self.db_client.fetch(query=sql)
         title_list = [i[0] for i in mysql_response]
@@ -42,9 +55,10 @@ class ColdStartTitleSimilarityTask(object):
     def get_title_from_meta_base(self):
         """
         获取meta_base表中文章标题列表
+        status: 1 表示文章初始化状态
         """
         sql = f"""
-            select article_id, title from crawler_meta_article where status = 1 and score is null limit {article_batch};
+            select article_id, title from crawler_meta_article where status = 1 and score is null;
         """
         mysql_response = self.db_client.fetch(query=sql, cursor_type=DictCursor)
         return mysql_response
@@ -78,22 +92,24 @@ class ColdStartTitleSimilarityTask(object):
         """
         执行任务
         """
-        base_title_list = self.get_level_up_title_list()
         target_article_list = self.get_title_from_meta_base()
         if not target_article_list:
             print("No more articles to process.")
             return
 
-        target_title_list = [i['title'] for i in target_article_list]
-        similarity_array = similarity_between_title_list(target_title_list, base_title_list)
+        base_title_list = self.get_level_up_title_list()
 
-        update_data_list = []
-        for index, score_list in enumerate(similarity_array):
-            sorted_score_list = sorted(score_list)
-            percent_threshold_score = np.percentile(sorted_score_list, PERCENT_THRESHOLD)
-            update_data_list.append((percent_threshold_score, target_article_list[index]['article_id']))
+        batch_task_list = chunks(target_article_list, ARTICLE_BATCH)
 
-        affected_rows = self.update_meta_article_batch(update_data_list)
-        print(affected_rows)
+        for batch_task in batch_task_list:
+            batch_target_title_list = [i['title'] for i in batch_task]
+            similarity_array = similarity_between_title_list(batch_target_title_list, base_title_list)
 
+            update_data_list = []
+            for index, score_list in enumerate(similarity_array):
+                sorted_score_list = sorted(score_list)
+                percent_threshold_score = np.percentile(sorted_score_list, PERCENT_THRESHOLD)
+                update_data_list.append((percent_threshold_score, batch_target_title_list[index]['article_id']))
 
+            affected_rows = self.update_meta_article_batch(update_data_list)
+            print(affected_rows)

+ 33 - 10
coldStartTasks/publish/publishCategoryArticles.py

@@ -103,10 +103,9 @@ class CategoryColdStartTask(object):
                                columns=['article_id', 'gh_id', 'position', 'title', 'link', 'read_cnt', 'status', 'llm_sensitivity', 'score'])
         return article_df
 
-    def change_article_status(self, category):
+    def filter_each_category(self, category):
         """
-        已经发布到生成计划中的 id,
-        :return:
+        过滤单个生成计划类别的文章
         """
         plan_id = self.category_map.get(category)
         if plan_id:
@@ -115,13 +114,13 @@ class CategoryColdStartTask(object):
             if title_list:
                 # update
                 update_sql = f"""
-                UPDATE 
-                    crawler_meta_article
-                SET
-                    status = %s
-                WHERE
-                    title in %s and status = %s;
-                """
+                            UPDATE 
+                                crawler_meta_article
+                            SET
+                                status = %s
+                            WHERE
+                                title in %s and status = %s;
+                            """
                 affected_rows = self.db_client.update(
                     sql=update_sql,
                     params=(self.PUBLISHED_STATUS, tuple(title_list), self.INIT_STATUS)
@@ -131,6 +130,27 @@ class CategoryColdStartTask(object):
             print("未获取到计划id")
             return
 
+    def published_articles_title_filter(self):
+        """
+        已经发布到生成计划中的 id,
+        :return:
+        """
+        category_list = list(self.category_map.keys())
+        for category in category_list:
+            try:
+                self.filter_each_category(category)
+            except Exception as e:
+                log(
+                    task="category_publish_task",
+                    function="published_articles_title_filter",
+                    message="过滤已发布文章失败",
+                    data={
+                        "error": str(e),
+                        "error_msg": traceback.format_exc(),
+                        "category": category
+                    }
+                )
+
     def change_article_status_while_publishing(self, article_id_list):
         """
 
@@ -356,6 +376,9 @@ class CategoryColdStartTask(object):
         )
         for category in category_list:
             try:
+                # 已发布标题去重
+                self.published_articles_title_filter(category=category)
+
                 category_df = self.get_articles_from_meta_table(category=category, article_source=article_source)
                 self.publish_filter_articles(
                     category=category,