Sfoglia il codice sorgente

Merge branch 'feature/luojunhui/20251229-improve-coldstart' of Server/LongArticleTaskServer into master

luojunhui 2 mesi fa
parent
commit
e4e1a5f05b

+ 14 - 14
applications/tasks/cold_start_tasks/article_pool/article_pool_cold_start_const.py

@@ -2,7 +2,7 @@ class ArticlePoolColdStartConst:
     # article
     DAILY_ARTICLE_NUM = 1000
     DAILY_CATEGORY_ARTICLE_NUM = 120
-    SIMILARITY_SCORE_THRESHOLD = 0.5
+    SIMILARITY_SCORE_THRESHOLD = 0.53
 
     TITLE_NOT_SENSITIVE = 0
     TITLE_SENSITIVE = 1
@@ -22,27 +22,27 @@ class ArticlePoolColdStartConst:
     CATEGORY_CONFIG_MAP = {
         "知识科普": {
             "num": 100,
-            "read_threshold": 1000,
+            "read_threshold": 500,
             "read_times_threshold": 1.3,
         },
         "军事历史": {
             "num": 50,
-            "read_threshold": 1000,
+            "read_threshold": 500,
             "read_times_threshold": 1.3,
         },
         "家长里短": {
             "num": 100,
-            "read_threshold": 1000,
+            "read_threshold": 500,
             "read_times_threshold": 1.3,
         },
         "社会法治": {
             "num": 100,
-            "read_threshold": 1000,
+            "read_threshold": 500,
             "read_times_threshold": 1.3,
         },
         "奇闻趣事": {
             "num": 150,
-            "read_threshold": 2000,
+            "read_threshold": 500,
             "read_times_threshold": 1.4,
         },
         "名人八卦": {
@@ -52,12 +52,12 @@ class ArticlePoolColdStartConst:
         },
         "健康养生": {
             "num": 100,
-            "read_threshold": 1000,
+            "read_threshold": 500,
             "read_times_threshold": 1.3,
         },
         "情感故事": {
             "num": 200,
-            "read_threshold": 1000,
+            "read_threshold": 500,
             "read_times_threshold": 1.3,
         },
         "国家大事": {
@@ -67,32 +67,32 @@ class ArticlePoolColdStartConst:
         },
         "现代人物": {
             "num": 100,
-            "read_threshold": 1000,
+            "read_threshold": 500,
             "read_times_threshold": 1.3,
         },
         "怀旧时光": {
             "num": 100,
-            "read_threshold": 1000,
+            "read_threshold": 500,
             "read_times_threshold": 1.3,
         },
         "政治新闻": {
             "num": 100,
-            "read_threshold": 1000,
+            "read_threshold": 500,
             "read_times_threshold": 1.3,
         },
         "历史人物": {
             "num": 100,
-            "read_threshold": 1000,
+            "read_threshold": 500,
             "read_times_threshold": 1.3,
         },
         "社会现象": {
             "num": 200,
-            "read_threshold": 1000,
+            "read_threshold": 500,
             "read_times_threshold": 1.3,
         },
         "财经科技": {
             "num": 100,
-            "read_threshold": 1000,
+            "read_threshold": 500,
             "read_times_threshold": 1.3,
         },
     }

+ 37 - 6
applications/tasks/cold_start_tasks/article_pool/article_pool_cold_start_strategy.py

@@ -48,13 +48,13 @@ class ArticlePoolColdStartStrategy(ArticlePoolColdStartConst):
 
             case "strategy_v2":
                 query = """
-                    select 
+                    select
                         article_id, title, link, llm_sensitivity, score, category_by_ai
-                    from crawler_meta_article t1 
+                    from crawler_meta_article t1
                     join crawler_meta_article_accounts_read_avg t2 on t1.out_account_id = t2.gh_id and t1.article_index = t2.position
-                    where category_by_ai = %s 
+                    where category_by_ai = %s
                         and platform = %s
-                        and title_sensitivity = %s 
+                        and title_sensitivity = %s
                         and t1.status = %s
                         and t1.read_cnt / t2.read_avg >= %s
                         and t1.read_cnt >= %s
@@ -68,15 +68,46 @@ class ArticlePoolColdStartStrategy(ArticlePoolColdStartConst):
                         "weixin",
                         self.TITLE_NOT_SENSITIVE,
                         self.INIT_STATUS,
-                        # self.READ_TIMES_THRESHOLD,
                         self.CATEGORY_CONFIG_MAP.get(
                             category, self.READ_TIMES_THRESHOLD
                         ).get("read_times_threshold", self.READ_TIMES_THRESHOLD),
-                        # self.READ_THRESHOLD,
+                        # 阅读量阈值
+                        self.CATEGORY_CONFIG_MAP.get(category, self.READ_THRESHOLD).get(
+                            "read_threshold", self.READ_THRESHOLD
+                        ),
+                        self.INIT_STATUS,
+                    ),
+                )
+                return article_list
+
+            case "strategy_v3":
+                query = """
+                    select
+                        article_id, title, link, llm_sensitivity, score, category_by_ai, t1.read_cnt / t2.read_avg as read_avg_rate
+                    from crawler_meta_article t1
+                    join crawler_meta_article_accounts_read_avg t2 on t1.out_account_id = t2.gh_id and t1.article_index = t2.position
+                    where category_by_ai = %s
+                        and platform = %s
+                        and title_sensitivity = %s
+                        and t1.status = %s
+                        and t1.read_cnt >= %s
+                        and t2.status = %s
+                        and t1.score >= %s
+                    order by read_avg_rate desc;
+                """
+                article_list = await self.pool.async_fetch(
+                    query=query,
+                    params=(
+                        category,
+                        "weixin",
+                        self.TITLE_NOT_SENSITIVE,
+                        self.INIT_STATUS,
+                        # 阅读量阈值
                         self.CATEGORY_CONFIG_MAP.get(category, self.READ_THRESHOLD).get(
                             "read_threshold", self.READ_THRESHOLD
                         ),
                         self.INIT_STATUS,
+                        self.SIMILARITY_SCORE_THRESHOLD,
                     ),
                 )
                 return article_list

+ 18 - 1
applications/tasks/cold_start_tasks/article_pool/article_pool_filter_strategy.py

@@ -79,6 +79,7 @@ class ArticlePoolFilterStrategy(ArticlePoolColdStartConst):
                     mention=False,
                 )
                 daily_article_num = self.DAILY_ARTICLE_NUM
+
             case "strategy_v2":
                 self.cold_start_records.append(
                     {
@@ -87,7 +88,23 @@ class ArticlePoolFilterStrategy(ArticlePoolColdStartConst):
                         "total_length": total_length,
                         "filter_by_title_length": length_level1,
                         "filter_by_sensitivity": length_level2,
-                        "filter_by_llm_sensitity": length_level3,
+                        "filter_by_llm_sensitivity": length_level3,
+                        "filter_by_score": length_level4,
+                        "read_avg_threshold": self.READ_TIMES_THRESHOLD,
+                        "read_threshold": self.READ_THRESHOLD,
+                        "title_length_threshold": self.TITLE_LENGTH_LIMIT,
+                    }
+                )
+
+            case "strategy_v3":
+                self.cold_start_records.append(
+                    {
+                        "category": category,
+                        "cold_start_num": min(daily_article_num, len(filter_df)),
+                        "total_length": total_length,
+                        "filter_by_title_length": length_level1,
+                        "filter_by_sensitivity": length_level2,
+                        "filter_by_llm_sensitivity": length_level3,
                         "filter_by_score": length_level4,
                         "read_avg_threshold": self.READ_TIMES_THRESHOLD,
                         "read_threshold": self.READ_THRESHOLD,

+ 123 - 98
applications/tasks/cold_start_tasks/article_pool_cold_start.py

@@ -228,6 +228,118 @@ class ArticlePoolColdStart(ArticlePoolColdStartStrategy, ArticlePoolFilterStrate
                     article_id_list=article_id_list
                 )
 
+            case "strategy_v3":
+                url_list = filter_article_df["link"].values.tolist()
+                await self.create_crawler_plan_and_bind_to_produce_plan(
+                    strategy, crawl_method, category, platform, url_list, plan_id
+                )
+                # change article status
+                article_id_list = filter_article_df["article_id"].values.tolist()
+                await self.change_article_status_while_publishing(
+                    article_id_list=article_id_list
+                )
+
+    async def cold_start_by_category(self, category_list, platform, strategy):
+        if not category_list:
+            category_list = list(cold_start_category_map.keys())
+
+        for category in tqdm(category_list):
+            try:
+                plan_id = cold_start_category_map[category]
+                affected_rows = await self.filter_published_titles(plan_id)
+                await self.log_client.log(
+                    contents={
+                        "task": "article_pool_cold_start",
+                        "platform": platform,
+                        "category": category,
+                        "status": "success",
+                        "trace_id": self.trace_id,
+                        "message": "通过已抓取标题修改文章状态",
+                        "data": {"affected_rows": affected_rows},
+                    }
+                )
+                await self.create_cold_start_plan(
+                    platform=platform,
+                    strategy=strategy,
+                    plan_id=plan_id,
+                    category=category,
+                )
+                await asyncio.sleep(120)
+            except Exception as e:
+                await feishu_robot.bot(
+                    title="文章冷启动异常",
+                    detail={
+                        "category": category,
+                        "strategy": strategy,
+                        "error": str(e),
+                        "function": "deal",
+                        "traceback": traceback.format_exc(),
+                    },
+                )
+
+        if self.cold_start_records:
+            columns = [
+                feishu_robot.create_feishu_columns_sheet(
+                    sheet_type="plain_text",
+                    sheet_name="category",
+                    display_name="文章品类",
+                ),
+                feishu_robot.create_feishu_columns_sheet(
+                    sheet_type="number",
+                    sheet_name="cold_start_num",
+                    display_name="本次冷启数量",
+                ),
+                feishu_robot.create_feishu_columns_sheet(
+                    sheet_type="number",
+                    sheet_name="total_length",
+                    display_name="总文章剩余数量",
+                ),
+                feishu_robot.create_feishu_columns_sheet(
+                    sheet_type="number",
+                    sheet_name="filter_by_title_length",
+                    display_name="标题长度过滤",
+                ),
+                feishu_robot.create_feishu_columns_sheet(
+                    sheet_type="number",
+                    sheet_name="filter_by_sensitivity",
+                    display_name="敏感词过滤",
+                ),
+                feishu_robot.create_feishu_columns_sheet(
+                    sheet_type="number",
+                    sheet_name="filter_by_llm_sensitivity",
+                    display_name="经过大模型判断敏感过滤",
+                ),
+                feishu_robot.create_feishu_columns_sheet(
+                    sheet_type="number",
+                    sheet_name="filter_by_score",
+                    display_name="经过相关性分过滤",
+                ),
+                feishu_robot.create_feishu_columns_sheet(
+                    sheet_type="number",
+                    sheet_name="read_avg_threshold",
+                    display_name="阅读均值倍数阈值",
+                ),
+                feishu_robot.create_feishu_columns_sheet(
+                    sheet_type="number",
+                    sheet_name="read_threshold",
+                    display_name="阅读量阈值",
+                ),
+                feishu_robot.create_feishu_columns_sheet(
+                    sheet_type="number",
+                    sheet_name="title_length_threshold",
+                    display_name="标题长度阈值",
+                ),
+            ]
+            await feishu_robot.bot(
+                title="长文文章路冷启动发布",
+                detail={
+                    "columns": columns,
+                    "rows": self.cold_start_records,
+                },
+                table=True,
+                mention=False,
+            )
+
     async def deal(
         self,
         platform: str,
@@ -288,105 +400,18 @@ class ArticlePoolColdStart(ArticlePoolColdStartStrategy, ArticlePoolFilterStrate
                         )
 
             case "strategy_v2":
-                if not category_list:
-                    category_list = list(cold_start_category_map.keys())
-
-                for category in tqdm(category_list):
-                    try:
-                        plan_id = cold_start_category_map[category]
-                        affected_rows = await self.filter_published_titles(plan_id)
-                        await self.log_client.log(
-                            contents={
-                                "task": "article_pool_cold_start",
-                                "platform": platform,
-                                "category": category,
-                                "status": "success",
-                                "trace_id": self.trace_id,
-                                "message": "通过已抓取标题修改文章状态",
-                                "data": {"affected_rows": affected_rows},
-                            }
-                        )
-                        await self.create_cold_start_plan(
-                            platform=platform,
-                            strategy=strategy,
-                            plan_id=plan_id,
-                            category=category,
-                        )
-                        await asyncio.sleep(120)
-                    except Exception as e:
-                        await feishu_robot.bot(
-                            title="文章冷启动异常",
-                            detail={
-                                "category": category,
-                                "strategy": strategy,
-                                "error": str(e),
-                                "function": "deal",
-                                "traceback": traceback.format_exc(),
-                            },
-                        )
+                await self.cold_start_by_category(
+                    category_list=category_list,
+                    platform=platform,
+                    strategy=strategy,
+                )
 
-                if self.cold_start_records:
-                    columns = [
-                        feishu_robot.create_feishu_columns_sheet(
-                            sheet_type="plain_text",
-                            sheet_name="category",
-                            display_name="文章品类",
-                        ),
-                        feishu_robot.create_feishu_columns_sheet(
-                            sheet_type="number",
-                            sheet_name="cold_start_num",
-                            display_name="本次冷启数量",
-                        ),
-                        feishu_robot.create_feishu_columns_sheet(
-                            sheet_type="number",
-                            sheet_name="total_length",
-                            display_name="总文章剩余数量",
-                        ),
-                        feishu_robot.create_feishu_columns_sheet(
-                            sheet_type="number",
-                            sheet_name="filter_by_title_length",
-                            display_name="标题长度过滤",
-                        ),
-                        feishu_robot.create_feishu_columns_sheet(
-                            sheet_type="number",
-                            sheet_name="filter_by_sensitivity",
-                            display_name="敏感词过滤",
-                        ),
-                        feishu_robot.create_feishu_columns_sheet(
-                            sheet_type="number",
-                            sheet_name="filter_by_llm_sensitity",
-                            display_name="经过大模型判断敏感过滤",
-                        ),
-                        feishu_robot.create_feishu_columns_sheet(
-                            sheet_type="number",
-                            sheet_name="filter_by_score",
-                            display_name="经过相关性分过滤",
-                        ),
-                        feishu_robot.create_feishu_columns_sheet(
-                            sheet_type="number",
-                            sheet_name="read_avg_threshold",
-                            display_name="阅读均值倍数阈值",
-                        ),
-                        feishu_robot.create_feishu_columns_sheet(
-                            sheet_type="number",
-                            sheet_name="read_threshold",
-                            display_name="阅读量阈值",
-                        ),
-                        feishu_robot.create_feishu_columns_sheet(
-                            sheet_type="number",
-                            sheet_name="title_length_threshold",
-                            display_name="标题长度阈值",
-                        ),
-                    ]
-                    await feishu_robot.bot(
-                        title="长文文章路冷启动发布",
-                        detail={
-                            "columns": columns,
-                            "rows": self.cold_start_records,
-                        },
-                        table=True,
-                        mention=False,
-                    )
+            case "strategy_v3":
+                await self.cold_start_by_category(
+                    category_list=category_list,
+                    platform=platform,
+                    strategy=strategy,
+                )
 
             case _:
                 raise Exception(f"error strategy {strategy}")