Browse Source

修改阅读阈值

luojunhui 1 month ago
parent
commit
249de187f5
1 changed files with 54 additions and 46 deletions
  1. 54 46
      applications/tasks/cold_start_tasks/article_pool_cold_start.py

+ 54 - 46
applications/tasks/cold_start_tasks/article_pool_cold_start.py

@@ -25,7 +25,8 @@ class ArticlePoolColdStartConst:
     INIT_STATUS = 1
     BAD_STATUS = 0
     READ_TIMES_THRESHOLD = 1.3
-    READ_THRESHOLD = 5000
+    # READ_THRESHOLD = 5000
+    READ_THRESHOLD = 1000
 
     TITLE_LENGTH_LIMIT = 15
     TITLE_LENGTH_MAX = 50
@@ -33,43 +34,12 @@ class ArticlePoolColdStartConst:
     DEFAULT_CRAWLER_METHODS = ["1030-手动挑号", "account_association"]
 
 
-class ArticlePoolColdStart(ArticlePoolColdStartConst):
+class ArticlePoolColdStartStrategy(ArticlePoolColdStartConst):
     def __init__(self, pool, log_client, trace_id):
         self.pool = pool
         self.log_client = log_client
         self.trace_id = trace_id
 
-    async def get_article_from_meta_table(
-        self, platform: str, crawl_method: str, strategy: str
-    ) -> DataFrame:
-        """
-        @param platform: 文章抓取平台
-        @param crawl_method: 文章抓取模式
-        @param strategy: 供给策略
-        """
-        match platform:
-            case "weixin":
-                article_list = await self.get_weixin_cold_start_articles(
-                    crawl_method, strategy
-                )
-            case "toutiao":
-                article_list = await self.get_toutiao_cold_start_articles(
-                    crawl_method, strategy
-                )
-            case _:
-                raise ValueError("Invalid platform")
-        return DataFrame(
-            article_list,
-            columns=[
-                "article_id",
-                "title",
-                "link",
-                "llm_sensitivity",
-                "score",
-                "category_by_ai",
-            ],
-        )
-
     async def get_weixin_cold_start_articles(
         self, crawl_method: str, strategy: str
     ) -> List[Dict]:
@@ -124,20 +94,8 @@ class ArticlePoolColdStart(ArticlePoolColdStartConst):
             case _:
                 raise ValueError("Invalid strategy")
 
-    async def filter_published_titles(self, plan_id):
-        """
-        过滤已添加至aigc中的标题
-        """
-        published_title_tuple = await get_titles_from_produce_plan(self.pool, plan_id)
-        update_query = f"""
-            update crawler_meta_article set status = %s where title in %s and status = %s;
-        """
-        changed_rows = await self.pool.async_save(
-            query=update_query,
-            params=(self.PUBLISHED_STATUS, published_title_tuple, self.INIT_STATUS),
-        )
-        return changed_rows
 
+class ArticlePoolFilterStrategy(ArticlePoolColdStartConst):
     async def filter_weixin_articles(self, dataframe, crawl_method):
         """微信过滤漏斗"""
         total_length: int = dataframe.shape[0]
@@ -215,6 +173,56 @@ class ArticlePoolColdStart(ArticlePoolColdStartConst):
         )
         return filter_df[: self.DAILY_ARTICLE_NUM]
 
+
+class ArticlePoolColdStart(ArticlePoolColdStartStrategy, ArticlePoolFilterStrategy):
+    def __init__(self, pool, log_client, trace_id):
+        super().__init__(pool, log_client, trace_id)
+
+    async def get_article_from_meta_table(
+        self, platform: str, crawl_method: str, strategy: str
+    ) -> DataFrame:
+        """
+        @param platform: 文章抓取平台
+        @param crawl_method: 文章抓取模式
+        @param strategy: 供给策略
+        """
+        match platform:
+            case "weixin":
+                article_list = await self.get_weixin_cold_start_articles(
+                    crawl_method, strategy
+                )
+            case "toutiao":
+                article_list = await self.get_toutiao_cold_start_articles(
+                    crawl_method, strategy
+                )
+            case _:
+                raise ValueError("Invalid platform")
+        return DataFrame(
+            article_list,
+            columns=[
+                "article_id",
+                "title",
+                "link",
+                "llm_sensitivity",
+                "score",
+                "category_by_ai",
+            ],
+        )
+
+    async def filter_published_titles(self, plan_id):
+        """
+        过滤已添加至aigc中的标题
+        """
+        published_title_tuple = await get_titles_from_produce_plan(self.pool, plan_id)
+        update_query = f"""
+            update crawler_meta_article set status = %s where title in %s and status = %s;
+        """
+        changed_rows = await self.pool.async_save(
+            query=update_query,
+            params=(self.PUBLISHED_STATUS, published_title_tuple, self.INIT_STATUS),
+        )
+        return changed_rows
+
     async def insert_crawler_plan_into_database(
         self, crawler_plan_id, crawler_plan_name, create_timestamp
     ):