Ver Fonte

Merge branch '2025-05-08-add-category-to-cold-start-tasks' of luojunhui/LongArticlesJob into master

luojunhui há 1 mês atrás
pai
commit
64274d2671

+ 0 - 48
cold_start/publish/publishAccountAssociationArticles.py

@@ -1,48 +0,0 @@
-"""
-@author: luojunhui
-发布i2u2i文章
-"""
-import pandas as pd
-from applications import DeNetMysql
-
-
-class I2U2I(object):
-    """
-    发布账号联想文章
-    """
-    db = DeNetMysql()
-
-    @classmethod
-    def getAccountPositionArticles(cls, gh_id, position):
-        """
-        获取联想账号的某个位置的所有文章
-        :return:
-        """
-        sql = f"""
-            select title, read_cnt, link 
-            from crawler_meta_article
-            where out_account_id = '{gh_id}' and article_index = {position};
-        """
-        article_list = cls.db.select(sql)
-        # df = pd.DataFrame(article_list, columns=['title', 'read_cnt', 'link'])
-        # read_mean = df['read_cnt'].mean()
-        # filter_response = df[
-        #     (df['read_cnt'] > read_mean * 1.3)
-        #     & (df['read_cnt'] > 5000)
-        #     ]
-        # return filter_response
-        return article_list
-
-    @classmethod
-    def filter(cls):
-        """
-        :return:
-        """
-        return
-
-if __name__ == '__main__':
-    job = I2U2I()
-    article_list = job.getAccountPositionArticles(gh_id='gh_e6be5a12e83c', position=1)
-    for article in article_list:
-        print(article)
-

+ 54 - 50
cold_start/publish/publishCategoryArticles.py

@@ -34,6 +34,7 @@ class CategoryColdStartTask(object):
         self.db_client = db_client
         self.category_map = json.loads(apollo.getConfigValue("category_cold_start_map"))
         self.category_cold_start_threshold = json.loads(apollo.getConfigValue("category_cold_start_threshold"))
+        self.article_category_list = json.loads(apollo.getConfigValue("category_list"))
         self.READ_THRESHOLD = self.category_cold_start_threshold.get("READ_THRESHOLD", 5000)
         self.READ_TIMES_THRESHOLD = self.category_cold_start_threshold.get("READ_TIMES_THRESHOLD", 1.3)
         self.LIMIT_TITLE_LENGTH = self.category_cold_start_threshold.get("LIMIT_TITLE_LENGTH", 15)
@@ -86,7 +87,7 @@ class CategoryColdStartTask(object):
         """
         sql = f"""
         SELECT 
-            article_id, out_account_id, article_index, title, link, read_cnt, status, llm_sensitivity, score
+            article_id, out_account_id, article_index, title, link, read_cnt, status, llm_sensitivity, score, category_by_ai
         FROM
             crawler_meta_article
         WHERE 
@@ -105,7 +106,7 @@ class CategoryColdStartTask(object):
         )
         article_df = DataFrame(article_list,
                                columns=['article_id', 'gh_id', 'position', 'title', 'link', 'read_cnt', 'status',
-                                        'llm_sensitivity', 'score'])
+                                        'llm_sensitivity', 'score', 'category_by_ai'])
         return article_df
 
     def filter_each_category(self, category):
@@ -341,55 +342,58 @@ class CategoryColdStartTask(object):
             except Exception as e:
                 print("failed to update sensitive status: {}".format(e))
 
-        url_list = filtered_articles_df['link'].values.tolist()
-        if url_list:
-            # create_crawler_plan
-            crawler_plan_response = aiditApi.auto_create_crawler_task(
-                plan_id=None,
-                plan_name="自动绑定-{}--{}--{}".format(category, datetime.date.today().__str__(), len(url_list)),
-                plan_tag="品类冷启动",
-                article_source=article_source,
-                url_list=url_list
-            )
-            log(
-                task="category_publish_task",
-                function="publish_filter_articles",
-                message="成功创建抓取计划",
-                data=crawler_plan_response
-            )
-            # save to db
-            create_timestamp = int(time.time()) * 1000
-            crawler_plan_id = crawler_plan_response['data']['id']
-            crawler_plan_name = crawler_plan_response['data']['name']
-            self.insert_into_db(crawler_plan_id, crawler_plan_name, create_timestamp)
-
-            # auto bind to generate plan
-            new_crawler_task_list = [
-                {
-                    "contentType": 1,
-                    "inputSourceType": 2,
-                    "inputSourceSubType": None,
-                    "fieldName": None,
-                    "inputSourceValue": crawler_plan_id,
-                    "inputSourceLabel": crawler_plan_name,
-                    "inputSourceModal": 3,
-                    "inputSourceChannel": input_source_channel
-                }
-            ]
-            generate_plan_response = aiditApi.bind_crawler_task_to_generate_task(
-                crawler_task_list=new_crawler_task_list,
-                generate_task_id=self.category_map[category]
-            )
-            log(
-                task="category_publish_task",
-                function="publish_filter_articles",
-                message="成功绑定到生成计划",
-                data=generate_plan_response
-            )
+        # split into different category
+        for ai_category in self.article_category_list:
+            filter_category_df = filtered_articles_df[filtered_articles_df['category_by_ai'] == ai_category]
+            url_list = filter_category_df['link'].values.tolist()
+            if url_list:
+                # create_crawler_plan
+                crawler_plan_response = aiditApi.auto_create_crawler_task(
+                    plan_id=None,
+                    plan_name="自动绑定-{}-{}-{}--{}".format(category, ai_category,datetime.date.today().__str__(), len(url_list)),
+                    plan_tag="品类冷启动",
+                    article_source=article_source,
+                    url_list=url_list
+                )
+                log(
+                    task="category_publish_task",
+                    function="publish_filter_articles",
+                    message="成功创建抓取计划",
+                    data=crawler_plan_response
+                )
+                # save to db
+                create_timestamp = int(time.time()) * 1000
+                crawler_plan_id = crawler_plan_response['data']['id']
+                crawler_plan_name = crawler_plan_response['data']['name']
+                self.insert_into_db(crawler_plan_id, crawler_plan_name, create_timestamp)
+
+                # auto bind to generate plan
+                new_crawler_task_list = [
+                    {
+                        "contentType": 1,
+                        "inputSourceType": 2,
+                        "inputSourceSubType": None,
+                        "fieldName": None,
+                        "inputSourceValue": crawler_plan_id,
+                        "inputSourceLabel": crawler_plan_name,
+                        "inputSourceModal": 3,
+                        "inputSourceChannel": input_source_channel
+                    }
+                ]
+                generate_plan_response = aiditApi.bind_crawler_task_to_generate_task(
+                    crawler_task_list=new_crawler_task_list,
+                    generate_task_id=self.category_map[category]
+                )
+                log(
+                    task="category_publish_task",
+                    function="publish_filter_articles",
+                    message="成功绑定到生成计划",
+                    data=generate_plan_response
+                )
 
-            # change article status
-            article_id_list = filtered_articles_df['article_id'].values.tolist()
-            self.change_article_status_while_publishing(article_id_list=article_id_list)
+                # change article status
+                article_id_list = filter_category_df['article_id'].values.tolist()
+                self.change_article_status_while_publishing(article_id_list=article_id_list)
 
     def do_job(self, article_source, category_list=None):
         """

+ 19 - 1
cold_start/publish/publish_article_pool_articles.py

@@ -10,7 +10,7 @@ from applications.db import DatabaseConnector
 from config import long_articles_config
 
 
-class CategoryColdStartTask:
+class PublishArticlePoolArticles:
     def __init__(self):
         self.db_client = DatabaseConnector(long_articles_config)
         self.db_client.connect()
@@ -35,3 +35,21 @@ class CategoryColdStartTask:
                 }
             )
 
+
+class PublishGzhArticles(PublishArticlePoolArticles):
+
+    def get_articles_by_crawler_method(self, crawler_method):
+        fetch_query = f"""
+            select
+                article_id, out_account_id, article_index, title, link, read_cnt, status, llm_sensitivity, score, category_by_ai
+            from crawler_meta_article
+            where category = '{crawler_method}' and platform = 'weixin' and title_sensitivity = 0;
+        """
+        fetch_response = self.db_client.fetch(fetch_query)
+        article_data_frame = DataFrame(
+            fetch_response,
+            columns=['article_id', 'gh_id', 'position', 'title', 'link', 'read_cnt', 'status',
+                                        'llm_sensitivity', 'score', 'category_by_ai']
+        )
+        return article_data_frame
+