|  | @@ -34,6 +34,7 @@ class CategoryColdStartTask(object):
 | 
	
		
			
				|  |  |          self.db_client = db_client
 | 
	
		
			
				|  |  |          self.category_map = json.loads(apollo.getConfigValue("category_cold_start_map"))
 | 
	
		
			
				|  |  |          self.category_cold_start_threshold = json.loads(apollo.getConfigValue("category_cold_start_threshold"))
 | 
	
		
			
				|  |  | +        self.article_category_list = json.loads(apollo.getConfigValue("category_list"))
 | 
	
		
			
				|  |  |          self.READ_THRESHOLD = self.category_cold_start_threshold.get("READ_THRESHOLD", 5000)
 | 
	
		
			
				|  |  |          self.READ_TIMES_THRESHOLD = self.category_cold_start_threshold.get("READ_TIMES_THRESHOLD", 1.3)
 | 
	
		
			
				|  |  |          self.LIMIT_TITLE_LENGTH = self.category_cold_start_threshold.get("LIMIT_TITLE_LENGTH", 15)
 | 
	
	
		
			
				|  | @@ -86,7 +87,7 @@ class CategoryColdStartTask(object):
 | 
	
		
			
				|  |  |          """
 | 
	
		
			
				|  |  |          sql = f"""
 | 
	
		
			
				|  |  |          SELECT 
 | 
	
		
			
				|  |  | -            article_id, out_account_id, article_index, title, link, read_cnt, status, llm_sensitivity, score
 | 
	
		
			
				|  |  | +            article_id, out_account_id, article_index, title, link, read_cnt, status, llm_sensitivity, score, category_by_ai
 | 
	
		
			
				|  |  |          FROM
 | 
	
		
			
				|  |  |              crawler_meta_article
 | 
	
		
			
				|  |  |          WHERE 
 | 
	
	
		
			
				|  | @@ -105,7 +106,7 @@ class CategoryColdStartTask(object):
 | 
	
		
			
				|  |  |          )
 | 
	
		
			
				|  |  |          article_df = DataFrame(article_list,
 | 
	
		
			
				|  |  |                                 columns=['article_id', 'gh_id', 'position', 'title', 'link', 'read_cnt', 'status',
 | 
	
		
			
				|  |  | -                                        'llm_sensitivity', 'score'])
 | 
	
		
			
				|  |  | +                                        'llm_sensitivity', 'score', 'category_by_ai'])
 | 
	
		
			
				|  |  |          return article_df
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |      def filter_each_category(self, category):
 | 
	
	
		
			
				|  | @@ -341,55 +342,58 @@ class CategoryColdStartTask(object):
 | 
	
		
			
				|  |  |              except Exception as e:
 | 
	
		
			
				|  |  |                  print("failed to update sensitive status: {}".format(e))
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | -        url_list = filtered_articles_df['link'].values.tolist()
 | 
	
		
			
				|  |  | -        if url_list:
 | 
	
		
			
				|  |  | -            # create_crawler_plan
 | 
	
		
			
				|  |  | -            crawler_plan_response = aiditApi.auto_create_crawler_task(
 | 
	
		
			
				|  |  | -                plan_id=None,
 | 
	
		
			
				|  |  | -                plan_name="自动绑定-{}--{}--{}".format(category, datetime.date.today().__str__(), len(url_list)),
 | 
	
		
			
				|  |  | -                plan_tag="品类冷启动",
 | 
	
		
			
				|  |  | -                article_source=article_source,
 | 
	
		
			
				|  |  | -                url_list=url_list
 | 
	
		
			
				|  |  | -            )
 | 
	
		
			
				|  |  | -            log(
 | 
	
		
			
				|  |  | -                task="category_publish_task",
 | 
	
		
			
				|  |  | -                function="publish_filter_articles",
 | 
	
		
			
				|  |  | -                message="成功创建抓取计划",
 | 
	
		
			
				|  |  | -                data=crawler_plan_response
 | 
	
		
			
				|  |  | -            )
 | 
	
		
			
				|  |  | -            # save to db
 | 
	
		
			
				|  |  | -            create_timestamp = int(time.time()) * 1000
 | 
	
		
			
				|  |  | -            crawler_plan_id = crawler_plan_response['data']['id']
 | 
	
		
			
				|  |  | -            crawler_plan_name = crawler_plan_response['data']['name']
 | 
	
		
			
				|  |  | -            self.insert_into_db(crawler_plan_id, crawler_plan_name, create_timestamp)
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  | -            # auto bind to generate plan
 | 
	
		
			
				|  |  | -            new_crawler_task_list = [
 | 
	
		
			
				|  |  | -                {
 | 
	
		
			
				|  |  | -                    "contentType": 1,
 | 
	
		
			
				|  |  | -                    "inputSourceType": 2,
 | 
	
		
			
				|  |  | -                    "inputSourceSubType": None,
 | 
	
		
			
				|  |  | -                    "fieldName": None,
 | 
	
		
			
				|  |  | -                    "inputSourceValue": crawler_plan_id,
 | 
	
		
			
				|  |  | -                    "inputSourceLabel": crawler_plan_name,
 | 
	
		
			
				|  |  | -                    "inputSourceModal": 3,
 | 
	
		
			
				|  |  | -                    "inputSourceChannel": input_source_channel
 | 
	
		
			
				|  |  | -                }
 | 
	
		
			
				|  |  | -            ]
 | 
	
		
			
				|  |  | -            generate_plan_response = aiditApi.bind_crawler_task_to_generate_task(
 | 
	
		
			
				|  |  | -                crawler_task_list=new_crawler_task_list,
 | 
	
		
			
				|  |  | -                generate_task_id=self.category_map[category]
 | 
	
		
			
				|  |  | -            )
 | 
	
		
			
				|  |  | -            log(
 | 
	
		
			
				|  |  | -                task="category_publish_task",
 | 
	
		
			
				|  |  | -                function="publish_filter_articles",
 | 
	
		
			
				|  |  | -                message="成功绑定到生成计划",
 | 
	
		
			
				|  |  | -                data=generate_plan_response
 | 
	
		
			
				|  |  | -            )
 | 
	
		
			
				|  |  | +        # split into different category
 | 
	
		
			
				|  |  | +        for ai_category in self.article_category_list:
 | 
	
		
			
				|  |  | +            filter_category_df = filtered_articles_df[filtered_articles_df['ai_category'] == ai_category]
 | 
	
		
			
				|  |  | +            url_list = filter_category_df['link'].values.tolist()
 | 
	
		
			
				|  |  | +            if url_list:
 | 
	
		
			
				|  |  | +                # create_crawler_plan
 | 
	
		
			
				|  |  | +                crawler_plan_response = aiditApi.auto_create_crawler_task(
 | 
	
		
			
				|  |  | +                    plan_id=None,
 | 
	
		
			
				|  |  | +                    plan_name="自动绑定-{}-{}-{}--{}".format(category, ai_category,datetime.date.today().__str__(), len(url_list)),
 | 
	
		
			
				|  |  | +                    plan_tag="品类冷启动",
 | 
	
		
			
				|  |  | +                    article_source=article_source,
 | 
	
		
			
				|  |  | +                    url_list=url_list
 | 
	
		
			
				|  |  | +                )
 | 
	
		
			
				|  |  | +                log(
 | 
	
		
			
				|  |  | +                    task="category_publish_task",
 | 
	
		
			
				|  |  | +                    function="publish_filter_articles",
 | 
	
		
			
				|  |  | +                    message="成功创建抓取计划",
 | 
	
		
			
				|  |  | +                    data=crawler_plan_response
 | 
	
		
			
				|  |  | +                )
 | 
	
		
			
				|  |  | +                # save to db
 | 
	
		
			
				|  |  | +                create_timestamp = int(time.time()) * 1000
 | 
	
		
			
				|  |  | +                crawler_plan_id = crawler_plan_response['data']['id']
 | 
	
		
			
				|  |  | +                crawler_plan_name = crawler_plan_response['data']['name']
 | 
	
		
			
				|  |  | +                self.insert_into_db(crawler_plan_id, crawler_plan_name, create_timestamp)
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +                # auto bind to generate plan
 | 
	
		
			
				|  |  | +                new_crawler_task_list = [
 | 
	
		
			
				|  |  | +                    {
 | 
	
		
			
				|  |  | +                        "contentType": 1,
 | 
	
		
			
				|  |  | +                        "inputSourceType": 2,
 | 
	
		
			
				|  |  | +                        "inputSourceSubType": None,
 | 
	
		
			
				|  |  | +                        "fieldName": None,
 | 
	
		
			
				|  |  | +                        "inputSourceValue": crawler_plan_id,
 | 
	
		
			
				|  |  | +                        "inputSourceLabel": crawler_plan_name,
 | 
	
		
			
				|  |  | +                        "inputSourceModal": 3,
 | 
	
		
			
				|  |  | +                        "inputSourceChannel": input_source_channel
 | 
	
		
			
				|  |  | +                    }
 | 
	
		
			
				|  |  | +                ]
 | 
	
		
			
				|  |  | +                generate_plan_response = aiditApi.bind_crawler_task_to_generate_task(
 | 
	
		
			
				|  |  | +                    crawler_task_list=new_crawler_task_list,
 | 
	
		
			
				|  |  | +                    generate_task_id=self.category_map[category]
 | 
	
		
			
				|  |  | +                )
 | 
	
		
			
				|  |  | +                log(
 | 
	
		
			
				|  |  | +                    task="category_publish_task",
 | 
	
		
			
				|  |  | +                    function="publish_filter_articles",
 | 
	
		
			
				|  |  | +                    message="成功绑定到生成计划",
 | 
	
		
			
				|  |  | +                    data=generate_plan_response
 | 
	
		
			
				|  |  | +                )
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | -            # change article status
 | 
	
		
			
				|  |  | -            article_id_list = filtered_articles_df['article_id'].values.tolist()
 | 
	
		
			
				|  |  | -            self.change_article_status_while_publishing(article_id_list=article_id_list)
 | 
	
		
			
				|  |  | +                # change article status
 | 
	
		
			
				|  |  | +                article_id_list = filter_category_df['article_id'].values.tolist()
 | 
	
		
			
				|  |  | +                self.change_article_status_while_publishing(article_id_list=article_id_list)
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |      def do_job(self, article_source, category_list=None):
 | 
	
		
			
				|  |  |          """
 |