|
@@ -34,6 +34,7 @@ class CategoryColdStartTask(object):
|
|
|
self.db_client = db_client
|
|
|
self.category_map = json.loads(apollo.getConfigValue("category_cold_start_map"))
|
|
|
self.category_cold_start_threshold = json.loads(apollo.getConfigValue("category_cold_start_threshold"))
|
|
|
+ self.article_category_list = json.loads(apollo.getConfigValue("category_list"))
|
|
|
self.READ_THRESHOLD = self.category_cold_start_threshold.get("READ_THRESHOLD", 5000)
|
|
|
self.READ_TIMES_THRESHOLD = self.category_cold_start_threshold.get("READ_TIMES_THRESHOLD", 1.3)
|
|
|
self.LIMIT_TITLE_LENGTH = self.category_cold_start_threshold.get("LIMIT_TITLE_LENGTH", 15)
|
|
@@ -86,7 +87,7 @@ class CategoryColdStartTask(object):
|
|
|
"""
|
|
|
sql = f"""
|
|
|
SELECT
|
|
|
- article_id, out_account_id, article_index, title, link, read_cnt, status, llm_sensitivity, score
|
|
|
+ article_id, out_account_id, article_index, title, link, read_cnt, status, llm_sensitivity, score, category_by_ai
|
|
|
FROM
|
|
|
crawler_meta_article
|
|
|
WHERE
|
|
@@ -105,7 +106,7 @@ class CategoryColdStartTask(object):
|
|
|
)
|
|
|
article_df = DataFrame(article_list,
|
|
|
columns=['article_id', 'gh_id', 'position', 'title', 'link', 'read_cnt', 'status',
|
|
|
- 'llm_sensitivity', 'score'])
|
|
|
+ 'llm_sensitivity', 'score', 'category_by_ai'])
|
|
|
return article_df
|
|
|
|
|
|
def filter_each_category(self, category):
|
|
@@ -341,55 +342,58 @@ class CategoryColdStartTask(object):
|
|
|
except Exception as e:
|
|
|
print("failed to update sensitive status: {}".format(e))
|
|
|
|
|
|
- url_list = filtered_articles_df['link'].values.tolist()
|
|
|
- if url_list:
|
|
|
- # create_crawler_plan
|
|
|
- crawler_plan_response = aiditApi.auto_create_crawler_task(
|
|
|
- plan_id=None,
|
|
|
- plan_name="自动绑定-{}--{}--{}".format(category, datetime.date.today().__str__(), len(url_list)),
|
|
|
- plan_tag="品类冷启动",
|
|
|
- article_source=article_source,
|
|
|
- url_list=url_list
|
|
|
- )
|
|
|
- log(
|
|
|
- task="category_publish_task",
|
|
|
- function="publish_filter_articles",
|
|
|
- message="成功创建抓取计划",
|
|
|
- data=crawler_plan_response
|
|
|
- )
|
|
|
- # save to db
|
|
|
- create_timestamp = int(time.time()) * 1000
|
|
|
- crawler_plan_id = crawler_plan_response['data']['id']
|
|
|
- crawler_plan_name = crawler_plan_response['data']['name']
|
|
|
- self.insert_into_db(crawler_plan_id, crawler_plan_name, create_timestamp)
|
|
|
-
|
|
|
- # auto bind to generate plan
|
|
|
- new_crawler_task_list = [
|
|
|
- {
|
|
|
- "contentType": 1,
|
|
|
- "inputSourceType": 2,
|
|
|
- "inputSourceSubType": None,
|
|
|
- "fieldName": None,
|
|
|
- "inputSourceValue": crawler_plan_id,
|
|
|
- "inputSourceLabel": crawler_plan_name,
|
|
|
- "inputSourceModal": 3,
|
|
|
- "inputSourceChannel": input_source_channel
|
|
|
- }
|
|
|
- ]
|
|
|
- generate_plan_response = aiditApi.bind_crawler_task_to_generate_task(
|
|
|
- crawler_task_list=new_crawler_task_list,
|
|
|
- generate_task_id=self.category_map[category]
|
|
|
- )
|
|
|
- log(
|
|
|
- task="category_publish_task",
|
|
|
- function="publish_filter_articles",
|
|
|
- message="成功绑定到生成计划",
|
|
|
- data=generate_plan_response
|
|
|
- )
|
|
|
+ # split into different category
|
|
|
+ for ai_category in self.article_category_list:
|
|
|
+ filter_category_df = filtered_articles_df[filtered_articles_df['category_by_ai'] == ai_category]
|
|
|
+ url_list = filter_category_df['link'].values.tolist()
|
|
|
+ if url_list:
|
|
|
+ # create_crawler_plan
|
|
|
+ crawler_plan_response = aiditApi.auto_create_crawler_task(
|
|
|
+ plan_id=None,
|
|
|
+ plan_name="自动绑定-{}-{}-{}--{}".format(category, ai_category,datetime.date.today().__str__(), len(url_list)),
|
|
|
+ plan_tag="品类冷启动",
|
|
|
+ article_source=article_source,
|
|
|
+ url_list=url_list
|
|
|
+ )
|
|
|
+ log(
|
|
|
+ task="category_publish_task",
|
|
|
+ function="publish_filter_articles",
|
|
|
+ message="成功创建抓取计划",
|
|
|
+ data=crawler_plan_response
|
|
|
+ )
|
|
|
+ # save to db
|
|
|
+ create_timestamp = int(time.time()) * 1000
|
|
|
+ crawler_plan_id = crawler_plan_response['data']['id']
|
|
|
+ crawler_plan_name = crawler_plan_response['data']['name']
|
|
|
+ self.insert_into_db(crawler_plan_id, crawler_plan_name, create_timestamp)
|
|
|
+
|
|
|
+ # auto bind to generate plan
|
|
|
+ new_crawler_task_list = [
|
|
|
+ {
|
|
|
+ "contentType": 1,
|
|
|
+ "inputSourceType": 2,
|
|
|
+ "inputSourceSubType": None,
|
|
|
+ "fieldName": None,
|
|
|
+ "inputSourceValue": crawler_plan_id,
|
|
|
+ "inputSourceLabel": crawler_plan_name,
|
|
|
+ "inputSourceModal": 3,
|
|
|
+ "inputSourceChannel": input_source_channel
|
|
|
+ }
|
|
|
+ ]
|
|
|
+ generate_plan_response = aiditApi.bind_crawler_task_to_generate_task(
|
|
|
+ crawler_task_list=new_crawler_task_list,
|
|
|
+ generate_task_id=self.category_map[category]
|
|
|
+ )
|
|
|
+ log(
|
|
|
+ task="category_publish_task",
|
|
|
+ function="publish_filter_articles",
|
|
|
+ message="成功绑定到生成计划",
|
|
|
+ data=generate_plan_response
|
|
|
+ )
|
|
|
|
|
|
- # change article status
|
|
|
- article_id_list = filtered_articles_df['article_id'].values.tolist()
|
|
|
- self.change_article_status_while_publishing(article_id_list=article_id_list)
|
|
|
+ # change article status
|
|
|
+ article_id_list = filter_category_df['article_id'].values.tolist()
|
|
|
+ self.change_article_status_while_publishing(article_id_list=article_id_list)
|
|
|
|
|
|
def do_job(self, article_source, category_list=None):
|
|
|
"""
|