|
@@ -36,7 +36,6 @@ class CategoryColdStartTask(object):
|
|
|
task="category_publish_task",
|
|
|
function="__init__",
|
|
|
message="数据库初始化连接完成,apollo配置获取完成",
|
|
|
- status="success",
|
|
|
data={
|
|
|
"category": self.category_map,
|
|
|
"threshold": self.category_cold_start_threshold
|
|
@@ -61,7 +60,6 @@ class CategoryColdStartTask(object):
|
|
|
task="category_publish_task",
|
|
|
function="get_articles_from_meta_table",
|
|
|
message="获取品类文章总数",
|
|
|
- status="success",
|
|
|
data={
|
|
|
"total_articles": len(article_list),
|
|
|
"category": category
|
|
@@ -77,28 +75,7 @@ class CategoryColdStartTask(object):
|
|
|
"""
|
|
|
plan_id = self.category_map.get(category)
|
|
|
if plan_id:
|
|
|
- sql = f"""
|
|
|
- SELECT
|
|
|
- account.wx_gh,
|
|
|
- content.title,
|
|
|
- content.content_link,
|
|
|
- content.view_count,
|
|
|
- content.like_count,
|
|
|
- from_unixtime(cprr.create_timestamp / 1000) AS 抓取时间,
|
|
|
- from_unixtime(content.publish_timestamp / 1000) AS 发布时间
|
|
|
- FROM crawler_plan_result_rel cprr
|
|
|
- JOIN crawler_plan plan ON cprr.plan_id = plan.id
|
|
|
- JOIN crawler_content content ON cprr.channel_source_id = content.channel_content_id
|
|
|
- JOIN crawler_account account ON content.channel_account_id = account.channel_account_id
|
|
|
- WHERE plan_id IN (
|
|
|
- SELECT
|
|
|
- input_source_value
|
|
|
- FROM
|
|
|
- produce_plan_input_source
|
|
|
- WHERE plan_id = '{plan_id}'
|
|
|
- );
|
|
|
- """
|
|
|
- article_list = self.db_client.select(sql)
|
|
|
+ article_list = aiditApi.get_generated_article_list(plan_id)
|
|
|
title_list = [i[1] for i in article_list]
|
|
|
if title_list:
|
|
|
# update
|
|
@@ -153,35 +130,64 @@ class CategoryColdStartTask(object):
|
|
|
"""
|
|
|
articles_df['average_read'] = articles_df.groupby(['gh_id', 'position'])['read_cnt'].transform('mean')
|
|
|
articles_df['read_times'] = articles_df['read_cnt'] / articles_df['average_read']
|
|
|
- filter_df = articles_df[
|
|
|
- (articles_df['read_times'] >= self.READ_TIMES_THRESHOLD)
|
|
|
- & (articles_df['read_cnt'] >= self.READ_THRESHOLD)
|
|
|
- & (articles_df['title'].str.len() > self.LIMIT_TITLE_LENGTH)
|
|
|
- & (~articles_df['title'].str.contains('农历'))
|
|
|
- & (~articles_df['title'].str.contains('太极'))
|
|
|
- & (~articles_df['title'].str.contains('节'))
|
|
|
- & (~articles_df['title'].str.contains('早上好'))
|
|
|
- & (~articles_df['title'].str.contains('赖清德'))
|
|
|
- & (~articles_df['title'].str.contains('普京'))
|
|
|
- & (~articles_df['title'].str.contains('俄'))
|
|
|
- & (~articles_df['title'].str.contains('南海'))
|
|
|
- & (~articles_df['title'].str.contains('台海'))
|
|
|
- & (~articles_df['title'].str.contains('解放军'))
|
|
|
- & (~articles_df['title'].str.contains('蔡英文'))
|
|
|
- & (~articles_df['title'].str.contains('中国'))
|
|
|
- ]
|
|
|
+ total_length = articles_df.shape[0]
|
|
|
+ # 第一层漏斗通过阅读均值倍数过滤
|
|
|
+ first_level_funnel_df = articles_df[articles_df['read_times'] >= self.READ_TIMES_THRESHOLD]
|
|
|
+ first_level_funnel_length = first_level_funnel_df.shape[0]
|
|
|
+
|
|
|
+ # 第二层漏斗通过阅读量过滤
|
|
|
+ second_level_funnel_df = first_level_funnel_df[
|
|
|
+ first_level_funnel_df['read_cnt'] >= self.READ_THRESHOLD
|
|
|
+ ]
|
|
|
+ second_level_funnel_length = second_level_funnel_df.shape[0]
|
|
|
+
|
|
|
+ # 第三层漏斗通过标题长度过滤
|
|
|
+ third_level_funnel_df = second_level_funnel_df[
|
|
|
+ second_level_funnel_df['title'].str.len() >= self.LIMIT_TITLE_LENGTH
|
|
|
+ ]
|
|
|
+ third_level_funnel_length = third_level_funnel_df.shape[0]
|
|
|
|
|
|
+ # 最后一层通过敏感词过滤
|
|
|
+ filter_df = third_level_funnel_df[
|
|
|
+ (~third_level_funnel_df['title'].str.contains('农历'))
|
|
|
+ & (~third_level_funnel_df['title'].str.contains('太极'))
|
|
|
+ & (~third_level_funnel_df['title'].str.contains('节'))
|
|
|
+ & (~third_level_funnel_df['title'].str.contains('早上好'))
|
|
|
+ & (~third_level_funnel_df['title'].str.contains('赖清德'))
|
|
|
+ & (~third_level_funnel_df['title'].str.contains('普京'))
|
|
|
+ & (~third_level_funnel_df['title'].str.contains('俄'))
|
|
|
+ & (~third_level_funnel_df['title'].str.contains('南海'))
|
|
|
+ & (~third_level_funnel_df['title'].str.contains('台海'))
|
|
|
+ & (~third_level_funnel_df['title'].str.contains('解放军'))
|
|
|
+ & (~third_level_funnel_df['title'].str.contains('蔡英文'))
|
|
|
+ & (~third_level_funnel_df['title'].str.contains('中国'))
|
|
|
+ ]
|
|
|
+ final_length = filter_df.shape[0]
|
|
|
url_list = filter_df['link'].values.tolist()
|
|
|
log(
|
|
|
task="category_publish_task",
|
|
|
function="publish_filter_articles",
|
|
|
message="过滤后文章总数",
|
|
|
- status="success",
|
|
|
data={
|
|
|
- "total_articles": len(url_list),
|
|
|
+ "total_articles": final_length,
|
|
|
"category": category
|
|
|
}
|
|
|
)
|
|
|
+ bot(
|
|
|
+ title="冷启任务发布通知",
|
|
|
+ detail={
|
|
|
+ "总文章数量": total_length,
|
|
|
+ "通过阅读均值倍数过滤": "过滤数量: {} 剩余数量: {}".format(total_length - first_level_funnel_length, first_level_funnel_length),
|
|
|
+ "通过阅读量过滤": "过滤数量: {} 剩余数量: {}".format(first_level_funnel_length - second_level_funnel_length, second_level_funnel_length),
|
|
|
+ "通过标题长度过滤": "过滤数量: {} 剩余数量: {}".format(second_level_funnel_length - third_level_funnel_length, third_level_funnel_length),
|
|
|
+ "通过敏感词过滤": "过滤数量: {} 剩余数量: {}".format(third_level_funnel_length - final_length, final_length),
|
|
|
+ "品类": category,
|
|
|
+ "阅读均值倍数阈值": self.READ_TIMES_THRESHOLD,
|
|
|
+ "阅读量阈值": self.READ_THRESHOLD,
|
|
|
+ "标题长度阈值": self.LIMIT_TITLE_LENGTH
|
|
|
+ },
|
|
|
+ mention=False
|
|
|
+ )
|
|
|
if url_list:
|
|
|
crawler_plan_response = aiditApi.auto_create_crawler_task(
|
|
|
plan_id=None,
|
|
@@ -193,7 +199,6 @@ class CategoryColdStartTask(object):
|
|
|
task="category_publish_task",
|
|
|
function="publish_filter_articles",
|
|
|
message="成功创建抓取计划",
|
|
|
- status="success",
|
|
|
data=crawler_plan_response
|
|
|
)
|
|
|
# auto bind to generate plan
|
|
@@ -217,7 +222,6 @@ class CategoryColdStartTask(object):
|
|
|
task="category_publish_task",
|
|
|
function="publish_filter_articles",
|
|
|
message="成功绑定到生成计划",
|
|
|
- status="success",
|
|
|
data=generate_plan_response
|
|
|
)
|
|
|
article_id_list = filter_df['article_id'].values.tolist()
|