|
@@ -74,7 +74,7 @@ class CategoryColdStartTask(object):
|
|
|
}
|
|
|
)
|
|
|
|
|
|
- def get_articles_from_meta_table(self, category):
|
|
|
+ def get_articles_from_meta_table(self, category, article_source):
|
|
|
"""
|
|
|
从长文 meta 库中获取冷启文章
|
|
|
:return:
|
|
@@ -85,7 +85,7 @@ class CategoryColdStartTask(object):
|
|
|
FROM
|
|
|
crawler_meta_article
|
|
|
WHERE
|
|
|
- category = "{category}";
|
|
|
+ category = "{category}" and platform = "{article_source}" and status = {self.INIT_STATUS};
|
|
|
"""
|
|
|
article_list = self.db_client.select(sql)
|
|
|
log(
|
|
@@ -153,12 +153,9 @@ class CategoryColdStartTask(object):
|
|
|
}
|
|
|
)
|
|
|
|
|
|
- def publish_filter_articles(self, category, articles_df):
|
|
|
+ def filter_weixin_articles(self, articles_df, category):
|
|
|
"""
|
|
|
- 过滤文章
|
|
|
- :param category:
|
|
|
- :param articles_df:
|
|
|
- :return:
|
|
|
+ 微信抓取文章过滤漏斗
|
|
|
"""
|
|
|
articles_df['average_read'] = articles_df.groupby(['gh_id', 'position'])['read_cnt'].transform('mean')
|
|
|
articles_df['read_times'] = articles_df['read_cnt'] / articles_df['average_read']
|
|
@@ -174,13 +171,13 @@ class CategoryColdStartTask(object):
|
|
|
# 第二层漏斗通过阅读量过滤
|
|
|
second_level_funnel_df = first_level_funnel_df[
|
|
|
first_level_funnel_df['read_cnt'] >= self.READ_THRESHOLD
|
|
|
- ]
|
|
|
+ ]
|
|
|
second_level_funnel_length = second_level_funnel_df.shape[0]
|
|
|
|
|
|
# 第三层漏斗通过标题长度过滤
|
|
|
third_level_funnel_df = second_level_funnel_df[
|
|
|
second_level_funnel_df['title'].str.len() >= self.LIMIT_TITLE_LENGTH
|
|
|
- ]
|
|
|
+ ]
|
|
|
third_level_funnel_length = third_level_funnel_df.shape[0]
|
|
|
|
|
|
# 最后一层通过敏感词过滤
|
|
@@ -199,7 +196,6 @@ class CategoryColdStartTask(object):
|
|
|
& (~third_level_funnel_df['title'].str.contains('中国'))
|
|
|
]
|
|
|
final_length = filter_df.shape[0]
|
|
|
- url_list = filter_df['link'].values.tolist()
|
|
|
log(
|
|
|
task="category_publish_task",
|
|
|
function="publish_filter_articles",
|
|
@@ -213,11 +209,16 @@ class CategoryColdStartTask(object):
|
|
|
title="冷启任务发布通知",
|
|
|
detail={
|
|
|
"总文章数量": total_length,
|
|
|
- "通过已经发布状态过滤": "过滤数量: {} 剩余数量: {}".format(total_length - zero_level_funnel_length, zero_level_funnel_length),
|
|
|
- "通过阅读均值倍数过滤": "过滤数量: {} 剩余数量: {}".format(zero_level_funnel_length - first_level_funnel_length, first_level_funnel_length),
|
|
|
- "通过阅读量过滤": "过滤数量: {} 剩余数量: {}".format(first_level_funnel_length - second_level_funnel_length, second_level_funnel_length),
|
|
|
- "通过标题长度过滤": "过滤数量: {} 剩余数量: {}".format(second_level_funnel_length - third_level_funnel_length, third_level_funnel_length),
|
|
|
- "通过敏感词过滤": "过滤数量: {} 剩余数量: {}".format(third_level_funnel_length - final_length, final_length),
|
|
|
+ "通过已经发布状态过滤": "过滤数量: {} 剩余数量: {}".format(total_length - zero_level_funnel_length,
|
|
|
+ zero_level_funnel_length),
|
|
|
+ "通过阅读均值倍数过滤": "过滤数量: {} 剩余数量: {}".format(
|
|
|
+ zero_level_funnel_length - first_level_funnel_length, first_level_funnel_length),
|
|
|
+ "通过阅读量过滤": "过滤数量: {} 剩余数量: {}".format(
|
|
|
+ first_level_funnel_length - second_level_funnel_length, second_level_funnel_length),
|
|
|
+ "通过标题长度过滤": "过滤数量: {} 剩余数量: {}".format(
|
|
|
+ second_level_funnel_length - third_level_funnel_length, third_level_funnel_length),
|
|
|
+ "通过敏感词过滤": "过滤数量: {} 剩余数量: {}".format(third_level_funnel_length - final_length,
|
|
|
+ final_length),
|
|
|
"品类": category,
|
|
|
"阅读均值倍数阈值": self.READ_TIMES_THRESHOLD,
|
|
|
"阅读量阈值": self.READ_THRESHOLD,
|
|
@@ -225,12 +226,34 @@ class CategoryColdStartTask(object):
|
|
|
},
|
|
|
mention=False
|
|
|
)
|
|
|
+ return filter_df
|
|
|
+
|
|
|
+ def publish_filter_articles(self, category, articles_df, article_source):
|
|
|
+ """
|
|
|
+ 过滤文章
|
|
|
+ :param category: 文章品类
|
|
|
+ :param articles_df: 该品类下的文章data_frame
|
|
|
+ :param article_source: 文章来源
|
|
|
+ :return:
|
|
|
+ """
|
|
|
+ match article_source:
|
|
|
+ case "weixin":
|
|
|
+ filtered_articles_df = self.filter_weixin_articles(articles_df, category)
|
|
|
+ input_source_channel = 5
|
|
|
+ case "toutiao":
|
|
|
+ filtered_articles_df = articles_df
|
|
|
+ input_source_channel = 6
|
|
|
+ case _:
|
|
|
+ return
|
|
|
+
|
|
|
+ url_list = filtered_articles_df['link'].values.tolist()
|
|
|
if url_list:
|
|
|
# create_crawler_plan
|
|
|
crawler_plan_response = aiditApi.auto_create_crawler_task(
|
|
|
plan_id=None,
|
|
|
plan_name="自动绑定-{}--{}--{}".format(category, datetime.date.today().__str__(), len(url_list)),
|
|
|
plan_tag="品类冷启动",
|
|
|
+ article_source=article_source,
|
|
|
url_list=url_list
|
|
|
)
|
|
|
log(
|
|
@@ -239,7 +262,6 @@ class CategoryColdStartTask(object):
|
|
|
message="成功创建抓取计划",
|
|
|
data=crawler_plan_response
|
|
|
)
|
|
|
-
|
|
|
# save to db
|
|
|
create_timestamp = int(time.time()) * 1000
|
|
|
crawler_plan_id = crawler_plan_response['data']['id']
|
|
@@ -253,10 +275,10 @@ class CategoryColdStartTask(object):
|
|
|
"inputSourceType": 2,
|
|
|
"inputSourceSubType": None,
|
|
|
"fieldName": None,
|
|
|
- "inputSourceValue": crawler_plan_response['data']['id'],
|
|
|
- "inputSourceLabel": crawler_plan_response['data']['name'],
|
|
|
+ "inputSourceValue": crawler_plan_id,
|
|
|
+ "inputSourceLabel": crawler_plan_name,
|
|
|
"inputSourceModal": 3,
|
|
|
- "inputSourceChannel": 5
|
|
|
+ "inputSourceChannel": input_source_channel
|
|
|
}
|
|
|
]
|
|
|
generate_plan_response = aiditApi.bind_crawler_task_to_generate_task(
|
|
@@ -271,16 +293,17 @@ class CategoryColdStartTask(object):
|
|
|
)
|
|
|
|
|
|
# change article status
|
|
|
- article_id_list = filter_df['article_id'].values.tolist()
|
|
|
+ article_id_list = articles_df['article_id'].values.tolist()
|
|
|
self.change_article_status_while_publishing(article_id_list=article_id_list)
|
|
|
|
|
|
- def do_job(self, category_list=None):
|
|
|
+ def do_job(self, article_source, category_list=None):
|
|
|
"""
|
|
|
执行任务
|
|
|
:return:
|
|
|
"""
|
|
|
if not category_list:
|
|
|
category_list = self.category_map.keys()
|
|
|
+
|
|
|
log(
|
|
|
task="category_publish_task",
|
|
|
function="do_job",
|
|
@@ -291,10 +314,11 @@ class CategoryColdStartTask(object):
|
|
|
)
|
|
|
for category in category_list:
|
|
|
try:
|
|
|
- category_df = self.get_articles_from_meta_table(category=category)
|
|
|
+ category_df = self.get_articles_from_meta_table(category=category, article_source=article_source)
|
|
|
self.publish_filter_articles(
|
|
|
category=category,
|
|
|
- articles_df=category_df
|
|
|
+ articles_df=category_df,
|
|
|
+ article_source=article_source
|
|
|
)
|
|
|
except Exception as e:
|
|
|
bot(
|
|
@@ -302,6 +326,7 @@ class CategoryColdStartTask(object):
|
|
|
detail={
|
|
|
"category": category,
|
|
|
"error": str(e),
|
|
|
- "function": "do_job"
|
|
|
+ "function": "do_job",
|
|
|
+ "traceback": traceback.format_exc()
|
|
|
}
|
|
|
- )
|
|
|
+ )
|