|
@@ -49,11 +49,11 @@ class CategoryColdStartTask(object):
|
|
"""
|
|
"""
|
|
sql = f"""
|
|
sql = f"""
|
|
SELECT
|
|
SELECT
|
|
- article_id, out_account_id, article_index, title, link, read_cnt
|
|
|
|
|
|
+ article_id, out_account_id, article_index, title, link, read_cnt, status
|
|
FROM
|
|
FROM
|
|
crawler_meta_article
|
|
crawler_meta_article
|
|
WHERE
|
|
WHERE
|
|
- category = "{category}" and status = '{self.INIT_STATUS}';
|
|
|
|
|
|
+ category = "{category}";
|
|
"""
|
|
"""
|
|
article_list = self.db_client.select(sql)
|
|
article_list = self.db_client.select(sql)
|
|
log(
|
|
log(
|
|
@@ -65,7 +65,7 @@ class CategoryColdStartTask(object):
|
|
"category": category
|
|
"category": category
|
|
}
|
|
}
|
|
)
|
|
)
|
|
- article_df = DataFrame(article_list, columns=['article_id', 'gh_id', 'position', 'title', 'link', 'read_cnt'])
|
|
|
|
|
|
+ article_df = DataFrame(article_list, columns=['article_id', 'gh_id', 'position', 'title', 'link', 'read_cnt', 'status'])
|
|
return article_df
|
|
return article_df
|
|
|
|
|
|
def change_article_status(self, category):
|
|
def change_article_status(self, category):
|
|
@@ -131,8 +131,12 @@ class CategoryColdStartTask(object):
|
|
articles_df['average_read'] = articles_df.groupby(['gh_id', 'position'])['read_cnt'].transform('mean')
|
|
articles_df['average_read'] = articles_df.groupby(['gh_id', 'position'])['read_cnt'].transform('mean')
|
|
articles_df['read_times'] = articles_df['read_cnt'] / articles_df['average_read']
|
|
articles_df['read_times'] = articles_df['read_cnt'] / articles_df['average_read']
|
|
total_length = articles_df.shape[0]
|
|
total_length = articles_df.shape[0]
|
|
|
|
+ # 第0层过滤已经发布的文章
|
|
|
|
+ zero_level_funnel_df = articles_df[articles_df['status'] == self.INIT_STATUS]
|
|
|
|
+ zero_level_funnel_length = zero_level_funnel_df.shape[0]
|
|
|
|
+
|
|
# 第一层漏斗通过阅读均值倍数过滤
|
|
# 第一层漏斗通过阅读均值倍数过滤
|
|
- first_level_funnel_df = articles_df[articles_df['read_times'] >= self.READ_TIMES_THRESHOLD]
|
|
|
|
|
|
+ first_level_funnel_df = zero_level_funnel_df[zero_level_funnel_df['read_times'] >= self.READ_TIMES_THRESHOLD]
|
|
first_level_funnel_length = first_level_funnel_df.shape[0]
|
|
first_level_funnel_length = first_level_funnel_df.shape[0]
|
|
|
|
|
|
# 第二层漏斗通过阅读量过滤
|
|
# 第二层漏斗通过阅读量过滤
|
|
@@ -177,7 +181,8 @@ class CategoryColdStartTask(object):
|
|
title="冷启任务发布通知",
|
|
title="冷启任务发布通知",
|
|
detail={
|
|
detail={
|
|
"总文章数量": total_length,
|
|
"总文章数量": total_length,
|
|
- "通过阅读均值倍数过滤": "过滤数量: {} 剩余数量: {}".format(total_length - first_level_funnel_length, first_level_funnel_length),
|
|
|
|
|
|
+ "通过已经发布状态过滤": "过滤数量: {} 剩余数量: {}".format(total_length - zero_level_funnel_length, zero_level_funnel_length),
|
|
|
|
+ "通过阅读均值倍数过滤": "过滤数量: {} 剩余数量: {}".format(zero_level_funnel_length - first_level_funnel_length, first_level_funnel_length),
|
|
"通过阅读量过滤": "过滤数量: {} 剩余数量: {}".format(first_level_funnel_length - second_level_funnel_length, second_level_funnel_length),
|
|
"通过阅读量过滤": "过滤数量: {} 剩余数量: {}".format(first_level_funnel_length - second_level_funnel_length, second_level_funnel_length),
|
|
"通过标题长度过滤": "过滤数量: {} 剩余数量: {}".format(second_level_funnel_length - third_level_funnel_length, third_level_funnel_length),
|
|
"通过标题长度过滤": "过滤数量: {} 剩余数量: {}".format(second_level_funnel_length - third_level_funnel_length, third_level_funnel_length),
|
|
"通过敏感词过滤": "过滤数量: {} 剩余数量: {}".format(third_level_funnel_length - final_length, final_length),
|
|
"通过敏感词过滤": "过滤数量: {} 剩余数量: {}".format(third_level_funnel_length - final_length, final_length),
|