|
@@ -34,6 +34,7 @@ class CategoryColdStartTask(object):
|
|
|
self.READ_THRESHOLD = self.category_cold_start_threshold.get("READ_THRESHOLD", 5000)
|
|
|
self.READ_TIMES_THRESHOLD = self.category_cold_start_threshold.get("READ_TIMES_THRESHOLD", 1.3)
|
|
|
self.LIMIT_TITLE_LENGTH = self.category_cold_start_threshold.get("LIMIT_TITLE_LENGTH", 15)
|
|
|
+ self.TITLE_LENGTH_MAX = self.category_cold_start_threshold.get("TITLE_LENGTH_MAX", 50)
|
|
|
log(
|
|
|
task="category_publish_task",
|
|
|
function="__init__",
|
|
@@ -81,7 +82,7 @@ class CategoryColdStartTask(object):
|
|
|
"""
|
|
|
sql = f"""
|
|
|
SELECT
|
|
|
- article_id, out_account_id, article_index, title, link, read_cnt, status
|
|
|
+ article_id, out_account_id, article_index, title, link, read_cnt, status, llm_sensitivity
|
|
|
FROM
|
|
|
crawler_meta_article
|
|
|
WHERE
|
|
@@ -98,7 +99,7 @@ class CategoryColdStartTask(object):
|
|
|
}
|
|
|
)
|
|
|
article_df = DataFrame(article_list,
|
|
|
- columns=['article_id', 'gh_id', 'position', 'title', 'link', 'read_cnt', 'status'])
|
|
|
+ columns=['article_id', 'gh_id', 'position', 'title', 'link', 'read_cnt', 'status', 'llm_sensitivity'])
|
|
|
return article_df
|
|
|
|
|
|
def change_article_status(self, category):
|
|
@@ -162,47 +163,53 @@ class CategoryColdStartTask(object):
|
|
|
articles_df['read_times'] = articles_df['read_cnt'] / articles_df['average_read']
|
|
|
total_length = articles_df.shape[0]
|
|
|
# 第0层过滤已经发布的文章
|
|
|
- zero_level_funnel_df = articles_df[articles_df['status'] == self.INIT_STATUS]
|
|
|
- zero_level_funnel_length = zero_level_funnel_df.shape[0]
|
|
|
+ filter_df = articles_df[articles_df['status'] == self.INIT_STATUS]
|
|
|
+ length_level0 = filter_df.shape[0]
|
|
|
|
|
|
# 第一层漏斗通过阅读均值倍数过滤
|
|
|
- first_level_funnel_df = zero_level_funnel_df[zero_level_funnel_df['read_times'] >= self.READ_TIMES_THRESHOLD]
|
|
|
- first_level_funnel_length = first_level_funnel_df.shape[0]
|
|
|
+ filter_df = filter_df[filter_df['read_times'] >= self.READ_TIMES_THRESHOLD]
|
|
|
+ length_level1 = filter_df.shape[0]
|
|
|
|
|
|
# 第二层漏斗通过阅读量过滤
|
|
|
- second_level_funnel_df = first_level_funnel_df[
|
|
|
- first_level_funnel_df['read_cnt'] >= self.READ_THRESHOLD
|
|
|
+ filter_df = filter_df[
|
|
|
+ filter_df['read_cnt'] >= self.READ_THRESHOLD
|
|
|
]
|
|
|
- second_level_funnel_length = second_level_funnel_df.shape[0]
|
|
|
+ length_level2 = filter_df.shape[0]
|
|
|
|
|
|
# 第三层漏斗通过标题长度过滤
|
|
|
- third_level_funnel_df = second_level_funnel_df[
|
|
|
- second_level_funnel_df['title'].str.len() >= self.LIMIT_TITLE_LENGTH
|
|
|
+ filter_df = filter_df[
|
|
|
+ (filter_df['title'].str.len() >= self.LIMIT_TITLE_LENGTH)
|
|
|
+ & (filter_df['title'].str.len() <= self.TITLE_LENGTH_MAX)
|
|
|
]
|
|
|
- third_level_funnel_length = third_level_funnel_df.shape[0]
|
|
|
+ length_level3 = filter_df.shape[0]
|
|
|
|
|
|
- # 最后一层通过敏感词过滤
|
|
|
- filter_df = third_level_funnel_df[
|
|
|
- (~third_level_funnel_df['title'].str.contains('农历'))
|
|
|
- & (~third_level_funnel_df['title'].str.contains('太极'))
|
|
|
- & (~third_level_funnel_df['title'].str.contains('节'))
|
|
|
- & (~third_level_funnel_df['title'].str.contains('早上好'))
|
|
|
- & (~third_level_funnel_df['title'].str.contains('赖清德'))
|
|
|
- & (~third_level_funnel_df['title'].str.contains('普京'))
|
|
|
- & (~third_level_funnel_df['title'].str.contains('俄'))
|
|
|
- & (~third_level_funnel_df['title'].str.contains('南海'))
|
|
|
- & (~third_level_funnel_df['title'].str.contains('台海'))
|
|
|
- & (~third_level_funnel_df['title'].str.contains('解放军'))
|
|
|
- & (~third_level_funnel_df['title'].str.contains('蔡英文'))
|
|
|
- & (~third_level_funnel_df['title'].str.contains('中国'))
|
|
|
+ # 第四层通过敏感词过滤
|
|
|
+ filter_df = filter_df[
|
|
|
+ (~filter_df['title'].str.contains('农历'))
|
|
|
+ & (~filter_df['title'].str.contains('太极'))
|
|
|
+ & (~filter_df['title'].str.contains('节'))
|
|
|
+ & (~filter_df['title'].str.contains('早上好'))
|
|
|
+ & (~filter_df['title'].str.contains('赖清德'))
|
|
|
+ & (~filter_df['title'].str.contains('普京'))
|
|
|
+ & (~filter_df['title'].str.contains('俄'))
|
|
|
+ & (~filter_df['title'].str.contains('南海'))
|
|
|
+ & (~filter_df['title'].str.contains('台海'))
|
|
|
+ & (~filter_df['title'].str.contains('解放军'))
|
|
|
+ & (~filter_df['title'].str.contains('蔡英文'))
|
|
|
+ & (~filter_df['title'].str.contains('中国'))
|
|
|
]
|
|
|
- final_length = filter_df.shape[0]
|
|
|
+ length_level4 = filter_df.shape[0]
|
|
|
+ # 第五层通过LLM敏感度过滤
|
|
|
+ filter_df = filter_df[
|
|
|
+ ~(filter_df['llm_sensitivity'] > 0)
|
|
|
+ ]
|
|
|
+ length_level5 = filter_df.shape[0]
|
|
|
log(
|
|
|
task="category_publish_task",
|
|
|
function="publish_filter_articles",
|
|
|
message="过滤后文章总数",
|
|
|
data={
|
|
|
- "total_articles": final_length,
|
|
|
+ "total_articles": length_level5,
|
|
|
"category": category
|
|
|
}
|
|
|
)
|
|
@@ -210,16 +217,19 @@ class CategoryColdStartTask(object):
|
|
|
title="冷启任务发布通知",
|
|
|
detail={
|
|
|
"总文章数量": total_length,
|
|
|
- "通过已经发布状态过滤": "过滤数量: {} 剩余数量: {}".format(total_length - zero_level_funnel_length,
|
|
|
- zero_level_funnel_length),
|
|
|
+ "通过已经发布状态过滤": "过滤数量: {} 剩余数量: {}".format(
|
|
|
+ total_length - length_level0, length_level0),
|
|
|
"通过阅读均值倍数过滤": "过滤数量: {} 剩余数量: {}".format(
|
|
|
- zero_level_funnel_length - first_level_funnel_length, first_level_funnel_length),
|
|
|
+ length_level0 - length_level1, length_level1),
|
|
|
"通过阅读量过滤": "过滤数量: {} 剩余数量: {}".format(
|
|
|
- first_level_funnel_length - second_level_funnel_length, second_level_funnel_length),
|
|
|
+ length_level1 - length_level2, length_level2),
|
|
|
"通过标题长度过滤": "过滤数量: {} 剩余数量: {}".format(
|
|
|
- second_level_funnel_length - third_level_funnel_length, third_level_funnel_length),
|
|
|
- "通过敏感词过滤": "过滤数量: {} 剩余数量: {}".format(third_level_funnel_length - final_length,
|
|
|
- final_length),
|
|
|
+ length_level2 - length_level3, length_level3),
|
|
|
+ "通过敏感词过滤": "过滤数量: {} 剩余数量: {}".format(
|
|
|
+ length_level3 - length_level4, length_level4),
|
|
|
+ "通过LLM敏感度过滤": "过滤数量: {} 剩余数量: {}".format(
|
|
|
+ length_level4 - length_level5, length_level5
|
|
|
+ ),
|
|
|
"品类": category,
|
|
|
"阅读均值倍数阈值": self.READ_TIMES_THRESHOLD,
|
|
|
"阅读量阈值": self.READ_THRESHOLD,
|