|
@@ -82,11 +82,12 @@ class CategoryColdStartTask(object):
|
|
|
"""
|
|
|
sql = f"""
|
|
|
SELECT
|
|
|
- article_id, out_account_id, article_index, title, link, read_cnt, status, llm_sensitivity
|
|
|
+ article_id, out_account_id, article_index, title, link, read_cnt, status, llm_sensitivity, score
|
|
|
FROM
|
|
|
crawler_meta_article
|
|
|
WHERE
|
|
|
- category = "{category}" and platform = "{article_source}";
|
|
|
+ category = "{category}" and platform = "{article_source}"
|
|
|
+ ORDER BY score DESC;
|
|
|
"""
|
|
|
article_list = self.db_client.select(sql)
|
|
|
log(
|
|
@@ -99,7 +100,7 @@ class CategoryColdStartTask(object):
|
|
|
}
|
|
|
)
|
|
|
article_df = DataFrame(article_list,
|
|
|
- columns=['article_id', 'gh_id', 'position', 'title', 'link', 'read_cnt', 'status', 'llm_sensitivity'])
|
|
|
+ columns=['article_id', 'gh_id', 'position', 'title', 'link', 'read_cnt', 'status', 'llm_sensitivity', 'score'])
|
|
|
return article_df
|
|
|
|
|
|
def change_article_status(self, category):
|
|
@@ -206,6 +207,11 @@ class CategoryColdStartTask(object):
|
|
|
~(filter_df['llm_sensitivity'] > 0)
|
|
|
]
|
|
|
length_level5 = filter_df.shape[0]
|
|
|
+
|
|
|
+ # 第六层通过相关性分数过滤
|
|
|
+ filter_df = filter_df[filter_df['score'] > 0.4]
|
|
|
+ length_level6 = filter_df.shape[0]
|
|
|
+
|
|
|
log(
|
|
|
task="category_publish_task",
|
|
|
function="publish_filter_articles",
|
|
@@ -232,6 +238,9 @@ class CategoryColdStartTask(object):
|
|
|
"通过LLM敏感度过滤": "过滤数量: {} 剩余数量: {}".format(
|
|
|
length_level4 - length_level5, length_level5
|
|
|
),
|
|
|
+ "通过相关性分数过滤": "过滤数量: {} 剩余数量: {}".format(
|
|
|
+ length_level5 - length_level6, length_level6
|
|
|
+ ),
|
|
|
"品类": category,
|
|
|
"阅读均值倍数阈值": self.READ_TIMES_THRESHOLD,
|
|
|
"阅读量阈值": self.READ_THRESHOLD,
|
|
@@ -239,7 +248,7 @@ class CategoryColdStartTask(object):
|
|
|
},
|
|
|
mention=False
|
|
|
)
|
|
|
- return filter_df
|
|
|
+ return filter_df[:1000]
|
|
|
|
|
|
def filter_toutiao_articles(self, articles_df, category):
|
|
|
"""
|