|
@@ -13,7 +13,7 @@ from applications import aiditApi, log, bot, llm_sensitivity
|
|
|
from config import apolloConfig
|
|
from config import apolloConfig
|
|
|
|
|
|
|
|
apollo = apolloConfig()
|
|
apollo = apolloConfig()
|
|
|
-DAILY_CRAWLER_MAX_NUM = 1000
|
|
|
|
|
|
|
+DAILY_CRAWLER_MAX_NUM = 100
|
|
|
SIMILARITY_MIN_SCORE = 0.4
|
|
SIMILARITY_MIN_SCORE = 0.4
|
|
|
TITLE_NOT_SENSITIVE = 0
|
|
TITLE_NOT_SENSITIVE = 0
|
|
|
|
|
|
|
@@ -86,14 +86,18 @@ class CategoryColdStartTask(object):
|
|
|
|
|
|
|
|
"""
|
|
"""
|
|
|
sql = f"""
|
|
sql = f"""
|
|
|
- SELECT
|
|
|
|
|
- article_id, out_account_id, article_index, title, link, read_cnt, status, llm_sensitivity, score, category_by_ai
|
|
|
|
|
- FROM
|
|
|
|
|
- crawler_meta_article
|
|
|
|
|
- WHERE
|
|
|
|
|
- category = "{category}" and platform = "{article_source}" and title_sensitivity = {TITLE_NOT_SENSITIVE}
|
|
|
|
|
- ORDER BY score DESC;
|
|
|
|
|
- """
|
|
|
|
|
|
|
+ select
|
|
|
|
|
+ article_id, title, link, llm_sensitivity, score, category_by_ai
|
|
|
|
|
+ from crawler_meta_article t1
|
|
|
|
|
+ join crawler_meta_article_accounts_read_avg t2 on t1.out_account_id = t2.gh_id and t1.article_index = t2.position
|
|
|
|
|
+ where category = '{category}'
|
|
|
|
|
+ and platform = '{article_source}'
|
|
|
|
|
+ and title_sensitivity = {TITLE_NOT_SENSITIVE}
|
|
|
|
|
+ and t1.status = {self.INIT_STATUS}
|
|
|
|
|
+ and t1.read_cnt / t2.read_avg >= {self.READ_TIMES_THRESHOLD}
|
|
|
|
|
+ and t1.read_cnt >= {self.READ_THRESHOLD}
|
|
|
|
|
+ ORDER BY score DESC;
|
|
|
|
|
+ """
|
|
|
article_list = self.db_client.select(sql)
|
|
article_list = self.db_client.select(sql)
|
|
|
log(
|
|
log(
|
|
|
task="category_publish_task",
|
|
task="category_publish_task",
|
|
@@ -105,8 +109,7 @@ class CategoryColdStartTask(object):
|
|
|
}
|
|
}
|
|
|
)
|
|
)
|
|
|
article_df = DataFrame(article_list,
|
|
article_df = DataFrame(article_list,
|
|
|
- columns=['article_id', 'gh_id', 'position', 'title', 'link', 'read_cnt', 'status',
|
|
|
|
|
- 'llm_sensitivity', 'score', 'category_by_ai'])
|
|
|
|
|
|
|
+ columns=['article_id', 'title', 'link', 'llm_sensitivity', 'score', 'category_by_ai'])
|
|
|
return article_df
|
|
return article_df
|
|
|
|
|
|
|
|
def filter_each_category(self, category):
|
|
def filter_each_category(self, category):
|
|
@@ -120,12 +123,9 @@ class CategoryColdStartTask(object):
|
|
|
if title_list:
|
|
if title_list:
|
|
|
# update
|
|
# update
|
|
|
update_sql = f"""
|
|
update_sql = f"""
|
|
|
- UPDATE
|
|
|
|
|
- crawler_meta_article
|
|
|
|
|
- SET
|
|
|
|
|
- status = %s
|
|
|
|
|
- WHERE
|
|
|
|
|
- title in %s and status = %s;
|
|
|
|
|
|
|
+ update crawler_meta_article
|
|
|
|
|
+ set status = %s
|
|
|
|
|
+ where title in %s and status = %s;
|
|
|
"""
|
|
"""
|
|
|
affected_rows = self.db_client.update(
|
|
affected_rows = self.db_client.update(
|
|
|
sql=update_sql,
|
|
sql=update_sql,
|
|
@@ -164,55 +164,36 @@ class CategoryColdStartTask(object):
|
|
|
:return:
|
|
:return:
|
|
|
"""
|
|
"""
|
|
|
update_sql = f"""
|
|
update_sql = f"""
|
|
|
- UPDATE
|
|
|
|
|
- crawler_meta_article
|
|
|
|
|
- SET
|
|
|
|
|
- status = %s
|
|
|
|
|
- WHERE
|
|
|
|
|
- article_id in %s and status = %s;
|
|
|
|
|
- """
|
|
|
|
|
|
|
+ update crawler_meta_article
|
|
|
|
|
+ set status = %s
|
|
|
|
|
+ where article_id in %s and status = %s;
|
|
|
|
|
+ """
|
|
|
affect_rows = self.db_client.update(
|
|
affect_rows = self.db_client.update(
|
|
|
sql=update_sql,
|
|
sql=update_sql,
|
|
|
params=(self.PUBLISHED_STATUS, tuple(article_id_list), self.INIT_STATUS)
|
|
params=(self.PUBLISHED_STATUS, tuple(article_id_list), self.INIT_STATUS)
|
|
|
)
|
|
)
|
|
|
- if affect_rows != len(article_id_list):
|
|
|
|
|
- bot(
|
|
|
|
|
- title="品类冷启任务中,出现更新状文章状态失败异常",
|
|
|
|
|
- detail={
|
|
|
|
|
- "affected_rows": affect_rows,
|
|
|
|
|
- "task_rows": len(article_id_list)
|
|
|
|
|
- }
|
|
|
|
|
- )
|
|
|
|
|
|
|
+ # if affect_rows != len(article_id_list):
|
|
|
|
|
+ # bot(
|
|
|
|
|
+ # title="品类冷启任务中,出现更新状文章状态失败异常",
|
|
|
|
|
+ # detail={
|
|
|
|
|
+ # "affected_rows": affect_rows,
|
|
|
|
|
+ # "task_rows": len(article_id_list)
|
|
|
|
|
+ # }
|
|
|
|
|
+ # )
|
|
|
|
|
|
|
|
def filter_weixin_articles(self, articles_df, category):
|
|
def filter_weixin_articles(self, articles_df, category):
|
|
|
"""
|
|
"""
|
|
|
微信抓取文章过滤漏斗
|
|
微信抓取文章过滤漏斗
|
|
|
"""
|
|
"""
|
|
|
- articles_df['average_read'] = articles_df.groupby(['gh_id', 'position'])['read_cnt'].transform('mean')
|
|
|
|
|
- articles_df['read_times'] = articles_df['read_cnt'] / articles_df['average_read']
|
|
|
|
|
total_length = articles_df.shape[0]
|
|
total_length = articles_df.shape[0]
|
|
|
- # 第0层过滤已经发布的文章
|
|
|
|
|
- filter_df = articles_df[articles_df['status'] == self.INIT_STATUS]
|
|
|
|
|
- length_level0 = filter_df.shape[0]
|
|
|
|
|
-
|
|
|
|
|
- # 第一层漏斗通过阅读均值倍数过滤
|
|
|
|
|
- filter_df = filter_df[filter_df['read_times'] >= self.READ_TIMES_THRESHOLD]
|
|
|
|
|
- length_level1 = filter_df.shape[0]
|
|
|
|
|
-
|
|
|
|
|
- # 第二层漏斗通过阅读量过滤
|
|
|
|
|
- filter_df = filter_df[
|
|
|
|
|
- filter_df['read_cnt'] >= self.READ_THRESHOLD
|
|
|
|
|
- ]
|
|
|
|
|
- length_level2 = filter_df.shape[0]
|
|
|
|
|
-
|
|
|
|
|
- # 第三层漏斗通过标题长度过滤
|
|
|
|
|
- filter_df = filter_df[
|
|
|
|
|
- (filter_df['title'].str.len() >= self.LIMIT_TITLE_LENGTH)
|
|
|
|
|
- & (filter_df['title'].str.len() <= self.TITLE_LENGTH_MAX)
|
|
|
|
|
|
|
+ # 第1层漏斗通过标题长度过滤
|
|
|
|
|
+ filter_df = articles_df[
|
|
|
|
|
+ (articles_df['title'].str.len() >= self.LIMIT_TITLE_LENGTH)
|
|
|
|
|
+ & (articles_df['title'].str.len() <= self.TITLE_LENGTH_MAX)
|
|
|
]
|
|
]
|
|
|
- length_level3 = filter_df.shape[0]
|
|
|
|
|
|
|
+ length_level1 = filter_df.shape[0]
|
|
|
|
|
|
|
|
- # 第四层通过敏感词过滤
|
|
|
|
|
|
|
+ # 第2层通过敏感词过滤
|
|
|
filter_df = filter_df[
|
|
filter_df = filter_df[
|
|
|
(~filter_df['title'].str.contains('农历'))
|
|
(~filter_df['title'].str.contains('农历'))
|
|
|
& (~filter_df['title'].str.contains('太极'))
|
|
& (~filter_df['title'].str.contains('太极'))
|
|
@@ -227,23 +208,23 @@ class CategoryColdStartTask(object):
|
|
|
& (~filter_df['title'].str.contains('蔡英文'))
|
|
& (~filter_df['title'].str.contains('蔡英文'))
|
|
|
& (~filter_df['title'].str.contains('中国'))
|
|
& (~filter_df['title'].str.contains('中国'))
|
|
|
]
|
|
]
|
|
|
- length_level4 = filter_df.shape[0]
|
|
|
|
|
- # 第五层通过LLM敏感度过滤
|
|
|
|
|
|
|
+ length_level2 = filter_df.shape[0]
|
|
|
|
|
+ # 第3层通过LLM敏感度过滤
|
|
|
filter_df = filter_df[
|
|
filter_df = filter_df[
|
|
|
~(filter_df['llm_sensitivity'] > 0)
|
|
~(filter_df['llm_sensitivity'] > 0)
|
|
|
]
|
|
]
|
|
|
- length_level5 = filter_df.shape[0]
|
|
|
|
|
|
|
+ length_level3 = filter_df.shape[0]
|
|
|
|
|
|
|
|
- # 第六层通过相关性分数过滤
|
|
|
|
|
|
|
+ # 第4层通过相关性分数过滤
|
|
|
filter_df = filter_df[filter_df['score'] > SIMILARITY_MIN_SCORE]
|
|
filter_df = filter_df[filter_df['score'] > SIMILARITY_MIN_SCORE]
|
|
|
- length_level6 = filter_df.shape[0]
|
|
|
|
|
|
|
+ length_level4 = filter_df.shape[0]
|
|
|
|
|
|
|
|
log(
|
|
log(
|
|
|
task="category_publish_task",
|
|
task="category_publish_task",
|
|
|
function="publish_filter_articles",
|
|
function="publish_filter_articles",
|
|
|
message="过滤后文章总数",
|
|
message="过滤后文章总数",
|
|
|
data={
|
|
data={
|
|
|
- "total_articles": length_level5,
|
|
|
|
|
|
|
+ "total_articles": length_level4,
|
|
|
"category": category
|
|
"category": category
|
|
|
}
|
|
}
|
|
|
)
|
|
)
|
|
@@ -251,21 +232,15 @@ class CategoryColdStartTask(object):
|
|
|
title="冷启任务发布通知",
|
|
title="冷启任务发布通知",
|
|
|
detail={
|
|
detail={
|
|
|
"总文章数量": total_length,
|
|
"总文章数量": total_length,
|
|
|
- "通过已经发布状态过滤": "过滤数量: {} 剩余数量: {}".format(
|
|
|
|
|
- total_length - length_level0, length_level0),
|
|
|
|
|
- "通过阅读均值倍数过滤": "过滤数量: {} 剩余数量: {}".format(
|
|
|
|
|
- length_level0 - length_level1, length_level1),
|
|
|
|
|
- "通过阅读量过滤": "过滤数量: {} 剩余数量: {}".format(
|
|
|
|
|
- length_level1 - length_level2, length_level2),
|
|
|
|
|
"通过标题长度过滤": "过滤数量: {} 剩余数量: {}".format(
|
|
"通过标题长度过滤": "过滤数量: {} 剩余数量: {}".format(
|
|
|
- length_level2 - length_level3, length_level3),
|
|
|
|
|
|
|
+ total_length - length_level1, length_level1),
|
|
|
"通过敏感词过滤": "过滤数量: {} 剩余数量: {}".format(
|
|
"通过敏感词过滤": "过滤数量: {} 剩余数量: {}".format(
|
|
|
- length_level3 - length_level4, length_level4),
|
|
|
|
|
|
|
+ length_level1 - length_level2, length_level2),
|
|
|
"通过LLM敏感度过滤": "过滤数量: {} 剩余数量: {}".format(
|
|
"通过LLM敏感度过滤": "过滤数量: {} 剩余数量: {}".format(
|
|
|
- length_level4 - length_level5, length_level5
|
|
|
|
|
|
|
+ length_level2 - length_level3, length_level3
|
|
|
),
|
|
),
|
|
|
"通过相关性分数过滤": "过滤数量: {} 剩余数量: {}".format(
|
|
"通过相关性分数过滤": "过滤数量: {} 剩余数量: {}".format(
|
|
|
- length_level5 - length_level6, length_level6
|
|
|
|
|
|
|
+ length_level3 - length_level4, length_level4
|
|
|
),
|
|
),
|
|
|
"品类": category,
|
|
"品类": category,
|
|
|
"阅读均值倍数阈值": self.READ_TIMES_THRESHOLD,
|
|
"阅读均值倍数阈值": self.READ_TIMES_THRESHOLD,
|