|
@@ -8,6 +8,7 @@ import traceback
|
|
|
from concurrent.futures import ThreadPoolExecutor
|
|
|
|
|
|
from pymysql.cursors import DictCursor
|
|
|
+from pandas import DataFrame
|
|
|
from tqdm import tqdm
|
|
|
|
|
|
from applications import log
|
|
@@ -419,11 +420,69 @@ class ArticlePoolCategoryGenerationTask(CategoryGenerationTask):
|
|
|
)
|
|
|
return fetch_result
|
|
|
|
|
|
+ def get_task_v2(self):
|
|
|
+ fetch_query = f"""
|
|
|
+ select
|
|
|
+ article_id, out_account_id, article_index, title, read_cnt, status, score
|
|
|
+ from
|
|
|
+ crawler_meta_article
|
|
|
+ where
|
|
|
+ category = 'account_association' and title_sensitivity = 0 and platform = 'weixin'
|
|
|
+ order by score desc
|
|
|
+ """
|
|
|
+ article_list = self.db_client.fetch(query=fetch_query)
|
|
|
+ articles_df = DataFrame(
|
|
|
+ article_list,
|
|
|
+ columns=['article_id', 'gh_id', 'position', 'title', 'read_cnt', 'status','score']
|
|
|
+ )
|
|
|
+ # filter
|
|
|
+ articles_df['average_read'] = articles_df.groupby(['gh_id', 'position'])['read_cnt'].transform('mean')
|
|
|
+ articles_df['read_times'] = articles_df['read_cnt'] / articles_df['average_read']
|
|
|
+ # 第0层过滤已经发布的文章
|
|
|
+ filter_df = articles_df[articles_df['status'] == 1]
|
|
|
+
|
|
|
+ # 第一层漏斗通过阅读均值倍数过滤
|
|
|
+ filter_df = filter_df[filter_df['read_times'] >= 1.3]
|
|
|
+
|
|
|
+ # 第二层漏斗通过阅读量过滤
|
|
|
+ filter_df = filter_df[
|
|
|
+ filter_df['read_cnt'] >= 5000
|
|
|
+ ]
|
|
|
+
|
|
|
+ # 第三层漏斗通过标题长度过滤
|
|
|
+ filter_df = filter_df[
|
|
|
+ (filter_df['title'].str.len() >= 15)
|
|
|
+ & (filter_df['title'].str.len() <= 50)
|
|
|
+ ]
|
|
|
+
|
|
|
+ # 第四层通过敏感词过滤
|
|
|
+ filter_df = filter_df[
|
|
|
+ (~filter_df['title'].str.contains('农历'))
|
|
|
+ & (~filter_df['title'].str.contains('太极'))
|
|
|
+ & (~filter_df['title'].str.contains('节'))
|
|
|
+ & (~filter_df['title'].str.contains('早上好'))
|
|
|
+ & (~filter_df['title'].str.contains('赖清德'))
|
|
|
+ & (~filter_df['title'].str.contains('普京'))
|
|
|
+ & (~filter_df['title'].str.contains('俄'))
|
|
|
+ & (~filter_df['title'].str.contains('南海'))
|
|
|
+ & (~filter_df['title'].str.contains('台海'))
|
|
|
+ & (~filter_df['title'].str.contains('解放军'))
|
|
|
+ & (~filter_df['title'].str.contains('蔡英文'))
|
|
|
+ & (~filter_df['title'].str.contains('中国'))
|
|
|
+ ]
|
|
|
+ length_level4 = filter_df.shape[0]
|
|
|
+
|
|
|
+ # 第六层通过相关性分数过滤
|
|
|
+ filter_df = filter_df[filter_df['score'] > 0.4]
|
|
|
+
|
|
|
+ result = filter_df[['article_id', 'title']].to_dict(orient='records')
|
|
|
+ return result
|
|
|
+
|
|
|
def deal(self):
|
|
|
|
|
|
self.rollback_lock_tasks(self.const.ARTICLE_TABLE_NAME)
|
|
|
|
|
|
- task_list = self.get_task_list()
|
|
|
+ task_list = self.get_task_v2()
|
|
|
task_batch_list = yield_batch(data=task_list, batch_size=self.const.BATCH_SIZE)
|
|
|
|
|
|
# # dev
|