Ver Fonte

冷启动发布,增加品类信息bugfix

luojunhui há 2 meses atrás
pai
commit
34006423d3
1 ficheiros alterados com 60 adições e 1 exclusões
  1. 60 1
      tasks/ai_tasks/category_generation_task.py

+ 60 - 1
tasks/ai_tasks/category_generation_task.py

@@ -8,6 +8,7 @@ import traceback
 from concurrent.futures import ThreadPoolExecutor
 
 from pymysql.cursors import DictCursor
+from pandas import DataFrame
 from tqdm import tqdm
 
 from applications import log
@@ -419,11 +420,69 @@ class ArticlePoolCategoryGenerationTask(CategoryGenerationTask):
         )
         return fetch_result
 
+    def get_task_v2(self):
+        fetch_query = f"""
+            select 
+                article_id, out_account_id, article_index, title, read_cnt, status, score
+            from
+                crawler_meta_article
+            where 
+                category = 'account_association' and title_sensitivity = 0 and platform = 'weixin'
+            order by score desc
+        """
+        article_list = self.db_client.fetch(query=fetch_query)
+        articles_df = DataFrame(
+            article_list,
+            columns=['article_id', 'gh_id', 'position', 'title', 'read_cnt', 'status','score']
+        )
+        # filter
+        articles_df['average_read'] = articles_df.groupby(['gh_id', 'position'])['read_cnt'].transform('mean')
+        articles_df['read_times'] = articles_df['read_cnt'] / articles_df['average_read']
+        # 第0层过滤已经发布的文章
+        filter_df = articles_df[articles_df['status'] == 1]
+
+        # 第一层漏斗通过阅读均值倍数过滤
+        filter_df = filter_df[filter_df['read_times'] >= 1.3]
+
+        # 第二层漏斗通过阅读量过滤
+        filter_df = filter_df[
+            filter_df['read_cnt'] >= 5000
+            ]
+
+        # 第三层漏斗通过标题长度过滤
+        filter_df = filter_df[
+            (filter_df['title'].str.len() >= 15)
+            & (filter_df['title'].str.len() <= 50)
+            ]
+
+        # 第四层通过敏感词过滤
+        filter_df = filter_df[
+            (~filter_df['title'].str.contains('农历'))
+            & (~filter_df['title'].str.contains('太极'))
+            & (~filter_df['title'].str.contains('节'))
+            & (~filter_df['title'].str.contains('早上好'))
+            & (~filter_df['title'].str.contains('赖清德'))
+            & (~filter_df['title'].str.contains('普京'))
+            & (~filter_df['title'].str.contains('俄'))
+            & (~filter_df['title'].str.contains('南海'))
+            & (~filter_df['title'].str.contains('台海'))
+            & (~filter_df['title'].str.contains('解放军'))
+            & (~filter_df['title'].str.contains('蔡英文'))
+            & (~filter_df['title'].str.contains('中国'))
+            ]
+        length_level4 = filter_df.shape[0]
+
+        # 第六层通过相关性分数过滤
+        filter_df = filter_df[filter_df['score'] > 0.4]
+
+        result = filter_df[['article_id', 'title']].to_dict(orient='records')
+        return result
+
     def deal(self):
 
         self.rollback_lock_tasks(self.const.ARTICLE_TABLE_NAME)
 
-        task_list = self.get_task_list()
+        task_list = self.get_task_v2()
         task_batch_list = yield_batch(data=task_list, batch_size=self.const.BATCH_SIZE)
 
         # #  dev