|  | @@ -74,7 +74,7 @@ class CategoryColdStartTask(object):
 | 
	
		
			
				|  |  |                  }
 | 
	
		
			
				|  |  |              )
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | -    def get_articles_from_meta_table(self, category):
 | 
	
		
			
				|  |  | +    def get_articles_from_meta_table(self, category, article_source):
 | 
	
		
			
				|  |  |          """
 | 
	
		
			
				|  |  |          从长文 meta 库中获取冷启文章
 | 
	
		
			
				|  |  |          :return:
 | 
	
	
		
			
				|  | @@ -85,7 +85,7 @@ class CategoryColdStartTask(object):
 | 
	
		
			
				|  |  |          FROM
 | 
	
		
			
				|  |  |              crawler_meta_article
 | 
	
		
			
				|  |  |          WHERE 
 | 
	
		
			
				|  |  | -            category = "{category}";
 | 
	
		
			
				|  |  | +            category = "{category}" and platform = "{article_source}" and status = {self.INIT_STATUS};
 | 
	
		
			
				|  |  |          """
 | 
	
		
			
				|  |  |          article_list = self.db_client.select(sql)
 | 
	
		
			
				|  |  |          log(
 | 
	
	
		
			
				|  | @@ -153,12 +153,9 @@ class CategoryColdStartTask(object):
 | 
	
		
			
				|  |  |                  }
 | 
	
		
			
				|  |  |              )
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | -    def publish_filter_articles(self, category, articles_df):
 | 
	
		
			
				|  |  | +    def filter_weixin_articles(self, articles_df, category):
 | 
	
		
			
				|  |  |          """
 | 
	
		
			
				|  |  | -        过滤文章
 | 
	
		
			
				|  |  | -        :param category:
 | 
	
		
			
				|  |  | -        :param articles_df:
 | 
	
		
			
				|  |  | -        :return:
 | 
	
		
			
				|  |  | +        微信抓取文章过滤漏斗
 | 
	
		
			
				|  |  |          """
 | 
	
		
			
				|  |  |          articles_df['average_read'] = articles_df.groupby(['gh_id', 'position'])['read_cnt'].transform('mean')
 | 
	
		
			
				|  |  |          articles_df['read_times'] = articles_df['read_cnt'] / articles_df['average_read']
 | 
	
	
		
			
				|  | @@ -174,13 +171,13 @@ class CategoryColdStartTask(object):
 | 
	
		
			
				|  |  |          # 第二层漏斗通过阅读量过滤
 | 
	
		
			
				|  |  |          second_level_funnel_df = first_level_funnel_df[
 | 
	
		
			
				|  |  |              first_level_funnel_df['read_cnt'] >= self.READ_THRESHOLD
 | 
	
		
			
				|  |  | -        ]
 | 
	
		
			
				|  |  | +            ]
 | 
	
		
			
				|  |  |          second_level_funnel_length = second_level_funnel_df.shape[0]
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |          # 第三层漏斗通过标题长度过滤
 | 
	
		
			
				|  |  |          third_level_funnel_df = second_level_funnel_df[
 | 
	
		
			
				|  |  |              second_level_funnel_df['title'].str.len() >= self.LIMIT_TITLE_LENGTH
 | 
	
		
			
				|  |  | -        ]
 | 
	
		
			
				|  |  | +            ]
 | 
	
		
			
				|  |  |          third_level_funnel_length = third_level_funnel_df.shape[0]
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |          # 最后一层通过敏感词过滤
 | 
	
	
		
			
				|  | @@ -199,7 +196,6 @@ class CategoryColdStartTask(object):
 | 
	
		
			
				|  |  |              & (~third_level_funnel_df['title'].str.contains('中国'))
 | 
	
		
			
				|  |  |              ]
 | 
	
		
			
				|  |  |          final_length = filter_df.shape[0]
 | 
	
		
			
				|  |  | -        url_list = filter_df['link'].values.tolist()
 | 
	
		
			
				|  |  |          log(
 | 
	
		
			
				|  |  |              task="category_publish_task",
 | 
	
		
			
				|  |  |              function="publish_filter_articles",
 | 
	
	
		
			
				|  | @@ -213,11 +209,16 @@ class CategoryColdStartTask(object):
 | 
	
		
			
				|  |  |              title="冷启任务发布通知",
 | 
	
		
			
				|  |  |              detail={
 | 
	
		
			
				|  |  |                  "总文章数量": total_length,
 | 
	
		
			
				|  |  | -                "通过已经发布状态过滤": "过滤数量: {}    剩余数量: {}".format(total_length - zero_level_funnel_length, zero_level_funnel_length),
 | 
	
		
			
				|  |  | -                "通过阅读均值倍数过滤": "过滤数量: {}    剩余数量: {}".format(zero_level_funnel_length - first_level_funnel_length, first_level_funnel_length),
 | 
	
		
			
				|  |  | -                "通过阅读量过滤": "过滤数量: {}    剩余数量: {}".format(first_level_funnel_length - second_level_funnel_length, second_level_funnel_length),
 | 
	
		
			
				|  |  | -                "通过标题长度过滤": "过滤数量: {}    剩余数量: {}".format(second_level_funnel_length - third_level_funnel_length, third_level_funnel_length),
 | 
	
		
			
				|  |  | -                "通过敏感词过滤":  "过滤数量: {}    剩余数量: {}".format(third_level_funnel_length - final_length, final_length),
 | 
	
		
			
				|  |  | +                "通过已经发布状态过滤": "过滤数量: {}    剩余数量: {}".format(total_length - zero_level_funnel_length,
 | 
	
		
			
				|  |  | +                                                                              zero_level_funnel_length),
 | 
	
		
			
				|  |  | +                "通过阅读均值倍数过滤": "过滤数量: {}    剩余数量: {}".format(
 | 
	
		
			
				|  |  | +                    zero_level_funnel_length - first_level_funnel_length, first_level_funnel_length),
 | 
	
		
			
				|  |  | +                "通过阅读量过滤": "过滤数量: {}    剩余数量: {}".format(
 | 
	
		
			
				|  |  | +                    first_level_funnel_length - second_level_funnel_length, second_level_funnel_length),
 | 
	
		
			
				|  |  | +                "通过标题长度过滤": "过滤数量: {}    剩余数量: {}".format(
 | 
	
		
			
				|  |  | +                    second_level_funnel_length - third_level_funnel_length, third_level_funnel_length),
 | 
	
		
			
				|  |  | +                "通过敏感词过滤": "过滤数量: {}    剩余数量: {}".format(third_level_funnel_length - final_length,
 | 
	
		
			
				|  |  | +                                                                        final_length),
 | 
	
		
			
				|  |  |                  "品类": category,
 | 
	
		
			
				|  |  |                  "阅读均值倍数阈值": self.READ_TIMES_THRESHOLD,
 | 
	
		
			
				|  |  |                  "阅读量阈值": self.READ_THRESHOLD,
 | 
	
	
		
			
				|  | @@ -225,12 +226,34 @@ class CategoryColdStartTask(object):
 | 
	
		
			
				|  |  |              },
 | 
	
		
			
				|  |  |              mention=False
 | 
	
		
			
				|  |  |          )
 | 
	
		
			
				|  |  | +        return filter_df
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    def publish_filter_articles(self, category, articles_df, article_source):
 | 
	
		
			
				|  |  | +        """
 | 
	
		
			
				|  |  | +        过滤文章
 | 
	
		
			
				|  |  | +        :param category: 文章品类
 | 
	
		
			
				|  |  | +        :param articles_df: 该品类下的文章data_frame
 | 
	
		
			
				|  |  | +        :param article_source: 文章来源
 | 
	
		
			
				|  |  | +        :return:
 | 
	
		
			
				|  |  | +        """
 | 
	
		
			
				|  |  | +        match article_source:
 | 
	
		
			
				|  |  | +            case "weixin":
 | 
	
		
			
				|  |  | +                filtered_articles_df = self.filter_weixin_articles(articles_df, category)
 | 
	
		
			
				|  |  | +                input_source_channel = 5
 | 
	
		
			
				|  |  | +            case "toutiao":
 | 
	
		
			
				|  |  | +                filtered_articles_df = articles_df
 | 
	
		
			
				|  |  | +                input_source_channel = 6
 | 
	
		
			
				|  |  | +            case _:
 | 
	
		
			
				|  |  | +                return
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        url_list = filtered_articles_df['link'].values.tolist()
 | 
	
		
			
				|  |  |          if url_list:
 | 
	
		
			
				|  |  |              # create_crawler_plan
 | 
	
		
			
				|  |  |              crawler_plan_response = aiditApi.auto_create_crawler_task(
 | 
	
		
			
				|  |  |                  plan_id=None,
 | 
	
		
			
				|  |  |                  plan_name="自动绑定-{}--{}--{}".format(category, datetime.date.today().__str__(), len(url_list)),
 | 
	
		
			
				|  |  |                  plan_tag="品类冷启动",
 | 
	
		
			
				|  |  | +                article_source=article_source,
 | 
	
		
			
				|  |  |                  url_list=url_list
 | 
	
		
			
				|  |  |              )
 | 
	
		
			
				|  |  |              log(
 | 
	
	
		
			
				|  | @@ -239,7 +262,6 @@ class CategoryColdStartTask(object):
 | 
	
		
			
				|  |  |                  message="成功创建抓取计划",
 | 
	
		
			
				|  |  |                  data=crawler_plan_response
 | 
	
		
			
				|  |  |              )
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  |              # save to db
 | 
	
		
			
				|  |  |              create_timestamp = int(time.time()) * 1000
 | 
	
		
			
				|  |  |              crawler_plan_id = crawler_plan_response['data']['id']
 | 
	
	
		
			
				|  | @@ -253,10 +275,10 @@ class CategoryColdStartTask(object):
 | 
	
		
			
				|  |  |                      "inputSourceType": 2,
 | 
	
		
			
				|  |  |                      "inputSourceSubType": None,
 | 
	
		
			
				|  |  |                      "fieldName": None,
 | 
	
		
			
				|  |  | -                    "inputSourceValue": crawler_plan_response['data']['id'],
 | 
	
		
			
				|  |  | -                    "inputSourceLabel": crawler_plan_response['data']['name'],
 | 
	
		
			
				|  |  | +                    "inputSourceValue": crawler_plan_id,
 | 
	
		
			
				|  |  | +                    "inputSourceLabel": crawler_plan_name,
 | 
	
		
			
				|  |  |                      "inputSourceModal": 3,
 | 
	
		
			
				|  |  | -                    "inputSourceChannel": 5
 | 
	
		
			
				|  |  | +                    "inputSourceChannel": input_source_channel
 | 
	
		
			
				|  |  |                  }
 | 
	
		
			
				|  |  |              ]
 | 
	
		
			
				|  |  |              generate_plan_response = aiditApi.bind_crawler_task_to_generate_task(
 | 
	
	
		
			
				|  | @@ -271,16 +293,17 @@ class CategoryColdStartTask(object):
 | 
	
		
			
				|  |  |              )
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |              # change article status
 | 
	
		
			
				|  |  | -            article_id_list = filter_df['article_id'].values.tolist()
 | 
	
		
			
				|  |  | +            article_id_list = articles_df['article_id'].values.tolist()
 | 
	
		
			
				|  |  |              self.change_article_status_while_publishing(article_id_list=article_id_list)
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | -    def do_job(self, category_list=None):
 | 
	
		
			
				|  |  | +    def do_job(self, article_source, category_list=None):
 | 
	
		
			
				|  |  |          """
 | 
	
		
			
				|  |  |          执行任务
 | 
	
		
			
				|  |  |          :return:
 | 
	
		
			
				|  |  |          """
 | 
	
		
			
				|  |  |          if not category_list:
 | 
	
		
			
				|  |  |              category_list = self.category_map.keys()
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  |          log(
 | 
	
		
			
				|  |  |              task="category_publish_task",
 | 
	
		
			
				|  |  |              function="do_job",
 | 
	
	
		
			
				|  | @@ -291,10 +314,11 @@ class CategoryColdStartTask(object):
 | 
	
		
			
				|  |  |          )
 | 
	
		
			
				|  |  |          for category in category_list:
 | 
	
		
			
				|  |  |              try:
 | 
	
		
			
				|  |  | -                category_df = self.get_articles_from_meta_table(category=category)
 | 
	
		
			
				|  |  | +                category_df = self.get_articles_from_meta_table(category=category, article_source=article_source)
 | 
	
		
			
				|  |  |                  self.publish_filter_articles(
 | 
	
		
			
				|  |  |                      category=category,
 | 
	
		
			
				|  |  | -                    articles_df=category_df
 | 
	
		
			
				|  |  | +                    articles_df=category_df,
 | 
	
		
			
				|  |  | +                    article_source=article_source
 | 
	
		
			
				|  |  |                  )
 | 
	
		
			
				|  |  |              except Exception as e:
 | 
	
		
			
				|  |  |                  bot(
 | 
	
	
		
			
				|  | @@ -302,6 +326,7 @@ class CategoryColdStartTask(object):
 | 
	
		
			
				|  |  |                      detail={
 | 
	
		
			
				|  |  |                          "category": category,
 | 
	
		
			
				|  |  |                          "error": str(e),
 | 
	
		
			
				|  |  | -                        "function": "do_job"
 | 
	
		
			
				|  |  | +                        "function": "do_job",
 | 
	
		
			
				|  |  | +                        "traceback": traceback.format_exc()
 | 
	
		
			
				|  |  |                      }
 | 
	
		
			
				|  |  | -                )
 | 
	
		
			
				|  |  | +                )
 |