|  | @@ -2,19 +2,46 @@
 | 
	
		
			
				|  |  |  @author: luojunhui
 | 
	
		
			
				|  |  |  抓取全局品类文章
 | 
	
		
			
				|  |  |  """
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  | +import json
 | 
	
		
			
				|  |  |  import time
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |  from tqdm import tqdm
 | 
	
		
			
				|  |  | +from pymysql.cursors import DictCursor
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  |  from applications import WeixinSpider, Functions, llm_sensitivity, log
 | 
	
		
			
				|  |  |  from coldStartTasks.filter import article_crawler_duplicate_filter
 | 
	
		
			
				|  |  | +from config import apolloConfig
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |  # 常量
 | 
	
		
			
				|  |  |  ACCOUNT_GOOD_STATUS = 1
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +# 账号是否每日抓取
 | 
	
		
			
				|  |  | +ACCOUNT_DAILY_SCRAPE = 1
 | 
	
		
			
				|  |  | +ACCOUNT_NOT_DAILY_SCRAPE = 0
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +# 默认值
 | 
	
		
			
				|  |  |  DEFAULT_VIEW_COUNT = 0
 | 
	
		
			
				|  |  |  DEFAULT_LIKE_COUNT = 0
 | 
	
		
			
				|  |  |  DEFAULT_ARTICLE_STATUS = 1
 | 
	
		
			
				|  |  | -DEFAULT_TIMESTAMP = 1704038400
 | 
	
		
			
				|  |  | +DEFAULT_TIMESTAMP = 1717171200
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +# 标题sensitivity
 | 
	
		
			
				|  |  | +TITLE_SENSITIVE = 1
 | 
	
		
			
				|  |  | +TITLE_NOT_SENSITIVE = 0
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +config = apolloConfig()
 | 
	
		
			
				|  |  | +sensitive_word_list = json.loads(config.getConfigValue("sensitive_word_list"))
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +def whether_title_sensitive(title: str) -> bool:
 | 
	
		
			
				|  |  | +    """
 | 
	
		
			
				|  |  | +    : param title:
 | 
	
		
			
				|  |  | +    判断视频是否的标题是否包含敏感词
 | 
	
		
			
				|  |  | +    """
 | 
	
		
			
				|  |  | +    for word in sensitive_word_list:
 | 
	
		
			
				|  |  | +        if word in title:
 | 
	
		
			
				|  |  | +            return True
 | 
	
		
			
				|  |  | +    return False
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |  class weixinCategory(object):
 | 
	
	
		
			
				|  | @@ -36,7 +63,7 @@ class weixinCategory(object):
 | 
	
		
			
				|  |  |          sql = f"""
 | 
	
		
			
				|  |  |              select gh_id, account_source, account_name, account_category, latest_update_time
 | 
	
		
			
				|  |  |              from long_articles_accounts 
 | 
	
		
			
				|  |  | -            where account_category = '{account_category}' and is_using = {ACCOUNT_GOOD_STATUS};
 | 
	
		
			
				|  |  | +            where account_category = '{account_category}' and is_using = {ACCOUNT_GOOD_STATUS} and daily_scrape = {ACCOUNT_DAILY_SCRAPE};
 | 
	
		
			
				|  |  |              """
 | 
	
		
			
				|  |  |          account_tuple = self.db_client_lam.select(sql)
 | 
	
		
			
				|  |  |          result = [
 | 
	
	
		
			
				|  | @@ -51,10 +78,25 @@ class weixinCategory(object):
 | 
	
		
			
				|  |  |          ]
 | 
	
		
			
				|  |  |          return result
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | +    def get_association_account_list(self, date_str):
 | 
	
		
			
				|  |  | +        """
 | 
	
		
			
				|  |  | +        获取账号联想的轮询账号
 | 
	
		
			
				|  |  | +        """
 | 
	
		
			
				|  |  | +        group_id = date_str[-1]
 | 
	
		
			
				|  |  | +        sql = f"""
 | 
	
		
			
				|  |  | +            select account_id, gh_id, account_name, latest_update_time
 | 
	
		
			
				|  |  | +            from long_articles_accounts
 | 
	
		
			
				|  |  | +            where account_category = 'account_association' and is_using = {ACCOUNT_DAILY_SCRAPE} and daily_scrape = {ACCOUNT_NOT_DAILY_SCRAPE};
 | 
	
		
			
				|  |  | +        """
 | 
	
		
			
				|  |  | +        account_list = self.db_client_lam.select(sql, cursor_type=DictCursor)
 | 
	
		
			
				|  |  | +        today_crawler_account_list = [i for i in account_list if str(i['account_id'])[-1] == group_id]
 | 
	
		
			
				|  |  | +        return today_crawler_account_list
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  |      def insert_data_into_db(self, gh_id, category, article_list):
 | 
	
		
			
				|  |  |          """
 | 
	
		
			
				|  |  |          将数据更新到数据库
 | 
	
		
			
				|  |  |          :return:
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  |          """
 | 
	
		
			
				|  |  |          success_records = []
 | 
	
		
			
				|  |  |          for article_obj in article_list:
 | 
	
	
		
			
				|  | @@ -63,7 +105,7 @@ class weixinCategory(object):
 | 
	
		
			
				|  |  |                  try:
 | 
	
		
			
				|  |  |                      # 判断文章是否存在相同的标题
 | 
	
		
			
				|  |  |                      if article_crawler_duplicate_filter(
 | 
	
		
			
				|  |  | -                        new_article_title=obj["Title"], db_client=self.db_client_lam
 | 
	
		
			
				|  |  | +                            new_article_title=obj["Title"], db_client=self.db_client_lam
 | 
	
		
			
				|  |  |                      ):
 | 
	
		
			
				|  |  |                          log(
 | 
	
		
			
				|  |  |                              function="weixinCategory",
 | 
	
	
		
			
				|  | @@ -72,6 +114,9 @@ class weixinCategory(object):
 | 
	
		
			
				|  |  |                              data={"title": obj["Title"]}
 | 
	
		
			
				|  |  |                          )
 | 
	
		
			
				|  |  |                          continue
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +                    # 判断标题是否包含敏感词
 | 
	
		
			
				|  |  | +                    title_sensitivity = TITLE_SENSITIVE if whether_title_sensitive(obj["Title"]) else TITLE_NOT_SENSITIVE
 | 
	
		
			
				|  |  |                      show_stat = self.function.show_desc_to_sta(obj["ShowDesc"])
 | 
	
		
			
				|  |  |                      show_view_count = show_stat.get("show_view_count", DEFAULT_VIEW_COUNT)
 | 
	
		
			
				|  |  |                      show_like_count = show_stat.get("show_like_count", DEFAULT_LIKE_COUNT)
 | 
	
	
		
			
				|  | @@ -80,10 +125,10 @@ class weixinCategory(object):
 | 
	
		
			
				|  |  |                          insert into crawler_meta_article
 | 
	
		
			
				|  |  |                          (
 | 
	
		
			
				|  |  |                           platform, mode, category, out_account_id, article_index, title, link, read_cnt, like_cnt,
 | 
	
		
			
				|  |  | -                         description, publish_time, crawler_time, status, unique_index, llm_sensitivity
 | 
	
		
			
				|  |  | +                         description, publish_time, crawler_time, status, unique_index, llm_sensitivity, title_sensitivity
 | 
	
		
			
				|  |  |                          )
 | 
	
		
			
				|  |  |                          VALUES 
 | 
	
		
			
				|  |  | -                        (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
 | 
	
		
			
				|  |  | +                        (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
 | 
	
		
			
				|  |  |                      """
 | 
	
		
			
				|  |  |                      self.db_client_lam.update(
 | 
	
		
			
				|  |  |                          sql=insert_sql,
 | 
	
	
		
			
				|  | @@ -102,7 +147,8 @@ class weixinCategory(object):
 | 
	
		
			
				|  |  |                              int(time.time()),
 | 
	
		
			
				|  |  |                              DEFAULT_ARTICLE_STATUS,
 | 
	
		
			
				|  |  |                              unique_idx,
 | 
	
		
			
				|  |  | -                            obj.get("llm_sensitivity", -1)
 | 
	
		
			
				|  |  | +                            obj.get("llm_sensitivity", -1),
 | 
	
		
			
				|  |  | +                            title_sensitivity
 | 
	
		
			
				|  |  |                          ),
 | 
	
		
			
				|  |  |                      )
 | 
	
		
			
				|  |  |                      success_records.append({
 | 
	
	
		
			
				|  | @@ -175,48 +221,59 @@ class weixinCategory(object):
 | 
	
		
			
				|  |  |              print("No more data")
 | 
	
		
			
				|  |  |              return []
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | -    def deal(self, category_list):
 | 
	
		
			
				|  |  | +    def crawler_each_category(self, account_list, category):
 | 
	
		
			
				|  |  |          """
 | 
	
		
			
				|  |  | +        抓取每个品类
 | 
	
		
			
				|  |  | +        :return:
 | 
	
		
			
				|  |  | +        """
 | 
	
		
			
				|  |  | +        success_records = []
 | 
	
		
			
				|  |  | +        for account in tqdm(account_list, desc="crawler_each_category"):
 | 
	
		
			
				|  |  | +            try:
 | 
	
		
			
				|  |  | +                gh_id = account['gh_id']
 | 
	
		
			
				|  |  | +                try:
 | 
	
		
			
				|  |  | +                    timestamp = int(account['latest_timestamp'].timestamp())
 | 
	
		
			
				|  |  | +                except Exception as e:
 | 
	
		
			
				|  |  | +                    timestamp = DEFAULT_TIMESTAMP
 | 
	
		
			
				|  |  | +                success_records += self.update_each_account(
 | 
	
		
			
				|  |  | +                    gh_id=gh_id,
 | 
	
		
			
				|  |  | +                    category=category,
 | 
	
		
			
				|  |  | +                    latest_time_stamp=timestamp
 | 
	
		
			
				|  |  | +                )
 | 
	
		
			
				|  |  | +                print("success")
 | 
	
		
			
				|  |  | +            except Exception as e:
 | 
	
		
			
				|  |  | +                print("fail because of {}".format(e))
 | 
	
		
			
				|  |  | +        success_titles = [x['title'] for x in success_records]
 | 
	
		
			
				|  |  | +        if success_titles:
 | 
	
		
			
				|  |  | +            try:
 | 
	
		
			
				|  |  | +                sensitive_results = llm_sensitivity.check_titles(success_titles)
 | 
	
		
			
				|  |  | +                for record, sensitive_result in zip(success_records, sensitive_results):
 | 
	
		
			
				|  |  | +                    self.update_article_sensitive_status(
 | 
	
		
			
				|  |  | +                        category=category,
 | 
	
		
			
				|  |  | +                        unique_index=record['unique_index'],
 | 
	
		
			
				|  |  | +                        status=sensitive_result['hit_rule']
 | 
	
		
			
				|  |  | +                    )
 | 
	
		
			
				|  |  | +            except Exception as e:
 | 
	
		
			
				|  |  | +                print("failed to update sensitive status: {}".format(e))
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | +    def deal(self, category_list, date_str):
 | 
	
		
			
				|  |  | +        """
 | 
	
		
			
				|  |  |          :param category_list:
 | 
	
		
			
				|  |  | +        :param date_str: YYYY-MM-DD
 | 
	
		
			
				|  |  |          :return:
 | 
	
		
			
				|  |  |          """
 | 
	
		
			
				|  |  | +        # daily 品类账号抓取
 | 
	
		
			
				|  |  |          for category in category_list:
 | 
	
		
			
				|  |  | -            success_records = []
 | 
	
		
			
				|  |  |              account_list = self.get_account_list(category)
 | 
	
		
			
				|  |  | -            for account in tqdm(account_list):
 | 
	
		
			
				|  |  | -                try:
 | 
	
		
			
				|  |  | -                    gh_id = account['gh_id']
 | 
	
		
			
				|  |  | -                    category = account['category']
 | 
	
		
			
				|  |  | -                    try:
 | 
	
		
			
				|  |  | -                        timestamp = int(account['latest_timestamp'].timestamp())
 | 
	
		
			
				|  |  | -                    except Exception as e:
 | 
	
		
			
				|  |  | -                        timestamp = DEFAULT_TIMESTAMP
 | 
	
		
			
				|  |  | -                    success_records += self.update_each_account(
 | 
	
		
			
				|  |  | -                        gh_id=gh_id,
 | 
	
		
			
				|  |  | -                        category=category,
 | 
	
		
			
				|  |  | -                        latest_time_stamp=timestamp
 | 
	
		
			
				|  |  | -                    )
 | 
	
		
			
				|  |  | -                    print("success")
 | 
	
		
			
				|  |  | -                except Exception as e:
 | 
	
		
			
				|  |  | -                    print("fail because of {}".format(e))
 | 
	
		
			
				|  |  | -            success_titles = [x['title'] for x in success_records]
 | 
	
		
			
				|  |  | -            if success_titles:
 | 
	
		
			
				|  |  | -                try:
 | 
	
		
			
				|  |  | -                    sensitive_results = llm_sensitivity.check_titles(success_titles)
 | 
	
		
			
				|  |  | -                    for record, sensitive_result in zip(success_records, sensitive_results):
 | 
	
		
			
				|  |  | -                        self.update_article_sensitive_status(
 | 
	
		
			
				|  |  | -                            category=category,
 | 
	
		
			
				|  |  | -                            unique_index=record['unique_index'],
 | 
	
		
			
				|  |  | -                            status=sensitive_result['hit_rule']
 | 
	
		
			
				|  |  | -                        )
 | 
	
		
			
				|  |  | -                except Exception as e:
 | 
	
		
			
				|  |  | -                    print("failed to update sensitive status: {}".format(e))
 | 
	
		
			
				|  |  | +            self.crawler_each_category(account_list=account_list, category=category)
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        # 账号联想账号轮询抓取
 | 
	
		
			
				|  |  | +        association_account_list = self.get_association_account_list(date_str)
 | 
	
		
			
				|  |  | +        self.crawler_each_category(account_list=association_account_list, category="association")
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |      def deal_accounts(self, account_list):
 | 
	
		
			
				|  |  |          """
 | 
	
		
			
				|  |  |          input account list
 | 
	
		
			
				|  |  | -        :param account_list:
 | 
	
		
			
				|  |  | +        :param account_list: 具体账号抓取,只抓一页
 | 
	
		
			
				|  |  |          :return:
 | 
	
		
			
				|  |  |          """
 | 
	
		
			
				|  |  |          account_tuple = tuple(account_list)
 | 
	
	
		
			
				|  | @@ -233,6 +290,7 @@ class weixinCategory(object):
 | 
	
		
			
				|  |  |                  try:
 | 
	
		
			
				|  |  |                      latest_timestamp = account[3].timestamp()
 | 
	
		
			
				|  |  |                  except Exception as e:
 | 
	
		
			
				|  |  | +                    print(e)
 | 
	
		
			
				|  |  |                      latest_timestamp = DEFAULT_TIMESTAMP
 | 
	
		
			
				|  |  |                  self.update_each_account(
 | 
	
		
			
				|  |  |                      gh_id=gh_id,
 | 
	
	
		
			
				|  | @@ -241,5 +299,3 @@ class weixinCategory(object):
 | 
	
		
			
				|  |  |                  )
 | 
	
		
			
				|  |  |              except Exception as e:
 | 
	
		
			
				|  |  |                  print(e)
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  | -
 |