luojunhui 10 ay önce
ebeveyn
işleme
99023bf6bf
1 değiştirilmiş dosya ile 23 ekleme ve 23 silme
  1. 23 23
      coldStartTasks/crawler/weixin_account_crawler.py

+ 23 - 23
coldStartTasks/crawler/weixin_account_crawler.py

@@ -6,9 +6,10 @@ import traceback
 from typing import List, Set, Dict, Tuple
 
 from tqdm import tqdm
+from datetime import datetime
 from pymysql.cursors import DictCursor
 
-from applications import WeixinSpider, longArticlesMySQL, log, bot
+from applications import WeixinSpider, longArticlesMySQL, log, bot, aiditApi
 from applications.const import WeixinVideoCrawlerConst
 from applications.functions import Functions
 
@@ -16,6 +17,16 @@ const = WeixinVideoCrawlerConst()
 function = Functions()
 
 
+def get_inner_account_gh_id() -> Set[str]:
+    """
+    获取内部账号名称
+    :return:
+    """
+    accounts = aiditApi.get_publish_account_from_aigc()
+    gh_id_list = [i['ghId'] for i in accounts]
+    return set(gh_id_list)
+
+
 class WeixinAccountCrawler(object):
     """
     账号抓取
@@ -26,18 +37,6 @@ class WeixinAccountCrawler(object):
         self.spider = WeixinSpider()
         self.crawler_account_count = 0
 
-    def get_inner_account_name(self) -> Set[str]:
-        """
-        获取内部账号名称
-        :return:
-        """
-        sql = "select distinct account_name from datastat_sort_strategy;"
-        account_name_list = self.db_client.select(sql, cursor_type=DictCursor)
-        account_name_set = set()
-        for account_name_obj in account_name_list:
-            account_name_set.add(account_name_obj['account_name'])
-        return account_name_set
-
     def get_crawler_articles(self) -> List[Dict]:
         """
         获取已经抓取到的文章,判断其是否有链接账号,若有则继续抓账号
@@ -64,11 +63,11 @@ class WeixinAccountCrawler(object):
         affected_rows = self.db_client.update(sql, (const.DO_NOT_NEED_SOURCE_ACCOUNT, article_id_tuple))
         return affected_rows
 
-    def get_seed_titles(self) -> List[str]:
+    def get_seed_titles(self, run_date) -> List[str]:
         """
         :return:
         """
-        publish_timestamp_threshold = int(time.time()) - const.STAT_PERIOD
+        publish_timestamp_threshold = int(datetime.strptime(run_date, "%Y-%m-%d").timestamp()) - const.STAT_PERIOD
         sql = f"""
             SELECT distinct title
             FROM datastat_sort_strategy
@@ -125,7 +124,7 @@ class WeixinAccountCrawler(object):
                     account_detail = account_detail['data']['data']
                     account_name = account_detail['account_name']
                     gh_id = account_detail['wx_gh']
-                    if account_name in inner_account_set:
+                    if gh_id in inner_account_set:
                         continue
                     # 判断搜索结果是否原创
                     if self.is_original(article_url):
@@ -177,19 +176,22 @@ class WeixinAccountCrawler(object):
             self.process_search_result(response, inner_account_set)
             time.sleep(const.SLEEP_SECONDS)
 
-    def run(self) -> None:
+    def run(self, run_date=None) -> None:
         """
         入口函数
         :return:
         """
+        if not run_date:
+            run_date = time.strftime("%Y-%m-%d", time.localtime())
+
         # get seed titles
-        title_list = self.get_seed_titles()
+        title_list = self.get_seed_titles(run_date)
         # get inner accounts set
-        inner_account_set = self.get_inner_account_name()
+        inner_account_gh_id_set = get_inner_account_gh_id()
 
         start_time = time.time()
         for title in tqdm(title_list, desc="search each title"):
-            self.search_title_in_weixin(title, inner_account_set)
+            self.search_title_in_weixin(title, inner_account_gh_id_set)
 
         # 通知
         bot(
@@ -249,6 +251,4 @@ class WeixinAccountCrawler(object):
                 "新增账号数量": insert_account_count
             },
             mention=False
-        )
-
-
+        )