7 maanden geleden · 99023bf6bf
--- a/coldStartTasks/crawler/weixin_account_crawler.py
+++ b/coldStartTasks/crawler/weixin_account_crawler.py
@@ -6,9 +6,10 @@ import traceback
 
				 from typing import List, Set, Dict, Tuple
			
 
				 
			
 
				 from tqdm import tqdm
			
 
				+from datetime import datetime
			
 
				 from pymysql.cursors import DictCursor
			
 
				 
			
 
				-from applications import WeixinSpider, longArticlesMySQL, log, bot
			
 
				+from applications import WeixinSpider, longArticlesMySQL, log, bot, aiditApi
			
 
				 from applications.const import WeixinVideoCrawlerConst
			
 
				 from applications.functions import Functions
			
 
				 
			
@@ -16,6 +17,16 @@ const = WeixinVideoCrawlerConst()
 
				 function = Functions()
			
 
				 
			
 
				 
			
 
				+def get_inner_account_gh_id() -> Set[str]:
			
 
				+    """
			
 
				+    获取内部账号名称
			
 
				+    :return:
			
 
				+    """
			
 
				+    accounts = aiditApi.get_publish_account_from_aigc()
			
 
				+    gh_id_list = [i['ghId'] for i in accounts]
			
 
				+    return set(gh_id_list)
			
 
				+
			
 
				+
			
 
				 class WeixinAccountCrawler(object):
			
 
				     """
			
 
				     账号抓取
			
@@ -26,18 +37,6 @@ class WeixinAccountCrawler(object):
 
				         self.spider = WeixinSpider()
			
 
				         self.crawler_account_count = 0
			
 
				 
			
 
				-    def get_inner_account_name(self) -> Set[str]:
			
 
				-        """
			
 
				-        获取内部账号名称
			
 
				-        :return:
			
 
				-        """
			
 
				-        sql = "select distinct account_name from datastat_sort_strategy;"
			
 
				-        account_name_list = self.db_client.select(sql, cursor_type=DictCursor)
			
 
				-        account_name_set = set()
			
 
				-        for account_name_obj in account_name_list:
			
 
				-            account_name_set.add(account_name_obj['account_name'])
			
 
				-        return account_name_set
			
 
				-
			
 
				     def get_crawler_articles(self) -> List[Dict]:
			
 
				         """
			
 
				         获取已经抓取到的文章，判断其是否有链接账号，若有则继续抓账号
			
@@ -64,11 +63,11 @@ class WeixinAccountCrawler(object):
 
				         affected_rows = self.db_client.update(sql, (const.DO_NOT_NEED_SOURCE_ACCOUNT, article_id_tuple))
			
 
				         return affected_rows
			
 
				 
			
 
				-    def get_seed_titles(self) -> List[str]:
			
 
				+    def get_seed_titles(self, run_date) -> List[str]:
			
 
				         """
			
 
				         :return:
			
 
				         """
			
 
				-        publish_timestamp_threshold = int(time.time()) - const.STAT_PERIOD
			
 
				+        publish_timestamp_threshold = int(datetime.strptime(run_date, "%Y-%m-%d").timestamp()) - const.STAT_PERIOD
			
 
				         sql = f"""
			
 
				             SELECT distinct title
			
 
				             FROM datastat_sort_strategy
			
@@ -125,7 +124,7 @@ class WeixinAccountCrawler(object):
 
				                     account_detail = account_detail['data']['data']
			
 
				                     account_name = account_detail['account_name']
			
 
				                     gh_id = account_detail['wx_gh']
			
 
				-                    if account_name in inner_account_set:
			
 
				+                    if gh_id in inner_account_set:
			
 
				                         continue
			
 
				                     # 判断搜索结果是否原创
			
 
				                     if self.is_original(article_url):
			
@@ -177,19 +176,22 @@ class WeixinAccountCrawler(object):
 
				             self.process_search_result(response, inner_account_set)
			
 
				             time.sleep(const.SLEEP_SECONDS)
			
 
				 
			
 
				-    def run(self) -> None:
			
 
				+    def run(self, run_date=None) -> None:
			
 
				         """
			
 
				         入口函数
			
 
				         :return:
			
 
				         """
			
 
				+        if not run_date:
			
 
				+            run_date = time.strftime("%Y-%m-%d", time.localtime())
			
 
				+
			
 
				         # get seed titles
			
 
				-        title_list = self.get_seed_titles()
			
 
				+        title_list = self.get_seed_titles(run_date)
			
 
				         # get inner accounts set
			
 
				-        inner_account_set = self.get_inner_account_name()
			
 
				+        inner_account_gh_id_set = get_inner_account_gh_id()
			
 
				 
			
 
				         start_time = time.time()
			
 
				         for title in tqdm(title_list, desc="search each title"):
			
 
				-            self.search_title_in_weixin(title, inner_account_set)
			
 
				+            self.search_title_in_weixin(title, inner_account_gh_id_set)
			
 
				 
			
 
				         # 通知
			
 
				         bot(
			
@@ -249,6 +251,4 @@ class WeixinAccountCrawler(object):
 
				                 "新增账号数量": insert_account_count
			
 
				             },
			
 
				             mention=False
			
 
				-        )
			
 
				-
			
 
				-
			
 
				+        )