|
@@ -6,9 +6,10 @@ import traceback
|
|
|
from typing import List, Set, Dict, Tuple
|
|
|
|
|
|
from tqdm import tqdm
|
|
|
+from datetime import datetime
|
|
|
from pymysql.cursors import DictCursor
|
|
|
|
|
|
-from applications import WeixinSpider, longArticlesMySQL, log, bot
|
|
|
+from applications import WeixinSpider, longArticlesMySQL, log, bot, aiditApi
|
|
|
from applications.const import WeixinVideoCrawlerConst
|
|
|
from applications.functions import Functions
|
|
|
|
|
@@ -16,6 +17,16 @@ const = WeixinVideoCrawlerConst()
|
|
|
function = Functions()
|
|
|
|
|
|
|
|
|
+def get_inner_account_gh_id() -> Set[str]:
|
|
|
+ """
|
|
|
+ 获取内部账号名称
|
|
|
+ :return:
|
|
|
+ """
|
|
|
+ accounts = aiditApi.get_publish_account_from_aigc()
|
|
|
+ gh_id_list = [i['ghId'] for i in accounts]
|
|
|
+ return set(gh_id_list)
|
|
|
+
|
|
|
+
|
|
|
class WeixinAccountCrawler(object):
|
|
|
"""
|
|
|
账号抓取
|
|
@@ -26,18 +37,6 @@ class WeixinAccountCrawler(object):
|
|
|
self.spider = WeixinSpider()
|
|
|
self.crawler_account_count = 0
|
|
|
|
|
|
- def get_inner_account_name(self) -> Set[str]:
|
|
|
- """
|
|
|
- 获取内部账号名称
|
|
|
- :return:
|
|
|
- """
|
|
|
- sql = "select distinct account_name from datastat_sort_strategy;"
|
|
|
- account_name_list = self.db_client.select(sql, cursor_type=DictCursor)
|
|
|
- account_name_set = set()
|
|
|
- for account_name_obj in account_name_list:
|
|
|
- account_name_set.add(account_name_obj['account_name'])
|
|
|
- return account_name_set
|
|
|
-
|
|
|
def get_crawler_articles(self) -> List[Dict]:
|
|
|
"""
|
|
|
获取已经抓取到的文章,判断其是否有链接账号,若有则继续抓账号
|
|
@@ -64,11 +63,11 @@ class WeixinAccountCrawler(object):
|
|
|
affected_rows = self.db_client.update(sql, (const.DO_NOT_NEED_SOURCE_ACCOUNT, article_id_tuple))
|
|
|
return affected_rows
|
|
|
|
|
|
- def get_seed_titles(self) -> List[str]:
|
|
|
+ def get_seed_titles(self, run_date) -> List[str]:
|
|
|
"""
|
|
|
:return:
|
|
|
"""
|
|
|
- publish_timestamp_threshold = int(time.time()) - const.STAT_PERIOD
|
|
|
+ publish_timestamp_threshold = int(datetime.strptime(run_date, "%Y-%m-%d").timestamp()) - const.STAT_PERIOD
|
|
|
sql = f"""
|
|
|
SELECT distinct title
|
|
|
FROM datastat_sort_strategy
|
|
@@ -125,7 +124,7 @@ class WeixinAccountCrawler(object):
|
|
|
account_detail = account_detail['data']['data']
|
|
|
account_name = account_detail['account_name']
|
|
|
gh_id = account_detail['wx_gh']
|
|
|
- if account_name in inner_account_set:
|
|
|
+ if gh_id in inner_account_set:
|
|
|
continue
|
|
|
# 判断搜索结果是否原创
|
|
|
if self.is_original(article_url):
|
|
@@ -177,19 +176,22 @@ class WeixinAccountCrawler(object):
|
|
|
self.process_search_result(response, inner_account_set)
|
|
|
time.sleep(const.SLEEP_SECONDS)
|
|
|
|
|
|
- def run(self) -> None:
|
|
|
+ def run(self, run_date=None) -> None:
|
|
|
"""
|
|
|
入口函数
|
|
|
:return:
|
|
|
"""
|
|
|
+ if not run_date:
|
|
|
+ run_date = time.strftime("%Y-%m-%d", time.localtime())
|
|
|
+
|
|
|
# get seed titles
|
|
|
- title_list = self.get_seed_titles()
|
|
|
+ title_list = self.get_seed_titles(run_date)
|
|
|
# get inner accounts set
|
|
|
- inner_account_set = self.get_inner_account_name()
|
|
|
+ inner_account_gh_id_set = get_inner_account_gh_id()
|
|
|
|
|
|
start_time = time.time()
|
|
|
for title in tqdm(title_list, desc="search each title"):
|
|
|
- self.search_title_in_weixin(title, inner_account_set)
|
|
|
+ self.search_title_in_weixin(title, inner_account_gh_id_set)
|
|
|
|
|
|
# 通知
|
|
|
bot(
|
|
@@ -249,6 +251,4 @@ class WeixinAccountCrawler(object):
|
|
|
"新增账号数量": insert_account_count
|
|
|
},
|
|
|
mention=False
|
|
|
- )
|
|
|
-
|
|
|
-
|
|
|
+ )
|