|
@@ -3,7 +3,7 @@
|
|
|
"""
|
|
|
import time
|
|
|
import traceback
|
|
|
-from typing import List, Set, Dict
|
|
|
+from typing import List, Set, Dict, Tuple
|
|
|
|
|
|
from tqdm import tqdm
|
|
|
|
|
@@ -37,6 +37,32 @@ class WeixinAccountCrawler(object):
|
|
|
account_name_set.add(account_name_obj['account_name'])
|
|
|
return account_name_set
|
|
|
|
|
|
+ def get_crawler_articles(self) -> List[Dict]:
|
|
|
+ """
|
|
|
+ 获取已经抓取到的文章,判断其是否有链接账号,若有则继续抓账号
|
|
|
+ :return:
|
|
|
+ """
|
|
|
+ sql = f"""
|
|
|
+ SELECT id, article_url
|
|
|
+ FROM publish_single_video_source
|
|
|
+ WHERE source_account = {const.NEED_SCAN_SOURCE_ACCOUNT};
|
|
|
+ """
|
|
|
+ article_url_list = self.db_client.select_json(sql)
|
|
|
+ return article_url_list
|
|
|
+
|
|
|
+ def update_crawler_article_status(self, article_id_tuple: Tuple[int, ...]) -> int:
|
|
|
+ """
|
|
|
+ :param article_id_tuple:
|
|
|
+ :return:
|
|
|
+ """
|
|
|
+ sql = """
|
|
|
+ UPDATE publish_single_video_source
|
|
|
+ SET source_account = %s
|
|
|
+ WHERE id IN %s;
|
|
|
+ """
|
|
|
+ affected_rows = self.db_client.update(sql, (const.DO_NOT_NEED_SOURCE_ACCOUNT, article_id_tuple))
|
|
|
+ return affected_rows
|
|
|
+
|
|
|
def get_seed_titles(self) -> List[str]:
|
|
|
"""
|
|
|
:return:
|
|
@@ -166,7 +192,7 @@ class WeixinAccountCrawler(object):
|
|
|
|
|
|
# 通知
|
|
|
bot(
|
|
|
- title="微信账号抓取完成",
|
|
|
+ title="微信账号抓取V1完成",
|
|
|
detail={
|
|
|
"总更新账号数量": self.crawler_account_count,
|
|
|
"总耗时": time.time() - start_time,
|
|
@@ -174,3 +200,53 @@ class WeixinAccountCrawler(object):
|
|
|
},
|
|
|
mention=False
|
|
|
)
|
|
|
+
|
|
|
+ def run_v2(self) -> None:
|
|
|
+ """
|
|
|
+ 入口函数
|
|
|
+ :return:
|
|
|
+ """
|
|
|
+ # get article list
|
|
|
+ crawler_article_list = self.get_crawler_articles()
|
|
|
+ article_id_list = []
|
|
|
+ insert_account_count = 0
|
|
|
+ for crawler_article_obj in tqdm(crawler_article_list, desc="crawler article list"):
|
|
|
+ try:
|
|
|
+ article_id = crawler_article_obj['id']
|
|
|
+ article_url = crawler_article_obj['article_url']
|
|
|
+ # 判断文章是否原创
|
|
|
+ if self.is_original(article_url):
|
|
|
+ continue
|
|
|
+ try:
|
|
|
+ source_account_info = function.get_source_account(article_url)
|
|
|
+ except Exception as e:
|
|
|
+ continue
|
|
|
+ if not source_account_info:
|
|
|
+ continue
|
|
|
+ if source_account_info:
|
|
|
+ account_name = source_account_info['name']
|
|
|
+ gh_id = source_account_info['gh_id']
|
|
|
+ affected_rows = self.insert_account(gh_id=gh_id, account_name=account_name)
|
|
|
+ insert_account_count += affected_rows
|
|
|
+ else:
|
|
|
+ continue
|
|
|
+
|
|
|
+ # 记录处理过的id
|
|
|
+ article_id_list.append(int(article_id))
|
|
|
+ except Exception as e:
|
|
|
+ print(e)
|
|
|
+ print(traceback.format_exc())
|
|
|
+
|
|
|
+ article_id_tuple = tuple(article_id_list)
|
|
|
+ affected_rows = self.update_crawler_article_status(article_id_tuple)
|
|
|
+
|
|
|
+ bot(
|
|
|
+ title="微信账号抓取V2完成",
|
|
|
+ detail={
|
|
|
+ "扫描文章数量": len(crawler_article_list),
|
|
|
+ "新增账号数量": insert_account_count
|
|
|
+ },
|
|
|
+ mention=False
|
|
|
+ )
|
|
|
+
|
|
|
+
|