|
@@ -0,0 +1,176 @@
|
|
|
+"""
|
|
|
+@author: luojunhui
|
|
|
+"""
|
|
|
+import time
|
|
|
+import traceback
|
|
|
+from typing import List, Set, Dict
|
|
|
+
|
|
|
+from tqdm import tqdm
|
|
|
+
|
|
|
+from applications import WeixinSpider, longArticlesMySQL, log, bot
|
|
|
+from applications.const import WeixinVideoCrawlerConst
|
|
|
+from applications.functions import Functions
|
|
|
+
|
|
|
+const = WeixinVideoCrawlerConst()
|
|
|
+function = Functions()
|
|
|
+
|
|
|
+
|
|
|
+class WeixinAccountCrawler(object):
|
|
|
+ """
|
|
|
+ 账号抓取
|
|
|
+ """
|
|
|
+
|
|
|
+ def __init__(self):
|
|
|
+ self.db_client = longArticlesMySQL()
|
|
|
+ self.spider = WeixinSpider()
|
|
|
+ self.account_count = 0
|
|
|
+
|
|
|
+ def get_inner_account_name(self) -> Set[str]:
|
|
|
+ """
|
|
|
+ 获取内部账号名称
|
|
|
+ :return:
|
|
|
+ """
|
|
|
+ sql = "select distinct account_name from datastat_sort_strategy;"
|
|
|
+ account_name_list = self.db_client.select_json(sql)
|
|
|
+ account_name_set = set()
|
|
|
+ for account_name_obj in account_name_list:
|
|
|
+ account_name_set.add(account_name_obj['account_name'])
|
|
|
+ return account_name_set
|
|
|
+
|
|
|
+ def get_seed_titles(self) -> List[str]:
|
|
|
+ """
|
|
|
+ :return:
|
|
|
+ """
|
|
|
+ publish_timestamp_threshold = int(time.time()) - const.STAT_PERIOD
|
|
|
+ sql = f"""
|
|
|
+ SELECT distinct title
|
|
|
+ FROM datastat_sort_strategy
|
|
|
+ WHERE read_rate > {const.READ_AVG_MULTIPLE} and view_count > {const.MIN_READ_COUNT} and publish_timestamp > {publish_timestamp_threshold};
|
|
|
+ ORDER BY read_rate DESC;
|
|
|
+ """
|
|
|
+ title_list = self.db_client.select_json(sql)
|
|
|
+ title_list = [i['title'] for i in title_list]
|
|
|
+ return title_list
|
|
|
+
|
|
|
+ def is_original(self, article_url: str) -> bool:
|
|
|
+ """
|
|
|
+ 判断视频是否是原创
|
|
|
+ :return:
|
|
|
+ """
|
|
|
+ response = self.spider.get_article_text(article_url)
|
|
|
+ data = response['data']['data']
|
|
|
+ return data['is_original']
|
|
|
+
|
|
|
+ def insert_account(self, gh_id: str, account_name: str) -> int:
|
|
|
+ """
|
|
|
+ 插入账号
|
|
|
+ :param account_name:
|
|
|
+ :param gh_id:
|
|
|
+ :return:
|
|
|
+ """
|
|
|
+ init_date = time.strftime("%Y-%m-%d", time.localtime())
|
|
|
+ sql = """
|
|
|
+ INSERT IGNORE INTO weixin_account_for_videos
|
|
|
+ (gh_id, account_name, account_init_date)
|
|
|
+ VALUES
|
|
|
+ (%s, %s, %s);
|
|
|
+ """
|
|
|
+ insert_rows = self.db_client.update(sql, (gh_id, account_name, init_date))
|
|
|
+ return insert_rows
|
|
|
+
|
|
|
+ def process_search_result(self, response: Dict, inner_account_set: Set[str]):
|
|
|
+ """
|
|
|
+ 处理搜索结果
|
|
|
+ :param response:
|
|
|
+ :param inner_account_set:
|
|
|
+ :return:
|
|
|
+ """
|
|
|
+ if response['code'] != 0:
|
|
|
+ return
|
|
|
+
|
|
|
+ article_list = response['data']['data']
|
|
|
+ if article_list:
|
|
|
+ for article in article_list:
|
|
|
+ try:
|
|
|
+ # 先判断账号是否内部账号
|
|
|
+ article_url = article['url']
|
|
|
+ account_detail = self.spider.get_account_by_url(article_url)
|
|
|
+ account_detail = account_detail['data']['data']
|
|
|
+ account_name = account_detail['account_name']
|
|
|
+ gh_id = account_detail['wx_gh']
|
|
|
+ if account_name in inner_account_set:
|
|
|
+ continue
|
|
|
+ # 判断搜索结果是否原创
|
|
|
+ if self.is_original(article_url):
|
|
|
+ continue
|
|
|
+
|
|
|
+ # 判断是否有视频链接
|
|
|
+ try:
|
|
|
+ video_url = function.get_video_url(article_url)
|
|
|
+ except Exception as e:
|
|
|
+ continue
|
|
|
+
|
|
|
+ if not video_url:
|
|
|
+ continue
|
|
|
+
|
|
|
+ # 将账号抓取进来
|
|
|
+ insert_rows = self.insert_account(gh_id=gh_id, account_name=account_name)
|
|
|
+ if insert_rows:
|
|
|
+ log(
|
|
|
+ task="account_crawler_v1",
|
|
|
+ function="process_search_result",
|
|
|
+ message="insert account success",
|
|
|
+ data={
|
|
|
+ "gh_id": gh_id,
|
|
|
+ "account_name": account_name
|
|
|
+ }
|
|
|
+ )
|
|
|
+ self.account_count += 1
|
|
|
+ except Exception as e:
|
|
|
+ log(
|
|
|
+ task="account_crawler_v1",
|
|
|
+ function="process_search_result",
|
|
|
+ message="insert account error",
|
|
|
+ data={
|
|
|
+ "error": str(e),
|
|
|
+ "traceback": traceback.format_exc(),
|
|
|
+ "data": article
|
|
|
+ }
|
|
|
+ )
|
|
|
+
|
|
|
+ def search_title_in_weixin(self, title: str, inner_account_set: Set[str]) -> None:
|
|
|
+ """
|
|
|
+ 调用搜索接口,在微信搜索
|
|
|
+ :param inner_account_set:
|
|
|
+ :param title:
|
|
|
+ :return:
|
|
|
+ """
|
|
|
+ for page_index in tqdm(range(1, const.MAX_SEARCH_PAGE_NUM + 1), desc='searching: {}'.format(title)):
|
|
|
+ response = self.spider.search_articles(title, page=str(page_index))
|
|
|
+ self.process_search_result(response, inner_account_set)
|
|
|
+ time.sleep(const.SLEEP_SECONDS)
|
|
|
+
|
|
|
+ def run(self) -> None:
|
|
|
+ """
|
|
|
+ 入口函数
|
|
|
+ :return:
|
|
|
+ """
|
|
|
+ # get seed titles
|
|
|
+ title_list = self.get_seed_titles()
|
|
|
+ # get inner accounts set
|
|
|
+ inner_account_set = self.get_inner_account_name()
|
|
|
+
|
|
|
+ start_time = time.time()
|
|
|
+ for title in tqdm(title_list, desc="search each title"):
|
|
|
+ self.search_title_in_weixin(title, inner_account_set)
|
|
|
+
|
|
|
+ # 通知
|
|
|
+ bot(
|
|
|
+ title="微信账号抓取完成",
|
|
|
+ detail={
|
|
|
+ "总更新账号数量": self.account_count,
|
|
|
+ "总耗时": time.time() - start_time,
|
|
|
+ "种子标题数量": len(title_list)
|
|
|
+ },
|
|
|
+ mention=False
|
|
|
+ )
|