weixin_account_crawler.py 5.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176
  1. """
  2. @author: luojunhui
  3. """
  4. import time
  5. import traceback
  6. from typing import List, Set, Dict
  7. from tqdm import tqdm
  8. from applications import WeixinSpider, longArticlesMySQL, log, bot
  9. from applications.const import WeixinVideoCrawlerConst
  10. from applications.functions import Functions
  11. const = WeixinVideoCrawlerConst()
  12. function = Functions()
  13. class WeixinAccountCrawler(object):
  14. """
  15. 账号抓取
  16. """
  17. def __init__(self):
  18. self.db_client = longArticlesMySQL()
  19. self.spider = WeixinSpider()
  20. self.crawler_account_count = 0
  21. def get_inner_account_name(self) -> Set[str]:
  22. """
  23. 获取内部账号名称
  24. :return:
  25. """
  26. sql = "select distinct account_name from datastat_sort_strategy;"
  27. account_name_list = self.db_client.select_json(sql)
  28. account_name_set = set()
  29. for account_name_obj in account_name_list:
  30. account_name_set.add(account_name_obj['account_name'])
  31. return account_name_set
  32. def get_seed_titles(self) -> List[str]:
  33. """
  34. :return:
  35. """
  36. publish_timestamp_threshold = int(time.time()) - const.STAT_PERIOD
  37. sql = f"""
  38. SELECT distinct title
  39. FROM datastat_sort_strategy
  40. WHERE read_rate > {const.READ_AVG_MULTIPLE} and view_count > {const.MIN_READ_COUNT} and publish_timestamp > {publish_timestamp_threshold};
  41. ORDER BY read_rate DESC;
  42. """
  43. title_list = self.db_client.select_json(sql)
  44. title_list = [i['title'] for i in title_list]
  45. return title_list
  46. def is_original(self, article_url: str) -> bool:
  47. """
  48. 判断视频是否是原创
  49. :return:
  50. """
  51. response = self.spider.get_article_text(article_url)
  52. data = response['data']['data']
  53. return data['is_original']
  54. def insert_account(self, gh_id: str, account_name: str) -> int:
  55. """
  56. 插入账号
  57. :param account_name:
  58. :param gh_id:
  59. :return:
  60. """
  61. init_date = time.strftime("%Y-%m-%d", time.localtime())
  62. sql = """
  63. INSERT IGNORE INTO weixin_account_for_videos
  64. (gh_id, account_name, account_init_date)
  65. VALUES
  66. (%s, %s, %s);
  67. """
  68. insert_rows = self.db_client.update(sql, (gh_id, account_name, init_date))
  69. return insert_rows
  70. def process_search_result(self, response: Dict, inner_account_set: Set[str]):
  71. """
  72. 处理搜索结果
  73. :param response:
  74. :param inner_account_set:
  75. :return:
  76. """
  77. if response['code'] != 0:
  78. return
  79. article_list = response['data']['data']
  80. if article_list:
  81. for article in article_list:
  82. try:
  83. # 先判断账号是否内部账号
  84. article_url = article['url']
  85. account_detail = self.spider.get_account_by_url(article_url)
  86. account_detail = account_detail['data']['data']
  87. account_name = account_detail['account_name']
  88. gh_id = account_detail['wx_gh']
  89. if account_name in inner_account_set:
  90. continue
  91. # 判断搜索结果是否原创
  92. if self.is_original(article_url):
  93. continue
  94. # 判断是否有视频链接
  95. try:
  96. video_url = function.get_video_url(article_url)
  97. except Exception as e:
  98. continue
  99. if not video_url:
  100. continue
  101. # 将账号抓取进来
  102. insert_rows = self.insert_account(gh_id=gh_id, account_name=account_name)
  103. if insert_rows:
  104. log(
  105. task="account_crawler_v1",
  106. function="process_search_result",
  107. message="insert account success",
  108. data={
  109. "gh_id": gh_id,
  110. "account_name": account_name
  111. }
  112. )
  113. self.crawler_account_count += 1
  114. except Exception as e:
  115. log(
  116. task="account_crawler_v1",
  117. function="process_search_result",
  118. message="insert account error",
  119. data={
  120. "error": str(e),
  121. "traceback": traceback.format_exc(),
  122. "data": article
  123. }
  124. )
  125. def search_title_in_weixin(self, title: str, inner_account_set: Set[str]) -> None:
  126. """
  127. 调用搜索接口,在微信搜索
  128. :param inner_account_set:
  129. :param title:
  130. :return:
  131. """
  132. for page_index in tqdm(range(1, const.MAX_SEARCH_PAGE_NUM + 1), desc='searching: {}'.format(title)):
  133. response = self.spider.search_articles(title, page=str(page_index))
  134. self.process_search_result(response, inner_account_set)
  135. time.sleep(const.SLEEP_SECONDS)
  136. def run(self) -> None:
  137. """
  138. 入口函数
  139. :return:
  140. """
  141. # get seed titles
  142. title_list = self.get_seed_titles()
  143. # get inner accounts set
  144. inner_account_set = self.get_inner_account_name()
  145. start_time = time.time()
  146. for title in tqdm(title_list, desc="search each title"):
  147. self.search_title_in_weixin(title, inner_account_set)
  148. # 通知
  149. bot(
  150. title="微信账号抓取完成",
  151. detail={
  152. "总更新账号数量": self.crawler_account_count,
  153. "总耗时": time.time() - start_time,
  154. "种子标题数量": len(title_list)
  155. },
  156. mention=False
  157. )