weixinCategoryCrawler.py 4.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143
  1. """
  2. @author: luojunhui
  3. 抓取全局品类文章
  4. """
  5. import time
  6. from tqdm import tqdm
  7. from applications import WeixinSpider, Functions, DeNetMysql
  8. class weixinCategory(object):
  9. """
  10. 微信全局品类账号抓取
  11. """
  12. def __init__(self):
  13. self.spider_client = DeNetMysql()
  14. self.spider = WeixinSpider()
  15. self.function = Functions()
  16. def getAccountList(self):
  17. """
  18. 获取账号
  19. :return:
  20. """
  21. now_time = int(time.time())
  22. twenty_hours_ago = now_time - 3600 * 20
  23. sql = f"""select * from long_article_accounts_outside where latest_article_timestamp < {twenty_hours_ago};"""
  24. account_tuple = self.spider_client.select(sql)
  25. result = [
  26. {
  27. "gh_id": i[0],
  28. "platform": i[1],
  29. "account_name": i[2],
  30. "category": i[3],
  31. "latest_timestamp": i[4],
  32. }
  33. for i in account_tuple
  34. ]
  35. return result
  36. def updateDataIntoMysql(self, gh_id, category, article_list):
  37. """
  38. 将数据更新到数据库
  39. :return:
  40. """
  41. for article_obj in article_list:
  42. detail_article_list = article_obj["AppMsg"]["DetailInfo"]
  43. for obj in detail_article_list:
  44. try:
  45. show_stat = self.function.show_desc_to_sta(obj["ShowDesc"])
  46. show_view_count = show_stat.get("show_view_count", 0)
  47. show_like_count = show_stat.get("show_like_count", 0)
  48. insert_sql = f"""
  49. insert into crawler_meta_article
  50. (platform, mode, category, out_account_id, article_index, title, link, read_cnt, like_cnt, description, publish_time, crawler_time, status, unique_index)
  51. VALUES
  52. (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
  53. """
  54. self.spider_client.update(
  55. sql=insert_sql,
  56. params=(
  57. "weixin",
  58. "account",
  59. category,
  60. gh_id,
  61. obj['ItemIndex'],
  62. obj["Title"],
  63. obj["ContentUrl"],
  64. show_view_count,
  65. show_like_count,
  66. obj["Digest"],
  67. obj["send_time"],
  68. int(time.time()),
  69. 1,
  70. self.function.generateGzhId(obj["ContentUrl"]),
  71. ),
  72. )
  73. except Exception as e:
  74. print(e)
  75. def updateLatestAccountTimeStamp(self, gh_id):
  76. """
  77. 更新账号的最新时间戳
  78. :return:
  79. """
  80. select_sql = f"""
  81. SELECT publish_time
  82. From crawler_meta_article
  83. WHERE out_account_id = '{gh_id}'
  84. ORDER BY publish_time DESC LIMIT 1;
  85. """
  86. result = self.spider_client.select(select_sql)
  87. time_stamp = result[0][0]
  88. update_sql = f"""
  89. update long_article_accounts_outside
  90. set latest_article_timestamp = %s
  91. where account_id = %s;
  92. """
  93. self.spider_client.update(sql=update_sql, params=(time_stamp, gh_id))
  94. def updateEachAccountArticles(self, gh_id, category, latest_time_stamp, index=None):
  95. """
  96. 更新账号文章
  97. :return:
  98. """
  99. response = self.spider.update_msg_list(ghId=gh_id, index=index)
  100. msg_list = response.get("data", {}).get("data")
  101. if msg_list:
  102. last_article_in_this_msg = msg_list[-1]
  103. self.updateDataIntoMysql(
  104. gh_id=gh_id, category=category, article_list=msg_list
  105. )
  106. last_time_stamp_in_this_msg = last_article_in_this_msg["AppMsg"]["BaseInfo"]["UpdateTime"]
  107. if latest_time_stamp < last_time_stamp_in_this_msg:
  108. next_cursor = response["data"]["next_cursor"]
  109. return self.updateEachAccountArticles(
  110. gh_id=gh_id,
  111. latest_time_stamp=latest_time_stamp,
  112. category=category,
  113. index=next_cursor,
  114. )
  115. else:
  116. # 更新最近抓取时间
  117. self.updateLatestAccountTimeStamp(gh_id=gh_id)
  118. else:
  119. print("No more data")
  120. if __name__ == "__main__":
  121. wxCategory = weixinCategory()
  122. account_list = wxCategory.getAccountList()
  123. for account in tqdm(account_list):
  124. try:
  125. wxCategory.updateEachAccountArticles(
  126. gh_id=account["gh_id"],
  127. category=account["category"],
  128. latest_time_stamp=account["latest_timestamp"],
  129. )
  130. except Exception as e:
  131. print("fail because of {}".format(e))