gzh_author.py 6.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163
  1. import json
  2. import time
  3. import random
  4. import requests
  5. import scrapy
  6. import urllib3
  7. from gzh_spider import functions
  8. from gzh_spider.items import GzhSpiderItem
  9. class GzhAuthorSpider(scrapy.Spider):
  10. name = "gzh_author"
  11. # 获取token
  12. token_dict = functions.return_token()
  13. start_urls = []
  14. user_base_url = "https://mp.weixin.qq.com/cgi-bin/searchbiz?"
  15. video_base_url = "https://mp.weixin.qq.com/cgi-bin/appmsg?"
  16. begin = 0
  17. parse_params = functions.params("parse", token_dict)
  18. parse_headers = functions.header("parse", token_dict)
  19. parse_video_params = functions.params("parse_video", token_dict)
  20. parse_video_headers = functions.header("parse_video", token_dict)
  21. custom_settings = {
  22. "LOG_FILE": "/Users/luojunhui/cyber/gzh_spider/gzh_spider/logs/20230911.log",
  23. # "LOG_LEVEL": "INFO"
  24. }
  25. def __init__(self, *args, **kwargs):
  26. super(GzhAuthorSpider).__init__(*args, **kwargs)
  27. with open("/Users/luojunhui/cyber/gzh_spider/user_list.json", "r", encoding="utf-8") as f:
  28. data = json.loads(f.read())
  29. self.start_urls = data
  30. self.rule_dict = {}
  31. self.task_dict = {}
  32. def start_requests(self):
  33. for index, url_obj in enumerate(self.start_urls[200:250]):
  34. # 公众号和公众号之间等待2-4s
  35. time.sleep(random.randint(20, 40))
  36. print("start {} {}".format(index + 101, url_obj['link']))
  37. yield scrapy.Request(
  38. url=self.user_base_url,
  39. # headers=self.headers,
  40. dont_filter=True,
  41. callback=self.parse,
  42. cb_kwargs={"link": url_obj['link']}
  43. )
  44. def parse(self, response, link):
  45. request_p = self.parse_params
  46. request_p['query'] = link
  47. urllib3.disable_warnings()
  48. time.sleep(random.randint(5, 20))
  49. response_obj = requests.get(
  50. url=self.user_base_url,
  51. headers=self.parse_headers,
  52. params=request_p,
  53. verify=False
  54. )
  55. response_obj.close()
  56. result = response_obj.json()
  57. print(result)
  58. # 判断是否存在异常情况
  59. base_resp = result['base_resp']
  60. if base_resp['err_msg'] == "invalid session":
  61. print("invalid session, need to change token and cookie")
  62. return
  63. if base_resp['err_msg'] == "freq control":
  64. print("freq control, need to change token")
  65. return
  66. if base_resp['err_msg'] == "invalid args" and base_resp['ret'] == 200002:
  67. print("invalid args")
  68. return
  69. if base_resp['err_msg'] == "ok":
  70. if len(result['list']) == 0:
  71. print("No more videos")
  72. else:
  73. user_info_dict = {
  74. "user_name": result['list'][0]['nickname'],
  75. "user_id": result['list'][0]['fakeid'],
  76. "avatar_url": result['list'][0]['round_head_img']
  77. }
  78. yield scrapy.Request(
  79. url=self.video_base_url,
  80. cb_kwargs=user_info_dict,
  81. callback=self.parse_video
  82. )
  83. def parse_video(self, response, user_id, user_name, avatar_url):
  84. if response.status == 200:
  85. print('开始获取公众号信息详情')
  86. while True:
  87. video_params = self.parse_video_params
  88. video_params['begin'] = str(self.begin)
  89. video_params['fakeid'] = user_id
  90. print(json.dumps(video_params))
  91. urllib3.disable_warnings()
  92. r = requests.get(url=self.video_base_url, headers=self.parse_video_headers, params=video_params,
  93. verify=False)
  94. r.close()
  95. # 对请求结果进行判断,并且打日志
  96. video_list_result = r.json()
  97. print(video_list_result)
  98. if video_list_result['base_resp']['err_msg'] == "invalid session":
  99. print("invalid session")
  100. # continue
  101. return
  102. if video_list_result['base_resp']['err_msg'] == "freq control":
  103. print("freq control")
  104. return
  105. # continue
  106. if video_list_result['base_resp']['err_msg'] == "invalid args" and video_list_result['base_resp']['ret'] == 200002:
  107. print("invalid args")
  108. # continue
  109. # if video_list_result.get("app_msg_list", 0) != 0:
  110. # print("频率控制")
  111. # continue
  112. if len(video_list_result['app_msg_list']) == 0:
  113. print("No more videos")
  114. return
  115. app_msg_list = r.json()['app_msg_list']
  116. for article in app_msg_list:
  117. create_time = article.get("create_time", 0)
  118. update_time = article.get("update_time", 0)
  119. publish_time_stamp = int(create_time)
  120. update_time_stamp = int(update_time)
  121. publish_time_str = time.strftime(
  122. "%Y-%m-%d %H:%M:%S", time.localtime(publish_time_stamp)
  123. )
  124. article_url = article.get("link", "")
  125. item = GzhSpiderItem()
  126. item['video_id'] = article.get("aid", "")
  127. item['video_title'] = article.get("title", "").replace(" ", "").replace('"', "").replace("'",
  128. "")
  129. item['publish_time_stamp'] = publish_time_stamp
  130. item['publish_time_str'] = publish_time_str
  131. item['user_name'] = user_name
  132. item['play_cnt'] = 0
  133. item['comment_cnt'] = 0
  134. item['like_cnt'] = 0
  135. item['share_cnt'] = 0
  136. item['user_id'] = user_id
  137. item['avatar_url'] = avatar_url
  138. item['cover_url'] = article.get("cover", "")
  139. item['article_url'] = article.get("link", "")
  140. item['video_url'] = functions.find_video_url(article_url)
  141. item['session'] = f"gongzhonghao-author1-{int(time.time())}"
  142. print(item)
  143. # if functions.time_delta_flag(publish_time_stamp, update_time_stamp, self.rule_dict):
  144. # yield item
  145. if functions.judge_time_temp(publish_time_stamp):
  146. time.sleep(random.randint(2, 5))
  147. yield item
  148. else:
  149. return
  150. # 翻页
  151. print("等待60s")
  152. time.sleep(60)
  153. self.begin += 5