import json import time import random import requests import scrapy import urllib3 from gzh_spider import functions from gzh_spider.items import GzhSpiderItem class GzhAuthorSpider(scrapy.Spider): name = "gzh_author" # 获取token token_dict = functions.return_token() start_urls = [] user_base_url = "https://mp.weixin.qq.com/cgi-bin/searchbiz?" video_base_url = "https://mp.weixin.qq.com/cgi-bin/appmsg?" begin = 0 parse_params = functions.params("parse", token_dict) parse_headers = functions.header("parse", token_dict) parse_video_params = functions.params("parse_video", token_dict) parse_video_headers = functions.header("parse_video", token_dict) custom_settings = { "LOG_FILE": "/Users/luojunhui/cyber/gzh_spider/gzh_spider/logs/20230911.log", # "LOG_LEVEL": "INFO" } def __init__(self, *args, **kwargs): super(GzhAuthorSpider).__init__(*args, **kwargs) with open("/Users/luojunhui/cyber/gzh_spider/user_list.json", "r", encoding="utf-8") as f: data = json.loads(f.read()) self.start_urls = data self.rule_dict = {} self.task_dict = {} def start_requests(self): for index, url_obj in enumerate(self.start_urls[200:250]): # 公众号和公众号之间等待2-4s time.sleep(random.randint(20, 40)) print("start {} {}".format(index + 101, url_obj['link'])) yield scrapy.Request( url=self.user_base_url, # headers=self.headers, dont_filter=True, callback=self.parse, cb_kwargs={"link": url_obj['link']} ) def parse(self, response, link): request_p = self.parse_params request_p['query'] = link urllib3.disable_warnings() time.sleep(random.randint(5, 20)) response_obj = requests.get( url=self.user_base_url, headers=self.parse_headers, params=request_p, verify=False ) response_obj.close() result = response_obj.json() print(result) # 判断是否存在异常情况 base_resp = result['base_resp'] if base_resp['err_msg'] == "invalid session": print("invalid session, need to change token and cookie") return if base_resp['err_msg'] == "freq control": print("freq control, need to change token") return if base_resp['err_msg'] == "invalid args" and base_resp['ret'] == 200002: print("invalid args") return if base_resp['err_msg'] == "ok": if len(result['list']) == 0: print("No more videos") else: user_info_dict = { "user_name": result['list'][0]['nickname'], "user_id": result['list'][0]['fakeid'], "avatar_url": result['list'][0]['round_head_img'] } yield scrapy.Request( url=self.video_base_url, cb_kwargs=user_info_dict, callback=self.parse_video ) def parse_video(self, response, user_id, user_name, avatar_url): if response.status == 200: print('开始获取公众号信息详情') while True: video_params = self.parse_video_params video_params['begin'] = str(self.begin) video_params['fakeid'] = user_id print(json.dumps(video_params)) urllib3.disable_warnings() r = requests.get(url=self.video_base_url, headers=self.parse_video_headers, params=video_params, verify=False) r.close() # 对请求结果进行判断,并且打日志 video_list_result = r.json() print(video_list_result) if video_list_result['base_resp']['err_msg'] == "invalid session": print("invalid session") # continue return if video_list_result['base_resp']['err_msg'] == "freq control": print("freq control") return # continue if video_list_result['base_resp']['err_msg'] == "invalid args" and video_list_result['base_resp']['ret'] == 200002: print("invalid args") # continue # if video_list_result.get("app_msg_list", 0) != 0: # print("频率控制") # continue if len(video_list_result['app_msg_list']) == 0: print("No more videos") return app_msg_list = r.json()['app_msg_list'] for article in app_msg_list: create_time = article.get("create_time", 0) update_time = article.get("update_time", 0) publish_time_stamp = int(create_time) update_time_stamp = int(update_time) publish_time_str = time.strftime( "%Y-%m-%d %H:%M:%S", time.localtime(publish_time_stamp) ) article_url = article.get("link", "") item = GzhSpiderItem() item['video_id'] = article.get("aid", "") item['video_title'] = article.get("title", "").replace(" ", "").replace('"', "").replace("'", "") item['publish_time_stamp'] = publish_time_stamp item['publish_time_str'] = publish_time_str item['user_name'] = user_name item['play_cnt'] = 0 item['comment_cnt'] = 0 item['like_cnt'] = 0 item['share_cnt'] = 0 item['user_id'] = user_id item['avatar_url'] = avatar_url item['cover_url'] = article.get("cover", "") item['article_url'] = article.get("link", "") item['video_url'] = functions.find_video_url(article_url) item['session'] = f"gongzhonghao-author1-{int(time.time())}" print(item) # if functions.time_delta_flag(publish_time_stamp, update_time_stamp, self.rule_dict): # yield item if functions.judge_time_temp(publish_time_stamp): time.sleep(random.randint(2, 5)) yield item else: return # 翻页 print("等待60s") time.sleep(60) self.begin += 5