123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163 |
- import json
- import time
- import random
- import requests
- import scrapy
- import urllib3
- from gzh_spider import functions
- from gzh_spider.items import GzhSpiderItem
- class GzhAuthorSpider(scrapy.Spider):
- name = "gzh_author"
- # 获取token
- token_dict = functions.return_token()
- start_urls = []
- user_base_url = "https://mp.weixin.qq.com/cgi-bin/searchbiz?"
- video_base_url = "https://mp.weixin.qq.com/cgi-bin/appmsg?"
- begin = 0
- parse_params = functions.params("parse", token_dict)
- parse_headers = functions.header("parse", token_dict)
- parse_video_params = functions.params("parse_video", token_dict)
- parse_video_headers = functions.header("parse_video", token_dict)
- custom_settings = {
- "LOG_FILE": "/Users/luojunhui/cyber/gzh_spider/gzh_spider/logs/20230911.log",
- # "LOG_LEVEL": "INFO"
- }
- def __init__(self, *args, **kwargs):
- super(GzhAuthorSpider).__init__(*args, **kwargs)
- with open("/Users/luojunhui/cyber/gzh_spider/user_list.json", "r", encoding="utf-8") as f:
- data = json.loads(f.read())
- self.start_urls = data
- self.rule_dict = {}
- self.task_dict = {}
- def start_requests(self):
- for index, url_obj in enumerate(self.start_urls[200:250]):
- # 公众号和公众号之间等待2-4s
- time.sleep(random.randint(20, 40))
- print("start {} {}".format(index + 101, url_obj['link']))
- yield scrapy.Request(
- url=self.user_base_url,
- # headers=self.headers,
- dont_filter=True,
- callback=self.parse,
- cb_kwargs={"link": url_obj['link']}
- )
- def parse(self, response, link):
- request_p = self.parse_params
- request_p['query'] = link
- urllib3.disable_warnings()
- time.sleep(random.randint(5, 20))
- response_obj = requests.get(
- url=self.user_base_url,
- headers=self.parse_headers,
- params=request_p,
- verify=False
- )
- response_obj.close()
- result = response_obj.json()
- print(result)
- # 判断是否存在异常情况
- base_resp = result['base_resp']
- if base_resp['err_msg'] == "invalid session":
- print("invalid session, need to change token and cookie")
- return
- if base_resp['err_msg'] == "freq control":
- print("freq control, need to change token")
- return
- if base_resp['err_msg'] == "invalid args" and base_resp['ret'] == 200002:
- print("invalid args")
- return
- if base_resp['err_msg'] == "ok":
- if len(result['list']) == 0:
- print("No more videos")
- else:
- user_info_dict = {
- "user_name": result['list'][0]['nickname'],
- "user_id": result['list'][0]['fakeid'],
- "avatar_url": result['list'][0]['round_head_img']
- }
- yield scrapy.Request(
- url=self.video_base_url,
- cb_kwargs=user_info_dict,
- callback=self.parse_video
- )
- def parse_video(self, response, user_id, user_name, avatar_url):
- if response.status == 200:
- print('开始获取公众号信息详情')
- while True:
- video_params = self.parse_video_params
- video_params['begin'] = str(self.begin)
- video_params['fakeid'] = user_id
- print(json.dumps(video_params))
- urllib3.disable_warnings()
- r = requests.get(url=self.video_base_url, headers=self.parse_video_headers, params=video_params,
- verify=False)
- r.close()
- # 对请求结果进行判断,并且打日志
- video_list_result = r.json()
- print(video_list_result)
- if video_list_result['base_resp']['err_msg'] == "invalid session":
- print("invalid session")
- # continue
- return
- if video_list_result['base_resp']['err_msg'] == "freq control":
- print("freq control")
- return
- # continue
- if video_list_result['base_resp']['err_msg'] == "invalid args" and video_list_result['base_resp']['ret'] == 200002:
- print("invalid args")
- # continue
- # if video_list_result.get("app_msg_list", 0) != 0:
- # print("频率控制")
- # continue
- if len(video_list_result['app_msg_list']) == 0:
- print("No more videos")
- return
- app_msg_list = r.json()['app_msg_list']
- for article in app_msg_list:
- create_time = article.get("create_time", 0)
- update_time = article.get("update_time", 0)
- publish_time_stamp = int(create_time)
- update_time_stamp = int(update_time)
- publish_time_str = time.strftime(
- "%Y-%m-%d %H:%M:%S", time.localtime(publish_time_stamp)
- )
- article_url = article.get("link", "")
- item = GzhSpiderItem()
- item['video_id'] = article.get("aid", "")
- item['video_title'] = article.get("title", "").replace(" ", "").replace('"', "").replace("'",
- "")
- item['publish_time_stamp'] = publish_time_stamp
- item['publish_time_str'] = publish_time_str
- item['user_name'] = user_name
- item['play_cnt'] = 0
- item['comment_cnt'] = 0
- item['like_cnt'] = 0
- item['share_cnt'] = 0
- item['user_id'] = user_id
- item['avatar_url'] = avatar_url
- item['cover_url'] = article.get("cover", "")
- item['article_url'] = article.get("link", "")
- item['video_url'] = functions.find_video_url(article_url)
- item['session'] = f"gongzhonghao-author1-{int(time.time())}"
- print(item)
- # if functions.time_delta_flag(publish_time_stamp, update_time_stamp, self.rule_dict):
- # yield item
- if functions.judge_time_temp(publish_time_stamp):
- time.sleep(random.randint(2, 5))
- yield item
- else:
- return
- # 翻页
- print("等待60s")
- time.sleep(60)
- self.begin += 5
|