|
@@ -605,101 +605,104 @@ class XiguarecommendScheduling:
|
|
|
queryCount = 1
|
|
|
while True:
|
|
|
Common.logger(log_type, crawler).info(f"正在抓取第{queryCount}页视频")
|
|
|
- signature = cls.get_signature(env)
|
|
|
- if signature is None:
|
|
|
- Common.logger(log_type, crawler).warning(f"signature:{signature}")
|
|
|
- time.sleep(1)
|
|
|
- continue
|
|
|
- url = "https://www.ixigua.com/api/feedv2/feedById?"
|
|
|
- params = {
|
|
|
- "channelId": "94349543909",
|
|
|
- "count": "9",
|
|
|
- "maxTime": str(int(time.time())),
|
|
|
- # "maxTime": "1683190690",
|
|
|
- "queryCount": str(queryCount),
|
|
|
- "_signature": signature,
|
|
|
- "request_from": "701",
|
|
|
- "offset": "0",
|
|
|
- "referrer:": "https://open.weixin.qq.com/",
|
|
|
- "aid": "1768",
|
|
|
- "msToken": "XDpSA6_ZPP-gAkkBV-_WRQvNpG20uUUGPwf3E-S-txhznjBcXNbK2sbOuSpF3U7Jki6R9HwLDPeW4Gj7n6PURPTKrKLEs8J-ieFrwXDvMp2DX94ZoMua",
|
|
|
- # "X-Bogus": "DFSzswVOx7bANt0TtCAcOFm4pIkR",
|
|
|
- }
|
|
|
- headers = {
|
|
|
- 'referer': 'https://www.ixigua.com/',
|
|
|
- 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36',
|
|
|
- 'authority': 'www.ixigua.com',
|
|
|
- 'accept': 'application/json, text/plain, */*',
|
|
|
- 'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
|
|
|
- 'cache-control': 'no-cache',
|
|
|
- # 'cookie': 'ttcid=5d8f917a525e46759dc886296bf1111b69; MONITOR_WEB_ID=ad1c8360-d4c9-4fa2-a801-d9fd68dfc1b2; s_v_web_id=verify_lh8vaa6v_VI4RQ0ET_nVbq_4PXw_8mfN_7Xp6wdLOZi08; passport_csrf_token=0e7c6992cb6170c9db034c3696191fff; passport_csrf_token_default=0e7c6992cb6170c9db034c3696191fff; odin_tt=b102690fef38bf07c400e3c69cdc27627701802bdd816fa827e3721c33607c4d2c0cbef09fe99c7d370e4a9e9e11c263; sid_guard=8dec4ecbe52cbdcff99dafe622b586b4%7C1683189144%7C3024002%7CThu%2C+08-Jun-2023+08%3A32%3A26+GMT; uid_tt=1dccbeaf685e24afd018fec335f3151d; uid_tt_ss=1dccbeaf685e24afd018fec335f3151d; sid_tt=8dec4ecbe52cbdcff99dafe622b586b4; sessionid=8dec4ecbe52cbdcff99dafe622b586b4; sessionid_ss=8dec4ecbe52cbdcff99dafe622b586b4; sid_ucp_v1=1.0.0-KGVhZTIxYjFlNzRlZTNhZjk5MjNlNzk2NGRhOWJlYzZiNGI5NzBhMzYKFQiu3d-eqQIQmNvNogYYGCAMOAhACxoCaGwiIDhkZWM0ZWNiZTUyY2JkY2ZmOTlkYWZlNjIyYjU4NmI0; ssid_ucp_v1=1.0.0-KGVhZTIxYjFlNzRlZTNhZjk5MjNlNzk2NGRhOWJlYzZiNGI5NzBhMzYKFQiu3d-eqQIQmNvNogYYGCAMOAhACxoCaGwiIDhkZWM0ZWNiZTUyY2JkY2ZmOTlkYWZlNjIyYjU4NmI0; support_webp=true; support_avif=true; csrf_session_id=9dd5d8287d4f075ae24ff163cd22e51f; msToken=XDpSA6_ZPP-gAkkBV-_WRQvNpG20uUUGPwf3E-S-txhznjBcXNbK2sbOuSpF3U7Jki6R9HwLDPeW4Gj7n6PURPTKrKLEs8J-ieFrwXDvMp2DX94ZoMua; ixigua-a-s=1; tt_scid=UTduWO4ij7cX6YKx23sDuV4zjvFkGFtFk5ZBhEnd1lJ1EZBykStzU7tbWQOSzGdE0fc6; ttwid=1%7C4zaTJmlaHpEa8rAB-KjREdxT3sNBUJWrAzRJnNvqExQ%7C1683198318%7Cffc2eef612caab19a0db93b4cec27e21a6230f9b82ab4bf5b1c6193d082baab1',
|
|
|
- 'pragma': 'no-cache',
|
|
|
- 'sec-ch-ua': '"Chromium";v="112", "Google Chrome";v="112", "Not:A-Brand";v="99"',
|
|
|
- 'sec-ch-ua-mobile': '?0',
|
|
|
- 'sec-ch-ua-platform': '"macOS"',
|
|
|
- 'sec-fetch-dest': 'empty',
|
|
|
- 'sec-fetch-mode': 'cors',
|
|
|
- 'sec-fetch-site': 'same-origin',
|
|
|
- # 'tt-anti-token': '95Ny0vj4Q-90dd9b91193b34ce554cc2861439b9629d897723f4d33719b9747d7d18a2ff7c',
|
|
|
- # 'x-secsdk-csrf-token': '000100000001ecb8f07e247a89e289b3ab55f3c967a8e88f88aa0addb1ddca9d3e36f35d7999175be79b8699c881'
|
|
|
- }
|
|
|
- urllib3.disable_warnings()
|
|
|
- s = requests.session()
|
|
|
- # max_retries=3 重试3次
|
|
|
- s.mount('http://', HTTPAdapter(max_retries=3))
|
|
|
- s.mount('https://', HTTPAdapter(max_retries=3))
|
|
|
- response = requests.get(url=url, headers=headers, params=params, proxies=Common.tunnel_proxies(), verify=False, timeout=5)
|
|
|
- response.close()
|
|
|
- queryCount += 1
|
|
|
- if response.status_code != 200:
|
|
|
- Common.logger(log_type, crawler).warning(f"get_videolist_response:{response.text}\n")
|
|
|
- return
|
|
|
- elif 'data' not in response.text:
|
|
|
- Common.logger(log_type, crawler).warning(f"get_videolist_response:{response.text}\n")
|
|
|
- return
|
|
|
- elif 'channelFeed' not in response.json()['data']:
|
|
|
- Common.logger(log_type, crawler).warning(f"get_videolist_response:{response.json()}\n")
|
|
|
- return
|
|
|
- elif 'Data' not in response.json()['data']['channelFeed']:
|
|
|
- Common.logger(log_type, crawler).warning(f"get_videolist_response:{response.json()}\n")
|
|
|
- return
|
|
|
- elif len(response.json()['data']['channelFeed']['Data']) == 0:
|
|
|
- Common.logger(log_type, crawler).warning(f"没有更多数据啦 ~ :{response.json()}\n")
|
|
|
- return
|
|
|
- else:
|
|
|
- feeds = response.json()['data']['channelFeed']['Data']
|
|
|
- for i in range(len(feeds)):
|
|
|
- try:
|
|
|
- item_id = feeds[i].get("data", {}).get("item_id", "")
|
|
|
- if item_id == "":
|
|
|
- Common.logger(log_type, crawler).info("无效视频\n")
|
|
|
- continue
|
|
|
- video_dict = cls.get_video_info(log_type, crawler, item_id)
|
|
|
- if video_dict is None:
|
|
|
- Common.logger(log_type, crawler).info("无效视频\n")
|
|
|
- continue
|
|
|
- for k, v in video_dict.items():
|
|
|
- Common.logger(log_type, crawler).info(f"{k}:{v}")
|
|
|
- if download_rule(log_type=log_type, crawler=crawler, video_dict=video_dict, rule_dict=rule_dict) is False:
|
|
|
- Common.logger(log_type, crawler).info("不满足抓取规则\n")
|
|
|
- elif any(str(word) if str(word) in video_dict["video_title"] else False
|
|
|
- for word in get_config_from_mysql(log_type=log_type,
|
|
|
- source=crawler,
|
|
|
- env=env,
|
|
|
- text="filter",
|
|
|
- action="")) is True:
|
|
|
- Common.logger(log_type, crawler).info('已中过滤词\n')
|
|
|
- elif cls.repeat_video(log_type, crawler, video_dict["video_id"], env) != 0:
|
|
|
- Common.logger(log_type, crawler).info('视频已下载\n')
|
|
|
- else:
|
|
|
- cls.download_publish(log_type=log_type,
|
|
|
- crawler=crawler,
|
|
|
- our_uid=our_uid,
|
|
|
- video_dict=video_dict,
|
|
|
- rule_dict=rule_dict,
|
|
|
- env=env)
|
|
|
- except Exception as e:
|
|
|
- Common.logger(log_type, crawler).error(f"抓取单条视频时异常:{e}\n")
|
|
|
+ try:
|
|
|
+ signature = cls.get_signature(env)
|
|
|
+ if signature is None:
|
|
|
+ Common.logger(log_type, crawler).warning(f"signature:{signature}")
|
|
|
+ time.sleep(1)
|
|
|
+ continue
|
|
|
+ url = "https://www.ixigua.com/api/feedv2/feedById?"
|
|
|
+ params = {
|
|
|
+ "channelId": "94349543909",
|
|
|
+ "count": "9",
|
|
|
+ "maxTime": str(int(time.time())),
|
|
|
+ # "maxTime": "1683190690",
|
|
|
+ "queryCount": str(queryCount),
|
|
|
+ "_signature": signature,
|
|
|
+ "request_from": "701",
|
|
|
+ "offset": "0",
|
|
|
+ "referrer:": "https://open.weixin.qq.com/",
|
|
|
+ "aid": "1768",
|
|
|
+ "msToken": "XDpSA6_ZPP-gAkkBV-_WRQvNpG20uUUGPwf3E-S-txhznjBcXNbK2sbOuSpF3U7Jki6R9HwLDPeW4Gj7n6PURPTKrKLEs8J-ieFrwXDvMp2DX94ZoMua",
|
|
|
+ # "X-Bogus": "DFSzswVOx7bANt0TtCAcOFm4pIkR",
|
|
|
+ }
|
|
|
+ headers = {
|
|
|
+ 'referer': 'https://www.ixigua.com/',
|
|
|
+ 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36',
|
|
|
+ 'authority': 'www.ixigua.com',
|
|
|
+ 'accept': 'application/json, text/plain, */*',
|
|
|
+ 'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
|
|
|
+ 'cache-control': 'no-cache',
|
|
|
+ # 'cookie': 'ttcid=5d8f917a525e46759dc886296bf1111b69; MONITOR_WEB_ID=ad1c8360-d4c9-4fa2-a801-d9fd68dfc1b2; s_v_web_id=verify_lh8vaa6v_VI4RQ0ET_nVbq_4PXw_8mfN_7Xp6wdLOZi08; passport_csrf_token=0e7c6992cb6170c9db034c3696191fff; passport_csrf_token_default=0e7c6992cb6170c9db034c3696191fff; odin_tt=b102690fef38bf07c400e3c69cdc27627701802bdd816fa827e3721c33607c4d2c0cbef09fe99c7d370e4a9e9e11c263; sid_guard=8dec4ecbe52cbdcff99dafe622b586b4%7C1683189144%7C3024002%7CThu%2C+08-Jun-2023+08%3A32%3A26+GMT; uid_tt=1dccbeaf685e24afd018fec335f3151d; uid_tt_ss=1dccbeaf685e24afd018fec335f3151d; sid_tt=8dec4ecbe52cbdcff99dafe622b586b4; sessionid=8dec4ecbe52cbdcff99dafe622b586b4; sessionid_ss=8dec4ecbe52cbdcff99dafe622b586b4; sid_ucp_v1=1.0.0-KGVhZTIxYjFlNzRlZTNhZjk5MjNlNzk2NGRhOWJlYzZiNGI5NzBhMzYKFQiu3d-eqQIQmNvNogYYGCAMOAhACxoCaGwiIDhkZWM0ZWNiZTUyY2JkY2ZmOTlkYWZlNjIyYjU4NmI0; ssid_ucp_v1=1.0.0-KGVhZTIxYjFlNzRlZTNhZjk5MjNlNzk2NGRhOWJlYzZiNGI5NzBhMzYKFQiu3d-eqQIQmNvNogYYGCAMOAhACxoCaGwiIDhkZWM0ZWNiZTUyY2JkY2ZmOTlkYWZlNjIyYjU4NmI0; support_webp=true; support_avif=true; csrf_session_id=9dd5d8287d4f075ae24ff163cd22e51f; msToken=XDpSA6_ZPP-gAkkBV-_WRQvNpG20uUUGPwf3E-S-txhznjBcXNbK2sbOuSpF3U7Jki6R9HwLDPeW4Gj7n6PURPTKrKLEs8J-ieFrwXDvMp2DX94ZoMua; ixigua-a-s=1; tt_scid=UTduWO4ij7cX6YKx23sDuV4zjvFkGFtFk5ZBhEnd1lJ1EZBykStzU7tbWQOSzGdE0fc6; ttwid=1%7C4zaTJmlaHpEa8rAB-KjREdxT3sNBUJWrAzRJnNvqExQ%7C1683198318%7Cffc2eef612caab19a0db93b4cec27e21a6230f9b82ab4bf5b1c6193d082baab1',
|
|
|
+ 'pragma': 'no-cache',
|
|
|
+ 'sec-ch-ua': '"Chromium";v="112", "Google Chrome";v="112", "Not:A-Brand";v="99"',
|
|
|
+ 'sec-ch-ua-mobile': '?0',
|
|
|
+ 'sec-ch-ua-platform': '"macOS"',
|
|
|
+ 'sec-fetch-dest': 'empty',
|
|
|
+ 'sec-fetch-mode': 'cors',
|
|
|
+ 'sec-fetch-site': 'same-origin',
|
|
|
+ # 'tt-anti-token': '95Ny0vj4Q-90dd9b91193b34ce554cc2861439b9629d897723f4d33719b9747d7d18a2ff7c',
|
|
|
+ # 'x-secsdk-csrf-token': '000100000001ecb8f07e247a89e289b3ab55f3c967a8e88f88aa0addb1ddca9d3e36f35d7999175be79b8699c881'
|
|
|
+ }
|
|
|
+ urllib3.disable_warnings()
|
|
|
+ s = requests.session()
|
|
|
+ # max_retries=3 重试3次
|
|
|
+ s.mount('http://', HTTPAdapter(max_retries=3))
|
|
|
+ s.mount('https://', HTTPAdapter(max_retries=3))
|
|
|
+ response = requests.get(url=url, headers=headers, params=params, proxies=Common.tunnel_proxies(), verify=False, timeout=5)
|
|
|
+ response.close()
|
|
|
+ queryCount += 1
|
|
|
+ if response.status_code != 200:
|
|
|
+ Common.logger(log_type, crawler).warning(f"get_videolist_response:{response.text}\n")
|
|
|
+ return
|
|
|
+ elif 'data' not in response.text:
|
|
|
+ Common.logger(log_type, crawler).warning(f"get_videolist_response:{response.text}\n")
|
|
|
+ return
|
|
|
+ elif 'channelFeed' not in response.json()['data']:
|
|
|
+ Common.logger(log_type, crawler).warning(f"get_videolist_response:{response.json()}\n")
|
|
|
+ return
|
|
|
+ elif 'Data' not in response.json()['data']['channelFeed']:
|
|
|
+ Common.logger(log_type, crawler).warning(f"get_videolist_response:{response.json()}\n")
|
|
|
+ return
|
|
|
+ elif len(response.json()['data']['channelFeed']['Data']) == 0:
|
|
|
+ Common.logger(log_type, crawler).warning(f"没有更多数据啦 ~ :{response.json()}\n")
|
|
|
+ return
|
|
|
+ else:
|
|
|
+ feeds = response.json()['data']['channelFeed']['Data']
|
|
|
+ for i in range(len(feeds)):
|
|
|
+ try:
|
|
|
+ item_id = feeds[i].get("data", {}).get("item_id", "")
|
|
|
+ if item_id == "":
|
|
|
+ Common.logger(log_type, crawler).info("无效视频\n")
|
|
|
+ continue
|
|
|
+ video_dict = cls.get_video_info(log_type, crawler, item_id)
|
|
|
+ if video_dict is None:
|
|
|
+ Common.logger(log_type, crawler).info("无效视频\n")
|
|
|
+ continue
|
|
|
+ for k, v in video_dict.items():
|
|
|
+ Common.logger(log_type, crawler).info(f"{k}:{v}")
|
|
|
+ if download_rule(log_type=log_type, crawler=crawler, video_dict=video_dict, rule_dict=rule_dict) is False:
|
|
|
+ Common.logger(log_type, crawler).info("不满足抓取规则\n")
|
|
|
+ elif any(str(word) if str(word) in video_dict["video_title"] else False
|
|
|
+ for word in get_config_from_mysql(log_type=log_type,
|
|
|
+ source=crawler,
|
|
|
+ env=env,
|
|
|
+ text="filter",
|
|
|
+ action="")) is True:
|
|
|
+ Common.logger(log_type, crawler).info('已中过滤词\n')
|
|
|
+ elif cls.repeat_video(log_type, crawler, video_dict["video_id"], env) != 0:
|
|
|
+ Common.logger(log_type, crawler).info('视频已下载\n')
|
|
|
+ else:
|
|
|
+ cls.download_publish(log_type=log_type,
|
|
|
+ crawler=crawler,
|
|
|
+ our_uid=our_uid,
|
|
|
+ video_dict=video_dict,
|
|
|
+ rule_dict=rule_dict,
|
|
|
+ env=env)
|
|
|
+ except Exception as e:
|
|
|
+ Common.logger(log_type, crawler).error(f"抓取单条视频时异常:{e}\n")
|
|
|
+ except Exception as e:
|
|
|
+ Common.logger(log_type, crawler).error(f"抓取第{queryCount}页时异常:{e}\n")
|
|
|
|
|
|
@classmethod
|
|
|
def download_publish(cls, log_type, crawler, our_uid, video_dict, rule_dict, env):
|