|
@@ -662,14 +662,14 @@ class XiguaSearch:
|
|
|
if v_type == 'video':
|
|
|
item_id = video_info['data']['group_id']
|
|
|
if video_info['data']['publish_time'] <= old_time:
|
|
|
- Common.logger(log_type, crawler).error(f'视频:{item_id},不符合抓取规则\n')
|
|
|
+ Common.logger(log_type, crawler).error(f'关键词:{search_word},视频:{item_id},不符合抓取规则\n')
|
|
|
continue
|
|
|
elif video_info['data']['video_watch_count'] <= rule_dict['play_cnt']:
|
|
|
- Common.logger(log_type, crawler).error(f'视频:{item_id},不符合抓取规则\n')
|
|
|
+ Common.logger(log_type, crawler).error(f'关键词:{search_word},视频:{item_id},不符合抓取规则\n')
|
|
|
continue
|
|
|
elif video_info['data']['video_time'] < rule_dict['min_duration'] or video_info['data'][
|
|
|
'video_time'] > rule_dict['max_duration']:
|
|
|
- Common.logger(log_type, crawler).error(f'视频:{item_id},不符合抓取规则\n')
|
|
|
+ Common.logger(log_type, crawler).error(f'关键词:{search_word},视频:{item_id},不符合抓取规则\n')
|
|
|
continue
|
|
|
try:
|
|
|
video_dict = cls.get_video_info(log_type, crawler, item_id)
|
|
@@ -680,10 +680,10 @@ class XiguaSearch:
|
|
|
video_dict['video_url'] = video_url_dict["video_url"]
|
|
|
video_dict['session'] = signature
|
|
|
except Exception as e:
|
|
|
- Common.logger(log_type, crawler).error(f'视频:{item_id},获取详情失败,原因:{e}')
|
|
|
+ Common.logger(log_type, crawler).error(f'关键词:{search_word},视频:{item_id},获取详情失败,原因:{e}')
|
|
|
continue
|
|
|
if cls.repeat_video(log_type, crawler, video_dict['video_id'], env, machine) != 0:
|
|
|
- Common.logger(log_type, crawler).info(f'gid:{video_dict["gid"]},视频已下载,无需重复下载\n')
|
|
|
+ Common.logger(log_type, crawler).info(f'关键词:{search_word},gid:{video_dict["gid"]},视频已下载,无需重复下载\n')
|
|
|
continue
|
|
|
for k, v in video_dict.items():
|
|
|
Common.logger(log_type, crawler).info(f"{k}:{v}")
|
|
@@ -705,70 +705,70 @@ class XiguaSearch:
|
|
|
)
|
|
|
|
|
|
except Exception as e:
|
|
|
- Common.logger(log_type, crawler).error(f'视频:{item_id},下载失败,原因:{e}')
|
|
|
+ Common.logger(log_type, crawler).error(f'关键词:{search_word},视频:{item_id},下载失败,原因:{e}')
|
|
|
continue
|
|
|
total_count += 1
|
|
|
if total_count >= 30:
|
|
|
return
|
|
|
- elif v_type == 'pseries':
|
|
|
- try:
|
|
|
- item_id = video_info['data']['group_id']
|
|
|
- p_url = "https://www.ixigua.com/api/videov2/pseries_more_v2?pSeriesId={}&rank=0&tailCount=30&aid=1768&msToken=wHEafKFLx0k3hihOPbhXYNsfMBxWiq2AB0K5R-34kEFixyq3ATi_DuXbL4Q47J9C2uK2zgWItMa1g2yc4FyDxM4dMijmSdwF4c4T8sSmOkoOI0wGzeEcPw==&X-Bogus=DFSzswVOzdUANG3ItaVHYr7TlqCv&_signature=_02B4Z6wo00001vB6l3QAAIDBZKzMeTihTmbwepPAANgh1Ai3JgFFo4e6anoezmBEpHfEMEYlWISGhXI-QKfev4N-2bwgXsHOuNGLnOsGqMbANIjFPh7Yj6OakQWrkbACenlv0P-arswtB6Zn45".format(
|
|
|
- item_id)
|
|
|
- p_headers = {
|
|
|
- 'referer': 'https://www.ixigua.com/{}?series_flow=1&logTag=cfec9d927da968feff89'.format(
|
|
|
- item_id),
|
|
|
- 'user-agent': get_random_user_agent('pc'),
|
|
|
- }
|
|
|
- p_res = requests.request("GET", p_url, headers=p_headers,
|
|
|
- proxies=Common.tunnel_proxies()).json()
|
|
|
- except Exception as e:
|
|
|
- Common.logger(log_type, crawler).error(f'合集:{item_id},没有获取到合集详情,原因:{e}')
|
|
|
- continue
|
|
|
- for video in p_res['data']:
|
|
|
- item_id = video['item_id']
|
|
|
- try:
|
|
|
- video_dict = cls.get_video_info(log_type, crawler, item_id)
|
|
|
- video_url_dict = cls.get_video_url(log_type, crawler, video_dict['gid'])
|
|
|
- video_dict['video_width'] = video_url_dict["video_width"]
|
|
|
- video_dict['video_height'] = video_url_dict["video_height"]
|
|
|
- video_dict['audio_url'] = video_url_dict["audio_url"]
|
|
|
- video_dict['video_url'] = video_url_dict["video_url"]
|
|
|
- video_dict['session'] = signature
|
|
|
- except Exception as e:
|
|
|
- Common.logger(log_type, crawler).error(f'视频:{item_id},没有获取到视频详情,原因:{e}')
|
|
|
- continue
|
|
|
- if cls.repeat_video(log_type, crawler, video_dict['video_id'], env, machine) != 0:
|
|
|
- Common.logger(log_type, crawler).info(
|
|
|
- f'gid:{video_dict["gid"]},视频已下载,无需重复下载\n')
|
|
|
- continue
|
|
|
- if not cls.is_ruled(log_type, crawler, video_dict, rule_dict):
|
|
|
- Common.logger(log_type, crawler).error(f'视频:{item_id},不符合抓取规则\n')
|
|
|
- continue
|
|
|
- for k, v in video_dict.items():
|
|
|
- Common.logger(log_type, crawler).info(f"{k}:{v}")
|
|
|
- try:
|
|
|
- # print(
|
|
|
- # f'search_word:{search_word},title:{video_dict["video_title"]},gid:{video_dict["gid"]},offset:{offset}, total:{total_count}')
|
|
|
- cls.download_publish(
|
|
|
- search_word=search_word,
|
|
|
- log_type=log_type,
|
|
|
- crawler=crawler,
|
|
|
- video_dict=video_dict,
|
|
|
- rule_dict=rule_dict,
|
|
|
- strategy=strategy,
|
|
|
- our_uid=our_uid,
|
|
|
- oss_endpoint=oss_endpoint,
|
|
|
- env=env,
|
|
|
- machine=machine
|
|
|
- )
|
|
|
- total_count += 1
|
|
|
- if total_count >= 30:
|
|
|
- return
|
|
|
- else:
|
|
|
- break
|
|
|
- except Exception as e:
|
|
|
- Common.logger(log_type, crawler).error(f'视频:{item_id},download_publish异常:{e}\n')
|
|
|
+ # elif v_type == 'pseries':
|
|
|
+ # try:
|
|
|
+ # item_id = video_info['data']['group_id']
|
|
|
+ # p_url = "https://www.ixigua.com/api/videov2/pseries_more_v2?pSeriesId={}&rank=0&tailCount=30&aid=1768&msToken=wHEafKFLx0k3hihOPbhXYNsfMBxWiq2AB0K5R-34kEFixyq3ATi_DuXbL4Q47J9C2uK2zgWItMa1g2yc4FyDxM4dMijmSdwF4c4T8sSmOkoOI0wGzeEcPw==&X-Bogus=DFSzswVOzdUANG3ItaVHYr7TlqCv&_signature=_02B4Z6wo00001vB6l3QAAIDBZKzMeTihTmbwepPAANgh1Ai3JgFFo4e6anoezmBEpHfEMEYlWISGhXI-QKfev4N-2bwgXsHOuNGLnOsGqMbANIjFPh7Yj6OakQWrkbACenlv0P-arswtB6Zn45".format(
|
|
|
+ # item_id)
|
|
|
+ # p_headers = {
|
|
|
+ # 'referer': 'https://www.ixigua.com/{}?series_flow=1&logTag=cfec9d927da968feff89'.format(
|
|
|
+ # item_id),
|
|
|
+ # 'user-agent': get_random_user_agent('pc'),
|
|
|
+ # }
|
|
|
+ # p_res = requests.request("GET", p_url, headers=p_headers,
|
|
|
+ # proxies=Common.tunnel_proxies()).json()
|
|
|
+ # except Exception as e:
|
|
|
+ # Common.logger(log_type, crawler).error(f'合集:{item_id},没有获取到合集详情,原因:{e}')
|
|
|
+ # continue
|
|
|
+ # for video in p_res['data']:
|
|
|
+ # item_id = video['item_id']
|
|
|
+ # try:
|
|
|
+ # video_dict = cls.get_video_info(log_type, crawler, item_id)
|
|
|
+ # video_url_dict = cls.get_video_url(log_type, crawler, video_dict['gid'])
|
|
|
+ # video_dict['video_width'] = video_url_dict["video_width"]
|
|
|
+ # video_dict['video_height'] = video_url_dict["video_height"]
|
|
|
+ # video_dict['audio_url'] = video_url_dict["audio_url"]
|
|
|
+ # video_dict['video_url'] = video_url_dict["video_url"]
|
|
|
+ # video_dict['session'] = signature
|
|
|
+ # except Exception as e:
|
|
|
+ # Common.logger(log_type, crawler).error(f'视频:{item_id},没有获取到视频详情,原因:{e}')
|
|
|
+ # continue
|
|
|
+ # if cls.repeat_video(log_type, crawler, video_dict['video_id'], env, machine) != 0:
|
|
|
+ # Common.logger(log_type, crawler).info(
|
|
|
+ # f'gid:{video_dict["gid"]},视频已下载,无需重复下载\n')
|
|
|
+ # continue
|
|
|
+ # if not cls.is_ruled(log_type, crawler, video_dict, rule_dict):
|
|
|
+ # Common.logger(log_type, crawler).error(f'视频:{item_id},不符合抓取规则\n')
|
|
|
+ # continue
|
|
|
+ # for k, v in video_dict.items():
|
|
|
+ # Common.logger(log_type, crawler).info(f"{k}:{v}")
|
|
|
+ # try:
|
|
|
+ # # print(
|
|
|
+ # # f'search_word:{search_word},title:{video_dict["video_title"]},gid:{video_dict["gid"]},offset:{offset}, total:{total_count}')
|
|
|
+ # cls.download_publish(
|
|
|
+ # search_word=search_word,
|
|
|
+ # log_type=log_type,
|
|
|
+ # crawler=crawler,
|
|
|
+ # video_dict=video_dict,
|
|
|
+ # rule_dict=rule_dict,
|
|
|
+ # strategy=strategy,
|
|
|
+ # our_uid=our_uid,
|
|
|
+ # oss_endpoint=oss_endpoint,
|
|
|
+ # env=env,
|
|
|
+ # machine=machine
|
|
|
+ # )
|
|
|
+ # total_count += 1
|
|
|
+ # if total_count >= 30:
|
|
|
+ # return
|
|
|
+ # else:
|
|
|
+ # break
|
|
|
+ # except Exception as e:
|
|
|
+ # Common.logger(log_type, crawler).error(f'视频:{item_id},download_publish异常:{e}\n')
|
|
|
|
|
|
offset += 10
|
|
|
|