Browse Source

update 西瓜搜索 合集取消抓取

lierqiang 2 years ago
parent
commit
d9ba59d116
1 changed files with 65 additions and 65 deletions
  1. 65 65
      xigua/xigua_search/xigua_search.py

+ 65 - 65
xigua/xigua_search/xigua_search.py

@@ -662,14 +662,14 @@ class XiguaSearch:
                 if v_type == 'video':
                     item_id = video_info['data']['group_id']
                     if video_info['data']['publish_time'] <= old_time:
-                        Common.logger(log_type, crawler).error(f'视频:{item_id},不符合抓取规则\n')
+                        Common.logger(log_type, crawler).error(f'关键词:{search_word},视频:{item_id},不符合抓取规则\n')
                         continue
                     elif video_info['data']['video_watch_count'] <= rule_dict['play_cnt']:
-                        Common.logger(log_type, crawler).error(f'视频:{item_id},不符合抓取规则\n')
+                        Common.logger(log_type, crawler).error(f'关键词:{search_word},视频:{item_id},不符合抓取规则\n')
                         continue
                     elif video_info['data']['video_time'] < rule_dict['min_duration'] or video_info['data'][
                         'video_time'] > rule_dict['max_duration']:
-                        Common.logger(log_type, crawler).error(f'视频:{item_id},不符合抓取规则\n')
+                        Common.logger(log_type, crawler).error(f'关键词:{search_word},视频:{item_id},不符合抓取规则\n')
                         continue
                     try:
                         video_dict = cls.get_video_info(log_type, crawler, item_id)
@@ -680,10 +680,10 @@ class XiguaSearch:
                         video_dict['video_url'] = video_url_dict["video_url"]
                         video_dict['session'] = signature
                     except Exception as e:
-                        Common.logger(log_type, crawler).error(f'视频:{item_id},获取详情失败,原因:{e}')
+                        Common.logger(log_type, crawler).error(f'关键词:{search_word},视频:{item_id},获取详情失败,原因:{e}')
                         continue
                     if cls.repeat_video(log_type, crawler, video_dict['video_id'], env, machine) != 0:
-                        Common.logger(log_type, crawler).info(f'gid:{video_dict["gid"]},视频已下载,无需重复下载\n')
+                        Common.logger(log_type, crawler).info(f'关键词:{search_word},gid:{video_dict["gid"]},视频已下载,无需重复下载\n')
                         continue
                     for k, v in video_dict.items():
                         Common.logger(log_type, crawler).info(f"{k}:{v}")
@@ -705,70 +705,70 @@ class XiguaSearch:
                         )
 
                     except Exception as e:
-                        Common.logger(log_type, crawler).error(f'视频:{item_id},下载失败,原因:{e}')
+                        Common.logger(log_type, crawler).error(f'关键词:{search_word},视频:{item_id},下载失败,原因:{e}')
                         continue
                     total_count += 1
                     if total_count >= 30:
                         return
-                elif v_type == 'pseries':
-                    try:
-                        item_id = video_info['data']['group_id']
-                        p_url = "https://www.ixigua.com/api/videov2/pseries_more_v2?pSeriesId={}&rank=0&tailCount=30&aid=1768&msToken=wHEafKFLx0k3hihOPbhXYNsfMBxWiq2AB0K5R-34kEFixyq3ATi_DuXbL4Q47J9C2uK2zgWItMa1g2yc4FyDxM4dMijmSdwF4c4T8sSmOkoOI0wGzeEcPw==&X-Bogus=DFSzswVOzdUANG3ItaVHYr7TlqCv&_signature=_02B4Z6wo00001vB6l3QAAIDBZKzMeTihTmbwepPAANgh1Ai3JgFFo4e6anoezmBEpHfEMEYlWISGhXI-QKfev4N-2bwgXsHOuNGLnOsGqMbANIjFPh7Yj6OakQWrkbACenlv0P-arswtB6Zn45".format(
-                            item_id)
-                        p_headers = {
-                            'referer': 'https://www.ixigua.com/{}?series_flow=1&logTag=cfec9d927da968feff89'.format(
-                                item_id),
-                            'user-agent': get_random_user_agent('pc'),
-                        }
-                        p_res = requests.request("GET", p_url, headers=p_headers,
-                                                 proxies=Common.tunnel_proxies()).json()
-                    except Exception as e:
-                        Common.logger(log_type, crawler).error(f'合集:{item_id},没有获取到合集详情,原因:{e}')
-                        continue
-                    for video in p_res['data']:
-                        item_id = video['item_id']
-                        try:
-                            video_dict = cls.get_video_info(log_type, crawler, item_id)
-                            video_url_dict = cls.get_video_url(log_type, crawler, video_dict['gid'])
-                            video_dict['video_width'] = video_url_dict["video_width"]
-                            video_dict['video_height'] = video_url_dict["video_height"]
-                            video_dict['audio_url'] = video_url_dict["audio_url"]
-                            video_dict['video_url'] = video_url_dict["video_url"]
-                            video_dict['session'] = signature
-                        except Exception as e:
-                            Common.logger(log_type, crawler).error(f'视频:{item_id},没有获取到视频详情,原因:{e}')
-                            continue
-                        if cls.repeat_video(log_type, crawler, video_dict['video_id'], env, machine) != 0:
-                            Common.logger(log_type, crawler).info(
-                                f'gid:{video_dict["gid"]},视频已下载,无需重复下载\n')
-                            continue
-                        if not cls.is_ruled(log_type, crawler, video_dict, rule_dict):
-                            Common.logger(log_type, crawler).error(f'视频:{item_id},不符合抓取规则\n')
-                            continue
-                        for k, v in video_dict.items():
-                            Common.logger(log_type, crawler).info(f"{k}:{v}")
-                        try:
-                            # print(
-                            #     f'search_word:{search_word},title:{video_dict["video_title"]},gid:{video_dict["gid"]},offset:{offset}, total:{total_count}')
-                            cls.download_publish(
-                                search_word=search_word,
-                                log_type=log_type,
-                                crawler=crawler,
-                                video_dict=video_dict,
-                                rule_dict=rule_dict,
-                                strategy=strategy,
-                                our_uid=our_uid,
-                                oss_endpoint=oss_endpoint,
-                                env=env,
-                                machine=machine
-                            )
-                            total_count += 1
-                            if total_count >= 30:
-                                return
-                            else:
-                                break
-                        except Exception as e:
-                            Common.logger(log_type, crawler).error(f'视频:{item_id},download_publish异常:{e}\n')
+                # elif v_type == 'pseries':
+                #     try:
+                #         item_id = video_info['data']['group_id']
+                #         p_url = "https://www.ixigua.com/api/videov2/pseries_more_v2?pSeriesId={}&rank=0&tailCount=30&aid=1768&msToken=wHEafKFLx0k3hihOPbhXYNsfMBxWiq2AB0K5R-34kEFixyq3ATi_DuXbL4Q47J9C2uK2zgWItMa1g2yc4FyDxM4dMijmSdwF4c4T8sSmOkoOI0wGzeEcPw==&X-Bogus=DFSzswVOzdUANG3ItaVHYr7TlqCv&_signature=_02B4Z6wo00001vB6l3QAAIDBZKzMeTihTmbwepPAANgh1Ai3JgFFo4e6anoezmBEpHfEMEYlWISGhXI-QKfev4N-2bwgXsHOuNGLnOsGqMbANIjFPh7Yj6OakQWrkbACenlv0P-arswtB6Zn45".format(
+                #             item_id)
+                #         p_headers = {
+                #             'referer': 'https://www.ixigua.com/{}?series_flow=1&logTag=cfec9d927da968feff89'.format(
+                #                 item_id),
+                #             'user-agent': get_random_user_agent('pc'),
+                #         }
+                #         p_res = requests.request("GET", p_url, headers=p_headers,
+                #                                  proxies=Common.tunnel_proxies()).json()
+                #     except Exception as e:
+                #         Common.logger(log_type, crawler).error(f'合集:{item_id},没有获取到合集详情,原因:{e}')
+                #         continue
+                #     for video in p_res['data']:
+                #         item_id = video['item_id']
+                #         try:
+                #             video_dict = cls.get_video_info(log_type, crawler, item_id)
+                #             video_url_dict = cls.get_video_url(log_type, crawler, video_dict['gid'])
+                #             video_dict['video_width'] = video_url_dict["video_width"]
+                #             video_dict['video_height'] = video_url_dict["video_height"]
+                #             video_dict['audio_url'] = video_url_dict["audio_url"]
+                #             video_dict['video_url'] = video_url_dict["video_url"]
+                #             video_dict['session'] = signature
+                #         except Exception as e:
+                #             Common.logger(log_type, crawler).error(f'视频:{item_id},没有获取到视频详情,原因:{e}')
+                #             continue
+                #         if cls.repeat_video(log_type, crawler, video_dict['video_id'], env, machine) != 0:
+                #             Common.logger(log_type, crawler).info(
+                #                 f'gid:{video_dict["gid"]},视频已下载,无需重复下载\n')
+                #             continue
+                #         if not cls.is_ruled(log_type, crawler, video_dict, rule_dict):
+                #             Common.logger(log_type, crawler).error(f'视频:{item_id},不符合抓取规则\n')
+                #             continue
+                #         for k, v in video_dict.items():
+                #             Common.logger(log_type, crawler).info(f"{k}:{v}")
+                #         try:
+                #             # print(
+                #             #     f'search_word:{search_word},title:{video_dict["video_title"]},gid:{video_dict["gid"]},offset:{offset}, total:{total_count}')
+                #             cls.download_publish(
+                #                 search_word=search_word,
+                #                 log_type=log_type,
+                #                 crawler=crawler,
+                #                 video_dict=video_dict,
+                #                 rule_dict=rule_dict,
+                #                 strategy=strategy,
+                #                 our_uid=our_uid,
+                #                 oss_endpoint=oss_endpoint,
+                #                 env=env,
+                #                 machine=machine
+                #             )
+                #             total_count += 1
+                #             if total_count >= 30:
+                #                 return
+                #             else:
+                #                 break
+                #         except Exception as e:
+                #             Common.logger(log_type, crawler).error(f'视频:{item_id},download_publish异常:{e}\n')
 
             offset += 10