Просмотр исходного кода

update xigua_search 过滤逻辑调整

lierqiang 2 лет назад
Родитель
Сommit
9507b60cc6
1 измененных файлов с 70 добавлено и 53 удалено
  1. 70 53
      xigua/xigua_search/xigua_search.py

+ 70 - 53
xigua/xigua_search/xigua_search.py

@@ -582,38 +582,41 @@ class Search:
             Common.logger(log_type, crawler).error(f'get_video_url:{e}\n')
 
     @classmethod
-    def get_video_info(cls, item_id):
-        d_url = "http://a6.pstatp.com/article/full/11/1/{video_id}/{video_id}/1/0/?iid=3636030325&device_id=5787057242" \
-                "&ac=wifi&channel=wandoujia&aid=13&app_name=news_article&version_code=532&version_name=5.3.2&device_platform" \
-                "=android&ab_client=a1%2Cc2%2Ce1%2Cf2%2Cg2%2Cb3%2Cf4&abflag=3&ssmix=a&device_type=SM705" \
-                "&device_brand=smartisan&os_api=19&os_version=4.4.2&uuid=864593021012562&openudid=e23a5ff037ef2d1a" \
-                "&manifest_version_code=532&resolution=1080*1920&dpi=480&update_version_code=5320".format(
-            video_id=item_id)
-        res = requests.get(url=d_url, headers=random_user_agent('pc'), proxies=Common.tunnel_proxies())
-        data = json.loads(res.text)['data']
-        item_counter = data['h5_extra']['itemCell']['itemCounter']
-        user_info = data['user_info']
-        detail_info = data['video_detail_info']
-        video_dict = {'video_title': data['title'],
-                      'video_id': detail_info['video_id'],
-                      'gid': data['group_id'],
-                      'play_cnt': item_counter['videoWatchCount'],
-                      'comment_cnt': item_counter['commentCount'],
-                      'like_cnt': item_counter['diggCount'],
-                      'share_cnt': item_counter['shareCount'],
-
-                      'duration': data['video_duration'],
-                      'publish_time_stamp': data['publish_time'],
-                      'publish_time_str': time.strftime("%Y-%m-%d %H:%M:%S",
-                                                        time.localtime(data['publish_time'])),
-                      'user_name': user_info['name'],
-                      'user_id': user_info['user_id'],
-                      'avatar_url': user_info['avatar_url'],
-                      'cover_url': data['large_image']['url'].replace('\u0026', '&'),
-
-                      }
-        return video_dict
-
+    def get_video_info(cls, log_type, crawler, item_id):
+        try:
+            d_url = "http://a6.pstatp.com/article/full/11/1/{video_id}/{video_id}/1/0/?iid=3636030325&device_id=5787057242" \
+                    "&ac=wifi&channel=wandoujia&aid=13&app_name=news_article&version_code=532&version_name=5.3.2&device_platform" \
+                    "=android&ab_client=a1%2Cc2%2Ce1%2Cf2%2Cg2%2Cb3%2Cf4&abflag=3&ssmix=a&device_type=SM705" \
+                    "&device_brand=smartisan&os_api=19&os_version=4.4.2&uuid=864593021012562&openudid=e23a5ff037ef2d1a" \
+                    "&manifest_version_code=532&resolution=1080*1920&dpi=480&update_version_code=5320".format(
+                video_id=item_id)
+            res = requests.get(url=d_url, headers=random_user_agent('pc'), proxies=Common.tunnel_proxies())
+            data = json.loads(res.text)['data']
+            item_counter = data['h5_extra']['itemCell']['itemCounter']
+            user_info = data['user_info']
+            detail_info = data['video_detail_info']
+            video_dict = {'video_title': data['title'],
+                          'video_id': detail_info['video_id'],
+                          'gid': data['group_id'],
+                          'play_cnt': item_counter['videoWatchCount'],
+                          'comment_cnt': item_counter['commentCount'],
+                          'like_cnt': item_counter['diggCount'],
+                          'share_cnt': item_counter['shareCount'],
+
+                          'duration': data['video_duration'],
+                          'publish_time_stamp': data['publish_time'],
+                          'publish_time_str': time.strftime("%Y-%m-%d %H:%M:%S",
+                                                            time.localtime(data['publish_time'])),
+                          'user_name': user_info['name'],
+                          'user_id': user_info['user_id'],
+                          'avatar_url': user_info['avatar_url'],
+                          'cover_url': data['large_image']['url'].replace('\u0026', '&'),
+
+                          }
+            return video_dict
+        except Exception as e:
+            Common.logger(log_type, crawler).error(f'视频:{item_id},没有获取到视频详情,原因:{e}')
+            return
     @classmethod
     def is_ruled(cls, log_type, crawler, video_dict, rule_dict):
         old_time = int(time.time()) - (3600 * 24 * rule_dict['publish_time'])
@@ -657,7 +660,17 @@ class Search:
 
                 if v_type == 'video':
                     try:
-                        video_dict = cls.get_video_info(item_id)
+                        old_time = int(time.time()) - (3600 * 24 * rule_dict['publish_time'])
+                        if video_info['data']['publish_time'] <= old_time:
+                            continue
+                        elif video_info['data']['video_watch_count'] <= rule_dict['play_cnt']:
+                            continue
+                        elif video_info['data']['video_time'] < rule_dict['min_duration'] or video_info['data']['video_time'] > rule_dict[
+                            'max_duration']:
+                            continue
+                        video_dict = cls.get_video_info(log_type, crawler, item_id)
+                        if not video_dict:
+                            continue
                         video_url_dict = cls.get_video_url(log_type, crawler, video_dict['gid'])
                         video_dict['video_width'] = video_url_dict["video_width"]
                         video_dict['video_height'] = video_url_dict["video_height"]
@@ -684,7 +697,9 @@ class Search:
                     for video in p_res['data']:
                         item_id = video['item_id']
                         try:
-                            video_dict = cls.get_video_info(item_id)
+                            video_dict = cls.get_video_info(log_type, crawler, item_id)
+                            if not video_dict:
+                                continue
                             if cls.is_ruled(log_type, crawler, video_dict, rule_dict):
                                 video_url_dict = cls.get_video_url(log_type, crawler, video_dict['gid'])
                                 video_dict['video_width'] = video_url_dict["video_width"]
@@ -704,19 +719,24 @@ class Search:
                 for k, v in video_dict.items():
                     Common.logger(log_type, crawler).info(f"{k}:{v}")
                 # print(f'title:{video_dict["video_title"]},gid:{video_dict["gid"]},offset:{offset}, total:{total_count}')
-                cls.download_publish(
-                    search_word=search_word,
-                    log_type=log_type,
-                    crawler=crawler,
-                    video_dict=video_dict,
-                    rule_dict=rule_dict,
-                    strategy=strategy,
-                    our_uid=our_uid,
-                    oss_endpoint=oss_endpoint,
-                    env=env,
-                    machine=machine
-                )
-
+                try:
+                    if cls.repeat_video(log_type, crawler, video_dict['video_id'], env, machine) != 0:
+                        Common.logger(log_type, crawler).info(f'gid:{item_id},视频已下载,无需重复下载\n')
+                        continue
+                    cls.download_publish(
+                        search_word=search_word,
+                        log_type=log_type,
+                        crawler=crawler,
+                        video_dict=video_dict,
+                        rule_dict=rule_dict,
+                        strategy=strategy,
+                        our_uid=our_uid,
+                        oss_endpoint=oss_endpoint,
+                        env=env,
+                        machine=machine
+                    )
+                except Exception as e:
+                    continue
                 total_count += 1
                 if total_count >= 30:
                     return
@@ -733,10 +753,7 @@ class Search:
     def download_publish(cls, log_type, crawler, search_word, strategy, video_dict, rule_dict, our_uid, oss_endpoint,
                          env, machine):
         try:
-            if cls.repeat_video(log_type, crawler, video_dict['video_id'], env, machine) != 0:
-                Common.logger(log_type, crawler).info('视频已下载\n')
-                return
-            Common.logger(log_type, crawler).info('视频已下载\n')
+
             Common.download_method(log_type=log_type, crawler=crawler, text='xigua_video',
                                    title=video_dict['video_title'], url=video_dict['video_url'])
             # 下载音频
@@ -848,7 +865,7 @@ class Search:
             user_list = cls.get_user_list(log_type=log_type, crawler=crawler, sheetid="SSPNPW", env=env,
                                           machine=machine)
             for user in user_list:
-                search_word = user["search_word"]
+                search_word = '猪油 健康'#user["search_word"]
                 our_uid = user["our_uid"]
                 Common.logger(log_type, crawler).info(f"开始抓取 {search_word} 用户主页视频\n")
                 cls.get_videolist(log_type=log_type,
@@ -865,7 +882,7 @@ class Search:
 
 if __name__ == '__main__':
     # print(Follow.get_signature("follow", "xigua", "95420624045", "local"))
-    Search.get_search_videos('search', 'xigua', 'xigua_search', 'out', 'dev', 'local')
+    Search.get_search_videos('search', 'xigua', 'xigua_search', 'inner', 'prod', 'aliyun')
 
     # Follow.get_videolist(log_type="follow",
     #                      crawler="xigua",