Bladeren bron

update xigua_search

lierqiang 2 jaren geleden
bovenliggende
commit
d944cdfb10
1 gewijzigde bestanden met toevoegingen van 114 en 95 verwijderingen
  1. 114 95
      xigua/xigua_search/xigua_search.py

+ 114 - 95
xigua/xigua_search/xigua_search.py

@@ -101,10 +101,10 @@ class Search:
                 for i in range(1, len(user_sheet)):
                     our_uid = user_sheet[i][6]
                     search_word = user_sheet[i][4]
-                    storage = user_sheet[i][5]
                     tag1 = user_sheet[i][8]
                     tag2 = user_sheet[i][9]
                     tag3 = user_sheet[i][10]
+                    tag4 = user_sheet[i][11]
                     Common.logger(log_type, crawler).info(f"正在更新 {search_word} 关键词信息\n")
                     if our_uid is None:
                         default_user = Users.get_default_user()
@@ -112,7 +112,7 @@ class Search:
                         user_dict = {
                             'nickName': default_user['nickName'],
                             'avatarUrl': default_user['avatarUrl'],
-                            'tagName': f'{tag1},{tag2},{tag3}',
+                            'tagName': f'{tag1},{tag2},{tag3},{tag4}',
                         }
                         Common.logger(log_type, crawler).info(f'新创建的站内UID:{our_uid}')
                         our_uid = Users.create_uid(log_type, crawler, user_dict, env)
@@ -585,41 +585,37 @@ class Search:
 
     @classmethod
     def get_video_info(cls, log_type, crawler, item_id):
-        try:
-            d_url = "http://a6.pstatp.com/article/full/11/1/{video_id}/{video_id}/1/0/?iid=3636030325&device_id=5787057242" \
-                    "&ac=wifi&channel=wandoujia&aid=13&app_name=news_article&version_code=532&version_name=5.3.2&device_platform" \
-                    "=android&ab_client=a1%2Cc2%2Ce1%2Cf2%2Cg2%2Cb3%2Cf4&abflag=3&ssmix=a&device_type=SM705" \
-                    "&device_brand=smartisan&os_api=19&os_version=4.4.2&uuid=864593021012562&openudid=e23a5ff037ef2d1a" \
-                    "&manifest_version_code=532&resolution=1080*1920&dpi=480&update_version_code=5320".format(
-                video_id=item_id)
-            res = requests.get(url=d_url, headers=random_user_agent('pc'), proxies=Common.tunnel_proxies())
-            data = json.loads(res.text)['data']
-            item_counter = data['h5_extra']['itemCell']['itemCounter']
-            user_info = data['user_info']
-            detail_info = data['video_detail_info']
-            video_dict = {'video_title': data['title'],
-                          'video_id': detail_info['video_id'],
-                          'gid': data['group_id'],
-                          'play_cnt': item_counter['videoWatchCount'],
-                          'comment_cnt': item_counter['commentCount'],
-                          'like_cnt': item_counter['diggCount'],
-                          'share_cnt': item_counter['shareCount'],
-
-
-                          'duration': data['video_duration'],
-                          'publish_time_stamp': data['publish_time'],
-                          'publish_time_str': time.strftime("%Y-%m-%d %H:%M:%S",
-                                                            time.localtime(data['publish_time'])),
-                          'user_name': user_info['name'],
-                          'user_id': user_info['user_id'],
-                          'avatar_url': user_info['avatar_url'],
-                          'cover_url': data['large_image']['url'].replace('\u0026', '&'),
-
-                          }
-            return video_dict
-        except Exception as e:
-            Common.logger(log_type, crawler).error(f'视频:{item_id},没有获取到视频详情,原因:{e}')
-            return {}
+        d_url = "http://a6.pstatp.com/article/full/11/1/{video_id}/{video_id}/1/0/?iid=3636030325&device_id=5787057242" \
+                "&ac=wifi&channel=wandoujia&aid=13&app_name=news_article&version_code=532&version_name=5.3.2&device_platform" \
+                "=android&ab_client=a1%2Cc2%2Ce1%2Cf2%2Cg2%2Cb3%2Cf4&abflag=3&ssmix=a&device_type=SM705" \
+                "&device_brand=smartisan&os_api=19&os_version=4.4.2&uuid=864593021012562&openudid=e23a5ff037ef2d1a" \
+                "&manifest_version_code=532&resolution=1080*1920&dpi=480&update_version_code=5320".format(
+            video_id=item_id)
+        res = requests.get(url=d_url, headers=random_user_agent('pc'), proxies=Common.tunnel_proxies())
+        data = json.loads(res.text)['data']
+        item_counter = data['h5_extra']['itemCell']['itemCounter']
+        user_info = data['user_info']
+        detail_info = data['video_detail_info']
+        video_dict = {'video_title': data['title'],
+                      'video_id': detail_info['video_id'],
+                      'gid': data['group_id'],
+                      'play_cnt': item_counter['videoWatchCount'],
+                      'comment_cnt': item_counter['commentCount'],
+                      'like_cnt': item_counter['diggCount'],
+                      'share_cnt': item_counter['shareCount'],
+
+                      'duration': data['video_duration'],
+                      'publish_time_stamp': data['publish_time'],
+                      'publish_time_str': time.strftime("%Y-%m-%d %H:%M:%S",
+                                                        time.localtime(data['publish_time'])),
+                      'user_name': user_info['name'],
+                      'user_id': user_info['user_id'],
+                      'avatar_url': user_info['avatar_url'],
+                      'cover_url': data['large_image']['url'].replace('\u0026', '&'),
+
+                      }
+        return video_dict
+
     @classmethod
     def is_ruled(cls, log_type, crawler, video_dict, rule_dict):
         old_time = int(time.time()) - (3600 * 24 * rule_dict['publish_time'])
@@ -634,7 +630,7 @@ class Search:
 
     @classmethod
     def get_videolist(cls, log_type, crawler, strategy, our_uid, search_word, oss_endpoint, env, machine):
-        total_count = 0
+        total_count = 1
         offset = 0
         while True:
 
@@ -659,22 +655,21 @@ class Search:
             for video_info in search_list:
                 v_type = video_info['type']
                 rule_dict = cls.get_rule(log_type, crawler)
-                video_dict = {}
-
                 if v_type == 'video':
+                    item_id = video_info['data']['group_id']
+                    old_time = int(time.time()) - (3600 * 24 * rule_dict['publish_time'])
+                    if video_info['data']['publish_time'] <= old_time:
+                        Common.logger(log_type, crawler).error(f'视频:{item_id},不符合抓取规则\n')
+                        continue
+                    elif video_info['data']['video_watch_count'] <= rule_dict['play_cnt']:
+                        Common.logger(log_type, crawler).error(f'视频:{item_id},不符合抓取规则\n')
+                        continue
+                    elif video_info['data']['video_time'] < rule_dict['min_duration'] or video_info['data'][
+                        'video_time'] > rule_dict['max_duration']:
+                        Common.logger(log_type, crawler).error(f'视频:{item_id},不符合抓取规则\n')
+                        continue
                     try:
-                        item_id = video_info['data']['group_id']
-                        old_time = int(time.time()) - (3600 * 24 * rule_dict['publish_time'])
-                        if video_info['data']['publish_time'] <= old_time:
-                            continue
-                        elif video_info['data']['video_watch_count'] <= rule_dict['play_cnt']:
-                            continue
-                        elif video_info['data']['video_time'] < rule_dict['min_duration'] or video_info['data']['video_time'] > rule_dict[
-                            'max_duration']:
-                            continue
                         video_dict = cls.get_video_info(log_type, crawler, item_id)
-                        if not video_dict:
-                            continue
                         video_url_dict = cls.get_video_url(log_type, crawler, video_dict['gid'])
                         video_dict['video_width'] = video_url_dict["video_width"]
                         video_dict['video_height'] = video_url_dict["video_height"]
@@ -682,8 +677,36 @@ class Search:
                         video_dict['video_url'] = video_url_dict["video_url"]
                         video_dict['session'] = signature
                     except Exception as e:
-                        # Common.logger(log_type, crawler).error(f'视频:{item_id},没有获取到视频详情,原因:{e}')
+                        Common.logger(log_type, crawler).error(f'视频:{item_id},获取详情失败,原因:{e}')
                         continue
+                    if cls.repeat_video(log_type, crawler, video_dict['video_id'], env, machine) != 0:
+                        Common.logger(log_type, crawler).info(f'gid:{video_dict["gid"]},视频已下载,无需重复下载\n')
+                        continue
+                    for k, v in video_dict.items():
+                        Common.logger(log_type, crawler).info(f"{k}:{v}")
+
+                    try:
+                        # print(
+                        #     f'search_word:{search_word},title:{video_dict["video_title"]},gid:{video_dict["gid"]},offset:{offset}, total:{total_count}')
+                        cls.download_publish(
+                            search_word=search_word,
+                            log_type=log_type,
+                            crawler=crawler,
+                            video_dict=video_dict,
+                            rule_dict=rule_dict,
+                            strategy=strategy,
+                            our_uid=our_uid,
+                            oss_endpoint=oss_endpoint,
+                            env=env,
+                            machine=machine
+                        )
+
+                    except Exception as e:
+                        Common.logger(log_type, crawler).error(f'视频:{item_id},下载失败,原因:{e}')
+                        continue
+                    total_count += 1
+                    if total_count >= 30:
+                        return
                 elif v_type == 'pseries':
                     try:
                         item_id = video_info['data']['group_id']
@@ -697,57 +720,53 @@ class Search:
                         p_res = requests.request("GET", p_url, headers=p_headers,
                                                  proxies=Common.tunnel_proxies()).json()
                     except Exception as e:
-                        # Common.logger(log_type, crawler).error(f'合集:{item_id},没有获取到合集详情,原因:{e}')
+                        Common.logger(log_type, crawler).error(f'合集:{item_id},没有获取到合集详情,原因:{e}')
                         continue
                     for video in p_res['data']:
                         item_id = video['item_id']
                         try:
                             video_dict = cls.get_video_info(log_type, crawler, item_id)
-                            if not video_dict:
-                                continue
-                            if not cls.is_ruled(log_type, crawler, video_dict, rule_dict):
-                                Common.logger(log_type, crawler).info(f'gid:{item_id},不符合抓取规则\n')
-                                continue
-                            else:
-                                video_url_dict = cls.get_video_url(log_type, crawler, video_dict['gid'])
-                                video_dict['video_width'] = video_url_dict["video_width"]
-                                video_dict['video_height'] = video_url_dict["video_height"]
-                                video_dict['audio_url'] = video_url_dict["audio_url"]
-                                video_dict['video_url'] = video_url_dict["video_url"]
-                                video_dict['session'] = signature
-                                break
+                            video_url_dict = cls.get_video_url(log_type, crawler, video_dict['gid'])
+                            video_dict['video_width'] = video_url_dict["video_width"]
+                            video_dict['video_height'] = video_url_dict["video_height"]
+                            video_dict['audio_url'] = video_url_dict["audio_url"]
+                            video_dict['video_url'] = video_url_dict["video_url"]
+                            video_dict['session'] = signature
                         except Exception as e:
                             Common.logger(log_type, crawler).error(f'视频:{item_id},没有获取到视频详情,原因:{e}')
                             continue
-                if not video_dict:
-                    continue
-                if cls.repeat_video(log_type, crawler, video_dict['video_id'], env, machine) != 0:
-                    Common.logger(log_type, crawler).info(f'gid:{video_dict["video_id"]},视频已下载,无需重复下载\n')
-                    continue
-                for k, v in video_dict.items():
-                    Common.logger(log_type, crawler).info(f"{k}:{v}")
-                try:
-
-                    cls.download_publish(
-                        search_word=search_word,
-                        log_type=log_type,
-                        crawler=crawler,
-                        video_dict=video_dict,
-                        rule_dict=rule_dict,
-                        strategy=strategy,
-                        our_uid=our_uid,
-                        oss_endpoint=oss_endpoint,
-                        env=env,
-                        machine=machine
-                    )
-                except Exception as e:
-                    Common.logger(log_type, crawler).error(f'视频:{item_id},download_publish异常:{e}\n')
-                    continue
+                        if cls.repeat_video(log_type, crawler, video_dict['video_id'], env, machine) != 0:
+                            Common.logger(log_type, crawler).info(
+                                f'gid:{video_dict["gid"]},视频已下载,无需重复下载\n')
+                            continue
+                        if not cls.is_ruled(log_type, crawler, video_dict, rule_dict):
+                            Common.logger(log_type, crawler).error(f'视频:{item_id},不符合抓取规则\n')
+                            continue
+                        for k, v in video_dict.items():
+                            Common.logger(log_type, crawler).info(f"{k}:{v}")
+                        try:
+                            # print(
+                            #     f'search_word:{search_word},title:{video_dict["video_title"]},gid:{video_dict["gid"]},offset:{offset}, total:{total_count}')
+                            cls.download_publish(
+                                search_word=search_word,
+                                log_type=log_type,
+                                crawler=crawler,
+                                video_dict=video_dict,
+                                rule_dict=rule_dict,
+                                strategy=strategy,
+                                our_uid=our_uid,
+                                oss_endpoint=oss_endpoint,
+                                env=env,
+                                machine=machine
+                            )
+                            total_count += 1
+                            if total_count >= 30:
+                                return
+                            else:
+                                break
+                        except Exception as e:
+                            Common.logger(log_type, crawler).error(f'视频:{item_id},download_publish异常:{e}\n')
 
-                total_count += 1
-                # print(f'search_word:{search_word},title:{video_dict["video_title"]},gid:{video_dict["gid"]},offset:{offset}, total:{total_count}')
-                if total_count >= 30:
-                    return
             offset += 10
 
     @classmethod
@@ -887,7 +906,7 @@ class Search:
 
 if __name__ == '__main__':
     # print(Follow.get_signature("follow", "xigua", "95420624045", "local"))
-    # Search.get_search_videos('search', 'xigua', 'xigua_search', 'inner', 'prod', 'aliyun')
+    Search.get_search_videos('search', 'xigua', 'xigua_search', 'inner', 'prod', 'aliyun')
 
     # Follow.get_videolist(log_type="follow",
     #                      crawler="xigua",