瀏覽代碼

update youtube 抓取日期求改,获取站外信息失败修复

kk 2 年之前
父節點
當前提交
4eb434d1cf
共有 1 個文件被更改,包括 45 次插入328 次删除
  1. 45 328
      youtube/youtube_follow/youtube_follow_api.py

+ 45 - 328
youtube/youtube_follow/youtube_follow_api.py

@@ -33,6 +33,21 @@ headers = {
 }
 
 
+def format_nums(data):
+    data_dict = [{'亿': 100000000}, {'百万': 1000000}, {'万': 10000}, {'k': 1000}, {'w': 10000}, {'m': 1000000},
+                 {'千': 1000}, {'M': 1000000}, {'K': 1000}, {'W': 10000}]
+    data = str(data)
+    for i in data_dict:
+        index = data.find(list(i.keys())[0])
+        if index > 0:
+            count = int(float(data[:index]) * list(i.values())[0])
+            return count
+        elif index < 0:
+            continue
+    count = int(float(re.findall(r'\d+', data)[0]))
+    return count
+
+
 class Follow:
     # 翻页参数
     continuation = ''
@@ -114,331 +129,33 @@ class Follow:
                                 'out_create_time': 站外用户创建时间}
         """
         try:
-            url = "https://www.youtube.com/youtubei/v1/browse?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8&prettyPrint=false"
-            payload = json.dumps({
-                "context": {
-                    "client": {
-                        "hl": "zh-CN",
-                        "gl": "US",
-                        "remoteHost": "38.93.247.21",
-                        "deviceMake": "Apple",
-                        "deviceModel": "",
-                        "visitorData": "CgtraDZfVnB4NXdIWSjL1IKfBg%3D%3D",
-                        "userAgent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36,gzip(gfe)",
-                        "clientName": "WEB",
-                        "clientVersion": "2.20230201.01.00",
-                        "osName": "Macintosh",
-                        "osVersion": "10_15_7",
-                        "originalUrl": f"https://www.youtube.com/{out_user_id}/about",
-                        "screenPixelDensity": 1,
-                        "platform": "DESKTOP",
-                        "clientFormFactor": "UNKNOWN_FORM_FACTOR",
-                        "configInfo": {
-                            "appInstallData": "CMvUgp8GEKLsrgUQzN-uBRC41K4FENfkrgUQsvWuBRDkoP4SELiLrgUQo_muBRDn964FENnprgUQlPiuBRC2nP4SEPuj_hIQ4tSuBRCJ6K4FEILdrgUQh92uBRD-7q4FEMz1rgUQ76P-EhDJya4FEJan_hIQkfj8Eg%3D%3D"
-                        },
-                        "screenDensityFloat": 1,
-                        "timeZone": "Asia/Shanghai",
-                        "browserName": "Chrome",
-                        "browserVersion": "109.0.0.0",
-                        "acceptHeader": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
-                        "deviceExperimentId": "ChxOekU1TlRReU5qWTBOVFExTVRRNU5qRTBOdz09EMvUgp8GGOmU7Z4G",
-                        "screenWidthPoints": 805,
-                        "screenHeightPoints": 969,
-                        "utcOffsetMinutes": 480,
-                        "userInterfaceTheme": "USER_INTERFACE_THEME_LIGHT",
-                        "memoryTotalKbytes": "8000000",
-                        "mainAppWebInfo": {
-                            "graftUrl": f"/{out_user_id}/about",
-                            "pwaInstallabilityStatus": "PWA_INSTALLABILITY_STATUS_CAN_BE_INSTALLED",
-                            "webDisplayMode": "WEB_DISPLAY_MODE_FULLSCREEN",
-                            "isWebNativeShareAvailable": True
-                        }
-                    },
-                    "user": {
-                        "lockedSafetyMode": False
-                    },
-                    "request": {
-                        "useSsl": True,
-                        "internalExperimentFlags": [],
-                        "consistencyTokenJars": []
-                    },
-                    "clickTracking": {
-                        "clickTrackingParams": "CBMQ8JMBGAoiEwjY34r0rYD9AhURSEwIHfHZAak="
-                    },
-                    "adSignalsInfo": {
-                        "params": [
-                            {
-                                "key": "dt",
-                                "value": "1675668045032"
-                            },
-                            {
-                                "key": "flash",
-                                "value": "0"
-                            },
-                            {
-                                "key": "frm",
-                                "value": "0"
-                            },
-                            {
-                                "key": "u_tz",
-                                "value": "480"
-                            },
-                            {
-                                "key": "u_his",
-                                "value": "1"
-                            },
-                            {
-                                "key": "u_h",
-                                "value": "1080"
-                            },
-                            {
-                                "key": "u_w",
-                                "value": "1920"
-                            },
-                            {
-                                "key": "u_ah",
-                                "value": "1080"
-                            },
-                            {
-                                "key": "u_aw",
-                                "value": "1920"
-                            },
-                            {
-                                "key": "u_cd",
-                                "value": "24"
-                            },
-                            {
-                                "key": "bc",
-                                "value": "31"
-                            },
-                            {
-                                "key": "bih",
-                                "value": "969"
-                            },
-                            {
-                                "key": "biw",
-                                "value": "805"
-                            },
-                            {
-                                "key": "brdim",
-                                "value": "-269,-1080,-269,-1080,1920,-1080,1920,1080,805,969"
-                            },
-                            {
-                                "key": "vis",
-                                "value": "1"
-                            },
-                            {
-                                "key": "wgl",
-                                "value": "true"
-                            },
-                            {
-                                "key": "ca_type",
-                                "value": "image"
-                            }
-                        ],
-                        "bid": "ANyPxKqvCBKtjNeHQ6uTC7sKj2ZwIvEkk3oRlmdU7H_soRJWLc4IQCkqMVP68RR-Xae0h3nMdOKYOtVh_Yb2OYr4znd60I5j7A"
-                    }
-                },
-                # "browseId": browse_id,
-                "params": "EgVhYm91dPIGBAoCEgA%3D"
-            })
-            headers = {
-                'authority': 'www.youtube.com',
-                'accept': '*/*',
-                'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
-                'cache-control': 'no-cache',
-                'content-type': 'application/json',
-                'cookie': 'VISITOR_INFO1_LIVE=kh6_Vpx5wHY; YSC=UupqFrWvAR0; DEVICE_INFO=ChxOekU1TlRReU5qWTBOVFExTVRRNU5qRTBOdz09EOmU7Z4GGOmU7Z4G; GPS=1; PREF=tz=Asia.Shanghai; ST-h076le=itct=CBMQ8JMBGAoiEwjY34r0rYD9AhURSEwIHfHZAak%3D&csn=MC45NDM2MjgyNzM1ODE5NDAz&endpoint=%7B%22clickTrackingParams%22%3A%22CBMQ8JMBGAoiEwjY34r0rYD9AhURSEwIHfHZAak%3D%22%2C%22commandMetadata%22%3A%7B%22webCommandMetadata%22%3A%7B%22url%22%3A%22%2F%40weitravel%2Fabout%22%2C%22webPageType%22%3A%22WEB_PAGE_TYPE_CHANNEL%22%2C%22rootVe%22%3A3611%2C%22apiUrl%22%3A%22%2Fyoutubei%2Fv1%2Fbrowse%22%7D%7D%2C%22browseEndpoint%22%3A%7B%22browseId%22%3A%22UC08jgxf119fzynp2uHCvZIg%22%2C%22params%22%3A%22EgVhYm91dPIGBAoCEgA%253D%22%2C%22canonicalBaseUrl%22%3A%22%2F%40weitravel%22%7D%7D',
-                'origin': 'https://www.youtube.com',
-                'pragma': 'no-cache',
-                'referer': f'https://www.youtube.com/{out_user_id}/videos',
-                'sec-ch-ua': '"Not_A Brand";v="99", "Chromium";v="109", "Google Chrome";v="109.0.5414.87"',
-                'sec-ch-ua-arch': '"arm"',
-                'sec-ch-ua-bitness': '"64"',
-                'sec-ch-ua-full-version': '"109.0.1518.52"',
-                'sec-ch-ua-full-version-list': '"Not_A Brand";v="99.0.0.0", "Microsoft Edge";v="109.0.1518.52", "Chromium";v="109.0.5414.87"',
-                'sec-ch-ua-mobile': '?0',
-                'sec-ch-ua-model': '',
-                'sec-ch-ua-platform': '"macOS"',
-                'sec-ch-ua-platform-version': '"12.4.0"',
-                'sec-ch-ua-wow64': '?0',
-                'sec-fetch-dest': 'empty',
-                'sec-fetch-mode': 'same-origin',
-                'sec-fetch-site': 'same-origin',
-                'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
-                'x-goog-visitor-id': 'CgtraDZfVnB4NXdIWSjL1IKfBg%3D%3D',
-                'x-youtube-bootstrap-logged-in': 'false',
-                'x-youtube-client-name': '1',
-                'x-youtube-client-version': '2.20230201.01.00'
+            url = f'https://www.youtube.com/{out_user_id}/about'
+            res = requests.get(url=url, headers=headers)
+            info = re.findall(r'var ytInitialData = (.*?);</script>', res.text, re.S)[0]
+            data = json.loads(info)
+            header = data['header']['c4TabbedHeaderRenderer']
+            tabs = data['contents']['twoColumnBrowseResultsRenderer']['tabs']
+            subsimpleText = header['subscriberCountText']['simpleText'].replace('位订阅者', '')
+            for tab in tabs:
+                if 'tabRenderer' not in tab or 'content' not in tab['tabRenderer']:
+                    continue
+                viewCountText = \
+                tab['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer'][
+                    'contents'][0]['channelAboutFullMetadataRenderer']['viewCountText']['simpleText']
+                out_create_time = \
+                tab['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer'][
+                    'contents'][0]['channelAboutFullMetadataRenderer']['joinedDateText']['runs'][1]['text']
+                break
+            out_user_dict = {
+                'out_user_name': header['title'],
+                'out_avatar_url': header['avatar']['thumbnails'][-1]['url'],
+                'out_fans': format_nums(subsimpleText),
+                'out_play_cnt': int(
+                    viewCountText.replace('收看次數:', '').replace('次', '').replace(',', '')) if viewCountText else 0,
+                'out_create_time': out_create_time.replace('年', '-').replace('月', '-').replace('日', ''),
             }
-            response = requests.post(url=url, headers=headers, data=payload)
-            if response.status_code != 200:
-                Common.logger(log_type, crawler).warning(f'get_out_user_info:{response.text}\n')
-            elif 'contents' not in response.text or 'header' not in response.text:
-                Common.logger(log_type, crawler).warning(f'get_out_user_info:{response.text}\n')
-            elif 'c4TabbedHeaderRenderer' not in response.json()['header']:
-                Common.logger(log_type, crawler).warning(f'get_out_user_info:{response.json()["header"]}\n')
-            elif 'twoColumnBrowseResultsRenderer' not in response.json()['contents']:
-                Common.logger(log_type, crawler).warning(f'get_out_user_info:{response.json()}\n')
-            elif 'tabs' not in response.json()['contents']['twoColumnBrowseResultsRenderer']:
-                Common.logger(log_type, crawler).warning(
-                    f"get_out_user_info:{response.json()['contents']['twoColumnBrowseResultsRenderer']}\n")
-            else:
-                header = response.json()['header']['c4TabbedHeaderRenderer']
-                tabs = response.json()['contents']['twoColumnBrowseResultsRenderer']['tabs']
-                for i in range(len(tabs)):
-                    if 'tabRenderer' not in tabs[i]:
-                        title = ''
-                    elif 'title' not in tabs[i]['tabRenderer']:
-                        title = ''
-                    else:
-                        title = tabs[i]['tabRenderer']['title']
-
-                    if title == '简介':
-                        if 'tabRenderer' not in tabs[i]:
-                            Common.logger(log_type, crawler).warning(f"get_out_user_info:{tabs[i]}\n")
-                        elif 'content' not in tabs[i]['tabRenderer']:
-                            Common.logger(log_type, crawler).warning(f"get_out_user_info:{tabs[i]['tabRenderer']}\n")
-                        elif 'sectionListRenderer' not in tabs[i]['tabRenderer']['content']:
-                            Common.logger(log_type, crawler).warning(
-                                f"get_out_user_info:{tabs[i]['tabRenderer']['content']}\n")
-                        elif 'contents' not in tabs[i]['tabRenderer']['content']['sectionListRenderer']:
-                            Common.logger(log_type, crawler).warning(
-                                f"get_out_user_info:{tabs[i]['tabRenderer']['content']['sectionListRenderer']}\n")
-                        elif len(tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents']) == 0:
-                            Common.logger(log_type, crawler).warning(
-                                f"get_out_user_info:{tabs[i]['tabRenderer']['content']['sectionListRenderer']}\n")
-                        elif 'itemSectionRenderer' not in \
-                                tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]:
-                            Common.logger(log_type, crawler).warning(
-                                f"get_out_user_info:{tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]}\n")
-                        elif 'contents' not in tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0][
-                            'itemSectionRenderer']:
-                            Common.logger(log_type, crawler).warning(
-                                f"get_out_user_info:{tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']}\n")
-                        elif len(tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0][
-                                     'itemSectionRenderer']['contents']) == 0:
-                            Common.logger(log_type, crawler).warning(
-                                f"get_out_user_info:{tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']}\n")
-                        elif 'channelAboutFullMetadataRenderer' not in \
-                                tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0][
-                                    'itemSectionRenderer']['contents'][0]:
-                            Common.logger(log_type, crawler).warning(
-                                f"get_out_user_info:{tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]}\n")
-                        else:
-                            # 站外用户昵称
-                            if 'title' not in header and 'title' not in \
-                                    tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0][
-                                        'itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer']:
-                                out_user_name = ''
-                            elif 'title' in header:
-                                out_user_name = header['title']
-                            elif 'simpleText' not in \
-                                    tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0][
-                                        'itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer'][
-                                        'title']:
-                                out_user_name = ''
-                            else:
-                                out_user_name = tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0][
-                                    'itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer']['title'][
-                                    'simpleText']
-
-                            # 站外用户头像
-                            if 'avatar' not in header and 'avatar' not in \
-                                    tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0][
-                                        'itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer']:
-                                out_avatar_url = ''
-                            elif 'thumbnails' not in header['avatar'] and 'thumbnails' not in \
-                                    tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0][
-                                        'itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer'][
-                                        'avatar']:
-                                out_avatar_url = ''
-                            elif len(header['avatar']['thumbnails']) == 0 and len(
-                                    tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0][
-                                        'itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer'][
-                                        'avatar']['thumbnails']) == 0:
-                                out_avatar_url = ''
-                            elif 'url' not in header['avatar']['thumbnails'][-1] and 'url' not in \
-                                    tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0][
-                                        'itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer'][
-                                        'avatar']['thumbnails'][-1]:
-                                out_avatar_url = ''
-                            elif 'url' in header['avatar']['thumbnails'][-1]:
-                                out_avatar_url = header['avatar']['thumbnails'][-1]['url']
-                            else:
-                                out_avatar_url = \
-                                    tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0][
-                                        'itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer'][
-                                        'avatar'][
-                                        'thumbnails'][-1]['url']
-
-                            # 站外用户粉丝
-                            if 'subscriberCountText' not in header:
-                                out_fans = 0
-                            elif 'accessibility' not in header['subscriberCountText']:
-                                out_fans = 0
-                            elif 'accessibilityData' not in header['subscriberCountText']['accessibility']:
-                                out_fans = 0
-                            elif 'label' not in header['subscriberCountText']['accessibility']['accessibilityData']:
-                                out_fans = 0
-                            else:
-                                out_fans = header['subscriberCountText']['accessibility']['accessibilityData']['label']
-                                if '万' in out_fans:
-                                    out_fans = int(float(out_fans.split('万')[0]) * 10000)
-                                elif "位" in out_fans:
-                                    out_fans = int(out_fans.split('位')[0].replace(",", ""))
-                                else:
-                                    pass
-
-                            # 站外用户总播放量
-                            if 'viewCountText' not in \
-                                    tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0][
-                                        'itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer']:
-                                out_play_cnt = 0
-                            elif 'simpleText' not in \
-                                    tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0][
-                                        'itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer'][
-                                        'viewCountText']:
-                                out_play_cnt = 0
-                            else:
-                                out_play_cnt = int(
-                                    tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0][
-                                        'itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer'][
-                                        'viewCountText']['simpleText'].split('次')[0].replace(',', ''))
-
-                            # 站外用户注册时间
-                            if 'joinedDateText' not in \
-                                    tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0][
-                                        'itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer']:
-                                out_create_time = ''
-                            elif 'runs' not in tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0][
-                                'itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer'][
-                                'joinedDateText']:
-                                out_create_time = ''
-                            elif len(tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0][
-                                         'itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer'][
-                                         'joinedDateText']['runs']) == 0:
-                                out_create_time = ''
-                            elif 'text' not in tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0][
-                                'itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer'][
-                                'joinedDateText']['runs'][0]:
-                                out_create_time = ''
-                            else:
-                                out_create_time = \
-                                    tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0][
-                                        'itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer'][
-                                        'joinedDateText']['runs'][0]['text'].replace('年', '-').replace('月',
-                                                                                                        '-').replace(
-                                        '日', '')
-                            out_user_dict = {
-                                'out_user_name': out_user_name,
-                                'out_avatar_url': out_avatar_url,
-                                'out_fans': out_fans,
-                                'out_play_cnt': out_play_cnt,
-                                'out_create_time': out_create_time,
-                            }
-                            # print(out_user_dict)
-                            return out_user_dict
+            # print(out_user_dict)
+            return out_user_dict
         except Exception as e:
             Common.logger(log_type, crawler).error(f'get_out_user_info异常:{e}\n')
 
@@ -857,9 +574,9 @@ class Follow:
                     video_id = data["richItemRenderer"]["content"]['videoRenderer']['videoId']
                     video_dict = cls.get_video_info(log_type, crawler, out_uid, video_id, machine)
                     # video_dict = cls.parse_video(video_dict, log_type, crawler, out_uid, video_id, machine)
-                    # 发布时间<=30
+                    # 发布时间<=7
                     publish_time = int(time.mktime(time.strptime(video_dict['publish_time'], "%Y-%m-%d")))
-                    if int(time.time()) - publish_time <= 3600 * 24 * 30:
+                    if int(time.time()) - publish_time <= 3600 * 24 * 7:
                         cls.download_publish(log_type, crawler, video_dict, strategy, our_uid, env, oss_endpoint,
                                              machine)
                     else:
@@ -881,9 +598,9 @@ class Follow:
                 if 'richItemRenderer' in data:
                     video_id = data["richItemRenderer"]["content"]['videoRenderer']['videoId']
                     video_dict = cls.get_video_info(log_type, crawler, out_uid, video_id, machine)
-                    # 发布时间<=30
+                    # 发布时间<=7
                     publish_time = int(time.mktime(time.strptime(video_dict['publish_time'], "%Y-%m-%d")))
-                    if int(time.time()) - publish_time <= 3600 * 24 * 30:
+                    if int(time.time()) - publish_time <= 3600 * 24 * 7:
                         cls.download_publish(log_type, crawler, video_dict, strategy, our_uid, env, oss_endpoint,
                                              machine)
                     else: