# -*- coding: utf-8 -*- # @Author: wangkun # @Time: 2023/2/3 """ YouTube 定向榜 1. 发布时间<=1个月 2. 10分钟>=时长>=1分钟 """ import os import re import shutil import sys import time import json # import emoji import requests from selenium import webdriver from selenium.webdriver.chrome.service import Service from selenium.webdriver.common.by import By from selenium.webdriver.common.desired_capabilities import DesiredCapabilities sys.path.append(os.getcwd()) from common.common import Common from common.db import MysqlHelper from common.feishu import Feishu from common.users import Users from common.publish import Publish from common.translate import Translate class Follow: # 翻页参数 continuation = '' # 抓取平台 platform = 'youtube' @classmethod def get_browse_id(cls, log_type, crawler, out_user_id, machine): """ 获取每个用户的 browse_id :param log_type: 日志 :param crawler: 哪款爬虫 :param out_user_id: 站外用户 UID :param machine: 部署机器,阿里云填写 aliyun / aliyun_hk,线下分别填写 macpro,macair,local :return: browse_id """ try: # 打印请求配置 ca = DesiredCapabilities.CHROME ca["goog:loggingPrefs"] = {"performance": "ALL"} # 不打开浏览器运行 chrome_options = webdriver.ChromeOptions() chrome_options.add_argument("--headless") chrome_options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36') chrome_options.add_argument("--no-sandbox") # driver初始化 if machine == 'aliyun' or machine == 'aliyun_hk': driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options) elif machine == 'macpro': driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options, service=Service('/Users/lieyunye/Downloads/chromedriver_v86/chromedriver')) elif machine == 'macair': driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options, service=Service('/Users/piaoquan/Downloads/chromedriver')) else: driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options, service=Service('/Users/wangkun/Downloads/chromedriver/chromedriver_v110/chromedriver')) driver.implicitly_wait(10) url = f'https://www.youtube.com/{out_user_id}/videos' driver.get(url) # driver.save_screenshot("./1.png") # 向上滑动 1000 个像素 # driver.execute_script('window.scrollBy(0, 2000)') # driver.save_screenshot("./2.png") time.sleep(3) accept_btns = driver.find_elements(By.XPATH, '//span[text()="全部接受"]') accept_btns_eng = driver.find_elements(By.XPATH, '//span[text()="Accept all"]') if len(accept_btns) != 0: accept_btns[0].click() time.sleep(2) elif len(accept_btns_eng) != 0: accept_btns_eng[0].click() time.sleep(2) browse_id = driver.find_element(By.XPATH, '//meta[@itemprop="channelId"]').get_attribute('content') driver.quit() return browse_id except Exception as e: Common.logger(log_type, crawler).error(f'get_browse_id异常:{e}\n') @classmethod def get_out_user_info(cls, log_type, crawler, browse_id, out_user_id): """ 获取站外用户信息 :param log_type: 日志 :param crawler: 哪款爬虫 :param browse_id: browse_id :param out_user_id: 站外用户 UID :return: out_user_dict = {'out_user_name': 站外用户昵称, 'out_avatar_url': 站外用户头像, 'out_fans': 站外用户粉丝量, 'out_play_cnt': 站外用户总播放量, 'out_create_time': 站外用户创建时间} """ try: url = "https://www.youtube.com/youtubei/v1/browse?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8&prettyPrint=false" payload = json.dumps({ "context": { "client": { "hl": "zh-CN", "gl": "US", "remoteHost": "38.93.247.21", "deviceMake": "Apple", "deviceModel": "", "visitorData": "CgtraDZfVnB4NXdIWSjL1IKfBg%3D%3D", "userAgent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36,gzip(gfe)", "clientName": "WEB", "clientVersion": "2.20230201.01.00", "osName": "Macintosh", "osVersion": "10_15_7", "originalUrl": f"https://www.youtube.com/{out_user_id}/about", "screenPixelDensity": 1, "platform": "DESKTOP", "clientFormFactor": "UNKNOWN_FORM_FACTOR", "configInfo": { "appInstallData": "CMvUgp8GEKLsrgUQzN-uBRC41K4FENfkrgUQsvWuBRDkoP4SELiLrgUQo_muBRDn964FENnprgUQlPiuBRC2nP4SEPuj_hIQ4tSuBRCJ6K4FEILdrgUQh92uBRD-7q4FEMz1rgUQ76P-EhDJya4FEJan_hIQkfj8Eg%3D%3D" }, "screenDensityFloat": 1, "timeZone": "Asia/Shanghai", "browserName": "Chrome", "browserVersion": "109.0.0.0", "acceptHeader": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", "deviceExperimentId": "ChxOekU1TlRReU5qWTBOVFExTVRRNU5qRTBOdz09EMvUgp8GGOmU7Z4G", "screenWidthPoints": 805, "screenHeightPoints": 969, "utcOffsetMinutes": 480, "userInterfaceTheme": "USER_INTERFACE_THEME_LIGHT", "memoryTotalKbytes": "8000000", "mainAppWebInfo": { "graftUrl": f"/{out_user_id}/about", "pwaInstallabilityStatus": "PWA_INSTALLABILITY_STATUS_CAN_BE_INSTALLED", "webDisplayMode": "WEB_DISPLAY_MODE_FULLSCREEN", "isWebNativeShareAvailable": True } }, "user": { "lockedSafetyMode": False }, "request": { "useSsl": True, "internalExperimentFlags": [], "consistencyTokenJars": [] }, "clickTracking": { "clickTrackingParams": "CBMQ8JMBGAoiEwjY34r0rYD9AhURSEwIHfHZAak=" }, "adSignalsInfo": { "params": [ { "key": "dt", "value": "1675668045032" }, { "key": "flash", "value": "0" }, { "key": "frm", "value": "0" }, { "key": "u_tz", "value": "480" }, { "key": "u_his", "value": "1" }, { "key": "u_h", "value": "1080" }, { "key": "u_w", "value": "1920" }, { "key": "u_ah", "value": "1080" }, { "key": "u_aw", "value": "1920" }, { "key": "u_cd", "value": "24" }, { "key": "bc", "value": "31" }, { "key": "bih", "value": "969" }, { "key": "biw", "value": "805" }, { "key": "brdim", "value": "-269,-1080,-269,-1080,1920,-1080,1920,1080,805,969" }, { "key": "vis", "value": "1" }, { "key": "wgl", "value": "true" }, { "key": "ca_type", "value": "image" } ], "bid": "ANyPxKqvCBKtjNeHQ6uTC7sKj2ZwIvEkk3oRlmdU7H_soRJWLc4IQCkqMVP68RR-Xae0h3nMdOKYOtVh_Yb2OYr4znd60I5j7A" } }, "browseId": browse_id, "params": "EgVhYm91dPIGBAoCEgA%3D" }) headers = { 'authority': 'www.youtube.com', 'accept': '*/*', 'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6', 'cache-control': 'no-cache', 'content-type': 'application/json', 'cookie': 'VISITOR_INFO1_LIVE=kh6_Vpx5wHY; YSC=UupqFrWvAR0; DEVICE_INFO=ChxOekU1TlRReU5qWTBOVFExTVRRNU5qRTBOdz09EOmU7Z4GGOmU7Z4G; GPS=1; PREF=tz=Asia.Shanghai; ST-h076le=itct=CBMQ8JMBGAoiEwjY34r0rYD9AhURSEwIHfHZAak%3D&csn=MC45NDM2MjgyNzM1ODE5NDAz&endpoint=%7B%22clickTrackingParams%22%3A%22CBMQ8JMBGAoiEwjY34r0rYD9AhURSEwIHfHZAak%3D%22%2C%22commandMetadata%22%3A%7B%22webCommandMetadata%22%3A%7B%22url%22%3A%22%2F%40weitravel%2Fabout%22%2C%22webPageType%22%3A%22WEB_PAGE_TYPE_CHANNEL%22%2C%22rootVe%22%3A3611%2C%22apiUrl%22%3A%22%2Fyoutubei%2Fv1%2Fbrowse%22%7D%7D%2C%22browseEndpoint%22%3A%7B%22browseId%22%3A%22UC08jgxf119fzynp2uHCvZIg%22%2C%22params%22%3A%22EgVhYm91dPIGBAoCEgA%253D%22%2C%22canonicalBaseUrl%22%3A%22%2F%40weitravel%22%7D%7D', 'origin': 'https://www.youtube.com', 'pragma': 'no-cache', 'referer': f'https://www.youtube.com/{out_user_id}/videos', 'sec-ch-ua': '"Not_A Brand";v="99", "Chromium";v="109", "Google Chrome";v="109.0.5414.87"', 'sec-ch-ua-arch': '"arm"', 'sec-ch-ua-bitness': '"64"', 'sec-ch-ua-full-version': '"109.0.1518.52"', 'sec-ch-ua-full-version-list': '"Not_A Brand";v="99.0.0.0", "Microsoft Edge";v="109.0.1518.52", "Chromium";v="109.0.5414.87"', 'sec-ch-ua-mobile': '?0', 'sec-ch-ua-model': '', 'sec-ch-ua-platform': '"macOS"', 'sec-ch-ua-platform-version': '"12.4.0"', 'sec-ch-ua-wow64': '?0', 'sec-fetch-dest': 'empty', 'sec-fetch-mode': 'same-origin', 'sec-fetch-site': 'same-origin', 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36', 'x-goog-visitor-id': 'CgtraDZfVnB4NXdIWSjL1IKfBg%3D%3D', 'x-youtube-bootstrap-logged-in': 'false', 'x-youtube-client-name': '1', 'x-youtube-client-version': '2.20230201.01.00' } response = requests.post(url=url, headers=headers, data=payload) if response.status_code != 200: Common.logger(log_type, crawler).warning(f'get_out_user_info:{response.text}\n') elif 'contents' not in response.text or 'header' not in response.text: Common.logger(log_type, crawler).warning(f'get_out_user_info:{response.text}\n') elif 'c4TabbedHeaderRenderer' not in response.json()['header']: Common.logger(log_type, crawler).warning(f'get_out_user_info:{response.json()["header"]}\n') elif 'twoColumnBrowseResultsRenderer' not in response.json()['contents']: Common.logger(log_type, crawler).warning(f'get_out_user_info:{response.json()}\n') elif 'tabs' not in response.json()['contents']['twoColumnBrowseResultsRenderer']: Common.logger(log_type, crawler).warning(f"get_out_user_info:{response.json()['contents']['twoColumnBrowseResultsRenderer']}\n") else: header = response.json()['header']['c4TabbedHeaderRenderer'] tabs = response.json()['contents']['twoColumnBrowseResultsRenderer']['tabs'] for i in range(len(tabs)): if 'tabRenderer' not in tabs[i]: title = '' elif 'title' not in tabs[i]['tabRenderer']: title = '' else: title = tabs[i]['tabRenderer']['title'] if title == '简介': if 'tabRenderer' not in tabs[i]: Common.logger(log_type, crawler).warning(f"get_out_user_info:{tabs[i]}\n") elif 'content' not in tabs[i]['tabRenderer']: Common.logger(log_type, crawler).warning(f"get_out_user_info:{tabs[i]['tabRenderer']}\n") elif 'sectionListRenderer' not in tabs[i]['tabRenderer']['content']: Common.logger(log_type, crawler).warning(f"get_out_user_info:{tabs[i]['tabRenderer']['content']}\n") elif 'contents' not in tabs[i]['tabRenderer']['content']['sectionListRenderer']: Common.logger(log_type, crawler).warning(f"get_out_user_info:{tabs[i]['tabRenderer']['content']['sectionListRenderer']}\n") elif len(tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents']) == 0: Common.logger(log_type, crawler).warning(f"get_out_user_info:{tabs[i]['tabRenderer']['content']['sectionListRenderer']}\n") elif 'itemSectionRenderer' not in tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]: Common.logger(log_type, crawler).warning(f"get_out_user_info:{tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]}\n") elif 'contents' not in tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']: Common.logger(log_type, crawler).warning(f"get_out_user_info:{tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']}\n") elif len(tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents']) == 0: Common.logger(log_type, crawler).warning(f"get_out_user_info:{tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']}\n") elif 'channelAboutFullMetadataRenderer' not in tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]: Common.logger(log_type, crawler).warning(f"get_out_user_info:{tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]}\n") else: # 站外用户昵称 if 'title' not in header and 'title' not in tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer']: out_user_name = '' elif 'title' in header: out_user_name = header['title'] elif 'simpleText' not in tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer']['title']: out_user_name = '' else: out_user_name = tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer']['title']['simpleText'] # 站外用户头像 if 'avatar' not in header and 'avatar' not in tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer']: out_avatar_url = '' elif 'thumbnails' not in header['avatar'] and 'thumbnails' not in tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer']['avatar']: out_avatar_url = '' elif len(header['avatar']['thumbnails']) == 0 and len(tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer']['avatar']['thumbnails']) == 0: out_avatar_url = '' elif 'url' not in header['avatar']['thumbnails'][-1] and 'url' not in tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer']['avatar']['thumbnails'][-1]: out_avatar_url = '' elif 'url' in header['avatar']['thumbnails'][-1]: out_avatar_url = header['avatar']['thumbnails'][-1]['url'] else: out_avatar_url = tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer']['avatar']['thumbnails'][-1]['url'] # 站外用户粉丝 if 'subscriberCountText' not in header: out_fans = 0 elif 'accessibility' not in header['subscriberCountText']: out_fans = 0 elif 'accessibilityData' not in header['subscriberCountText']['accessibility']: out_fans = 0 elif 'label' not in header['subscriberCountText']['accessibility']['accessibilityData']: out_fans = 0 else: out_fans = header['subscriberCountText']['accessibility']['accessibilityData']['label'] if '万' in out_fans: out_fans = int(float(out_fans.split('万')[0])*10000) elif "位" in out_fans: out_fans = int(out_fans.split('位')[0].replace(",", "")) else: pass # 站外用户总播放量 if 'viewCountText' not in tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer']: out_play_cnt = 0 elif 'simpleText' not in tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer']['viewCountText']: out_play_cnt = 0 else: out_play_cnt = int(tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer']['viewCountText']['simpleText'].split('次')[0].replace(',', '')) # 站外用户注册时间 if 'joinedDateText' not in tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer']: out_create_time = '' elif 'runs' not in tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer']['joinedDateText']: out_create_time = '' elif len(tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer']['joinedDateText']['runs']) == 0: out_create_time = '' elif 'text' not in tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer']['joinedDateText']['runs'][0]: out_create_time = '' else: out_create_time = tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer']['joinedDateText']['runs'][0]['text'].replace('年', '-').replace('月', '-').replace('日', '') out_user_dict = { 'out_user_name': out_user_name, 'out_avatar_url': out_avatar_url, 'out_fans': out_fans, 'out_play_cnt': out_play_cnt, 'out_create_time': out_create_time, } # print(out_user_dict) return out_user_dict except Exception as e: Common.logger(log_type, crawler).error(f'get_out_user_info异常:{e}\n') @classmethod def get_user_from_feishu(cls, log_type, crawler, sheetid, env, machine): """ 补全飞书用户表信息,并返回 :param log_type: 日志 :param crawler: 哪款爬虫 :param sheetid: 飞书表 :param env: 正式环境:prod,测试环境:dev :param machine: 部署机器,阿里云填写 aliyun,aliyun_hk ,线下分别填写 macpro,macair,local :return: user_list """ try: user_sheet = Feishu.get_values_batch(log_type, crawler, sheetid) user_list = [] for i in range(1, len(user_sheet)): out_uid = user_sheet[i][2] user_name = user_sheet[i][3] browse_id = user_sheet[i][5] our_uid = user_sheet[i][6] Common.logger(log_type, crawler).info(f"正在更新 {user_name} 用户信息\n") # 获取站外browse_id,并写入飞书 if browse_id is None: browse_id = cls.get_browse_id(log_type, crawler, out_uid, machine) if browse_id is None: Common.logger(log_type, crawler).warning('browse_id is None !') else: Feishu.update_values(log_type, crawler, sheetid, f'F{i+1}:F{i+1}', [[browse_id]]) Common.logger(log_type, crawler).info(f'browse_id写入成功:{browse_id}') # 站内 UID 为空,且数据库中(youtube+out_user_id)返回数量 == 0,则创建新的站内账号 if our_uid is None: sql = f""" select * from crawler_user where platform="{cls.platform}" and out_user_id="{out_uid}" """ our_user_info = MysqlHelper.get_values(log_type, crawler, sql, env, machine) # 数据库中(youtube + out_user_id)返回数量 == 0,则创建站内账号UID,并写入定向账号飞书表。并结合站外用户信息,一并写入爬虫账号数据库 if our_user_info is None or len(our_user_info) == 0: # 获取站外账号信息,写入数据库 out_user_dict = cls.get_out_user_info(log_type, crawler, browse_id, out_uid) out_avatar_url = out_user_dict['out_avatar_url'] out_create_time = out_user_dict['out_create_time'] out_play_cnt = out_user_dict['out_play_cnt'] out_fans = out_user_dict['out_fans'] tag = 'youtube爬虫,定向爬虫策略' # 创建站内账号 create_user_dict = { 'nickName': user_name, 'avatarUrl': out_avatar_url, 'tagName': tag, } our_uid = Users.create_user(log_type, crawler, create_user_dict, env) Common.logger(log_type, crawler).info(f'新创建的站内UID:{our_uid}') if env == 'dev': our_user_link = f'https://testadmin.piaoquantv.com/ums/user/{our_uid}/post' else: our_user_link = f'https://admin.piaoquantv.com/ums/user/{our_uid}/post' Common.logger(log_type, crawler).info(f'站内用户主页链接:{our_user_link}') Feishu.update_values(log_type, crawler, sheetid, f'G{i + 1}:H{i + 1}', [[our_uid, our_user_link]]) Common.logger(log_type, crawler).info(f'站内用户信息写入飞书成功!') sql = f""" insert into crawler_user(user_id, out_user_id, out_user_name, out_avatar_url, out_create_time, out_play_cnt, out_fans, platform, tag) values({our_uid}, "{out_uid}", "{user_name}", "{out_avatar_url}", "{out_create_time}", {out_play_cnt}, {out_fans}, "{cls.platform}", "{tag}") """ Common.logger(log_type, crawler).info(f'sql:{sql}') MysqlHelper.update_values(log_type, crawler, sql, env, machine) Common.logger(log_type, crawler).info('用户信息插入数据库成功!\n') # 数据库中(youtube + out_user_id)返回数量 != 0,则直接把数据库中的站内 UID 写入飞书 else: our_uid = our_user_info[0][1] if 'env' == 'prod': our_user_link = f'https://admin.piaoquantv.com/ums/user/{our_uid}/post' else: our_user_link = f'https://testadmin.piaoquantv.com/ums/user/{our_uid}/post' Common.logger(log_type, crawler).info(f'站内用户主页链接:{our_user_link}') Feishu.update_values(log_type, crawler, sheetid, f'G{i+1}:H{i+1}', [[our_uid, our_user_link]]) Common.logger(log_type, crawler).info(f'站内用户信息写入飞书成功!\n') user_dict = { 'out_user_id': out_uid, 'out_user_name': user_name, 'out_browse_id': browse_id, 'our_user_id': our_uid, } user_list.append(user_dict) return user_list except Exception as e: Common.logger(log_type, crawler).error(f"get_user_from_feishu异常:{e}\n") @classmethod def get_feeds(cls, log_type, crawler, browse_id, out_uid): """ 获取个人主页视频列表 :param log_type: 日志 :param crawler: 哪款爬虫 :param browse_id: 每个用户主页的请求参数中唯一值 :param out_uid: 站外用户UID :return: video_list """ url = "https://www.youtube.com/youtubei/v1/browse?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8&prettyPrint=false" payload = json.dumps({ "context": { "client": { "hl": "zh-CN", "gl": "US", "remoteHost": "38.93.247.21", "deviceMake": "Apple", "deviceModel": "", "visitorData": "CgtraDZfVnB4NXdIWSi6mIOfBg%3D%3D", "userAgent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36,gzip(gfe)", "clientName": "WEB", "clientVersion": "2.20230201.01.00", "osName": "Macintosh", "osVersion": "10_15_7", "originalUrl": f"https://www.youtube.com/{out_uid}/videos", "platform": "DESKTOP", "clientFormFactor": "UNKNOWN_FORM_FACTOR", "configInfo": { "appInstallData": "CLqYg58GEInorgUQuIuuBRCU-K4FENfkrgUQuNSuBRC2nP4SEPuj_hIQ5_euBRCy9a4FEKLsrgUQt-CuBRDi1K4FEILdrgUQh92uBRDM364FEP7urgUQzPWuBRDZ6a4FEOSg_hIQo_muBRDvo_4SEMnJrgUQlqf-EhCR-PwS" }, "timeZone": "Asia/Shanghai", "browserName": "Chrome", "browserVersion": "109.0.0.0", "acceptHeader": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", "deviceExperimentId": "ChxOekU1TlRReU5qWTBOVFExTVRRNU5qRTBOdz09ELqYg58GGOmU7Z4G", "screenWidthPoints": 944, "screenHeightPoints": 969, "screenPixelDensity": 1, "screenDensityFloat": 1, "utcOffsetMinutes": 480, "userInterfaceTheme": "USER_INTERFACE_THEME_LIGHT", "memoryTotalKbytes": "8000000", "mainAppWebInfo": { "graftUrl": f"/{out_uid}/videos", "pwaInstallabilityStatus": "PWA_INSTALLABILITY_STATUS_CAN_BE_INSTALLED", "webDisplayMode": "WEB_DISPLAY_MODE_FULLSCREEN", "isWebNativeShareAvailable": True } }, "user": { "lockedSafetyMode": False }, "request": { "useSsl": True, "internalExperimentFlags": [], "consistencyTokenJars": [] }, "clickTracking": { "clickTrackingParams": "CBcQ8JMBGAYiEwiNhIXX9IL9AhUFSUwIHWnnDks=" }, "adSignalsInfo": { "params": [ { "key": "dt", "value": "1675676731048" }, { "key": "flash", "value": "0" }, { "key": "frm", "value": "0" }, { "key": "u_tz", "value": "480" }, { "key": "u_his", "value": "4" }, { "key": "u_h", "value": "1080" }, { "key": "u_w", "value": "1920" }, { "key": "u_ah", "value": "1080" }, { "key": "u_aw", "value": "1920" }, { "key": "u_cd", "value": "24" }, { "key": "bc", "value": "31" }, { "key": "bih", "value": "969" }, { "key": "biw", "value": "944" }, { "key": "brdim", "value": "-269,-1080,-269,-1080,1920,-1080,1920,1080,944,969" }, { "key": "vis", "value": "1" }, { "key": "wgl", "value": "true" }, { "key": "ca_type", "value": "image" } ], "bid": "ANyPxKpfiaAf-DBzNeKLgkceMEA9UIeCWFRTRm4AQMCuejhI3PGwDB1jizQIX60YcEYtt_CX7tZWAbYerQ-rWLvV7y_KCLkBww" } }, "browseId": browse_id, "params": "EgZ2aWRlb3PyBgQKAjoA", "continuation": cls.continuation }) headers = { 'authority': 'www.youtube.com', 'accept': '*/*', 'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6', 'cache-control': 'no-cache', 'content-type': 'application/json', 'cookie': 'VISITOR_INFO1_LIVE=kh6_Vpx5wHY; YSC=UupqFrWvAR0; DEVICE_INFO=ChxOekU1TlRReU5qWTBOVFExTVRRNU5qRTBOdz09EOmU7Z4GGOmU7Z4G; PREF=tz=Asia.Shanghai; ST-1kg1gfd=itct=CBcQ8JMBGAYiEwiNhIXX9IL9AhUFSUwIHWnnDks%3D&csn=MC4zNzI3MDcwMDA1Mjg4NzE5Ng..&endpoint=%7B%22clickTrackingParams%22%3A%22CBcQ8JMBGAYiEwiNhIXX9IL9AhUFSUwIHWnnDks%3D%22%2C%22commandMetadata%22%3A%7B%22webCommandMetadata%22%3A%7B%22url%22%3A%22%2F%40chinatravel5971%2Fvideos%22%2C%22webPageType%22%3A%22WEB_PAGE_TYPE_CHANNEL%22%2C%22rootVe%22%3A3611%2C%22apiUrl%22%3A%22%2Fyoutubei%2Fv1%2Fbrowse%22%7D%7D%2C%22browseEndpoint%22%3A%7B%22browseId%22%3A%22UCpLXnfBCNhj8KLnt54RQMKA%22%2C%22params%22%3A%22EgZ2aWRlb3PyBgQKAjoA%22%2C%22canonicalBaseUrl%22%3A%22%2F%40chinatravel5971%22%7D%7D', 'origin': 'https://www.youtube.com', 'pragma': 'no-cache', 'referer': f'https://www.youtube.com/{out_uid}/featured', 'sec-ch-ua': '"Not_A Brand";v="99", "Chromium";v="109", "Google Chrome";v="109.0.5414.87"', 'sec-ch-ua-arch': '"arm"', 'sec-ch-ua-bitness': '"64"', 'sec-ch-ua-full-version': '"109.0.1518.52"', 'sec-ch-ua-full-version-list': '"Not_A Brand";v="99.0.0.0", "Microsoft Edge";v="109.0.1518.52", "Chromium";v="109.0.5414.87"', 'sec-ch-ua-mobile': '?0', 'sec-ch-ua-model': '', 'sec-ch-ua-platform': '"macOS"', 'sec-ch-ua-platform-version': '"12.4.0"', 'sec-ch-ua-wow64': '?0', 'sec-fetch-dest': 'empty', 'sec-fetch-mode': 'same-origin', 'sec-fetch-site': 'same-origin', 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36', 'x-goog-visitor-id': 'CgtraDZfVnB4NXdIWSi6mIOfBg%3D%3D', 'x-youtube-bootstrap-logged-in': 'false', 'x-youtube-client-name': '1', 'x-youtube-client-version': '2.20230201.01.00' } try: response = requests.post(url=url, headers=headers, data=payload) # Common.logger(log_type, crawler).info(f"get_feeds_response:{response.json()}\n") cls.continuation = response.json()['trackingParams'] if response.status_code != 200: Common.logger(log_type, crawler).warning(f'get_feeds_response:{response.text}\n') elif 'continuationContents' not in response.text and 'onResponseReceivedActions' not in response.text: Common.logger(log_type, crawler).warning(f'get_feeds_response:{response.text}\n') elif 'continuationContents' in response.json(): # Common.logger(log_type, crawler).info("'continuationContents' in response.json()\n") if 'richGridContinuation' not in response.json()['continuationContents']: # Common.logger(log_type, crawler).warning(f"'richGridContinuation' not in response.json()['continuationContents']\n") Common.logger(log_type, crawler).warning(f'get_feeds_response:{response.json()["continuationContents"]}\n') elif 'contents' not in response.json()['continuationContents']['richGridContinuation']: Common.logger(log_type, crawler).warning(f'get_feeds_response:{response.json()["continuationContents"]["richGridContinuation"]}\n') elif 'contents' in response.json()["continuationContents"]["richGridContinuation"]: feeds = response.json()["continuationContents"]["richGridContinuation"]['contents'] return feeds elif 'onResponseReceivedActions' in response.json(): Common.logger(log_type, crawler).info("'onResponseReceivedActions' in response.json()\n") if len(response.json()['onResponseReceivedActions']) == 0: Common.logger(log_type, crawler).warning(f'get_feeds_response:{response.json()["onResponseReceivedActions"]}\n') elif 'appendContinuationItemsAction' not in response.json()['onResponseReceivedActions'][0]: Common.logger(log_type, crawler).warning(f'get_feeds_response:{response.json()["onResponseReceivedActions"][0]}\n') elif 'continuationItems' not in response.json()['onResponseReceivedActions'][0]['appendContinuationItemsAction']: Common.logger(log_type, crawler).warning(f'get_feeds_response:{response.json()["onResponseReceivedActions"][0]["appendContinuationItemsAction"]}\n') elif len(response.json()['onResponseReceivedActions'][0]['appendContinuationItemsAction']['continuationItems']) == 0: Common.logger(log_type, crawler).warning(f'get_feeds_response:{response.json()["onResponseReceivedActions"][0]["appendContinuationItemsAction"]["continuationItems"]}\n') else: feeds = response.json()["onResponseReceivedActions"][0]["appendContinuationItemsAction"]["continuationItems"] return feeds else: Common.logger(log_type, crawler).info('feeds is None\n') except Exception as e: Common.logger(log_type, crawler).error(f'get_feeds异常:{e}\n') @classmethod def get_videos(cls, log_type, crawler, strategy, oss_endpoint, env, browse_id, out_uid, our_uid, machine): try: while True: feeds = cls.get_feeds(log_type, crawler, browse_id, out_uid) for i in range(len(feeds)): if 'richItemRenderer' not in feeds[i]: Common.logger(log_type, crawler).warning(f'feeds:{feeds[i]}\n') return elif 'content' not in feeds[i]['richItemRenderer']: Common.logger(log_type, crawler).warning(f'feeds:{feeds[i]["richItemRenderer"]}\n') return elif 'videoRenderer' not in feeds[i]['richItemRenderer']['content']: Common.logger(log_type, crawler).warning(f'feeds:{feeds[i]["richItemRenderer"]["content"]}\n') return elif 'videoId' not in feeds[i]["richItemRenderer"]["content"]['videoRenderer']: Common.logger(log_type, crawler).warning(f'feeds:{feeds[i]["richItemRenderer"]["content"]["videoRenderer"]}\n') return else: video_id = feeds[i]["richItemRenderer"]["content"]['videoRenderer']['videoId'] video_dict = cls.get_video_info(log_type, crawler, out_uid, video_id, machine) # 发布时间<=30天 publish_time = int(time.mktime(time.strptime(video_dict['publish_time'], "%Y-%m-%d"))) if int(time.time()) - publish_time <= 3600*24*180: cls.download_publish(log_type, crawler, video_dict, strategy, our_uid, env, oss_endpoint, machine) else: Common.logger(log_type, crawler).info('发布时间超过180天\n') return except Exception as e: Common.logger(log_type, crawler).error(f"get_videos异常:{e}\n") @classmethod def filter_emoji(cls, title): # 过滤表情 try: co = re.compile(u'[\U00010000-\U0010ffff]') except re.error: co = re.compile(u'[\uD800-\uDBFF][\uDC00-\uDFFF]') return co.sub("", title) @classmethod def get_video_info(cls, log_type, crawler, out_uid, video_id, machine): try: url = "https://www.youtube.com/youtubei/v1/player?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8&prettyPrint=false" payload = json.dumps({ "context": { "client": { "hl": "zh-CN", "gl": "US", "remoteHost": "38.93.247.21", "deviceMake": "Apple", "deviceModel": "", "visitorData": "CgtraDZfVnB4NXdIWSjkzoefBg%3D%3D", "userAgent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36,gzip(gfe)", "clientName": "WEB", "clientVersion": "2.20230201.01.00", "osName": "Macintosh", "osVersion": "10_15_7", "originalUrl": f"https://www.youtube.com/watch?v={video_id}", "platform": "DESKTOP", "clientFormFactor": "UNKNOWN_FORM_FACTOR", "configInfo": { "appInstallData": "COTOh58GEPuj_hIQ1-SuBRC4i64FEMzfrgUQgt2uBRCi7K4FEOLUrgUQzPWuBRCKgK8FEOSg_hIQtpz-EhDa6a4FEP7urgUQieiuBRDn964FELjUrgUQlPiuBRCH3a4FELfgrgUQ76P-EhDJya4FEJan_hIQkfj8Eg%3D%3D" }, "timeZone": "Asia/Shanghai", "browserName": "Chrome", "browserVersion": "109.0.0.0", "acceptHeader": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", "deviceExperimentId": "ChxOekU1TlRReU5qWTBOVFExTVRRNU5qRTBOdz09EOTOh58GGOmU7Z4G", "screenWidthPoints": 1037, "screenHeightPoints": 969, "screenPixelDensity": 1, "screenDensityFloat": 1, "utcOffsetMinutes": 480, "userInterfaceTheme": "USER_INTERFACE_THEME_LIGHT", "memoryTotalKbytes": "8000000", "clientScreen": "WATCH", "mainAppWebInfo": { "graftUrl": f"/watch?v={video_id}", "pwaInstallabilityStatus": "PWA_INSTALLABILITY_STATUS_CAN_BE_INSTALLED", "webDisplayMode": "WEB_DISPLAY_MODE_FULLSCREEN", "isWebNativeShareAvailable": True } }, "user": { "lockedSafetyMode": False }, "request": { "useSsl": True, "internalExperimentFlags": [], "consistencyTokenJars": [] }, "clickTracking": { "clickTrackingParams": "CIwBEKQwGAYiEwipncqx3IL9AhXs4cQKHbKZDO4yB3JlbGF0ZWRInsS1qbGFtIlUmgEFCAEQ-B0=" }, "adSignalsInfo": { "params": [ { "key": "dt", "value": "1675749222611" }, { "key": "flash", "value": "0" }, { "key": "frm", "value": "0" }, { "key": "u_tz", "value": "480" }, { "key": "u_his", "value": "3" }, { "key": "u_h", "value": "1080" }, { "key": "u_w", "value": "1920" }, { "key": "u_ah", "value": "1080" }, { "key": "u_aw", "value": "1920" }, { "key": "u_cd", "value": "24" }, { "key": "bc", "value": "31" }, { "key": "bih", "value": "969" }, { "key": "biw", "value": "1037" }, { "key": "brdim", "value": "-269,-1080,-269,-1080,1920,-1080,1920,1080,1037,969" }, { "key": "vis", "value": "1" }, { "key": "wgl", "value": "true" }, { "key": "ca_type", "value": "image" } ], "bid": "ANyPxKop8SijebwUCq4ZfKbJwlSjVQa_RTdS6c6a6WPYpCKnxpWCJ33B1SzRuSXjSfH9O2MhURebAs0CngRg6B4nOjBpeJDKgA" } }, "videoId": str(video_id), "playbackContext": { "contentPlaybackContext": { "currentUrl": f"/watch?v={video_id}", "vis": 0, "splay": False, "autoCaptionsDefaultOn": False, "autonavState": "STATE_NONE", "html5Preference": "HTML5_PREF_WANTS", "signatureTimestamp": 19394, "referer": f"https://www.youtube.com/watch?v={video_id}", "lactMilliseconds": "-1", "watchAmbientModeContext": { "watchAmbientModeEnabled": True } } }, "racyCheckOk": False, "contentCheckOk": False }) headers = { 'authority': 'www.youtube.com', 'accept': '*/*', 'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6', 'cache-control': 'no-cache', 'content-type': 'application/json', 'cookie': f'VISITOR_INFO1_LIVE=kh6_Vpx5wHY; YSC=UupqFrWvAR0; DEVICE_INFO=ChxOekU1TlRReU5qWTBOVFExTVRRNU5qRTBOdz09EOmU7Z4GGOmU7Z4G; PREF=tz=Asia.Shanghai; ST-180dxzo=itct=CIwBEKQwGAYiEwipncqx3IL9AhXs4cQKHbKZDO4yB3JlbGF0ZWRInsS1qbGFtIlUmgEFCAEQ-B0%3D&csn=MC41MTQ1NTQzMTE3NTA4MjY0&endpoint=%7B%22clickTrackingParams%22%3A%22CIwBEKQwGAYiEwipncqx3IL9AhXs4cQKHbKZDO4yB3JlbGF0ZWRInsS1qbGFtIlUmgEFCAEQ-B0%3D%22%2C%22commandMetadata%22%3A%7B%22webCommandMetadata%22%3A%7B%22url%22%3A%22%2Fwatch%3Fv%3D{video_id}%22%2C%22webPageType%22%3A%22WEB_PAGE_TYPE_WATCH%22%2C%22rootVe%22%3A3832%7D%7D%2C%22watchEndpoint%22%3A%7B%22videoId%22%3A%22{video_id}%22%2C%22nofollow%22%3Atrue%2C%22watchEndpointSupportedOnesieConfig%22%3A%7B%22html5PlaybackOnesieConfig%22%3A%7B%22commonConfig%22%3A%7B%22url%22%3A%22https%3A%2F%2Frr5---sn-nx5s7n76.googlevideo.com%2Finitplayback%3Fsource%3Dyoutube%26oeis%3D1%26c%3DWEB%26oad%3D3200%26ovd%3D3200%26oaad%3D11000%26oavd%3D11000%26ocs%3D700%26oewis%3D1%26oputc%3D1%26ofpcc%3D1%26msp%3D1%26odepv%3D1%26id%3D38654ad085c12212%26ip%3D38.93.247.21%26initcwndbps%3D11346250%26mt%3D1675748964%26oweuc%3D%26pxtags%3DCg4KAnR4EggyNDQ1MTI4OA%26rxtags%3DCg4KAnR4EggyNDQ1MTI4Ng%252CCg4KAnR4EggyNDQ1MTI4Nw%252CCg4KAnR4EggyNDQ1MTI4OA%252CCg4KAnR4EggyNDQ1MTI4OQ%22%7D%7D%7D%7D%7D', 'origin': 'https://www.youtube.com', 'pragma': 'no-cache', 'referer': f'https://www.youtube.com/watch?v={video_id}', 'sec-ch-ua': '"Not_A Brand";v="99", "Chromium";v="109", "Google Chrome";v="109.0.5414.87"', 'sec-ch-ua-arch': '"arm"', 'sec-ch-ua-bitness': '"64"', 'sec-ch-ua-full-version': '"109.0.1518.52"', 'sec-ch-ua-full-version-list': '"Not_A Brand";v="99.0.0.0", "Microsoft Edge";v="109.0.1518.52", "Chromium";v="109.0.5414.87"', 'sec-ch-ua-mobile': '?0', 'sec-ch-ua-model': '', 'sec-ch-ua-platform': '"macOS"', 'sec-ch-ua-platform-version': '"12.4.0"', 'sec-ch-ua-wow64': '?0', 'sec-fetch-dest': 'empty', 'sec-fetch-mode': 'same-origin', 'sec-fetch-site': 'same-origin', 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36', 'x-goog-visitor-id': 'CgtraDZfVnB4NXdIWSjkzoefBg%3D%3D', 'x-youtube-bootstrap-logged-in': 'false', 'x-youtube-client-name': '1', 'x-youtube-client-version': '2.20230201.01.00' } response = requests.post(url=url, headers=headers, data=payload) if response.status_code != 200: Common.logger(log_type, crawler).warning(f"get_video_info_response:{response.text}\n") elif 'streamingData' not in response.json(): Common.logger(log_type, crawler).warning(f"get_video_info_response:{response.json()}\n") elif 'videoDetails' not in response.json(): Common.logger(log_type, crawler).warning(f"get_video_info_response:{response.json()}\n") elif 'microformat' not in response.json(): Common.logger(log_type, crawler).warning(f"get_video_info_response:{response.json()}\n") else: playerMicroformatRenderer = response.json()['microformat']['playerMicroformatRenderer'] videoDetails = response.json()['videoDetails'] # streamingData = response.json()['streamingData'] # video_title if 'title' not in videoDetails: video_title = '' else: video_title = videoDetails['title'].replace("&", "").strip().replace("\n", "") \ .replace("/", "").replace("\r", "").replace("#", "") \ .replace(".", "。").replace("\\", "").replace("&NBSP", "") video_title = cls.filter_emoji(video_title) # if Translate.is_contains_chinese(video_title) is False: video_title = Translate.google_translate(video_title, machine) # 自动翻译标题为中文 if 'lengthSeconds' not in videoDetails: duration = 0 else: duration = int(videoDetails['lengthSeconds']) # play_cnt if 'viewCount' not in videoDetails: play_cnt = 0 else: play_cnt = int(videoDetails['viewCount']) # publish_time if 'publishDate' not in playerMicroformatRenderer: publish_time = '' else: publish_time = playerMicroformatRenderer['publishDate'] if publish_time == '': publish_time_stamp = 0 elif ':' in publish_time: publish_time_stamp = int(time.mktime(time.strptime(publish_time, "%Y-%m-%d %H:%M:%S"))) else: publish_time_stamp = int(time.mktime(time.strptime(publish_time, "%Y-%m-%d"))) # user_name if 'author' not in videoDetails: user_name = '' else: user_name = videoDetails['author'] # cover_url if 'thumbnail' not in videoDetails: cover_url = '' elif 'thumbnails' not in videoDetails['thumbnail']: cover_url = '' elif len(videoDetails['thumbnail']['thumbnails']) == 0: cover_url = '' elif 'url' not in videoDetails['thumbnail']['thumbnails'][-1]: cover_url = '' else: cover_url = videoDetails['thumbnail']['thumbnails'][-1]['url'] # video_url # if 'formats' not in streamingData: # video_url = '' # elif len(streamingData['formats']) == 0: # video_url = '' # elif 'url' not in streamingData['formats'][-1]: # video_url = '' # else: # video_url = streamingData['formats'][-1]['url'] video_url = f"https://www.youtube.com/watch?v={video_id}" Common.logger(log_type, crawler).info(f'video_title:{video_title}') Common.logger(log_type, crawler).info(f'video_id:{video_id}') Common.logger(log_type, crawler).info(f'play_cnt:{play_cnt}') Common.logger(log_type, crawler).info(f'publish_time:{publish_time}') Common.logger(log_type, crawler).info(f'user_name:{user_name}') Common.logger(log_type, crawler).info(f'cover_url:{cover_url}') Common.logger(log_type, crawler).info(f'video_url:{video_url}') video_dict = { 'video_title': video_title, 'video_id': video_id, 'duration': duration, 'play_cnt': play_cnt, 'publish_time': publish_time, 'publish_time_stamp': publish_time_stamp, 'user_name': user_name, 'out_uid': out_uid, 'cover_url': cover_url, 'video_url': video_url, } return video_dict except Exception as e: Common.logger(log_type, crawler).error(f"get_video_info异常:{e}\n") @classmethod def repeat_video(cls, log_type, crawler, video_id, env, machine): sql = f""" select * from crawler_video where platform="{cls.platform}" and out_video_id="{video_id}"; """ repeat_video = MysqlHelper.get_values(log_type, crawler, sql, env, machine) return len(repeat_video) @classmethod def download_publish(cls, log_type, crawler, video_dict, strategy, our_uid, env, oss_endpoint, machine): try: # sql = f""" select * from crawler_video where platform="{cls.platform}" and out_video_id="{video_dict['video_id']}" """ # repeat_video = MysqlHelper.get_values(log_type, crawler, sql, env, machine) if video_dict['video_title'] == '' or video_dict['video_url'] == '': Common.logger(log_type, crawler).info('无效视频\n') elif video_dict['duration'] > 1200 or video_dict['duration'] < 60: Common.logger(log_type, crawler).info(f"时长:{video_dict['duration']}不满足规则\n") # elif repeat_video is not None and len(repeat_video) != 0: elif cls.repeat_video(log_type, crawler, video_dict['video_id'], env, machine) != 0: Common.logger(log_type, crawler).info('视频已下载\n') elif video_dict['video_id'] in [x for y in Feishu.get_values_batch(log_type, crawler, 'GVxlYk') for x in y]: Common.logger(log_type, crawler).info('视频已下载\n') else: # 下载视频 Common.logger(log_type, crawler).info('开始下载视频...') # Common.download_method(log_type, crawler, 'video', video_dict['video_title'], video_dict['video_url']) Common.download_method(log_type, crawler, 'youtube_video', video_dict['video_title'], video_dict['video_url']) # ffmpeg_dict = Common.ffmpeg(log_type, crawler, f"./{crawler}/videos/{video_dict['video_title']}/video.mp4") # video_width = int(ffmpeg_dict['width']) video_width = 1280 # video_height = int(ffmpeg_dict['height']) video_height = 720 duration = int(video_dict['duration']) # video_size = int(ffmpeg_dict['size']) Common.logger(log_type, crawler).info(f'video_width:{video_width}') Common.logger(log_type, crawler).info(f'video_height:{video_height}') Common.logger(log_type, crawler).info(f'duration:{duration}') # Common.logger(log_type, crawler).info(f'video_size:{video_size}\n') video_dict['video_width'] = video_width video_dict['video_height'] = video_height video_dict['duration'] = duration video_dict['comment_cnt'] = 0 video_dict['like_cnt'] = 0 video_dict['share_cnt'] = 0 video_dict['avatar_url'] = video_dict['cover_url'] video_dict['session'] = f'youtube{int(time.time())}' rule='1,2' # if duration < 60 or duration > 600: # # 删除视频文件夹 # shutil.rmtree(f"./{crawler}/videos/{video_dict['video_title']}/") # Common.logger(log_type, crawler).info(f"时长:{video_dict['duration']}不满足抓取规则,删除成功\n") # return if duration == 0 or duration is None: # 删除视频文件夹 shutil.rmtree(f"./{crawler}/videos/{video_dict['video_title']}/") Common.logger(log_type, crawler).info(f"视频下载出错,删除成功\n") return else: # 下载封面 Common.download_method(log_type, crawler, 'cover', video_dict['video_title'], video_dict['cover_url']) # 保存视频文本信息 Common.save_video_info(log_type, crawler, video_dict) # 上传视频 Common.logger(log_type, crawler).info(f"开始上传视频") if env == 'dev': our_video_id = Publish.upload_and_publish(log_type, crawler, strategy, our_uid, env, oss_endpoint) our_video_link = f"https://testadmin.piaoquantv.com/cms/post-detail/{our_video_id}/info" else: our_video_id = Publish.upload_and_publish(log_type, crawler, strategy, our_uid, env, oss_endpoint) our_video_link = f"https://admin.piaoquantv.com/cms/post-detail/{our_video_id}/info" Common.logger(log_type, crawler).info("视频上传完成") # 视频信息保存至飞书 Feishu.insert_columns(log_type, crawler, "GVxlYk", "ROWS", 1, 2) # 视频ID工作表,首行写入数据 upload_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(time.time()))) values = [[upload_time, "定向榜", video_dict['video_id'], video_dict['video_title'], our_video_link, video_dict['play_cnt'], video_dict['duration'], f'{video_width}*{video_height}', video_dict['publish_time'], video_dict['user_name'], video_dict['cover_url'], video_dict['video_url'] ]] time.sleep(1) Feishu.update_values(log_type, crawler, "GVxlYk", "F2:Z2", values) Common.logger(log_type, crawler).info('视频信息写入定向_已下载表成功\n') # 视频信息保存数据库 sql = f""" insert into crawler_video(video_id, user_id, out_user_id, platform, strategy, out_video_id, video_title, cover_url, video_url, duration, publish_time, play_cnt, crawler_rule, width, height) values({our_video_id}, "{our_uid}", "{video_dict['out_uid']}", "{cls.platform}", "定向爬虫策略", "{video_dict['video_id']}", "{video_dict['video_title']}", "{video_dict['cover_url']}", "{video_dict['video_url']}", {int(duration)}, "{video_dict['publish_time']}", {int(video_dict['play_cnt'])}, "{rule}", {int(video_width)}, {int(video_height)}) """ MysqlHelper.update_values(log_type, crawler, sql, env, machine) Common.logger(log_type, crawler).info('视频信息插入数据库成功!\n') except Exception as e: Common.logger(log_type, crawler).info(f"download_publish异常:{e}\n") @classmethod def get_follow_videos(cls, log_type, crawler, strategy, oss_endpoint, env, machine): try: user_list = cls.get_user_from_feishu(log_type, crawler, 'c467d7', env, machine) if len(user_list) == 0: Common.logger(log_type, crawler).warning('用户列表为空\n') else: for user_dict in user_list: out_uid = user_dict['out_user_id'] user_name = user_dict['out_user_name'] browse_id = user_dict['out_browse_id'] our_uid = user_dict['our_user_id'] Common.logger(log_type, crawler).info(f'获取 {user_name} 主页视频\n') cls.get_videos(log_type, crawler, strategy, oss_endpoint, env, browse_id, out_uid, our_uid, machine) Common.logger(log_type, crawler).info('休眠 10 秒') time.sleep(10) cls.continuation = '' except Exception as e: Common.logger(log_type, crawler).error(f"get_follow_videos异常:{e}\n") if __name__ == "__main__": # print(Follow.get_browse_id('follow', 'youtube', '@chinatravel5971', "local")) # print(Follow.get_user_from_feishu('follow', 'youtube', 'c467d7', 'dev', 'local')) # Follow.get_out_user_info('follow', 'youtube', 'UC08jgxf119fzynp2uHCvZIg', '@weitravel') # Follow.get_video_info('follow', 'youtube', 'OGVK0IXBIhI') # Follow.get_follow_videos('follow', 'youtube', 'youtube_follow', 'out', 'dev', 'local') # print(Follow.filter_emoji("姐妹倆一唱一和,完美配合,終於把大慶降服了😅😅#萌娃搞笑日常")) # Follow.repeat_video('follow', 'youtube', 4, "dev", "local") pass