youtube_follow.py 64 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128
  1. # -*- coding: utf-8 -*-
  2. # @Author: wangkun
  3. # @Time: 2023/2/3
  4. """
  5. YouTube 定向榜
  6. 1. 发布时间<=1个月
  7. 2. 10分钟>=时长>=1分钟
  8. """
  9. import os
  10. import shutil
  11. import sys
  12. import time
  13. import json
  14. import requests
  15. from selenium import webdriver
  16. from selenium.webdriver.chrome.service import Service
  17. from selenium.webdriver.common.by import By
  18. from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
  19. sys.path.append(os.getcwd())
  20. from common.common import Common
  21. from common.db import MysqlHelper
  22. from common.feishu import Feishu
  23. from common.users import Users
  24. from common.publish import Publish
  25. from common.translate import Translate
  26. class Follow:
  27. # 翻页参数
  28. continuation = ''
  29. # 抓取平台
  30. platform = 'youtube'
  31. @classmethod
  32. def get_browse_id(cls, log_type, crawler, out_user_id, machine):
  33. """
  34. 获取每个用户的 browse_id
  35. :param log_type: 日志
  36. :param crawler: 哪款爬虫
  37. :param out_user_id: 站外用户 UID
  38. :param machine: 部署机器,阿里云填写 aliyun / aliyun_hk,线下分别填写 macpro,macair,local
  39. :return: browse_id
  40. """
  41. try:
  42. # 打印请求配置
  43. ca = DesiredCapabilities.CHROME
  44. ca["goog:loggingPrefs"] = {"performance": "ALL"}
  45. # 不打开浏览器运行
  46. chrome_options = webdriver.ChromeOptions()
  47. chrome_options.add_argument("--headless")
  48. chrome_options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36')
  49. chrome_options.add_argument("--no-sandbox")
  50. # driver初始化
  51. if machine == 'aliyun' or machine == 'aliyun_hk':
  52. driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options)
  53. elif machine == 'macpro':
  54. driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options, service=Service('/Users/lieyunye/Downloads/chromedriver_v86/chromedriver'))
  55. elif machine == 'macair':
  56. driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options, service=Service('/Users/piaoquan/Downloads/chromedriver'))
  57. else:
  58. driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options, service=Service('/Users/wangkun/Downloads/chromedriver/chromedriver_v110/chromedriver'))
  59. driver.implicitly_wait(10)
  60. url = f'https://www.youtube.com/{out_user_id}/videos'
  61. driver.get(url)
  62. # driver.save_screenshot("./1.png")
  63. # 向上滑动 1000 个像素
  64. # driver.execute_script('window.scrollBy(0, 2000)')
  65. # driver.save_screenshot("./2.png")
  66. time.sleep(3)
  67. accept_btns = driver.find_elements(By.XPATH, '//span[text()="全部接受"]')
  68. accept_btns_eng = driver.find_elements(By.XPATH, '//span[text()="Accept all"]')
  69. if len(accept_btns) != 0:
  70. accept_btns[0].click()
  71. time.sleep(2)
  72. elif len(accept_btns_eng) != 0:
  73. accept_btns_eng[0].click()
  74. time.sleep(2)
  75. browse_id = driver.find_element(By.XPATH, '//meta[@itemprop="channelId"]').get_attribute('content')
  76. driver.quit()
  77. return browse_id
  78. except Exception as e:
  79. Common.logger(log_type, crawler).error(f'get_browse_id异常:{e}\n')
  80. @classmethod
  81. def get_out_user_info(cls, log_type, crawler, browse_id, out_user_id):
  82. """
  83. 获取站外用户信息
  84. :param log_type: 日志
  85. :param crawler: 哪款爬虫
  86. :param browse_id: browse_id
  87. :param out_user_id: 站外用户 UID
  88. :return: out_user_dict = {'out_user_name': 站外用户昵称,
  89. 'out_avatar_url': 站外用户头像,
  90. 'out_fans': 站外用户粉丝量,
  91. 'out_play_cnt': 站外用户总播放量,
  92. 'out_create_time': 站外用户创建时间}
  93. """
  94. try:
  95. url = "https://www.youtube.com/youtubei/v1/browse?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8&prettyPrint=false"
  96. payload = json.dumps({
  97. "context": {
  98. "client": {
  99. "hl": "zh-CN",
  100. "gl": "US",
  101. "remoteHost": "38.93.247.21",
  102. "deviceMake": "Apple",
  103. "deviceModel": "",
  104. "visitorData": "CgtraDZfVnB4NXdIWSjL1IKfBg%3D%3D",
  105. "userAgent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36,gzip(gfe)",
  106. "clientName": "WEB",
  107. "clientVersion": "2.20230201.01.00",
  108. "osName": "Macintosh",
  109. "osVersion": "10_15_7",
  110. "originalUrl": f"https://www.youtube.com/{out_user_id}/about",
  111. "screenPixelDensity": 1,
  112. "platform": "DESKTOP",
  113. "clientFormFactor": "UNKNOWN_FORM_FACTOR",
  114. "configInfo": {
  115. "appInstallData": "CMvUgp8GEKLsrgUQzN-uBRC41K4FENfkrgUQsvWuBRDkoP4SELiLrgUQo_muBRDn964FENnprgUQlPiuBRC2nP4SEPuj_hIQ4tSuBRCJ6K4FEILdrgUQh92uBRD-7q4FEMz1rgUQ76P-EhDJya4FEJan_hIQkfj8Eg%3D%3D"
  116. },
  117. "screenDensityFloat": 1,
  118. "timeZone": "Asia/Shanghai",
  119. "browserName": "Chrome",
  120. "browserVersion": "109.0.0.0",
  121. "acceptHeader": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
  122. "deviceExperimentId": "ChxOekU1TlRReU5qWTBOVFExTVRRNU5qRTBOdz09EMvUgp8GGOmU7Z4G",
  123. "screenWidthPoints": 805,
  124. "screenHeightPoints": 969,
  125. "utcOffsetMinutes": 480,
  126. "userInterfaceTheme": "USER_INTERFACE_THEME_LIGHT",
  127. "memoryTotalKbytes": "8000000",
  128. "mainAppWebInfo": {
  129. "graftUrl": f"/{out_user_id}/about",
  130. "pwaInstallabilityStatus": "PWA_INSTALLABILITY_STATUS_CAN_BE_INSTALLED",
  131. "webDisplayMode": "WEB_DISPLAY_MODE_FULLSCREEN",
  132. "isWebNativeShareAvailable": True
  133. }
  134. },
  135. "user": {
  136. "lockedSafetyMode": False
  137. },
  138. "request": {
  139. "useSsl": True,
  140. "internalExperimentFlags": [],
  141. "consistencyTokenJars": []
  142. },
  143. "clickTracking": {
  144. "clickTrackingParams": "CBMQ8JMBGAoiEwjY34r0rYD9AhURSEwIHfHZAak="
  145. },
  146. "adSignalsInfo": {
  147. "params": [
  148. {
  149. "key": "dt",
  150. "value": "1675668045032"
  151. },
  152. {
  153. "key": "flash",
  154. "value": "0"
  155. },
  156. {
  157. "key": "frm",
  158. "value": "0"
  159. },
  160. {
  161. "key": "u_tz",
  162. "value": "480"
  163. },
  164. {
  165. "key": "u_his",
  166. "value": "1"
  167. },
  168. {
  169. "key": "u_h",
  170. "value": "1080"
  171. },
  172. {
  173. "key": "u_w",
  174. "value": "1920"
  175. },
  176. {
  177. "key": "u_ah",
  178. "value": "1080"
  179. },
  180. {
  181. "key": "u_aw",
  182. "value": "1920"
  183. },
  184. {
  185. "key": "u_cd",
  186. "value": "24"
  187. },
  188. {
  189. "key": "bc",
  190. "value": "31"
  191. },
  192. {
  193. "key": "bih",
  194. "value": "969"
  195. },
  196. {
  197. "key": "biw",
  198. "value": "805"
  199. },
  200. {
  201. "key": "brdim",
  202. "value": "-269,-1080,-269,-1080,1920,-1080,1920,1080,805,969"
  203. },
  204. {
  205. "key": "vis",
  206. "value": "1"
  207. },
  208. {
  209. "key": "wgl",
  210. "value": "true"
  211. },
  212. {
  213. "key": "ca_type",
  214. "value": "image"
  215. }
  216. ],
  217. "bid": "ANyPxKqvCBKtjNeHQ6uTC7sKj2ZwIvEkk3oRlmdU7H_soRJWLc4IQCkqMVP68RR-Xae0h3nMdOKYOtVh_Yb2OYr4znd60I5j7A"
  218. }
  219. },
  220. "browseId": browse_id,
  221. "params": "EgVhYm91dPIGBAoCEgA%3D"
  222. })
  223. headers = {
  224. 'authority': 'www.youtube.com',
  225. 'accept': '*/*',
  226. 'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
  227. 'cache-control': 'no-cache',
  228. 'content-type': 'application/json',
  229. 'cookie': 'VISITOR_INFO1_LIVE=kh6_Vpx5wHY; YSC=UupqFrWvAR0; DEVICE_INFO=ChxOekU1TlRReU5qWTBOVFExTVRRNU5qRTBOdz09EOmU7Z4GGOmU7Z4G; GPS=1; PREF=tz=Asia.Shanghai; ST-h076le=itct=CBMQ8JMBGAoiEwjY34r0rYD9AhURSEwIHfHZAak%3D&csn=MC45NDM2MjgyNzM1ODE5NDAz&endpoint=%7B%22clickTrackingParams%22%3A%22CBMQ8JMBGAoiEwjY34r0rYD9AhURSEwIHfHZAak%3D%22%2C%22commandMetadata%22%3A%7B%22webCommandMetadata%22%3A%7B%22url%22%3A%22%2F%40weitravel%2Fabout%22%2C%22webPageType%22%3A%22WEB_PAGE_TYPE_CHANNEL%22%2C%22rootVe%22%3A3611%2C%22apiUrl%22%3A%22%2Fyoutubei%2Fv1%2Fbrowse%22%7D%7D%2C%22browseEndpoint%22%3A%7B%22browseId%22%3A%22UC08jgxf119fzynp2uHCvZIg%22%2C%22params%22%3A%22EgVhYm91dPIGBAoCEgA%253D%22%2C%22canonicalBaseUrl%22%3A%22%2F%40weitravel%22%7D%7D',
  230. 'origin': 'https://www.youtube.com',
  231. 'pragma': 'no-cache',
  232. 'referer': f'https://www.youtube.com/{out_user_id}/videos',
  233. 'sec-ch-ua': '"Not_A Brand";v="99", "Chromium";v="109", "Google Chrome";v="109.0.5414.87"',
  234. 'sec-ch-ua-arch': '"arm"',
  235. 'sec-ch-ua-bitness': '"64"',
  236. 'sec-ch-ua-full-version': '"109.0.1518.52"',
  237. 'sec-ch-ua-full-version-list': '"Not_A Brand";v="99.0.0.0", "Microsoft Edge";v="109.0.1518.52", "Chromium";v="109.0.5414.87"',
  238. 'sec-ch-ua-mobile': '?0',
  239. 'sec-ch-ua-model': '',
  240. 'sec-ch-ua-platform': '"macOS"',
  241. 'sec-ch-ua-platform-version': '"12.4.0"',
  242. 'sec-ch-ua-wow64': '?0',
  243. 'sec-fetch-dest': 'empty',
  244. 'sec-fetch-mode': 'same-origin',
  245. 'sec-fetch-site': 'same-origin',
  246. 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
  247. 'x-goog-visitor-id': 'CgtraDZfVnB4NXdIWSjL1IKfBg%3D%3D',
  248. 'x-youtube-bootstrap-logged-in': 'false',
  249. 'x-youtube-client-name': '1',
  250. 'x-youtube-client-version': '2.20230201.01.00'
  251. }
  252. response = requests.post(url=url, headers=headers, data=payload)
  253. if response.status_code != 200:
  254. Common.logger(log_type, crawler).warning(f'get_out_user_info:{response.text}\n')
  255. elif 'contents' not in response.text or 'header' not in response.text:
  256. Common.logger(log_type, crawler).warning(f'get_out_user_info:{response.text}\n')
  257. elif 'c4TabbedHeaderRenderer' not in response.json()['header']:
  258. Common.logger(log_type, crawler).warning(f'get_out_user_info:{response.json()["header"]}\n')
  259. elif 'twoColumnBrowseResultsRenderer' not in response.json()['contents']:
  260. Common.logger(log_type, crawler).warning(f'get_out_user_info:{response.json()}\n')
  261. elif 'tabs' not in response.json()['contents']['twoColumnBrowseResultsRenderer']:
  262. Common.logger(log_type, crawler).warning(f"get_out_user_info:{response.json()['contents']['twoColumnBrowseResultsRenderer']}\n")
  263. else:
  264. header = response.json()['header']['c4TabbedHeaderRenderer']
  265. tabs = response.json()['contents']['twoColumnBrowseResultsRenderer']['tabs']
  266. for i in range(len(tabs)):
  267. if 'tabRenderer' not in tabs[i]:
  268. title = ''
  269. elif 'title' not in tabs[i]['tabRenderer']:
  270. title = ''
  271. else:
  272. title = tabs[i]['tabRenderer']['title']
  273. if title == '简介':
  274. if 'tabRenderer' not in tabs[i]:
  275. Common.logger(log_type, crawler).warning(f"get_out_user_info:{tabs[i]}\n")
  276. elif 'content' not in tabs[i]['tabRenderer']:
  277. Common.logger(log_type, crawler).warning(f"get_out_user_info:{tabs[i]['tabRenderer']}\n")
  278. elif 'sectionListRenderer' not in tabs[i]['tabRenderer']['content']:
  279. Common.logger(log_type, crawler).warning(f"get_out_user_info:{tabs[i]['tabRenderer']['content']}\n")
  280. elif 'contents' not in tabs[i]['tabRenderer']['content']['sectionListRenderer']:
  281. Common.logger(log_type, crawler).warning(f"get_out_user_info:{tabs[i]['tabRenderer']['content']['sectionListRenderer']}\n")
  282. elif len(tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents']) == 0:
  283. Common.logger(log_type, crawler).warning(f"get_out_user_info:{tabs[i]['tabRenderer']['content']['sectionListRenderer']}\n")
  284. elif 'itemSectionRenderer' not in tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]:
  285. Common.logger(log_type, crawler).warning(f"get_out_user_info:{tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]}\n")
  286. elif 'contents' not in tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']:
  287. Common.logger(log_type, crawler).warning(f"get_out_user_info:{tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']}\n")
  288. elif len(tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents']) == 0:
  289. Common.logger(log_type, crawler).warning(f"get_out_user_info:{tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']}\n")
  290. elif 'channelAboutFullMetadataRenderer' not in tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]:
  291. Common.logger(log_type, crawler).warning(f"get_out_user_info:{tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]}\n")
  292. else:
  293. # 站外用户昵称
  294. if 'title' not in header and 'title' not in tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer']:
  295. out_user_name = ''
  296. elif 'title' in header:
  297. out_user_name = header['title']
  298. elif 'simpleText' not in tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer']['title']:
  299. out_user_name = ''
  300. else:
  301. out_user_name = tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer']['title']['simpleText']
  302. # 站外用户头像
  303. if 'avatar' not in header and 'avatar' not in tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer']:
  304. out_avatar_url = ''
  305. elif 'thumbnails' not in header['avatar'] and 'thumbnails' not in tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer']['avatar']:
  306. out_avatar_url = ''
  307. elif len(header['avatar']['thumbnails']) == 0 and len(tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer']['avatar']['thumbnails']) == 0:
  308. out_avatar_url = ''
  309. elif 'url' not in header['avatar']['thumbnails'][-1] and 'url' not in tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer']['avatar']['thumbnails'][-1]:
  310. out_avatar_url = ''
  311. elif 'url' in header['avatar']['thumbnails'][-1]:
  312. out_avatar_url = header['avatar']['thumbnails'][-1]['url']
  313. else:
  314. out_avatar_url = tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer']['avatar']['thumbnails'][-1]['url']
  315. # 站外用户粉丝
  316. if 'subscriberCountText' not in header:
  317. out_fans = 0
  318. elif 'accessibility' not in header['subscriberCountText']:
  319. out_fans = 0
  320. elif 'accessibilityData' not in header['subscriberCountText']['accessibility']:
  321. out_fans = 0
  322. elif 'label' not in header['subscriberCountText']['accessibility']['accessibilityData']:
  323. out_fans = 0
  324. else:
  325. out_fans = header['subscriberCountText']['accessibility']['accessibilityData']['label']
  326. if '万' in out_fans:
  327. out_fans = int(float(out_fans.split('万')[0])*10000)
  328. else:
  329. pass
  330. # 站外用户总播放量
  331. if 'viewCountText' not in tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer']:
  332. out_play_cnt = 0
  333. elif 'simpleText' not in tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer']['viewCountText']:
  334. out_play_cnt = 0
  335. else:
  336. out_play_cnt = int(tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer']['viewCountText']['simpleText'].split('次')[0].replace(',', ''))
  337. # 站外用户注册时间
  338. if 'joinedDateText' not in tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer']:
  339. out_create_time = ''
  340. elif 'runs' not in tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer']['joinedDateText']:
  341. out_create_time = ''
  342. elif len(tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer']['joinedDateText']['runs']) == 0:
  343. out_create_time = ''
  344. elif 'text' not in tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer']['joinedDateText']['runs'][0]:
  345. out_create_time = ''
  346. else:
  347. out_create_time = tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer']['joinedDateText']['runs'][0]['text'].replace('年', '-').replace('月', '-').replace('日', '')
  348. out_user_dict = {
  349. 'out_user_name': out_user_name,
  350. 'out_avatar_url': out_avatar_url,
  351. 'out_fans': out_fans,
  352. 'out_play_cnt': out_play_cnt,
  353. 'out_create_time': out_create_time,
  354. }
  355. # print(out_user_dict)
  356. return out_user_dict
  357. except Exception as e:
  358. Common.logger(log_type, crawler).error(f'get_out_user_info异常:{e}\n')
  359. @classmethod
  360. def get_user_from_feishu(cls, log_type, crawler, sheetid, env, machine):
  361. """
  362. 补全飞书用户表信息,并返回
  363. :param log_type: 日志
  364. :param crawler: 哪款爬虫
  365. :param sheetid: 飞书表
  366. :param env: 正式环境:prod,测试环境:dev
  367. :param machine: 部署机器,阿里云填写 aliyun,aliyun_hk ,线下分别填写 macpro,macair,local
  368. :return: user_list
  369. """
  370. try:
  371. user_sheet = Feishu.get_values_batch(log_type, crawler, sheetid)
  372. user_list = []
  373. for i in range(1, len(user_sheet)):
  374. out_uid = user_sheet[i][2]
  375. user_name = user_sheet[i][3]
  376. browse_id = user_sheet[i][5]
  377. our_uid = user_sheet[i][6]
  378. Common.logger(log_type, crawler).info(f"正在更新 {user_name} 用户信息\n")
  379. # 获取站外browse_id,并写入飞书
  380. if browse_id is None:
  381. browse_id = cls.get_browse_id(log_type, crawler, out_uid, machine)
  382. if browse_id is None:
  383. Common.logger(log_type, crawler).warning('browse_id is None !')
  384. else:
  385. Feishu.update_values(log_type, crawler, sheetid, f'F{i+1}:F{i+1}', [[browse_id]])
  386. Common.logger(log_type, crawler).info(f'browse_id写入成功:{browse_id}')
  387. # 站内 UID 为空,且数据库中(youtube+out_user_id)返回数量 == 0,则创建新的站内账号
  388. if our_uid is None:
  389. sql = f""" select * from crawler_user where platform="{cls.platform}" and out_user_id="{out_uid}" """
  390. our_user_info = MysqlHelper.get_values(log_type, crawler, sql, env, machine)
  391. # 数据库中(youtube + out_user_id)返回数量 == 0,则创建站内账号UID,并写入定向账号飞书表。并结合站外用户信息,一并写入爬虫账号数据库
  392. if our_user_info is None or len(our_user_info) == 0:
  393. # 获取站外账号信息,写入数据库
  394. out_user_dict = cls.get_out_user_info(log_type, crawler, browse_id, out_uid)
  395. out_avatar_url = out_user_dict['out_avatar_url']
  396. out_create_time = out_user_dict['out_create_time']
  397. out_play_cnt = out_user_dict['out_play_cnt']
  398. out_fans = out_user_dict['out_fans']
  399. tag = 'youtube爬虫,定向爬虫策略'
  400. # 创建站内账号
  401. create_user_dict = {
  402. 'nickName': user_name,
  403. 'avatarUrl': out_avatar_url,
  404. 'tagName': tag,
  405. }
  406. our_uid = Users.create_user(log_type, crawler, create_user_dict, env)
  407. Common.logger(log_type, crawler).info(f'新创建的站内UID:{our_uid}')
  408. if env == 'prod':
  409. our_user_link = f'https://admin.piaoquantv.com/ums/user/{our_uid}/post'
  410. else:
  411. our_user_link = f'https://testadmin.piaoquantv.com/ums/user/{our_uid}/post'
  412. Common.logger(log_type, crawler).info(f'站内用户主页链接:{our_user_link}')
  413. Feishu.update_values(log_type, crawler, sheetid, f'G{i + 1}:H{i + 1}', [[our_uid, our_user_link]])
  414. Common.logger(log_type, crawler).info(f'站内用户信息写入飞书成功!')
  415. sql = f""" insert into crawler_user(user_id,
  416. out_user_id,
  417. out_user_name,
  418. out_avatar_url,
  419. out_create_time,
  420. out_play_cnt,
  421. out_fans,
  422. platform,
  423. tag)
  424. values({our_uid},
  425. "{out_uid}",
  426. "{user_name}",
  427. "{out_avatar_url}",
  428. "{out_create_time}",
  429. {out_play_cnt},
  430. {out_fans},
  431. "{cls.platform}",
  432. "{tag}") """
  433. MysqlHelper.update_values(log_type, crawler, sql, env, machine)
  434. Common.logger(log_type, crawler).info('用户信息插入数据库成功!\n')
  435. # 数据库中(youtube + out_user_id)返回数量 != 0,则直接把数据库中的站内 UID 写入飞书
  436. else:
  437. our_uid = our_user_info[0][1]
  438. if 'env' == 'prod':
  439. our_user_link = f'https://admin.piaoquantv.com/ums/user/{our_uid}/post'
  440. else:
  441. our_user_link = f'https://testadmin.piaoquantv.com/ums/user/{our_uid}/post'
  442. Common.logger(log_type, crawler).info(f'站内用户主页链接:{our_user_link}')
  443. Feishu.update_values(log_type, crawler, sheetid, f'G{i+1}:H{i+1}', [[our_uid, our_user_link]])
  444. Common.logger(log_type, crawler).info(f'站内用户信息写入飞书成功!\n')
  445. user_dict = {
  446. 'out_user_id': out_uid,
  447. 'out_user_name': user_name,
  448. 'out_browse_id': browse_id,
  449. 'our_user_id': our_uid,
  450. }
  451. user_list.append(user_dict)
  452. return user_list
  453. except Exception as e:
  454. Common.logger(log_type, crawler).error(f"get_user_from_feishu异常:{e}\n")
  455. @classmethod
  456. def get_feeds(cls, log_type, crawler, browse_id, out_uid):
  457. """
  458. 获取个人主页视频列表
  459. :param log_type: 日志
  460. :param crawler: 哪款爬虫
  461. :param browse_id: 每个用户主页的请求参数中唯一值
  462. :param out_uid: 站外用户UID
  463. :return: video_list
  464. """
  465. url = "https://www.youtube.com/youtubei/v1/browse?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8&prettyPrint=false"
  466. payload = json.dumps({
  467. "context": {
  468. "client": {
  469. "hl": "zh-CN",
  470. "gl": "US",
  471. "remoteHost": "38.93.247.21",
  472. "deviceMake": "Apple",
  473. "deviceModel": "",
  474. "visitorData": "CgtraDZfVnB4NXdIWSi6mIOfBg%3D%3D",
  475. "userAgent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36,gzip(gfe)",
  476. "clientName": "WEB",
  477. "clientVersion": "2.20230201.01.00",
  478. "osName": "Macintosh",
  479. "osVersion": "10_15_7",
  480. "originalUrl": f"https://www.youtube.com/{out_uid}/videos",
  481. "platform": "DESKTOP",
  482. "clientFormFactor": "UNKNOWN_FORM_FACTOR",
  483. "configInfo": {
  484. "appInstallData": "CLqYg58GEInorgUQuIuuBRCU-K4FENfkrgUQuNSuBRC2nP4SEPuj_hIQ5_euBRCy9a4FEKLsrgUQt-CuBRDi1K4FEILdrgUQh92uBRDM364FEP7urgUQzPWuBRDZ6a4FEOSg_hIQo_muBRDvo_4SEMnJrgUQlqf-EhCR-PwS"
  485. },
  486. "timeZone": "Asia/Shanghai",
  487. "browserName": "Chrome",
  488. "browserVersion": "109.0.0.0",
  489. "acceptHeader": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
  490. "deviceExperimentId": "ChxOekU1TlRReU5qWTBOVFExTVRRNU5qRTBOdz09ELqYg58GGOmU7Z4G",
  491. "screenWidthPoints": 944,
  492. "screenHeightPoints": 969,
  493. "screenPixelDensity": 1,
  494. "screenDensityFloat": 1,
  495. "utcOffsetMinutes": 480,
  496. "userInterfaceTheme": "USER_INTERFACE_THEME_LIGHT",
  497. "memoryTotalKbytes": "8000000",
  498. "mainAppWebInfo": {
  499. "graftUrl": f"/{out_uid}/videos",
  500. "pwaInstallabilityStatus": "PWA_INSTALLABILITY_STATUS_CAN_BE_INSTALLED",
  501. "webDisplayMode": "WEB_DISPLAY_MODE_FULLSCREEN",
  502. "isWebNativeShareAvailable": True
  503. }
  504. },
  505. "user": {
  506. "lockedSafetyMode": False
  507. },
  508. "request": {
  509. "useSsl": True,
  510. "internalExperimentFlags": [],
  511. "consistencyTokenJars": []
  512. },
  513. "clickTracking": {
  514. "clickTrackingParams": "CBcQ8JMBGAYiEwiNhIXX9IL9AhUFSUwIHWnnDks="
  515. },
  516. "adSignalsInfo": {
  517. "params": [
  518. {
  519. "key": "dt",
  520. "value": "1675676731048"
  521. },
  522. {
  523. "key": "flash",
  524. "value": "0"
  525. },
  526. {
  527. "key": "frm",
  528. "value": "0"
  529. },
  530. {
  531. "key": "u_tz",
  532. "value": "480"
  533. },
  534. {
  535. "key": "u_his",
  536. "value": "4"
  537. },
  538. {
  539. "key": "u_h",
  540. "value": "1080"
  541. },
  542. {
  543. "key": "u_w",
  544. "value": "1920"
  545. },
  546. {
  547. "key": "u_ah",
  548. "value": "1080"
  549. },
  550. {
  551. "key": "u_aw",
  552. "value": "1920"
  553. },
  554. {
  555. "key": "u_cd",
  556. "value": "24"
  557. },
  558. {
  559. "key": "bc",
  560. "value": "31"
  561. },
  562. {
  563. "key": "bih",
  564. "value": "969"
  565. },
  566. {
  567. "key": "biw",
  568. "value": "944"
  569. },
  570. {
  571. "key": "brdim",
  572. "value": "-269,-1080,-269,-1080,1920,-1080,1920,1080,944,969"
  573. },
  574. {
  575. "key": "vis",
  576. "value": "1"
  577. },
  578. {
  579. "key": "wgl",
  580. "value": "true"
  581. },
  582. {
  583. "key": "ca_type",
  584. "value": "image"
  585. }
  586. ],
  587. "bid": "ANyPxKpfiaAf-DBzNeKLgkceMEA9UIeCWFRTRm4AQMCuejhI3PGwDB1jizQIX60YcEYtt_CX7tZWAbYerQ-rWLvV7y_KCLkBww"
  588. }
  589. },
  590. "browseId": browse_id,
  591. "params": "EgZ2aWRlb3PyBgQKAjoA",
  592. "continuation": cls.continuation
  593. })
  594. headers = {
  595. 'authority': 'www.youtube.com',
  596. 'accept': '*/*',
  597. 'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
  598. 'cache-control': 'no-cache',
  599. 'content-type': 'application/json',
  600. 'cookie': 'VISITOR_INFO1_LIVE=kh6_Vpx5wHY; YSC=UupqFrWvAR0; DEVICE_INFO=ChxOekU1TlRReU5qWTBOVFExTVRRNU5qRTBOdz09EOmU7Z4GGOmU7Z4G; PREF=tz=Asia.Shanghai; ST-1kg1gfd=itct=CBcQ8JMBGAYiEwiNhIXX9IL9AhUFSUwIHWnnDks%3D&csn=MC4zNzI3MDcwMDA1Mjg4NzE5Ng..&endpoint=%7B%22clickTrackingParams%22%3A%22CBcQ8JMBGAYiEwiNhIXX9IL9AhUFSUwIHWnnDks%3D%22%2C%22commandMetadata%22%3A%7B%22webCommandMetadata%22%3A%7B%22url%22%3A%22%2F%40chinatravel5971%2Fvideos%22%2C%22webPageType%22%3A%22WEB_PAGE_TYPE_CHANNEL%22%2C%22rootVe%22%3A3611%2C%22apiUrl%22%3A%22%2Fyoutubei%2Fv1%2Fbrowse%22%7D%7D%2C%22browseEndpoint%22%3A%7B%22browseId%22%3A%22UCpLXnfBCNhj8KLnt54RQMKA%22%2C%22params%22%3A%22EgZ2aWRlb3PyBgQKAjoA%22%2C%22canonicalBaseUrl%22%3A%22%2F%40chinatravel5971%22%7D%7D',
  601. 'origin': 'https://www.youtube.com',
  602. 'pragma': 'no-cache',
  603. 'referer': f'https://www.youtube.com/{out_uid}/featured',
  604. 'sec-ch-ua': '"Not_A Brand";v="99", "Chromium";v="109", "Google Chrome";v="109.0.5414.87"',
  605. 'sec-ch-ua-arch': '"arm"',
  606. 'sec-ch-ua-bitness': '"64"',
  607. 'sec-ch-ua-full-version': '"109.0.1518.52"',
  608. 'sec-ch-ua-full-version-list': '"Not_A Brand";v="99.0.0.0", "Microsoft Edge";v="109.0.1518.52", "Chromium";v="109.0.5414.87"',
  609. 'sec-ch-ua-mobile': '?0',
  610. 'sec-ch-ua-model': '',
  611. 'sec-ch-ua-platform': '"macOS"',
  612. 'sec-ch-ua-platform-version': '"12.4.0"',
  613. 'sec-ch-ua-wow64': '?0',
  614. 'sec-fetch-dest': 'empty',
  615. 'sec-fetch-mode': 'same-origin',
  616. 'sec-fetch-site': 'same-origin',
  617. 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
  618. 'x-goog-visitor-id': 'CgtraDZfVnB4NXdIWSi6mIOfBg%3D%3D',
  619. 'x-youtube-bootstrap-logged-in': 'false',
  620. 'x-youtube-client-name': '1',
  621. 'x-youtube-client-version': '2.20230201.01.00'
  622. }
  623. try:
  624. response = requests.post(url=url, headers=headers, data=payload)
  625. # Common.logger(log_type, crawler).info(f"get_feeds_response:{response.json()}\n")
  626. cls.continuation = response.json()['trackingParams']
  627. if response.status_code != 200:
  628. Common.logger(log_type, crawler).warning(f'get_feeds_response:{response.text}\n')
  629. elif 'continuationContents' not in response.text and 'onResponseReceivedActions' not in response.text:
  630. Common.logger(log_type, crawler).warning(f'get_feeds_response:{response.text}\n')
  631. elif 'continuationContents' in response.json():
  632. # Common.logger(log_type, crawler).info("'continuationContents' in response.json()\n")
  633. if 'richGridContinuation' not in response.json()['continuationContents']:
  634. # Common.logger(log_type, crawler).warning(f"'richGridContinuation' not in response.json()['continuationContents']\n")
  635. Common.logger(log_type, crawler).warning(f'get_feeds_response:{response.json()["continuationContents"]}\n')
  636. elif 'contents' not in response.json()['continuationContents']['richGridContinuation']:
  637. Common.logger(log_type, crawler).warning(f'get_feeds_response:{response.json()["continuationContents"]["richGridContinuation"]}\n')
  638. elif 'contents' in response.json()["continuationContents"]["richGridContinuation"]:
  639. feeds = response.json()["continuationContents"]["richGridContinuation"]['contents']
  640. return feeds
  641. elif 'onResponseReceivedActions' in response.json():
  642. Common.logger(log_type, crawler).info("'onResponseReceivedActions' in response.json()\n")
  643. if len(response.json()['onResponseReceivedActions']) == 0:
  644. Common.logger(log_type, crawler).warning(f'get_feeds_response:{response.json()["onResponseReceivedActions"]}\n')
  645. elif 'appendContinuationItemsAction' not in response.json()['onResponseReceivedActions'][0]:
  646. Common.logger(log_type, crawler).warning(f'get_feeds_response:{response.json()["onResponseReceivedActions"][0]}\n')
  647. elif 'continuationItems' not in response.json()['onResponseReceivedActions'][0]['appendContinuationItemsAction']:
  648. Common.logger(log_type, crawler).warning(f'get_feeds_response:{response.json()["onResponseReceivedActions"][0]["appendContinuationItemsAction"]}\n')
  649. elif len(response.json()['onResponseReceivedActions'][0]['appendContinuationItemsAction']['continuationItems']) == 0:
  650. Common.logger(log_type, crawler).warning(f'get_feeds_response:{response.json()["onResponseReceivedActions"][0]["appendContinuationItemsAction"]["continuationItems"]}\n')
  651. else:
  652. feeds = response.json()["onResponseReceivedActions"][0]["appendContinuationItemsAction"]["continuationItems"]
  653. return feeds
  654. else:
  655. Common.logger(log_type, crawler).info('feeds is None\n')
  656. except Exception as e:
  657. Common.logger(log_type, crawler).error(f'get_feeds异常:{e}\n')
  658. @classmethod
  659. def get_videos(cls, log_type, crawler, strategy, oss_endpoint, env, browse_id, out_uid, our_uid, machine):
  660. try:
  661. while True:
  662. feeds = cls.get_feeds(log_type, crawler, browse_id, out_uid)
  663. for i in range(len(feeds)):
  664. if 'richItemRenderer' not in feeds[i]:
  665. Common.logger(log_type, crawler).warning(f'feeds:{feeds[i]}\n')
  666. elif 'content' not in feeds[i]['richItemRenderer']:
  667. Common.logger(log_type, crawler).warning(f'feeds:{feeds[i]["richItemRenderer"]}\n')
  668. elif 'videoRenderer' not in feeds[i]['richItemRenderer']['content']:
  669. Common.logger(log_type, crawler).warning(f'feeds:{feeds[i]["richItemRenderer"]["content"]}\n')
  670. elif 'videoId' not in feeds[i]["richItemRenderer"]["content"]['videoRenderer']:
  671. Common.logger(log_type, crawler).warning(f'feeds:{feeds[i]["richItemRenderer"]["content"]["videoRenderer"]}\n')
  672. else:
  673. video_id = feeds[i]["richItemRenderer"]["content"]['videoRenderer']['videoId']
  674. video_dict = cls.get_video_info(log_type, crawler, out_uid, video_id, machine)
  675. # 发布时间<=30天
  676. publish_time = int(time.mktime(time.strptime(video_dict['publish_time'], "%Y-%m-%d")))
  677. if int(time.time()) - publish_time <= 3600*24*30:
  678. cls.download_publish(log_type, crawler, video_dict, strategy, our_uid, env, oss_endpoint, machine)
  679. else:
  680. Common.logger(log_type, crawler).info('发布时间超过30天\n')
  681. return
  682. except Exception as e:
  683. Common.logger(log_type, crawler).error(f"get_videos异常:{e}\n")
  684. @classmethod
  685. def get_video_info(cls, log_type, crawler, out_uid, video_id, machine):
  686. try:
  687. url = "https://www.youtube.com/youtubei/v1/player?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8&prettyPrint=false"
  688. payload = json.dumps({
  689. "context": {
  690. "client": {
  691. "hl": "zh-CN",
  692. "gl": "US",
  693. "remoteHost": "38.93.247.21",
  694. "deviceMake": "Apple",
  695. "deviceModel": "",
  696. "visitorData": "CgtraDZfVnB4NXdIWSjkzoefBg%3D%3D",
  697. "userAgent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36,gzip(gfe)",
  698. "clientName": "WEB",
  699. "clientVersion": "2.20230201.01.00",
  700. "osName": "Macintosh",
  701. "osVersion": "10_15_7",
  702. "originalUrl": f"https://www.youtube.com/watch?v={video_id}",
  703. "platform": "DESKTOP",
  704. "clientFormFactor": "UNKNOWN_FORM_FACTOR",
  705. "configInfo": {
  706. "appInstallData": "COTOh58GEPuj_hIQ1-SuBRC4i64FEMzfrgUQgt2uBRCi7K4FEOLUrgUQzPWuBRCKgK8FEOSg_hIQtpz-EhDa6a4FEP7urgUQieiuBRDn964FELjUrgUQlPiuBRCH3a4FELfgrgUQ76P-EhDJya4FEJan_hIQkfj8Eg%3D%3D"
  707. },
  708. "timeZone": "Asia/Shanghai",
  709. "browserName": "Chrome",
  710. "browserVersion": "109.0.0.0",
  711. "acceptHeader": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
  712. "deviceExperimentId": "ChxOekU1TlRReU5qWTBOVFExTVRRNU5qRTBOdz09EOTOh58GGOmU7Z4G",
  713. "screenWidthPoints": 1037,
  714. "screenHeightPoints": 969,
  715. "screenPixelDensity": 1,
  716. "screenDensityFloat": 1,
  717. "utcOffsetMinutes": 480,
  718. "userInterfaceTheme": "USER_INTERFACE_THEME_LIGHT",
  719. "memoryTotalKbytes": "8000000",
  720. "clientScreen": "WATCH",
  721. "mainAppWebInfo": {
  722. "graftUrl": f"/watch?v={video_id}",
  723. "pwaInstallabilityStatus": "PWA_INSTALLABILITY_STATUS_CAN_BE_INSTALLED",
  724. "webDisplayMode": "WEB_DISPLAY_MODE_FULLSCREEN",
  725. "isWebNativeShareAvailable": True
  726. }
  727. },
  728. "user": {
  729. "lockedSafetyMode": False
  730. },
  731. "request": {
  732. "useSsl": True,
  733. "internalExperimentFlags": [],
  734. "consistencyTokenJars": []
  735. },
  736. "clickTracking": {
  737. "clickTrackingParams": "CIwBEKQwGAYiEwipncqx3IL9AhXs4cQKHbKZDO4yB3JlbGF0ZWRInsS1qbGFtIlUmgEFCAEQ-B0="
  738. },
  739. "adSignalsInfo": {
  740. "params": [
  741. {
  742. "key": "dt",
  743. "value": "1675749222611"
  744. },
  745. {
  746. "key": "flash",
  747. "value": "0"
  748. },
  749. {
  750. "key": "frm",
  751. "value": "0"
  752. },
  753. {
  754. "key": "u_tz",
  755. "value": "480"
  756. },
  757. {
  758. "key": "u_his",
  759. "value": "3"
  760. },
  761. {
  762. "key": "u_h",
  763. "value": "1080"
  764. },
  765. {
  766. "key": "u_w",
  767. "value": "1920"
  768. },
  769. {
  770. "key": "u_ah",
  771. "value": "1080"
  772. },
  773. {
  774. "key": "u_aw",
  775. "value": "1920"
  776. },
  777. {
  778. "key": "u_cd",
  779. "value": "24"
  780. },
  781. {
  782. "key": "bc",
  783. "value": "31"
  784. },
  785. {
  786. "key": "bih",
  787. "value": "969"
  788. },
  789. {
  790. "key": "biw",
  791. "value": "1037"
  792. },
  793. {
  794. "key": "brdim",
  795. "value": "-269,-1080,-269,-1080,1920,-1080,1920,1080,1037,969"
  796. },
  797. {
  798. "key": "vis",
  799. "value": "1"
  800. },
  801. {
  802. "key": "wgl",
  803. "value": "true"
  804. },
  805. {
  806. "key": "ca_type",
  807. "value": "image"
  808. }
  809. ],
  810. "bid": "ANyPxKop8SijebwUCq4ZfKbJwlSjVQa_RTdS6c6a6WPYpCKnxpWCJ33B1SzRuSXjSfH9O2MhURebAs0CngRg6B4nOjBpeJDKgA"
  811. }
  812. },
  813. "videoId": str(video_id),
  814. "playbackContext": {
  815. "contentPlaybackContext": {
  816. "currentUrl": f"/watch?v={video_id}",
  817. "vis": 0,
  818. "splay": False,
  819. "autoCaptionsDefaultOn": False,
  820. "autonavState": "STATE_NONE",
  821. "html5Preference": "HTML5_PREF_WANTS",
  822. "signatureTimestamp": 19394,
  823. "referer": f"https://www.youtube.com/watch?v={video_id}",
  824. "lactMilliseconds": "-1",
  825. "watchAmbientModeContext": {
  826. "watchAmbientModeEnabled": True
  827. }
  828. }
  829. },
  830. "racyCheckOk": False,
  831. "contentCheckOk": False
  832. })
  833. headers = {
  834. 'authority': 'www.youtube.com',
  835. 'accept': '*/*',
  836. 'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
  837. 'cache-control': 'no-cache',
  838. 'content-type': 'application/json',
  839. 'cookie': f'VISITOR_INFO1_LIVE=kh6_Vpx5wHY; YSC=UupqFrWvAR0; DEVICE_INFO=ChxOekU1TlRReU5qWTBOVFExTVRRNU5qRTBOdz09EOmU7Z4GGOmU7Z4G; PREF=tz=Asia.Shanghai; ST-180dxzo=itct=CIwBEKQwGAYiEwipncqx3IL9AhXs4cQKHbKZDO4yB3JlbGF0ZWRInsS1qbGFtIlUmgEFCAEQ-B0%3D&csn=MC41MTQ1NTQzMTE3NTA4MjY0&endpoint=%7B%22clickTrackingParams%22%3A%22CIwBEKQwGAYiEwipncqx3IL9AhXs4cQKHbKZDO4yB3JlbGF0ZWRInsS1qbGFtIlUmgEFCAEQ-B0%3D%22%2C%22commandMetadata%22%3A%7B%22webCommandMetadata%22%3A%7B%22url%22%3A%22%2Fwatch%3Fv%3D{video_id}%22%2C%22webPageType%22%3A%22WEB_PAGE_TYPE_WATCH%22%2C%22rootVe%22%3A3832%7D%7D%2C%22watchEndpoint%22%3A%7B%22videoId%22%3A%22{video_id}%22%2C%22nofollow%22%3Atrue%2C%22watchEndpointSupportedOnesieConfig%22%3A%7B%22html5PlaybackOnesieConfig%22%3A%7B%22commonConfig%22%3A%7B%22url%22%3A%22https%3A%2F%2Frr5---sn-nx5s7n76.googlevideo.com%2Finitplayback%3Fsource%3Dyoutube%26oeis%3D1%26c%3DWEB%26oad%3D3200%26ovd%3D3200%26oaad%3D11000%26oavd%3D11000%26ocs%3D700%26oewis%3D1%26oputc%3D1%26ofpcc%3D1%26msp%3D1%26odepv%3D1%26id%3D38654ad085c12212%26ip%3D38.93.247.21%26initcwndbps%3D11346250%26mt%3D1675748964%26oweuc%3D%26pxtags%3DCg4KAnR4EggyNDQ1MTI4OA%26rxtags%3DCg4KAnR4EggyNDQ1MTI4Ng%252CCg4KAnR4EggyNDQ1MTI4Nw%252CCg4KAnR4EggyNDQ1MTI4OA%252CCg4KAnR4EggyNDQ1MTI4OQ%22%7D%7D%7D%7D%7D',
  840. 'origin': 'https://www.youtube.com',
  841. 'pragma': 'no-cache',
  842. 'referer': f'https://www.youtube.com/watch?v={video_id}',
  843. 'sec-ch-ua': '"Not_A Brand";v="99", "Chromium";v="109", "Google Chrome";v="109.0.5414.87"',
  844. 'sec-ch-ua-arch': '"arm"',
  845. 'sec-ch-ua-bitness': '"64"',
  846. 'sec-ch-ua-full-version': '"109.0.1518.52"',
  847. 'sec-ch-ua-full-version-list': '"Not_A Brand";v="99.0.0.0", "Microsoft Edge";v="109.0.1518.52", "Chromium";v="109.0.5414.87"',
  848. 'sec-ch-ua-mobile': '?0',
  849. 'sec-ch-ua-model': '',
  850. 'sec-ch-ua-platform': '"macOS"',
  851. 'sec-ch-ua-platform-version': '"12.4.0"',
  852. 'sec-ch-ua-wow64': '?0',
  853. 'sec-fetch-dest': 'empty',
  854. 'sec-fetch-mode': 'same-origin',
  855. 'sec-fetch-site': 'same-origin',
  856. 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
  857. 'x-goog-visitor-id': 'CgtraDZfVnB4NXdIWSjkzoefBg%3D%3D',
  858. 'x-youtube-bootstrap-logged-in': 'false',
  859. 'x-youtube-client-name': '1',
  860. 'x-youtube-client-version': '2.20230201.01.00'
  861. }
  862. response = requests.post(url=url, headers=headers, data=payload)
  863. if response.status_code != 200:
  864. Common.logger(log_type, crawler).warning(f"get_video_info_response:{response.text}\n")
  865. elif 'streamingData' not in response.json():
  866. Common.logger(log_type, crawler).warning(f"get_video_info_response:{response.json()}\n")
  867. elif 'videoDetails' not in response.json():
  868. Common.logger(log_type, crawler).warning(f"get_video_info_response:{response.json()}\n")
  869. elif 'microformat' not in response.json():
  870. Common.logger(log_type, crawler).warning(f"get_video_info_response:{response.json()}\n")
  871. else:
  872. playerMicroformatRenderer = response.json()['microformat']['playerMicroformatRenderer']
  873. videoDetails = response.json()['videoDetails']
  874. streamingData = response.json()['streamingData']
  875. # video_title
  876. if 'title' not in videoDetails:
  877. video_title = ''
  878. else:
  879. video_title = videoDetails['title']
  880. if Translate.is_contains_chinese(video_title) is False:
  881. video_title = Translate.google_translate(video_title, machine) # 自动翻译标题为中文
  882. if 'lengthSeconds' not in videoDetails:
  883. duration = 0
  884. else:
  885. duration = int(videoDetails['lengthSeconds'])
  886. # play_cnt
  887. if 'viewCount' not in videoDetails:
  888. play_cnt = 0
  889. else:
  890. play_cnt = int(videoDetails['viewCount'])
  891. # publish_time
  892. if 'publishDate' not in playerMicroformatRenderer:
  893. publish_time = ''
  894. else:
  895. publish_time = playerMicroformatRenderer['publishDate']
  896. if publish_time == '':
  897. publish_time_stamp = 0
  898. elif ':' in publish_time:
  899. publish_time_stamp = int(time.mktime(time.strptime(publish_time, "%Y-%m-%d %H:%M:%S")))
  900. else:
  901. publish_time_stamp = int(time.mktime(time.strptime(publish_time, "%Y-%m-%d")))
  902. # user_name
  903. if 'author' not in videoDetails:
  904. user_name = ''
  905. else:
  906. user_name = videoDetails['author']
  907. # cover_url
  908. if 'thumbnail' not in videoDetails:
  909. cover_url = ''
  910. elif 'thumbnails' not in videoDetails['thumbnail']:
  911. cover_url = ''
  912. elif len(videoDetails['thumbnail']['thumbnails']) == 0:
  913. cover_url = ''
  914. elif 'url' not in videoDetails['thumbnail']['thumbnails'][-1]:
  915. cover_url = ''
  916. else:
  917. cover_url = videoDetails['thumbnail']['thumbnails'][-1]['url']
  918. # video_url
  919. if 'formats' not in streamingData:
  920. video_url = ''
  921. elif len(streamingData['formats']) == 0:
  922. video_url = ''
  923. elif 'url' not in streamingData['formats'][-1]:
  924. video_url = ''
  925. else:
  926. video_url = streamingData['formats'][-1]['url']
  927. Common.logger(log_type, crawler).info(f'video_title:{video_title}')
  928. Common.logger(log_type, crawler).info(f'video_id:{video_id}')
  929. Common.logger(log_type, crawler).info(f'play_cnt:{play_cnt}')
  930. Common.logger(log_type, crawler).info(f'publish_time:{publish_time}')
  931. Common.logger(log_type, crawler).info(f'user_name:{user_name}')
  932. Common.logger(log_type, crawler).info(f'cover_url:{cover_url}')
  933. Common.logger(log_type, crawler).info(f'video_url:{video_url}')
  934. video_dict = {
  935. 'video_title': video_title,
  936. 'video_id': video_id,
  937. 'duration': duration,
  938. 'play_cnt': play_cnt,
  939. 'publish_time': publish_time,
  940. 'publish_time_stamp': publish_time_stamp,
  941. 'user_name': user_name,
  942. 'out_uid': out_uid,
  943. 'cover_url': cover_url,
  944. 'video_url': video_url,
  945. }
  946. return video_dict
  947. except Exception as e:
  948. Common.logger(log_type, crawler).error(f"get_video_info异常:{e}\n")
  949. @classmethod
  950. def download_publish(cls, log_type, crawler, video_dict, strategy, our_uid, env, oss_endpoint, machine):
  951. try:
  952. sql = f""" select * from crawler_video where platform="{cls.platform}" and out_video_id="{video_dict['video_id']}" """
  953. repeat_video = MysqlHelper.get_values(log_type, crawler, sql, env, machine)
  954. if video_dict['video_title'] == '' or video_dict['video_url'] == '':
  955. Common.logger(log_type, crawler).info('无效视频\n')
  956. elif video_dict['duration'] > 600 or video_dict['duration'] < 60:
  957. Common.logger(log_type, crawler).info(f"时长:{video_dict['duration']}不满足规则\n")
  958. elif repeat_video is not None and len(repeat_video) != 0:
  959. Common.logger(log_type, crawler).info('视频已下载\n')
  960. elif video_dict['video_id'] in [x for y in Feishu.get_values_batch(log_type, crawler, 'GVxlYk') for x in y]:
  961. Common.logger(log_type, crawler).info('视频已下载\n')
  962. else:
  963. # 下载视频
  964. Common.logger(log_type, crawler).info('开始下载视频...')
  965. Common.download_method(log_type, crawler, 'video', video_dict['video_title'], video_dict['video_url'])
  966. ffmpeg_dict = Common.ffmpeg(log_type, crawler, f"./{crawler}/videos/{video_dict['video_title']}/video.mp4")
  967. video_width = int(ffmpeg_dict['width'])
  968. video_height = int(ffmpeg_dict['height'])
  969. duration = int(ffmpeg_dict['duration'])
  970. video_size = int(ffmpeg_dict['size'])
  971. Common.logger(log_type, crawler).info(f'video_width:{video_width}')
  972. Common.logger(log_type, crawler).info(f'video_height:{video_height}')
  973. Common.logger(log_type, crawler).info(f'duration:{duration}')
  974. Common.logger(log_type, crawler).info(f'video_size:{video_size}\n')
  975. video_dict['video_width'] = video_width
  976. video_dict['video_height'] = video_height
  977. video_dict['duration'] = duration
  978. video_dict['comment_cnt'] = 0
  979. video_dict['like_cnt'] = 0
  980. video_dict['share_cnt'] = 0
  981. video_dict['avatar_url'] = video_dict['cover_url']
  982. video_dict['session'] = f'youtube{int(time.time())}'
  983. rule='1,2'
  984. if duration < 60 or duration > 600:
  985. # 删除视频文件夹
  986. shutil.rmtree(f"./{crawler}/videos/{video_dict['video_title']}/")
  987. Common.logger(log_type, crawler).info(f"时长:{video_dict['duration']}不满足抓取规则,删除成功\n")
  988. return
  989. elif video_size == 0 or duration == 0 or video_size is None or duration is None:
  990. # 删除视频文件夹
  991. shutil.rmtree(f"./{crawler}/videos/{video_dict['video_title']}/")
  992. Common.logger(log_type, crawler).info(f"视频下载出错,删除成功\n")
  993. return
  994. else:
  995. # 下载封面
  996. Common.download_method(log_type, crawler, 'cover', video_dict['video_title'], video_dict['cover_url'])
  997. # 保存视频文本信息
  998. Common.save_video_info(log_type, crawler, video_dict)
  999. # 上传视频
  1000. Common.logger(log_type, crawler).info(f"开始上传视频")
  1001. if env == 'dev':
  1002. our_video_id = Publish.upload_and_publish(log_type, crawler, strategy, our_uid, env, oss_endpoint)
  1003. our_video_link = f"https://testadmin.piaoquantv.com/cms/post-detail/{our_video_id}/info"
  1004. else:
  1005. our_video_id = Publish.upload_and_publish(log_type, crawler, strategy, our_uid, env, oss_endpoint)
  1006. our_video_link = f"https://admin.piaoquantv.com/cms/post-detail/{our_video_id}/info"
  1007. Common.logger(log_type, crawler).info("视频上传完成")
  1008. # 视频信息保存至飞书
  1009. Feishu.insert_columns(log_type, crawler, "GVxlYk", "ROWS", 1, 2)
  1010. # 视频ID工作表,首行写入数据
  1011. upload_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(time.time())))
  1012. values = [[upload_time,
  1013. "定向榜",
  1014. video_dict['video_id'],
  1015. video_dict['video_title'],
  1016. our_video_link,
  1017. video_dict['play_cnt'],
  1018. video_dict['duration'],
  1019. f'{video_width}*{video_height}',
  1020. video_dict['publish_time'],
  1021. video_dict['user_name'],
  1022. video_dict['cover_url'],
  1023. video_dict['video_url']
  1024. ]]
  1025. time.sleep(1)
  1026. Feishu.update_values(log_type, crawler, "GVxlYk", "F2:Z2", values)
  1027. Common.logger(log_type, crawler).info('视频信息写入定向_已下载表成功\n')
  1028. # 视频信息保存数据库
  1029. sql = f""" insert into crawler_video(video_id,
  1030. user_id,
  1031. out_user_id,
  1032. platform,
  1033. strategy,
  1034. out_video_id,
  1035. video_title,
  1036. cover_url,
  1037. video_url,
  1038. duration,
  1039. publish_time,
  1040. play_cnt,
  1041. crawler_rule,
  1042. width,
  1043. height)
  1044. values({our_video_id},
  1045. "{our_uid}",
  1046. "{video_dict['out_uid']}",
  1047. "{cls.platform}",
  1048. "定向爬虫策略",
  1049. "{video_dict['video_id']}",
  1050. "{video_dict['video_title']}",
  1051. "{video_dict['cover_url']}",
  1052. "{video_dict['video_url']}",
  1053. {int(duration)},
  1054. "{video_dict['publish_time']}",
  1055. {int(video_dict['play_cnt'])},
  1056. "{rule}",
  1057. {int(video_width)},
  1058. {int(video_height)}) """
  1059. MysqlHelper.update_values(log_type, crawler, sql, env, machine)
  1060. Common.logger(log_type, crawler).info('视频信息插入数据库成功!\n')
  1061. except Exception as e:
  1062. Common.logger(log_type, crawler).info(f"download_publish异常:{e}\n")
  1063. @classmethod
  1064. def get_follow_videos(cls, log_type, crawler, strategy, oss_endpoint, env, machine):
  1065. try:
  1066. user_list = cls.get_user_from_feishu(log_type, crawler, 'c467d7', env, machine)
  1067. if len(user_list) == 0:
  1068. Common.logger(log_type, crawler).warning('用户列表为空\n')
  1069. else:
  1070. for user_dict in user_list:
  1071. out_uid = user_dict['out_user_id']
  1072. user_name = user_dict['out_user_name']
  1073. browse_id = user_dict['out_browse_id']
  1074. our_uid = user_dict['our_user_id']
  1075. Common.logger(log_type, crawler).info(f'获取 {user_name} 主页视频\n')
  1076. cls.get_videos(log_type, crawler, strategy, oss_endpoint, env, browse_id, out_uid, our_uid, machine)
  1077. Common.logger(log_type, crawler).info('休眠 10 秒')
  1078. time.sleep(10)
  1079. cls.continuation = ''
  1080. except Exception as e:
  1081. Common.logger(log_type, crawler).error(f"get_follow_videos异常:{e}\n")
  1082. if __name__ == "__main__":
  1083. print(Follow.get_browse_id('follow', 'youtube', '@chinatravel5971', "local"))
  1084. # print(Follow.get_user_from_feishu('follow', 'youtube', 'c467d7', 'dev', 'local'))
  1085. # Follow.get_out_user_info('follow', 'youtube', 'UC08jgxf119fzynp2uHCvZIg', '@weitravel')
  1086. # Follow.get_video_info('follow', 'youtube', 'OGVK0IXBIhI')
  1087. # Follow.get_follow_videos('follow', 'youtube', 'youtube_follow', 'out', 'dev', 'local')
  1088. pass