youtube_follow_api.py 82 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470
  1. # -*- coding: utf-8 -*-
  2. # @Author: wangkun
  3. # @Time: 2023/2/3
  4. """
  5. YouTube 定向榜
  6. 1. 发布时间<=1个月
  7. 2. 10分钟>=时长>=1分钟
  8. """
  9. import os
  10. import re
  11. import shutil
  12. import sys
  13. import time
  14. import json
  15. import random
  16. # import emoji
  17. import requests
  18. from selenium import webdriver
  19. from selenium.webdriver.chrome.service import Service
  20. from selenium.webdriver.common.by import By
  21. from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
  22. sys.path.append(os.getcwd())
  23. from common.common import Common
  24. from common.db import MysqlHelper
  25. from common.feishu import Feishu
  26. from common.users import Users
  27. from common.publish import Publish
  28. from common.translate import Translate
  29. headers = {
  30. 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36',
  31. }
  32. class Follow:
  33. # 翻页参数
  34. continuation = ''
  35. # 抓取平台
  36. platform = 'youtube'
  37. headers = {
  38. 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36',
  39. }
  40. @classmethod
  41. def get_browse_id(cls, log_type, crawler, out_user_id, machine):
  42. """
  43. 获取每个用户的 browse_id
  44. :param log_type: 日志
  45. :param crawler: 哪款爬虫
  46. :param out_user_id: 站外用户 UID
  47. :param machine: 部署机器,阿里云填写 aliyun / aliyun_hk,线下分别填写 macpro,macair,local
  48. :return: browse_id
  49. """
  50. try:
  51. # 打印请求配置
  52. ca = DesiredCapabilities.CHROME
  53. ca["goog:loggingPrefs"] = {"performance": "ALL"}
  54. # 不打开浏览器运行
  55. chrome_options = webdriver.ChromeOptions()
  56. chrome_options.add_argument("--headless")
  57. chrome_options.add_argument(
  58. '--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36')
  59. chrome_options.add_argument("--no-sandbox")
  60. # driver初始化
  61. if machine == 'aliyun' or machine == 'aliyun_hk':
  62. driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options)
  63. elif machine == 'macpro':
  64. driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options,
  65. service=Service('/Users/lieyunye/Downloads/chromedriver_v86/chromedriver'))
  66. elif machine == 'macair':
  67. driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options,
  68. service=Service('/Users/piaoquan/Downloads/chromedriver'))
  69. else:
  70. driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options, service=Service(
  71. '/Users/wangkun/Downloads/chromedriver/chromedriver_v110/chromedriver'))
  72. driver.implicitly_wait(10)
  73. url = f'https://www.youtube.com/{out_user_id}/videos'
  74. driver.get(url)
  75. # driver.save_screenshot("./1.png")
  76. # 向上滑动 1000 个像素
  77. # driver.execute_script('window.scrollBy(0, 2000)')
  78. # driver.save_screenshot("./2.png")
  79. time.sleep(3)
  80. accept_btns = driver.find_elements(By.XPATH, '//span[text()="全部接受"]')
  81. accept_btns_eng = driver.find_elements(By.XPATH, '//span[text()="Accept all"]')
  82. if len(accept_btns) != 0:
  83. accept_btns[0].click()
  84. time.sleep(2)
  85. elif len(accept_btns_eng) != 0:
  86. accept_btns_eng[0].click()
  87. time.sleep(2)
  88. browse_id = driver.find_element(By.XPATH, '//meta[@itemprop="channelId"]').get_attribute('content')
  89. driver.quit()
  90. return browse_id
  91. except Exception as e:
  92. Common.logger(log_type, crawler).error(f'get_browse_id异常:{e}\n')
  93. @classmethod
  94. def get_out_user_info(cls, log_type, crawler, browse_id, out_user_id):
  95. """
  96. 获取站外用户信息
  97. :param log_type: 日志
  98. :param crawler: 哪款爬虫
  99. :param browse_id: browse_id
  100. :param out_user_id: 站外用户 UID
  101. :return: out_user_dict = {'out_user_name': 站外用户昵称,
  102. 'out_avatar_url': 站外用户头像,
  103. 'out_fans': 站外用户粉丝量,
  104. 'out_play_cnt': 站外用户总播放量,
  105. 'out_create_time': 站外用户创建时间}
  106. """
  107. try:
  108. url = "https://www.youtube.com/youtubei/v1/browse?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8&prettyPrint=false"
  109. payload = json.dumps({
  110. "context": {
  111. "client": {
  112. "hl": "zh-CN",
  113. "gl": "US",
  114. "remoteHost": "38.93.247.21",
  115. "deviceMake": "Apple",
  116. "deviceModel": "",
  117. "visitorData": "CgtraDZfVnB4NXdIWSjL1IKfBg%3D%3D",
  118. "userAgent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36,gzip(gfe)",
  119. "clientName": "WEB",
  120. "clientVersion": "2.20230201.01.00",
  121. "osName": "Macintosh",
  122. "osVersion": "10_15_7",
  123. "originalUrl": f"https://www.youtube.com/{out_user_id}/about",
  124. "screenPixelDensity": 1,
  125. "platform": "DESKTOP",
  126. "clientFormFactor": "UNKNOWN_FORM_FACTOR",
  127. "configInfo": {
  128. "appInstallData": "CMvUgp8GEKLsrgUQzN-uBRC41K4FENfkrgUQsvWuBRDkoP4SELiLrgUQo_muBRDn964FENnprgUQlPiuBRC2nP4SEPuj_hIQ4tSuBRCJ6K4FEILdrgUQh92uBRD-7q4FEMz1rgUQ76P-EhDJya4FEJan_hIQkfj8Eg%3D%3D"
  129. },
  130. "screenDensityFloat": 1,
  131. "timeZone": "Asia/Shanghai",
  132. "browserName": "Chrome",
  133. "browserVersion": "109.0.0.0",
  134. "acceptHeader": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
  135. "deviceExperimentId": "ChxOekU1TlRReU5qWTBOVFExTVRRNU5qRTBOdz09EMvUgp8GGOmU7Z4G",
  136. "screenWidthPoints": 805,
  137. "screenHeightPoints": 969,
  138. "utcOffsetMinutes": 480,
  139. "userInterfaceTheme": "USER_INTERFACE_THEME_LIGHT",
  140. "memoryTotalKbytes": "8000000",
  141. "mainAppWebInfo": {
  142. "graftUrl": f"/{out_user_id}/about",
  143. "pwaInstallabilityStatus": "PWA_INSTALLABILITY_STATUS_CAN_BE_INSTALLED",
  144. "webDisplayMode": "WEB_DISPLAY_MODE_FULLSCREEN",
  145. "isWebNativeShareAvailable": True
  146. }
  147. },
  148. "user": {
  149. "lockedSafetyMode": False
  150. },
  151. "request": {
  152. "useSsl": True,
  153. "internalExperimentFlags": [],
  154. "consistencyTokenJars": []
  155. },
  156. "clickTracking": {
  157. "clickTrackingParams": "CBMQ8JMBGAoiEwjY34r0rYD9AhURSEwIHfHZAak="
  158. },
  159. "adSignalsInfo": {
  160. "params": [
  161. {
  162. "key": "dt",
  163. "value": "1675668045032"
  164. },
  165. {
  166. "key": "flash",
  167. "value": "0"
  168. },
  169. {
  170. "key": "frm",
  171. "value": "0"
  172. },
  173. {
  174. "key": "u_tz",
  175. "value": "480"
  176. },
  177. {
  178. "key": "u_his",
  179. "value": "1"
  180. },
  181. {
  182. "key": "u_h",
  183. "value": "1080"
  184. },
  185. {
  186. "key": "u_w",
  187. "value": "1920"
  188. },
  189. {
  190. "key": "u_ah",
  191. "value": "1080"
  192. },
  193. {
  194. "key": "u_aw",
  195. "value": "1920"
  196. },
  197. {
  198. "key": "u_cd",
  199. "value": "24"
  200. },
  201. {
  202. "key": "bc",
  203. "value": "31"
  204. },
  205. {
  206. "key": "bih",
  207. "value": "969"
  208. },
  209. {
  210. "key": "biw",
  211. "value": "805"
  212. },
  213. {
  214. "key": "brdim",
  215. "value": "-269,-1080,-269,-1080,1920,-1080,1920,1080,805,969"
  216. },
  217. {
  218. "key": "vis",
  219. "value": "1"
  220. },
  221. {
  222. "key": "wgl",
  223. "value": "true"
  224. },
  225. {
  226. "key": "ca_type",
  227. "value": "image"
  228. }
  229. ],
  230. "bid": "ANyPxKqvCBKtjNeHQ6uTC7sKj2ZwIvEkk3oRlmdU7H_soRJWLc4IQCkqMVP68RR-Xae0h3nMdOKYOtVh_Yb2OYr4znd60I5j7A"
  231. }
  232. },
  233. # "browseId": browse_id,
  234. "params": "EgVhYm91dPIGBAoCEgA%3D"
  235. })
  236. headers = {
  237. 'authority': 'www.youtube.com',
  238. 'accept': '*/*',
  239. 'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
  240. 'cache-control': 'no-cache',
  241. 'content-type': 'application/json',
  242. 'cookie': 'VISITOR_INFO1_LIVE=kh6_Vpx5wHY; YSC=UupqFrWvAR0; DEVICE_INFO=ChxOekU1TlRReU5qWTBOVFExTVRRNU5qRTBOdz09EOmU7Z4GGOmU7Z4G; GPS=1; PREF=tz=Asia.Shanghai; ST-h076le=itct=CBMQ8JMBGAoiEwjY34r0rYD9AhURSEwIHfHZAak%3D&csn=MC45NDM2MjgyNzM1ODE5NDAz&endpoint=%7B%22clickTrackingParams%22%3A%22CBMQ8JMBGAoiEwjY34r0rYD9AhURSEwIHfHZAak%3D%22%2C%22commandMetadata%22%3A%7B%22webCommandMetadata%22%3A%7B%22url%22%3A%22%2F%40weitravel%2Fabout%22%2C%22webPageType%22%3A%22WEB_PAGE_TYPE_CHANNEL%22%2C%22rootVe%22%3A3611%2C%22apiUrl%22%3A%22%2Fyoutubei%2Fv1%2Fbrowse%22%7D%7D%2C%22browseEndpoint%22%3A%7B%22browseId%22%3A%22UC08jgxf119fzynp2uHCvZIg%22%2C%22params%22%3A%22EgVhYm91dPIGBAoCEgA%253D%22%2C%22canonicalBaseUrl%22%3A%22%2F%40weitravel%22%7D%7D',
  243. 'origin': 'https://www.youtube.com',
  244. 'pragma': 'no-cache',
  245. 'referer': f'https://www.youtube.com/{out_user_id}/videos',
  246. 'sec-ch-ua': '"Not_A Brand";v="99", "Chromium";v="109", "Google Chrome";v="109.0.5414.87"',
  247. 'sec-ch-ua-arch': '"arm"',
  248. 'sec-ch-ua-bitness': '"64"',
  249. 'sec-ch-ua-full-version': '"109.0.1518.52"',
  250. 'sec-ch-ua-full-version-list': '"Not_A Brand";v="99.0.0.0", "Microsoft Edge";v="109.0.1518.52", "Chromium";v="109.0.5414.87"',
  251. 'sec-ch-ua-mobile': '?0',
  252. 'sec-ch-ua-model': '',
  253. 'sec-ch-ua-platform': '"macOS"',
  254. 'sec-ch-ua-platform-version': '"12.4.0"',
  255. 'sec-ch-ua-wow64': '?0',
  256. 'sec-fetch-dest': 'empty',
  257. 'sec-fetch-mode': 'same-origin',
  258. 'sec-fetch-site': 'same-origin',
  259. 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
  260. 'x-goog-visitor-id': 'CgtraDZfVnB4NXdIWSjL1IKfBg%3D%3D',
  261. 'x-youtube-bootstrap-logged-in': 'false',
  262. 'x-youtube-client-name': '1',
  263. 'x-youtube-client-version': '2.20230201.01.00'
  264. }
  265. response = requests.post(url=url, headers=headers, data=payload)
  266. if response.status_code != 200:
  267. Common.logger(log_type, crawler).warning(f'get_out_user_info:{response.text}\n')
  268. elif 'contents' not in response.text or 'header' not in response.text:
  269. Common.logger(log_type, crawler).warning(f'get_out_user_info:{response.text}\n')
  270. elif 'c4TabbedHeaderRenderer' not in response.json()['header']:
  271. Common.logger(log_type, crawler).warning(f'get_out_user_info:{response.json()["header"]}\n')
  272. elif 'twoColumnBrowseResultsRenderer' not in response.json()['contents']:
  273. Common.logger(log_type, crawler).warning(f'get_out_user_info:{response.json()}\n')
  274. elif 'tabs' not in response.json()['contents']['twoColumnBrowseResultsRenderer']:
  275. Common.logger(log_type, crawler).warning(
  276. f"get_out_user_info:{response.json()['contents']['twoColumnBrowseResultsRenderer']}\n")
  277. else:
  278. header = response.json()['header']['c4TabbedHeaderRenderer']
  279. tabs = response.json()['contents']['twoColumnBrowseResultsRenderer']['tabs']
  280. for i in range(len(tabs)):
  281. if 'tabRenderer' not in tabs[i]:
  282. title = ''
  283. elif 'title' not in tabs[i]['tabRenderer']:
  284. title = ''
  285. else:
  286. title = tabs[i]['tabRenderer']['title']
  287. if title == '简介':
  288. if 'tabRenderer' not in tabs[i]:
  289. Common.logger(log_type, crawler).warning(f"get_out_user_info:{tabs[i]}\n")
  290. elif 'content' not in tabs[i]['tabRenderer']:
  291. Common.logger(log_type, crawler).warning(f"get_out_user_info:{tabs[i]['tabRenderer']}\n")
  292. elif 'sectionListRenderer' not in tabs[i]['tabRenderer']['content']:
  293. Common.logger(log_type, crawler).warning(
  294. f"get_out_user_info:{tabs[i]['tabRenderer']['content']}\n")
  295. elif 'contents' not in tabs[i]['tabRenderer']['content']['sectionListRenderer']:
  296. Common.logger(log_type, crawler).warning(
  297. f"get_out_user_info:{tabs[i]['tabRenderer']['content']['sectionListRenderer']}\n")
  298. elif len(tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents']) == 0:
  299. Common.logger(log_type, crawler).warning(
  300. f"get_out_user_info:{tabs[i]['tabRenderer']['content']['sectionListRenderer']}\n")
  301. elif 'itemSectionRenderer' not in \
  302. tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]:
  303. Common.logger(log_type, crawler).warning(
  304. f"get_out_user_info:{tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]}\n")
  305. elif 'contents' not in tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0][
  306. 'itemSectionRenderer']:
  307. Common.logger(log_type, crawler).warning(
  308. f"get_out_user_info:{tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']}\n")
  309. elif len(tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0][
  310. 'itemSectionRenderer']['contents']) == 0:
  311. Common.logger(log_type, crawler).warning(
  312. f"get_out_user_info:{tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']}\n")
  313. elif 'channelAboutFullMetadataRenderer' not in \
  314. tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0][
  315. 'itemSectionRenderer']['contents'][0]:
  316. Common.logger(log_type, crawler).warning(
  317. f"get_out_user_info:{tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]}\n")
  318. else:
  319. # 站外用户昵称
  320. if 'title' not in header and 'title' not in \
  321. tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0][
  322. 'itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer']:
  323. out_user_name = ''
  324. elif 'title' in header:
  325. out_user_name = header['title']
  326. elif 'simpleText' not in \
  327. tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0][
  328. 'itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer'][
  329. 'title']:
  330. out_user_name = ''
  331. else:
  332. out_user_name = tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0][
  333. 'itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer']['title'][
  334. 'simpleText']
  335. # 站外用户头像
  336. if 'avatar' not in header and 'avatar' not in \
  337. tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0][
  338. 'itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer']:
  339. out_avatar_url = ''
  340. elif 'thumbnails' not in header['avatar'] and 'thumbnails' not in \
  341. tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0][
  342. 'itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer'][
  343. 'avatar']:
  344. out_avatar_url = ''
  345. elif len(header['avatar']['thumbnails']) == 0 and len(
  346. tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0][
  347. 'itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer'][
  348. 'avatar']['thumbnails']) == 0:
  349. out_avatar_url = ''
  350. elif 'url' not in header['avatar']['thumbnails'][-1] and 'url' not in \
  351. tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0][
  352. 'itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer'][
  353. 'avatar']['thumbnails'][-1]:
  354. out_avatar_url = ''
  355. elif 'url' in header['avatar']['thumbnails'][-1]:
  356. out_avatar_url = header['avatar']['thumbnails'][-1]['url']
  357. else:
  358. out_avatar_url = \
  359. tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0][
  360. 'itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer'][
  361. 'avatar'][
  362. 'thumbnails'][-1]['url']
  363. # 站外用户粉丝
  364. if 'subscriberCountText' not in header:
  365. out_fans = 0
  366. elif 'accessibility' not in header['subscriberCountText']:
  367. out_fans = 0
  368. elif 'accessibilityData' not in header['subscriberCountText']['accessibility']:
  369. out_fans = 0
  370. elif 'label' not in header['subscriberCountText']['accessibility']['accessibilityData']:
  371. out_fans = 0
  372. else:
  373. out_fans = header['subscriberCountText']['accessibility']['accessibilityData']['label']
  374. if '万' in out_fans:
  375. out_fans = int(float(out_fans.split('万')[0]) * 10000)
  376. elif "位" in out_fans:
  377. out_fans = int(out_fans.split('位')[0].replace(",", ""))
  378. else:
  379. pass
  380. # 站外用户总播放量
  381. if 'viewCountText' not in \
  382. tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0][
  383. 'itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer']:
  384. out_play_cnt = 0
  385. elif 'simpleText' not in \
  386. tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0][
  387. 'itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer'][
  388. 'viewCountText']:
  389. out_play_cnt = 0
  390. else:
  391. out_play_cnt = int(
  392. tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0][
  393. 'itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer'][
  394. 'viewCountText']['simpleText'].split('次')[0].replace(',', ''))
  395. # 站外用户注册时间
  396. if 'joinedDateText' not in \
  397. tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0][
  398. 'itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer']:
  399. out_create_time = ''
  400. elif 'runs' not in tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0][
  401. 'itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer'][
  402. 'joinedDateText']:
  403. out_create_time = ''
  404. elif len(tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0][
  405. 'itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer'][
  406. 'joinedDateText']['runs']) == 0:
  407. out_create_time = ''
  408. elif 'text' not in tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0][
  409. 'itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer'][
  410. 'joinedDateText']['runs'][0]:
  411. out_create_time = ''
  412. else:
  413. out_create_time = \
  414. tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0][
  415. 'itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer'][
  416. 'joinedDateText']['runs'][0]['text'].replace('年', '-').replace('月',
  417. '-').replace(
  418. '日', '')
  419. out_user_dict = {
  420. 'out_user_name': out_user_name,
  421. 'out_avatar_url': out_avatar_url,
  422. 'out_fans': out_fans,
  423. 'out_play_cnt': out_play_cnt,
  424. 'out_create_time': out_create_time,
  425. }
  426. # print(out_user_dict)
  427. return out_user_dict
  428. except Exception as e:
  429. Common.logger(log_type, crawler).error(f'get_out_user_info异常:{e}\n')
  430. @classmethod
  431. def get_user_from_feishu(cls, log_type, crawler, sheetid, env, machine):
  432. """
  433. 补全飞书用户表信息,并返回
  434. :param log_type: 日志
  435. :param crawler: 哪款爬虫
  436. :param sheetid: 飞书表
  437. :param env: 正式环境:prod,测试环境:dev
  438. :param machine: 部署机器,阿里云填写 aliyun,aliyun_hk ,线下分别填写 macpro,macair,local
  439. :return: user_list
  440. """
  441. try:
  442. user_sheet = Feishu.get_values_batch(log_type, crawler, sheetid)
  443. user_list = []
  444. for i in range(1, len(user_sheet)):
  445. # for i in range(181, len(user_sheet)):
  446. out_uid = user_sheet[i][2]
  447. user_name = user_sheet[i][3]
  448. browse_id = user_sheet[i][5]
  449. our_uid = user_sheet[i][6]
  450. uer_url = user_sheet[i][4]
  451. if out_uid is not None and user_name is not None:
  452. Common.logger(log_type, crawler).info(f"正在更新 {user_name} 用户信息\n")
  453. # 获取站外browse_id,并写入飞书
  454. # if browse_id is None:
  455. # browse_id = cls.get_browse_id(log_type, crawler, out_uid, machine)
  456. # if browse_id is None:
  457. # Common.logger(log_type, crawler).warning('browse_id is None !')
  458. # else:
  459. # Feishu.update_values(log_type, crawler, sheetid, f'F{i + 1}:F{i + 1}', [[browse_id]])
  460. # Common.logger(log_type, crawler).info(f'browse_id写入成功:{browse_id}')
  461. # 站内 UID 为空,且数据库中(youtube+out_user_id)返回数量 == 0,则创建新的站内账号
  462. if our_uid is None:
  463. sql = f""" select * from crawler_user where platform="{cls.platform}" and out_user_id="{out_uid}" """
  464. our_user_info = MysqlHelper.get_values(log_type, crawler, sql, env, machine)
  465. # 数据库中(youtube + out_user_id)返回数量 == 0,则创建站内账号UID,并写入定向账号飞书表。并结合站外用户信息,一并写入爬虫账号数据库
  466. if our_user_info is None or len(our_user_info) == 0:
  467. # 获取站外账号信息,写入数据库
  468. out_user_dict = cls.get_out_user_info(log_type, crawler, browse_id, out_uid)
  469. out_avatar_url = out_user_dict['out_avatar_url']
  470. out_create_time = out_user_dict['out_create_time']
  471. out_play_cnt = out_user_dict['out_play_cnt']
  472. out_fans = out_user_dict['out_fans']
  473. tag = 'youtube爬虫,定向爬虫策略'
  474. # 创建站内账号
  475. create_user_dict = {
  476. 'nickName': user_name,
  477. 'avatarUrl': out_avatar_url,
  478. 'tagName': tag,
  479. }
  480. our_uid = Users.create_uid(log_type, crawler, create_user_dict, env)
  481. Common.logger(log_type, crawler).info(f'新创建的站内UID:{our_uid}')
  482. if env == 'prod':
  483. our_user_link = f'https://admin.piaoquantv.com/ums/user/{our_uid}/post'
  484. else:
  485. our_user_link = f'https://testadmin.piaoquantv.com/ums/user/{our_uid}/post'
  486. Common.logger(log_type, crawler).info(f'站内用户主页链接:{our_user_link}')
  487. Feishu.update_values(log_type, crawler, sheetid, f'G{i + 1}:H{i + 1}',
  488. [[our_uid, our_user_link]])
  489. Common.logger(log_type, crawler).info(f'站内用户信息写入飞书成功!')
  490. sql = f""" insert into crawler_user(user_id,
  491. out_user_id,
  492. out_user_name,
  493. out_avatar_url,
  494. out_create_time,
  495. out_play_cnt,
  496. out_fans,
  497. platform,
  498. tag)
  499. values({our_uid},
  500. "{out_uid}",
  501. "{user_name}",
  502. "{out_avatar_url}",
  503. "{out_create_time}",
  504. {out_play_cnt},
  505. {out_fans},
  506. "{cls.platform}",
  507. "{tag}") """
  508. Common.logger(log_type, crawler).info(f'sql:{sql}')
  509. MysqlHelper.update_values(log_type, crawler, sql, env, machine)
  510. Common.logger(log_type, crawler).info('用户信息插入数据库成功!\n')
  511. # 数据库中(youtube + out_user_id)返回数量 != 0,则直接把数据库中的站内 UID 写入飞书
  512. else:
  513. our_uid = our_user_info[0][1]
  514. if 'env' == 'prod':
  515. our_user_link = f'https://admin.piaoquantv.com/ums/user/{our_uid}/post'
  516. else:
  517. our_user_link = f'https://testadmin.piaoquantv.com/ums/user/{our_uid}/post'
  518. Common.logger(log_type, crawler).info(f'站内用户主页链接:{our_user_link}')
  519. Feishu.update_values(log_type, crawler, sheetid, f'G{i + 1}:H{i + 1}',
  520. [[our_uid, our_user_link]])
  521. Common.logger(log_type, crawler).info(f'站内用户信息写入飞书成功!\n')
  522. user_dict = {
  523. 'out_user_id': out_uid,
  524. 'out_user_name': user_name,
  525. 'out_browse_id': browse_id,
  526. 'our_user_id': our_uid,
  527. 'out_user_url': uer_url
  528. }
  529. user_list.append(user_dict)
  530. else:
  531. pass
  532. return user_list
  533. except Exception as e:
  534. Common.logger(log_type, crawler).error(f"get_user_from_feishu异常:{e}\n")
  535. @classmethod
  536. def get_continuation(cls, data):
  537. continuation = data['continuationItemRenderer']['continuationEndpoint']['continuationCommand']['token']
  538. return continuation
  539. @classmethod
  540. def get_feeds(cls, log_type, crawler, browse_id, out_uid):
  541. """
  542. 获取个人主页视频列表
  543. :param log_type: 日志
  544. :param crawler: 哪款爬虫
  545. :param browse_id: 每个用户主页的请求参数中唯一值
  546. :param out_uid: 站外用户UID
  547. :return: video_list
  548. """
  549. url = "https://www.youtube.com/youtubei/v1/browse?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8&prettyPrint=false"
  550. payload = json.dumps({
  551. "context": {
  552. "client": {
  553. "hl": "zh-CN",
  554. "gl": "US",
  555. "remoteHost": "38.93.247.21",
  556. "deviceMake": "Apple",
  557. "deviceModel": "",
  558. "visitorData": "CgtraDZfVnB4NXdIWSi6mIOfBg%3D%3D",
  559. "userAgent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36,gzip(gfe)",
  560. "clientName": "WEB",
  561. "clientVersion": "2.20230201.01.00",
  562. "osName": "Macintosh",
  563. "osVersion": "10_15_7",
  564. "originalUrl": f"https://www.youtube.com/{out_uid}/videos",
  565. "platform": "DESKTOP",
  566. "clientFormFactor": "UNKNOWN_FORM_FACTOR",
  567. "configInfo": {
  568. "appInstallData": "CLqYg58GEInorgUQuIuuBRCU-K4FENfkrgUQuNSuBRC2nP4SEPuj_hIQ5_euBRCy9a4FEKLsrgUQt-CuBRDi1K4FEILdrgUQh92uBRDM364FEP7urgUQzPWuBRDZ6a4FEOSg_hIQo_muBRDvo_4SEMnJrgUQlqf-EhCR-PwS"
  569. },
  570. "timeZone": "Asia/Shanghai",
  571. "browserName": "Chrome",
  572. "browserVersion": "109.0.0.0",
  573. "acceptHeader": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
  574. "deviceExperimentId": "ChxOekU1TlRReU5qWTBOVFExTVRRNU5qRTBOdz09ELqYg58GGOmU7Z4G",
  575. "screenWidthPoints": 944,
  576. "screenHeightPoints": 969,
  577. "screenPixelDensity": 1,
  578. "screenDensityFloat": 1,
  579. "utcOffsetMinutes": 480,
  580. "userInterfaceTheme": "USER_INTERFACE_THEME_LIGHT",
  581. "memoryTotalKbytes": "8000000",
  582. "mainAppWebInfo": {
  583. "graftUrl": f"/{out_uid}/videos",
  584. "pwaInstallabilityStatus": "PWA_INSTALLABILITY_STATUS_CAN_BE_INSTALLED",
  585. "webDisplayMode": "WEB_DISPLAY_MODE_FULLSCREEN",
  586. "isWebNativeShareAvailable": True
  587. }
  588. },
  589. "user": {
  590. "lockedSafetyMode": False
  591. },
  592. "request": {
  593. "useSsl": True,
  594. "internalExperimentFlags": [],
  595. "consistencyTokenJars": []
  596. },
  597. "clickTracking": {
  598. "clickTrackingParams": "CBcQ8JMBGAYiEwiNhIXX9IL9AhUFSUwIHWnnDks="
  599. },
  600. "adSignalsInfo": {
  601. "params": [
  602. {
  603. "key": "dt",
  604. "value": "1675676731048"
  605. },
  606. {
  607. "key": "flash",
  608. "value": "0"
  609. },
  610. {
  611. "key": "frm",
  612. "value": "0"
  613. },
  614. {
  615. "key": "u_tz",
  616. "value": "480"
  617. },
  618. {
  619. "key": "u_his",
  620. "value": "4"
  621. },
  622. {
  623. "key": "u_h",
  624. "value": "1080"
  625. },
  626. {
  627. "key": "u_w",
  628. "value": "1920"
  629. },
  630. {
  631. "key": "u_ah",
  632. "value": "1080"
  633. },
  634. {
  635. "key": "u_aw",
  636. "value": "1920"
  637. },
  638. {
  639. "key": "u_cd",
  640. "value": "24"
  641. },
  642. {
  643. "key": "bc",
  644. "value": "31"
  645. },
  646. {
  647. "key": "bih",
  648. "value": "969"
  649. },
  650. {
  651. "key": "biw",
  652. "value": "944"
  653. },
  654. {
  655. "key": "brdim",
  656. "value": "-269,-1080,-269,-1080,1920,-1080,1920,1080,944,969"
  657. },
  658. {
  659. "key": "vis",
  660. "value": "1"
  661. },
  662. {
  663. "key": "wgl",
  664. "value": "true"
  665. },
  666. {
  667. "key": "ca_type",
  668. "value": "image"
  669. }
  670. ],
  671. "bid": "ANyPxKpfiaAf-DBzNeKLgkceMEA9UIeCWFRTRm4AQMCuejhI3PGwDB1jizQIX60YcEYtt_CX7tZWAbYerQ-rWLvV7y_KCLkBww"
  672. }
  673. },
  674. # "browseId": browse_id,
  675. "params": "EgZ2aWRlb3PyBgQKAjoA",
  676. "continuation": cls.continuation
  677. })
  678. headers = {
  679. 'authority': 'www.youtube.com',
  680. 'accept': '*/*',
  681. 'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
  682. 'cache-control': 'no-cache',
  683. 'content-type': 'application/json',
  684. 'cookie': 'VISITOR_INFO1_LIVE=kh6_Vpx5wHY; YSC=UupqFrWvAR0; DEVICE_INFO=ChxOekU1TlRReU5qWTBOVFExTVRRNU5qRTBOdz09EOmU7Z4GGOmU7Z4G; PREF=tz=Asia.Shanghai; ST-1kg1gfd=itct=CBcQ8JMBGAYiEwiNhIXX9IL9AhUFSUwIHWnnDks%3D&csn=MC4zNzI3MDcwMDA1Mjg4NzE5Ng..&endpoint=%7B%22clickTrackingParams%22%3A%22CBcQ8JMBGAYiEwiNhIXX9IL9AhUFSUwIHWnnDks%3D%22%2C%22commandMetadata%22%3A%7B%22webCommandMetadata%22%3A%7B%22url%22%3A%22%2F%40chinatravel5971%2Fvideos%22%2C%22webPageType%22%3A%22WEB_PAGE_TYPE_CHANNEL%22%2C%22rootVe%22%3A3611%2C%22apiUrl%22%3A%22%2Fyoutubei%2Fv1%2Fbrowse%22%7D%7D%2C%22browseEndpoint%22%3A%7B%22browseId%22%3A%22UCpLXnfBCNhj8KLnt54RQMKA%22%2C%22params%22%3A%22EgZ2aWRlb3PyBgQKAjoA%22%2C%22canonicalBaseUrl%22%3A%22%2F%40chinatravel5971%22%7D%7D',
  685. 'origin': 'https://www.youtube.com',
  686. 'pragma': 'no-cache',
  687. 'referer': f'https://www.youtube.com/{out_uid}/featured',
  688. 'sec-ch-ua': '"Not_A Brand";v="99", "Chromium";v="109", "Google Chrome";v="109.0.5414.87"',
  689. 'sec-ch-ua-arch': '"arm"',
  690. 'sec-ch-ua-bitness': '"64"',
  691. 'sec-ch-ua-full-version': '"109.0.1518.52"',
  692. 'sec-ch-ua-full-version-list': '"Not_A Brand";v="99.0.0.0", "Microsoft Edge";v="109.0.1518.52", "Chromium";v="109.0.5414.87"',
  693. 'sec-ch-ua-mobile': '?0',
  694. 'sec-ch-ua-model': '',
  695. 'sec-ch-ua-platform': '"macOS"',
  696. 'sec-ch-ua-platform-version': '"12.4.0"',
  697. 'sec-ch-ua-wow64': '?0',
  698. 'sec-fetch-dest': 'empty',
  699. 'sec-fetch-mode': 'same-origin',
  700. 'sec-fetch-site': 'same-origin',
  701. 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
  702. 'x-goog-visitor-id': 'CgtraDZfVnB4NXdIWSi6mIOfBg%3D%3D',
  703. 'x-youtube-bootstrap-logged-in': 'false',
  704. 'x-youtube-client-name': '1',
  705. 'x-youtube-client-version': '2.20230201.01.00'
  706. }
  707. try:
  708. response = requests.post(url=url, headers=headers, data=payload)
  709. # Common.logger(log_type, crawler).info(f"get_feeds_response:{response.json()}\n")
  710. cls.continuation = response.json()['trackingParams']
  711. if response.status_code != 200:
  712. Common.logger(log_type, crawler).warning(f'get_feeds_response:{response.text}\n')
  713. elif 'continuationContents' not in response.text and 'onResponseReceivedActions' not in response.text:
  714. Common.logger(log_type, crawler).warning(f'get_feeds_response:{response.text}\n')
  715. elif 'continuationContents' in response.json():
  716. # Common.logger(log_type, crawler).info("'continuationContents' in response.json()\n")
  717. if 'richGridContinuation' not in response.json()['continuationContents']:
  718. # Common.logger(log_type, crawler).warning(f"'richGridContinuation' not in response.json()['continuationContents']\n")
  719. Common.logger(log_type, crawler).warning(
  720. f'get_feeds_response:{response.json()["continuationContents"]}\n')
  721. elif 'contents' not in response.json()['continuationContents']['richGridContinuation']:
  722. Common.logger(log_type, crawler).warning(
  723. f'get_feeds_response:{response.json()["continuationContents"]["richGridContinuation"]}\n')
  724. elif 'contents' in response.json()["continuationContents"]["richGridContinuation"]:
  725. feeds = response.json()["continuationContents"]["richGridContinuation"]['contents']
  726. return feeds
  727. elif 'onResponseReceivedActions' in response.json():
  728. Common.logger(log_type, crawler).info("'onResponseReceivedActions' in response.json()\n")
  729. if len(response.json()['onResponseReceivedActions']) == 0:
  730. Common.logger(log_type, crawler).warning(
  731. f'get_feeds_response:{response.json()["onResponseReceivedActions"]}\n')
  732. elif 'appendContinuationItemsAction' not in response.json()['onResponseReceivedActions'][0]:
  733. Common.logger(log_type, crawler).warning(
  734. f'get_feeds_response:{response.json()["onResponseReceivedActions"][0]}\n')
  735. elif 'continuationItems' not in response.json()['onResponseReceivedActions'][0][
  736. 'appendContinuationItemsAction']:
  737. Common.logger(log_type, crawler).warning(
  738. f'get_feeds_response:{response.json()["onResponseReceivedActions"][0]["appendContinuationItemsAction"]}\n')
  739. elif len(response.json()['onResponseReceivedActions'][0]['appendContinuationItemsAction'][
  740. 'continuationItems']) == 0:
  741. Common.logger(log_type, crawler).warning(
  742. f'get_feeds_response:{response.json()["onResponseReceivedActions"][0]["appendContinuationItemsAction"]["continuationItems"]}\n')
  743. else:
  744. feeds = response.json()["onResponseReceivedActions"][0]["appendContinuationItemsAction"][
  745. "continuationItems"]
  746. return feeds
  747. else:
  748. Common.logger(log_type, crawler).info('feeds is None\n')
  749. except Exception as e:
  750. Common.logger(log_type, crawler).error(f'get_feeds异常:{e}\n')
  751. @classmethod
  752. def get_first_page(cls, user_url):
  753. try:
  754. res = requests.get(url=user_url, headers=cls.headers)
  755. info = re.findall(r'var ytInitialData = (.*?);', res.text, re.S)[0]
  756. ytInitialData = json.loads(info)
  757. video_list = \
  758. ytInitialData['contents']['twoColumnBrowseResultsRenderer']['tabs'][1]['tabRenderer']['content'][
  759. 'richGridRenderer']['contents']
  760. except Exception as e:
  761. video_list = []
  762. return video_list
  763. @classmethod
  764. def get_next_page(cls, log_type, crawler, strategy, oss_endpoint, env, out_uid, our_uid,
  765. machine, out_user_url, continuation):
  766. post_url = "https://www.youtube.com/youtubei/v1/browse?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8&prettyPrint=false"
  767. payload = json.dumps({
  768. "context": {
  769. "client": {
  770. "userAgent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36,gzip(gfe)",
  771. "clientName": "WEB",
  772. "clientVersion": "2.20230221.06.00",
  773. "osName": "Macintosh",
  774. "osVersion": "10_15_7",
  775. "originalUrl": "https://www.youtube.com/@wongkim728/videos",
  776. "screenPixelDensity": 2,
  777. "platform": "DESKTOP",
  778. "clientFormFactor": "UNKNOWN_FORM_FACTOR",
  779. "configInfo": {
  780. "appInstallData": "CKWy258GEOWg_hIQzN-uBRC4rP4SEOf3rgUQzPWuBRCi7K4FEMiJrwUQieiuBRDshq8FENrprgUQ4tSuBRD-7q4FEKOArwUQgt2uBRC2nP4SEJT4rgUQuIuuBRCH3a4FELjUrgUQjqj-EhCR-PwS"
  781. },
  782. "screenDensityFloat": 2,
  783. "timeZone": "Asia/Shanghai",
  784. "browserName": "Chrome",
  785. "browserVersion": "110.0.0.0",
  786. "acceptHeader": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
  787. "deviceExperimentId": "ChxOekl3TWpVek9UQXpPVE13TnpJd056a3pNZz09EKWy258GGJie0p8G",
  788. "screenWidthPoints": 576,
  789. "screenHeightPoints": 764,
  790. "utcOffsetMinutes": 480,
  791. "userInterfaceTheme": "USER_INTERFACE_THEME_LIGHT",
  792. "connectionType": "CONN_CELLULAR_4G",
  793. "memoryTotalKbytes": "8000000",
  794. "mainAppWebInfo": {
  795. "graftUrl": out_user_url,
  796. "pwaInstallabilityStatus": "PWA_INSTALLABILITY_STATUS_CAN_BE_INSTALLED",
  797. "webDisplayMode": "WEB_DISPLAY_MODE_FULLSCREEN",
  798. "isWebNativeShareAvailable": False
  799. }
  800. },
  801. "user": {
  802. "lockedSafetyMode": False
  803. },
  804. "request": {
  805. "useSsl": True,
  806. "internalExperimentFlags": [],
  807. "consistencyTokenJars": []
  808. },
  809. "clickTracking": {
  810. "clickTrackingParams": ""
  811. },
  812. "adSignalsInfo": {
  813. "params": [],
  814. "bid": "ANyPxKo8EXfKNGm3gYLAqhR5HA90FSKMvQf43tk3KV_XUWB5xi_0OxAo2TJTfoVx_516NRxz0qwRg-1x2kD-IVt7LPKrRHkJBA"
  815. }
  816. },
  817. "continuation": continuation
  818. })
  819. headers = {
  820. # 'authorization': 'SAPISIDHASH 1677121838_f5055bd4b4c242d18af423b37ac0f556bf1dfc30',
  821. 'content-type': 'application/json',
  822. 'cookie': 'VISITOR_INFO1_LIVE=HABZsLFdU40; DEVICE_INFO=ChxOekl3TWpVek9UQXpPVE13TnpJd056a3pNZz09EJie0p8GGJie0p8G; PREF=f4=4000000&tz=Asia.Shanghai; HSID=AxFp7ylWWebUZYqrl; SSID=ANHuSQMqvVcV0vVNn; APISID=AkwZgjPvFZ6LZCrE/Aiv0K-2rEUzY1bH1u; SAPISID=8yRrBMHYXAhqkybH/AEFGJvzZ3tPalnTy0; __Secure-1PAPISID=8yRrBMHYXAhqkybH/AEFGJvzZ3tPalnTy0; __Secure-3PAPISID=8yRrBMHYXAhqkybH/AEFGJvzZ3tPalnTy0; SID=TwjWkM4mrKb4o8pRKbyQVqELjNU43ZL0bF8QB2hdTI9z05T4Koo9aQoNQfX1AiGFWeD7WA.; __Secure-1PSID=TwjWkM4mrKb4o8pRKbyQVqELjNU43ZL0bF8QB2hdTI9z05T4bs4qvvXffLLTXq_VYw0XLw.; __Secure-3PSID=TwjWkM4mrKb4o8pRKbyQVqELjNU43ZL0bF8QB2hdTI9z05T4cNwzpudzvCglfQ5A1FJnog.; LOGIN_INFO=AFmmF2swRAIgO4TvR9xxWoHPgrGoGAEVo-P8Slqem__vIdF_oajjRiECIFiq4YtbL_IQGCbkjrHsWkWH6OpzKd8RlgdS6qNurR0Q:QUQ3MjNmejV5WkRVUmZXVlFjbjY0dW1aVGpoZkZQdmxYamIzV01zc0lmT3JiQl9ldVYwc0t4dlNkbWpoVEdJMHVaWjZXVEt3ZERQeUppU3AyNmR6ckFucWltZU5LNmZjQ3lHUEtKTDBzSlo5WXpJQzF3UlNCVlp2Q1ZKVmxtRk05OHRuWFFiWGphcFpPblFOUURWTlVxVGtBazVjcmVtS2pR; YSC=CtX0f3NennA; SIDCC=AFvIBn9aXC4vNCbg5jPzjbC8LMYCBVx_dy8uJO20b-768rmRfP9f5BqQ_xXspPemecVq29qZ7A; __Secure-1PSIDCC=AFvIBn-4TD_lPaKgbmYAGO6hZluLgSgbWgb7XAcaeNG6982LIIpS_Gb9vkqHTBMyCGvb4x7m6jk; __Secure-3PSIDCC=AFvIBn9ypvGX15qq4CsnsuhWTaXa9yMTxWMWbIDXtr6L3XZD81XBUQ0IMUv9ZKh9mf8NEbSvOy0; SIDCC=AFvIBn_DwLbohF2llhq4EQjFDFA3n9-_AK_7ITJsTZtCeYwy43J8KCYUPfY7ghqX9s-Qq5dOIQ; __Secure-1PSIDCC=AFvIBn-7x_HhxbmDkOzXew-sXAEWVuUGpglr8rypU623IyO8Y9OungcqMkuxBZQ2vr6G7x9UcxM; __Secure-3PSIDCC=AFvIBn-7aSYRxZkCKZp7-Mdn9PwbW4CUtXD0ok0nCvPIZXfkFrN9VqN1BHkI1fUaoIo_8YCjwRs',
  823. 'origin': 'https://www.youtube.com',
  824. 'referer': out_user_url,
  825. 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36',
  826. }
  827. try:
  828. res = requests.request("POST", post_url, headers=headers, data=payload).json()
  829. video_infos = res['onResponseReceivedActions'][0]['appendContinuationItemsAction']['continuationItems']
  830. for data in video_infos:
  831. if 'richItemRenderer' in data:
  832. video_id = data["richItemRenderer"]["content"]['videoRenderer']['videoId']
  833. video_dict = cls.get_video_info(log_type, crawler, out_uid, video_id, machine)
  834. # video_dict = cls.parse_video(video_dict, log_type, crawler, out_uid, video_id, machine)
  835. # 发布时间<=30天
  836. publish_time = int(time.mktime(time.strptime(video_dict['publish_time'], "%Y-%m-%d")))
  837. if int(time.time()) - publish_time <= 3600 * 24 * 30:
  838. cls.download_publish(log_type, crawler, video_dict, strategy, our_uid, env, oss_endpoint,
  839. machine)
  840. else:
  841. Common.logger(log_type, crawler).info('发布时间超过30天\n')
  842. return
  843. else:
  844. continuation = cls.get_continuation(data)
  845. cls.get_next_page(log_type, crawler, strategy, oss_endpoint, env, out_uid, our_uid,
  846. machine, out_user_url, continuation)
  847. except:
  848. return
  849. @classmethod
  850. def get_videos(cls, log_type, crawler, strategy, oss_endpoint, env, out_uid, our_uid,
  851. machine, out_user_url):
  852. try:
  853. feeds = cls.get_first_page(out_user_url)
  854. for data in feeds:
  855. if 'richItemRenderer' in data:
  856. video_id = data["richItemRenderer"]["content"]['videoRenderer']['videoId']
  857. video_dict = cls.get_video_info(log_type, crawler, out_uid, video_id, machine)
  858. # 发布时间<=30天
  859. publish_time = int(time.mktime(time.strptime(video_dict['publish_time'], "%Y-%m-%d")))
  860. if int(time.time()) - publish_time <= 3600 * 24 * 30:
  861. cls.download_publish(log_type, crawler, video_dict, strategy, our_uid, env, oss_endpoint,
  862. machine)
  863. else:
  864. Common.logger(log_type, crawler).info('发布时间超过30天\n')
  865. return
  866. else:
  867. continuation = cls.get_continuation(data)
  868. cls.get_next_page(log_type, crawler, strategy, oss_endpoint, env, out_uid, our_uid,
  869. machine, out_user_url, continuation=continuation)
  870. except Exception as e:
  871. Common.logger(log_type, crawler).error(f"get_videos异常:{e}\n")
  872. @classmethod
  873. def filter_emoji(cls, title):
  874. # 过滤表情
  875. try:
  876. co = re.compile(u'[\U00010000-\U0010ffff]')
  877. except re.error:
  878. co = re.compile(u'[\uD800-\uDBFF][\uDC00-\uDFFF]')
  879. return co.sub("", title)
  880. @classmethod
  881. def is_contain_chinese(cls, strword):
  882. for ch in strword:
  883. if u'\u4e00' <= ch <= u'\u9fff':
  884. return True
  885. return False
  886. @classmethod
  887. def parse_video(cls, video_dict, log_type, crawler, out_uid, video_id, machine):
  888. try:
  889. if 'streamingData' not in video_dict:
  890. Common.logger(log_type, crawler).warning(f"get_video_info_response:{video_dict}\n")
  891. elif 'videoDetails' not in video_dict:
  892. Common.logger(log_type, crawler).warning(f"get_video_info_response:{video_dict}\n")
  893. elif 'microformat' not in video_dict:
  894. Common.logger(log_type, crawler).warning(f"get_video_info_response:{video_dict}\n")
  895. else:
  896. playerMicroformatRenderer = video_dict['microformat']['playerMicroformatRenderer']
  897. videoDetails = video_dict['videoDetails']
  898. # streamingData = response.json()['streamingData']
  899. # video_title
  900. if 'title' not in videoDetails:
  901. video_title = ''
  902. else:
  903. video_title = videoDetails['title']
  904. video_title = cls.filter_emoji(video_title)
  905. # if Translate.is_contains_chinese(video_title) is False:
  906. if not cls.is_contain_chinese(video_title):
  907. video_title = Translate.google_translate(video_title, machine) \
  908. .strip().replace("\\", "").replace(" ", "").replace("\n", "") \
  909. .replace("/", "").replace("\r", "").replace("&NBSP", "").replace("&", "") \
  910. .replace(";", "").replace("amp;", "") # 自动翻译标题为中文
  911. if 'lengthSeconds' not in videoDetails:
  912. duration = 0
  913. else:
  914. duration = int(videoDetails['lengthSeconds'])
  915. # play_cnt
  916. if 'viewCount' not in videoDetails:
  917. play_cnt = 0
  918. else:
  919. play_cnt = int(videoDetails['viewCount'])
  920. # publish_time
  921. if 'publishDate' not in playerMicroformatRenderer:
  922. publish_time = ''
  923. else:
  924. publish_time = playerMicroformatRenderer['publishDate']
  925. if publish_time == '':
  926. publish_time_stamp = 0
  927. elif ':' in publish_time:
  928. publish_time_stamp = int(time.mktime(time.strptime(publish_time, "%Y-%m-%d %H:%M:%S")))
  929. else:
  930. publish_time_stamp = int(time.mktime(time.strptime(publish_time, "%Y-%m-%d")))
  931. # user_name
  932. if 'author' not in videoDetails:
  933. user_name = ''
  934. else:
  935. user_name = videoDetails['author']
  936. # cover_url
  937. if 'thumbnail' not in videoDetails:
  938. cover_url = ''
  939. elif 'thumbnails' not in videoDetails['thumbnail']:
  940. cover_url = ''
  941. elif len(videoDetails['thumbnail']['thumbnails']) == 0:
  942. cover_url = ''
  943. elif 'url' not in videoDetails['thumbnail']['thumbnails'][-1]:
  944. cover_url = ''
  945. else:
  946. cover_url = videoDetails['thumbnail']['thumbnails'][-1]['url']
  947. # video_url
  948. # if 'formats' not in streamingData:
  949. # video_url = ''
  950. # elif len(streamingData['formats']) == 0:
  951. # video_url = ''
  952. # elif 'url' not in streamingData['formats'][-1]:
  953. # video_url = ''
  954. # else:
  955. # video_url = streamingData['formats'][-1]['url']
  956. video_url = f"https://www.youtube.com/watch?v={video_id}"
  957. Common.logger(log_type, crawler).info(f'video_title:{video_title}')
  958. Common.logger(log_type, crawler).info(f'video_id:{video_id}')
  959. Common.logger(log_type, crawler).info(f'play_cnt:{play_cnt}')
  960. Common.logger(log_type, crawler).info(f'publish_time:{publish_time}')
  961. Common.logger(log_type, crawler).info(f'user_name:{user_name}')
  962. Common.logger(log_type, crawler).info(f'cover_url:{cover_url}')
  963. Common.logger(log_type, crawler).info(f'video_url:{video_url}')
  964. video_dict = {
  965. 'video_title': video_title,
  966. 'video_id': video_id,
  967. 'duration': duration,
  968. 'play_cnt': play_cnt,
  969. 'publish_time': publish_time,
  970. 'publish_time_stamp': publish_time_stamp,
  971. 'user_name': user_name,
  972. 'out_uid': out_uid,
  973. 'cover_url': cover_url,
  974. 'video_url': video_url,
  975. }
  976. return video_dict
  977. except Exception as e:
  978. Common.logger(log_type, crawler).error(f"get_video_info异常:{e}\n")
  979. @classmethod
  980. def get_video_info(cls, log_type, crawler, out_uid, video_id, machine):
  981. try:
  982. url = "https://www.youtube.com/youtubei/v1/player?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8&prettyPrint=false"
  983. payload = json.dumps({
  984. "context": {
  985. "client": {
  986. "hl": "zh-CN",
  987. "gl": "US",
  988. "remoteHost": "38.93.247.21",
  989. "deviceMake": "Apple",
  990. "deviceModel": "",
  991. "visitorData": "CgtraDZfVnB4NXdIWSjkzoefBg%3D%3D",
  992. "userAgent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36,gzip(gfe)",
  993. "clientName": "WEB",
  994. "clientVersion": "2.20230201.01.00",
  995. "osName": "Macintosh",
  996. "osVersion": "10_15_7",
  997. "originalUrl": f"https://www.youtube.com/watch?v={video_id}",
  998. "platform": "DESKTOP",
  999. "clientFormFactor": "UNKNOWN_FORM_FACTOR",
  1000. "configInfo": {
  1001. "appInstallData": "COTOh58GEPuj_hIQ1-SuBRC4i64FEMzfrgUQgt2uBRCi7K4FEOLUrgUQzPWuBRCKgK8FEOSg_hIQtpz-EhDa6a4FEP7urgUQieiuBRDn964FELjUrgUQlPiuBRCH3a4FELfgrgUQ76P-EhDJya4FEJan_hIQkfj8Eg%3D%3D"
  1002. },
  1003. "timeZone": "Asia/Shanghai",
  1004. "browserName": "Chrome",
  1005. "browserVersion": "109.0.0.0",
  1006. "acceptHeader": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
  1007. "deviceExperimentId": "ChxOekU1TlRReU5qWTBOVFExTVRRNU5qRTBOdz09EOTOh58GGOmU7Z4G",
  1008. "screenWidthPoints": 1037,
  1009. "screenHeightPoints": 969,
  1010. "screenPixelDensity": 1,
  1011. "screenDensityFloat": 1,
  1012. "utcOffsetMinutes": 480,
  1013. "userInterfaceTheme": "USER_INTERFACE_THEME_LIGHT",
  1014. "memoryTotalKbytes": "8000000",
  1015. "clientScreen": "WATCH",
  1016. "mainAppWebInfo": {
  1017. "graftUrl": f"/watch?v={video_id}",
  1018. "pwaInstallabilityStatus": "PWA_INSTALLABILITY_STATUS_CAN_BE_INSTALLED",
  1019. "webDisplayMode": "WEB_DISPLAY_MODE_FULLSCREEN",
  1020. "isWebNativeShareAvailable": True
  1021. }
  1022. },
  1023. "user": {
  1024. "lockedSafetyMode": False
  1025. },
  1026. "request": {
  1027. "useSsl": True,
  1028. "internalExperimentFlags": [],
  1029. "consistencyTokenJars": []
  1030. },
  1031. "clickTracking": {
  1032. "clickTrackingParams": "CIwBEKQwGAYiEwipncqx3IL9AhXs4cQKHbKZDO4yB3JlbGF0ZWRInsS1qbGFtIlUmgEFCAEQ-B0="
  1033. },
  1034. "adSignalsInfo": {
  1035. "params": [
  1036. {
  1037. "key": "dt",
  1038. "value": "1675749222611"
  1039. },
  1040. {
  1041. "key": "flash",
  1042. "value": "0"
  1043. },
  1044. {
  1045. "key": "frm",
  1046. "value": "0"
  1047. },
  1048. {
  1049. "key": "u_tz",
  1050. "value": "480"
  1051. },
  1052. {
  1053. "key": "u_his",
  1054. "value": "3"
  1055. },
  1056. {
  1057. "key": "u_h",
  1058. "value": "1080"
  1059. },
  1060. {
  1061. "key": "u_w",
  1062. "value": "1920"
  1063. },
  1064. {
  1065. "key": "u_ah",
  1066. "value": "1080"
  1067. },
  1068. {
  1069. "key": "u_aw",
  1070. "value": "1920"
  1071. },
  1072. {
  1073. "key": "u_cd",
  1074. "value": "24"
  1075. },
  1076. {
  1077. "key": "bc",
  1078. "value": "31"
  1079. },
  1080. {
  1081. "key": "bih",
  1082. "value": "969"
  1083. },
  1084. {
  1085. "key": "biw",
  1086. "value": "1037"
  1087. },
  1088. {
  1089. "key": "brdim",
  1090. "value": "-269,-1080,-269,-1080,1920,-1080,1920,1080,1037,969"
  1091. },
  1092. {
  1093. "key": "vis",
  1094. "value": "1"
  1095. },
  1096. {
  1097. "key": "wgl",
  1098. "value": "true"
  1099. },
  1100. {
  1101. "key": "ca_type",
  1102. "value": "image"
  1103. }
  1104. ],
  1105. "bid": "ANyPxKop8SijebwUCq4ZfKbJwlSjVQa_RTdS6c6a6WPYpCKnxpWCJ33B1SzRuSXjSfH9O2MhURebAs0CngRg6B4nOjBpeJDKgA"
  1106. }
  1107. },
  1108. "videoId": str(video_id),
  1109. "playbackContext": {
  1110. "contentPlaybackContext": {
  1111. "currentUrl": f"/watch?v={video_id}",
  1112. "vis": 0,
  1113. "splay": False,
  1114. "autoCaptionsDefaultOn": False,
  1115. "autonavState": "STATE_NONE",
  1116. "html5Preference": "HTML5_PREF_WANTS",
  1117. "signatureTimestamp": 19394,
  1118. "referer": f"https://www.youtube.com/watch?v={video_id}",
  1119. "lactMilliseconds": "-1",
  1120. "watchAmbientModeContext": {
  1121. "watchAmbientModeEnabled": True
  1122. }
  1123. }
  1124. },
  1125. "racyCheckOk": False,
  1126. "contentCheckOk": False
  1127. })
  1128. headers = {
  1129. 'authority': 'www.youtube.com',
  1130. 'accept': '*/*',
  1131. 'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
  1132. 'cache-control': 'no-cache',
  1133. 'content-type': 'application/json',
  1134. 'cookie': f'VISITOR_INFO1_LIVE=kh6_Vpx5wHY; YSC=UupqFrWvAR0; DEVICE_INFO=ChxOekU1TlRReU5qWTBOVFExTVRRNU5qRTBOdz09EOmU7Z4GGOmU7Z4G; PREF=tz=Asia.Shanghai; ST-180dxzo=itct=CIwBEKQwGAYiEwipncqx3IL9AhXs4cQKHbKZDO4yB3JlbGF0ZWRInsS1qbGFtIlUmgEFCAEQ-B0%3D&csn=MC41MTQ1NTQzMTE3NTA4MjY0&endpoint=%7B%22clickTrackingParams%22%3A%22CIwBEKQwGAYiEwipncqx3IL9AhXs4cQKHbKZDO4yB3JlbGF0ZWRInsS1qbGFtIlUmgEFCAEQ-B0%3D%22%2C%22commandMetadata%22%3A%7B%22webCommandMetadata%22%3A%7B%22url%22%3A%22%2Fwatch%3Fv%3D{video_id}%22%2C%22webPageType%22%3A%22WEB_PAGE_TYPE_WATCH%22%2C%22rootVe%22%3A3832%7D%7D%2C%22watchEndpoint%22%3A%7B%22videoId%22%3A%22{video_id}%22%2C%22nofollow%22%3Atrue%2C%22watchEndpointSupportedOnesieConfig%22%3A%7B%22html5PlaybackOnesieConfig%22%3A%7B%22commonConfig%22%3A%7B%22url%22%3A%22https%3A%2F%2Frr5---sn-nx5s7n76.googlevideo.com%2Finitplayback%3Fsource%3Dyoutube%26oeis%3D1%26c%3DWEB%26oad%3D3200%26ovd%3D3200%26oaad%3D11000%26oavd%3D11000%26ocs%3D700%26oewis%3D1%26oputc%3D1%26ofpcc%3D1%26msp%3D1%26odepv%3D1%26id%3D38654ad085c12212%26ip%3D38.93.247.21%26initcwndbps%3D11346250%26mt%3D1675748964%26oweuc%3D%26pxtags%3DCg4KAnR4EggyNDQ1MTI4OA%26rxtags%3DCg4KAnR4EggyNDQ1MTI4Ng%252CCg4KAnR4EggyNDQ1MTI4Nw%252CCg4KAnR4EggyNDQ1MTI4OA%252CCg4KAnR4EggyNDQ1MTI4OQ%22%7D%7D%7D%7D%7D',
  1135. 'origin': 'https://www.youtube.com',
  1136. 'pragma': 'no-cache',
  1137. 'referer': f'https://www.youtube.com/watch?v={video_id}',
  1138. 'sec-ch-ua': '"Not_A Brand";v="99", "Chromium";v="109", "Google Chrome";v="109.0.5414.87"',
  1139. 'sec-ch-ua-arch': '"arm"',
  1140. 'sec-ch-ua-bitness': '"64"',
  1141. 'sec-ch-ua-full-version': '"109.0.1518.52"',
  1142. 'sec-ch-ua-full-version-list': '"Not_A Brand";v="99.0.0.0", "Microsoft Edge";v="109.0.1518.52", "Chromium";v="109.0.5414.87"',
  1143. 'sec-ch-ua-mobile': '?0',
  1144. 'sec-ch-ua-model': '',
  1145. 'sec-ch-ua-platform': '"macOS"',
  1146. 'sec-ch-ua-platform-version': '"12.4.0"',
  1147. 'sec-ch-ua-wow64': '?0',
  1148. 'sec-fetch-dest': 'empty',
  1149. 'sec-fetch-mode': 'same-origin',
  1150. 'sec-fetch-site': 'same-origin',
  1151. 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
  1152. 'x-goog-visitor-id': 'CgtraDZfVnB4NXdIWSjkzoefBg%3D%3D',
  1153. 'x-youtube-bootstrap-logged-in': 'false',
  1154. 'x-youtube-client-name': '1',
  1155. 'x-youtube-client-version': '2.20230201.01.00'
  1156. }
  1157. response = requests.post(url=url, headers=headers, data=payload)
  1158. if response.status_code != 200:
  1159. Common.logger(log_type, crawler).warning(f"get_video_info_response:{response.text}\n")
  1160. elif 'streamingData' not in response.json():
  1161. Common.logger(log_type, crawler).warning(f"get_video_info_response:{response.json()}\n")
  1162. elif 'videoDetails' not in response.json():
  1163. Common.logger(log_type, crawler).warning(f"get_video_info_response:{response.json()}\n")
  1164. elif 'microformat' not in response.json():
  1165. Common.logger(log_type, crawler).warning(f"get_video_info_response:{response.json()}\n")
  1166. else:
  1167. playerMicroformatRenderer = response.json()['microformat']['playerMicroformatRenderer']
  1168. videoDetails = response.json()['videoDetails']
  1169. # streamingData = response.json()['streamingData']
  1170. # video_title
  1171. if 'title' not in videoDetails:
  1172. video_title = ''
  1173. else:
  1174. video_title = videoDetails['title']
  1175. video_title = cls.filter_emoji(video_title)
  1176. if not cls.is_contain_chinese(video_title):
  1177. video_title = Translate.google_translate(video_title, machine) \
  1178. .strip().replace("\\", "").replace(" ", "").replace("\n", "") \
  1179. .replace("/", "").replace("\r", "").replace("&NBSP", "").replace("&", "") \
  1180. .replace(";", "").replace("amp;", "") # 自动翻译标题为中文
  1181. if 'lengthSeconds' not in videoDetails:
  1182. duration = 0
  1183. else:
  1184. duration = int(videoDetails['lengthSeconds'])
  1185. # play_cnt
  1186. if 'viewCount' not in videoDetails:
  1187. play_cnt = 0
  1188. else:
  1189. play_cnt = int(videoDetails['viewCount'])
  1190. # publish_time
  1191. if 'publishDate' not in playerMicroformatRenderer:
  1192. publish_time = ''
  1193. else:
  1194. publish_time = playerMicroformatRenderer['publishDate']
  1195. if publish_time == '':
  1196. publish_time_stamp = 0
  1197. elif ':' in publish_time:
  1198. publish_time_stamp = int(time.mktime(time.strptime(publish_time, "%Y-%m-%d %H:%M:%S")))
  1199. else:
  1200. publish_time_stamp = int(time.mktime(time.strptime(publish_time, "%Y-%m-%d")))
  1201. # user_name
  1202. if 'author' not in videoDetails:
  1203. user_name = ''
  1204. else:
  1205. user_name = videoDetails['author']
  1206. # cover_url
  1207. if 'thumbnail' not in videoDetails:
  1208. cover_url = ''
  1209. elif 'thumbnails' not in videoDetails['thumbnail']:
  1210. cover_url = ''
  1211. elif len(videoDetails['thumbnail']['thumbnails']) == 0:
  1212. cover_url = ''
  1213. elif 'url' not in videoDetails['thumbnail']['thumbnails'][-1]:
  1214. cover_url = ''
  1215. else:
  1216. cover_url = videoDetails['thumbnail']['thumbnails'][-1]['url']
  1217. # video_url
  1218. # if 'formats' not in streamingData:
  1219. # video_url = ''
  1220. # elif len(streamingData['formats']) == 0:
  1221. # video_url = ''
  1222. # elif 'url' not in streamingData['formats'][-1]:
  1223. # video_url = ''
  1224. # else:
  1225. # video_url = streamingData['formats'][-1]['url']
  1226. video_url = f"https://www.youtube.com/watch?v={video_id}"
  1227. Common.logger(log_type, crawler).info(f'video_title:{video_title}')
  1228. Common.logger(log_type, crawler).info(f'video_id:{video_id}')
  1229. Common.logger(log_type, crawler).info(f'play_cnt:{play_cnt}')
  1230. Common.logger(log_type, crawler).info(f'publish_time:{publish_time}')
  1231. Common.logger(log_type, crawler).info(f'user_name:{user_name}')
  1232. Common.logger(log_type, crawler).info(f'cover_url:{cover_url}')
  1233. Common.logger(log_type, crawler).info(f'video_url:{video_url}')
  1234. video_dict = {
  1235. 'video_title': video_title,
  1236. 'video_id': video_id,
  1237. 'duration': duration,
  1238. 'play_cnt': play_cnt,
  1239. 'publish_time': publish_time,
  1240. 'publish_time_stamp': publish_time_stamp,
  1241. 'user_name': user_name,
  1242. 'out_uid': out_uid,
  1243. 'cover_url': cover_url,
  1244. 'video_url': video_url,
  1245. }
  1246. return video_dict
  1247. except Exception as e:
  1248. Common.logger(log_type, crawler).error(f"get_video_info异常:{e}\n")
  1249. @classmethod
  1250. def repeat_video(cls, log_type, crawler, video_id, env, machine):
  1251. sql = f""" select * from crawler_video where platform="{cls.platform}" and out_video_id="{video_id}"; """
  1252. repeat_video = MysqlHelper.get_values(log_type, crawler, sql, env, machine)
  1253. return len(repeat_video)
  1254. @classmethod
  1255. def download_publish(cls, log_type, crawler, video_dict, strategy, our_uid, env, oss_endpoint, machine):
  1256. try:
  1257. # sql = f""" select * from crawler_video where platform="{cls.platform}" and out_video_id="{video_dict['video_id']}" """
  1258. # repeat_video = MysqlHelper.get_values(log_type, crawler, sql, env, machine)
  1259. if video_dict['video_title'] == '' or video_dict['video_url'] == '':
  1260. Common.logger(log_type, crawler).info('无效视频\n')
  1261. elif video_dict['duration'] > 1200 or video_dict['duration'] < 60:
  1262. Common.logger(log_type, crawler).info(f"时长:{video_dict['duration']}不满足规则\n")
  1263. # elif repeat_video is not None and len(repeat_video) != 0:
  1264. elif cls.repeat_video(log_type, crawler, video_dict['video_id'], env, machine) != 0:
  1265. Common.logger(log_type, crawler).info('视频已下载\n')
  1266. elif video_dict['video_id'] in [x for y in Feishu.get_values_batch(log_type, crawler, 'GVxlYk') for x in y]:
  1267. Common.logger(log_type, crawler).info('视频已下载\n')
  1268. else:
  1269. # 下载视频
  1270. Common.logger(log_type, crawler).info('开始下载视频...')
  1271. # Common.download_method(log_type, crawler, 'video', video_dict['video_title'], video_dict['video_url'])
  1272. Common.download_method(log_type, crawler, 'youtube_video', video_dict['video_title'],
  1273. video_dict['video_url'])
  1274. # ffmpeg_dict = Common.ffmpeg(log_type, crawler, f"./{crawler}/videos/{video_dict['video_title']}/video.mp4")
  1275. # video_width = int(ffmpeg_dict['width'])
  1276. # video_height = int(ffmpeg_dict['height'])
  1277. # video_size = int(ffmpeg_dict['size'])
  1278. video_width = 1280
  1279. video_height = 720
  1280. duration = int(video_dict['duration'])
  1281. Common.logger(log_type, crawler).info(f'video_width:{video_width}')
  1282. Common.logger(log_type, crawler).info(f'video_height:{video_height}')
  1283. Common.logger(log_type, crawler).info(f'duration:{duration}')
  1284. # Common.logger(log_type, crawler).info(f'video_size:{video_size}\n')
  1285. video_dict['video_width'] = video_width
  1286. video_dict['video_height'] = video_height
  1287. video_dict['duration'] = duration
  1288. video_dict['comment_cnt'] = 0
  1289. video_dict['like_cnt'] = 0
  1290. video_dict['share_cnt'] = 0
  1291. video_dict['avatar_url'] = video_dict['cover_url']
  1292. video_dict['session'] = f'youtube{int(time.time())}'
  1293. rule = '1,2'
  1294. # if duration < 60 or duration > 600:
  1295. # # 删除视频文件夹
  1296. # shutil.rmtree(f"./{crawler}/videos/{video_dict['video_title']}/")
  1297. # Common.logger(log_type, crawler).info(f"时长:{video_dict['duration']}不满足抓取规则,删除成功\n")
  1298. # return
  1299. # if duration == 0 or duration is None:
  1300. # # 删除视频文件夹
  1301. # shutil.rmtree(f"./{crawler}/videos/{video_dict['video_title']}/")
  1302. # Common.logger(log_type, crawler).info(f"视频下载出错,删除成功\n")
  1303. # return
  1304. # else:
  1305. # 下载封面
  1306. Common.download_method(log_type, crawler, 'cover', video_dict['video_title'], video_dict['cover_url'])
  1307. # 保存视频文本信息
  1308. Common.save_video_info(log_type, crawler, video_dict)
  1309. # 上传视频
  1310. Common.logger(log_type, crawler).info(f"开始上传视频")
  1311. if env == 'dev':
  1312. our_video_id = Publish.upload_and_publish(log_type, crawler, strategy, our_uid, env, oss_endpoint)
  1313. our_video_link = f"https://testadmin.piaoquantv.com/cms/post-detail/{our_video_id}/info"
  1314. else:
  1315. our_video_id = Publish.upload_and_publish(log_type, crawler, strategy, our_uid, env, oss_endpoint)
  1316. our_video_link = f"https://admin.piaoquantv.com/cms/post-detail/{our_video_id}/info"
  1317. Common.logger(log_type, crawler).info("视频上传完成")
  1318. if our_video_id is None:
  1319. # 删除视频文件夹
  1320. shutil.rmtree(f"./{crawler}/videos/{video_dict['video_title']}/")
  1321. return
  1322. # 视频信息保存至飞书
  1323. Feishu.insert_columns(log_type, crawler, "GVxlYk", "ROWS", 1, 2)
  1324. # 视频ID工作表,首行写入数据
  1325. upload_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(time.time())))
  1326. values = [[upload_time,
  1327. "定向榜",
  1328. video_dict['video_id'],
  1329. video_dict['video_title'],
  1330. our_video_link,
  1331. video_dict['play_cnt'],
  1332. video_dict['duration'],
  1333. f'{video_width}*{video_height}',
  1334. video_dict['publish_time'],
  1335. video_dict['user_name'],
  1336. video_dict['cover_url'],
  1337. video_dict['video_url']
  1338. ]]
  1339. # time.sleep(1)
  1340. Feishu.update_values(log_type, crawler, "GVxlYk", "F2:Z2", values)
  1341. Common.logger(log_type, crawler).info('视频信息写入定向_已下载表成功\n')
  1342. # 视频信息保存数据库
  1343. sql = f""" insert into crawler_video(video_id,
  1344. user_id,
  1345. out_user_id,
  1346. platform,
  1347. strategy,
  1348. out_video_id,
  1349. video_title,
  1350. cover_url,
  1351. video_url,
  1352. duration,
  1353. publish_time,
  1354. play_cnt,
  1355. crawler_rule,
  1356. width,
  1357. height)
  1358. values({our_video_id},
  1359. "{our_uid}",
  1360. "{video_dict['out_uid']}",
  1361. "{cls.platform}",
  1362. "定向爬虫策略",
  1363. "{video_dict['video_id']}",
  1364. "{video_dict['video_title']}",
  1365. "{video_dict['cover_url']}",
  1366. "{video_dict['video_url']}",
  1367. {int(duration)},
  1368. "{video_dict['publish_time']}",
  1369. {int(video_dict['play_cnt'])},
  1370. "{rule}",
  1371. {int(video_width)},
  1372. {int(video_height)}) """
  1373. MysqlHelper.update_values(log_type, crawler, sql, env, machine)
  1374. Common.logger(log_type, crawler).info('视频信息插入数据库成功!\n')
  1375. except Exception as e:
  1376. Common.logger(log_type, crawler).info(f"download_publish异常:{e}\n")
  1377. @classmethod
  1378. def get_follow_videos(cls, log_type, crawler, strategy, oss_endpoint, env, machine):
  1379. try:
  1380. user_list = cls.get_user_from_feishu(log_type, crawler, 'c467d7', env, machine)
  1381. if len(user_list) == 0:
  1382. Common.logger(log_type, crawler).warning('用户列表为空\n')
  1383. else:
  1384. for user_dict in user_list:
  1385. out_uid = user_dict['out_user_id']
  1386. user_name = user_dict['out_user_name']
  1387. browse_id = user_dict['out_browse_id']
  1388. our_uid = user_dict['our_user_id']
  1389. out_user_url = user_dict['out_user_url']
  1390. Common.logger(log_type, crawler).info(f'获取 {user_name} 主页视频\n')
  1391. cls.get_videos(log_type, crawler, strategy, oss_endpoint, env, out_uid, our_uid, machine,
  1392. out_user_url)
  1393. # Common.logger(log_type, crawler).info('休眠 10 秒')
  1394. # time.sleep(random.randint(1, 2))
  1395. cls.continuation = ''
  1396. except Exception as e:
  1397. Common.logger(log_type, crawler).error(f"get_follow_videos异常:{e}\n")
  1398. if __name__ == "__main__":
  1399. # print(Follow.get_browse_id('follow', 'youtube', '@chinatravel5971', "local"))
  1400. # print(Follow.get_user_from_feishu('follow', 'youtube', 'c467d7', 'dev', 'local'))
  1401. # Follow.get_out_user_info('follow', 'youtube', 'UC08jgxf119fzynp2uHCvZIg', '@weitravel')
  1402. # Follow.get_video_info('follow', 'youtube', 'OGVK0IXBIhI')
  1403. # Follow.get_follow_videos('follow', 'youtube', 'youtube_follow', 'out', 'dev', 'local')
  1404. # print(Follow.filter_emoji("姐妹倆一唱一和,完美配合,終於把大慶降服了😅😅#萌娃搞笑日常"))
  1405. # Follow.repeat_video('follow', 'youtube', 4, "dev", "local")
  1406. # title = "'西部巡游220丨两人一车环游中国半年,需要花费多少钱? 2万公里吃住行费用总结'"
  1407. # title = "'Insanely Crowded Shanghai Yu Garden Lantern Festival Walk Tour 2023 人气爆棚的上海豫园元宵节漫步之行 4K'"
  1408. # print(title.strip().replace("\\", "").replace(" ", "").replace("\n", "").replace("/", "").replace("\r", "").replace("&NBSP", "").replace("&", ""))
  1409. pass