youtube_follow.py 61 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093
  1. # -*- coding: utf-8 -*-
  2. # @Author: wangkun
  3. # @Time: 2023/2/3
  4. """
  5. YouTube 定向榜
  6. """
  7. import os
  8. import shutil
  9. import sys
  10. import time
  11. import json
  12. import requests
  13. from selenium import webdriver
  14. from selenium.webdriver.chrome.service import Service
  15. from selenium.webdriver.common.by import By
  16. from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
  17. sys.path.append(os.getcwd())
  18. from common.common import Common
  19. from common.db import MysqlHelper
  20. from common.feishu import Feishu
  21. from common.users import Users
  22. from common.publish import Publish
  23. from common.translate import Translate
  24. class Follow:
  25. # 翻页参数
  26. continuation = ''
  27. # 抓取平台
  28. platform = 'youtube'
  29. @classmethod
  30. def get_browse_id(cls, log_type, crawler, out_user_id, machine):
  31. """
  32. 获取每个用户的 browse_id
  33. :param log_type: 日志
  34. :param crawler: 哪款爬虫
  35. :param out_user_id: 站外用户 UID
  36. :param machine: 部署机器,阿里云填写 aliyun,线下分别填写 macpro,macair,local
  37. :return: browse_id
  38. """
  39. try:
  40. # 打印请求配置
  41. ca = DesiredCapabilities.CHROME
  42. ca["goog:loggingPrefs"] = {"performance": "ALL"}
  43. # 不打开浏览器运行
  44. chrome_options = webdriver.ChromeOptions()
  45. chrome_options.add_argument("--headless")
  46. chrome_options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36')
  47. chrome_options.add_argument("--no-sandbox")
  48. # driver初始化
  49. if machine == 'aliyun':
  50. driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options)
  51. elif machine == 'macpro':
  52. driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options, service=Service('/Users/lieyunye/Downloads/chromedriver_v86/chromedriver'))
  53. elif machine == 'macair':
  54. driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options, service=Service('/Users/piaoquan/Downloads/chromedriver'))
  55. else:
  56. driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options, service=Service('/Users/wangkun/Downloads/chromedriver/chromedriver_v110/chromedriver'))
  57. driver.implicitly_wait(10)
  58. url = f'https://www.youtube.com/{out_user_id}/videos'
  59. driver.get(url)
  60. # driver.save_screenshot("./1.png")
  61. # 向上滑动 1000 个像素
  62. # driver.execute_script('window.scrollBy(0, 2000)')
  63. # driver.save_screenshot("./2.png")
  64. time.sleep(3)
  65. accept_btns = driver.find_elements(By.XPATH, '//span[text()="全部接受"]')
  66. accept_btns_eng = driver.find_elements(By.XPATH, '//span[text()="Accept all"]')
  67. if len(accept_btns) != 0:
  68. accept_btns[0].click()
  69. time.sleep(2)
  70. elif len(accept_btns_eng) != 0:
  71. accept_btns_eng[0].click()
  72. time.sleep(2)
  73. browse_id = driver.find_element(By.XPATH, '//meta[@itemprop="channelId"]').get_attribute('content')
  74. driver.quit()
  75. return browse_id
  76. except Exception as e:
  77. Common.logger(log_type, crawler).error(f'get_browse_id异常:{e}\n')
  78. @classmethod
  79. def get_out_user_info(cls, log_type, crawler, browse_id, out_user_id):
  80. """
  81. 获取站外用户信息
  82. :param log_type: 日志
  83. :param crawler: 哪款爬虫
  84. :param browse_id: browse_id
  85. :param out_user_id: 站外用户 UID
  86. :return: out_user_dict = {'out_user_name': 站外用户昵称,
  87. 'out_avatar_url': 站外用户头像,
  88. 'out_fans': 站外用户粉丝量,
  89. 'out_play_cnt': 站外用户总播放量,
  90. 'out_create_time': 站外用户创建时间}
  91. """
  92. try:
  93. url = "https://www.youtube.com/youtubei/v1/browse?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8&prettyPrint=false"
  94. payload = json.dumps({
  95. "context": {
  96. "client": {
  97. "hl": "zh-CN",
  98. "gl": "US",
  99. "remoteHost": "38.93.247.21",
  100. "deviceMake": "Apple",
  101. "deviceModel": "",
  102. "visitorData": "CgtraDZfVnB4NXdIWSjL1IKfBg%3D%3D",
  103. "userAgent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36,gzip(gfe)",
  104. "clientName": "WEB",
  105. "clientVersion": "2.20230201.01.00",
  106. "osName": "Macintosh",
  107. "osVersion": "10_15_7",
  108. "originalUrl": f"https://www.youtube.com/{out_user_id}/about",
  109. "screenPixelDensity": 1,
  110. "platform": "DESKTOP",
  111. "clientFormFactor": "UNKNOWN_FORM_FACTOR",
  112. "configInfo": {
  113. "appInstallData": "CMvUgp8GEKLsrgUQzN-uBRC41K4FENfkrgUQsvWuBRDkoP4SELiLrgUQo_muBRDn964FENnprgUQlPiuBRC2nP4SEPuj_hIQ4tSuBRCJ6K4FEILdrgUQh92uBRD-7q4FEMz1rgUQ76P-EhDJya4FEJan_hIQkfj8Eg%3D%3D"
  114. },
  115. "screenDensityFloat": 1,
  116. "timeZone": "Asia/Shanghai",
  117. "browserName": "Chrome",
  118. "browserVersion": "109.0.0.0",
  119. "acceptHeader": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
  120. "deviceExperimentId": "ChxOekU1TlRReU5qWTBOVFExTVRRNU5qRTBOdz09EMvUgp8GGOmU7Z4G",
  121. "screenWidthPoints": 805,
  122. "screenHeightPoints": 969,
  123. "utcOffsetMinutes": 480,
  124. "userInterfaceTheme": "USER_INTERFACE_THEME_LIGHT",
  125. "memoryTotalKbytes": "8000000",
  126. "mainAppWebInfo": {
  127. "graftUrl": f"/{out_user_id}/about",
  128. "pwaInstallabilityStatus": "PWA_INSTALLABILITY_STATUS_CAN_BE_INSTALLED",
  129. "webDisplayMode": "WEB_DISPLAY_MODE_FULLSCREEN",
  130. "isWebNativeShareAvailable": True
  131. }
  132. },
  133. "user": {
  134. "lockedSafetyMode": False
  135. },
  136. "request": {
  137. "useSsl": True,
  138. "internalExperimentFlags": [],
  139. "consistencyTokenJars": []
  140. },
  141. "clickTracking": {
  142. "clickTrackingParams": "CBMQ8JMBGAoiEwjY34r0rYD9AhURSEwIHfHZAak="
  143. },
  144. "adSignalsInfo": {
  145. "params": [
  146. {
  147. "key": "dt",
  148. "value": "1675668045032"
  149. },
  150. {
  151. "key": "flash",
  152. "value": "0"
  153. },
  154. {
  155. "key": "frm",
  156. "value": "0"
  157. },
  158. {
  159. "key": "u_tz",
  160. "value": "480"
  161. },
  162. {
  163. "key": "u_his",
  164. "value": "1"
  165. },
  166. {
  167. "key": "u_h",
  168. "value": "1080"
  169. },
  170. {
  171. "key": "u_w",
  172. "value": "1920"
  173. },
  174. {
  175. "key": "u_ah",
  176. "value": "1080"
  177. },
  178. {
  179. "key": "u_aw",
  180. "value": "1920"
  181. },
  182. {
  183. "key": "u_cd",
  184. "value": "24"
  185. },
  186. {
  187. "key": "bc",
  188. "value": "31"
  189. },
  190. {
  191. "key": "bih",
  192. "value": "969"
  193. },
  194. {
  195. "key": "biw",
  196. "value": "805"
  197. },
  198. {
  199. "key": "brdim",
  200. "value": "-269,-1080,-269,-1080,1920,-1080,1920,1080,805,969"
  201. },
  202. {
  203. "key": "vis",
  204. "value": "1"
  205. },
  206. {
  207. "key": "wgl",
  208. "value": "true"
  209. },
  210. {
  211. "key": "ca_type",
  212. "value": "image"
  213. }
  214. ],
  215. "bid": "ANyPxKqvCBKtjNeHQ6uTC7sKj2ZwIvEkk3oRlmdU7H_soRJWLc4IQCkqMVP68RR-Xae0h3nMdOKYOtVh_Yb2OYr4znd60I5j7A"
  216. }
  217. },
  218. "browseId": browse_id,
  219. "params": "EgVhYm91dPIGBAoCEgA%3D"
  220. })
  221. headers = {
  222. 'authority': 'www.youtube.com',
  223. 'accept': '*/*',
  224. 'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
  225. 'cache-control': 'no-cache',
  226. 'content-type': 'application/json',
  227. 'cookie': 'VISITOR_INFO1_LIVE=kh6_Vpx5wHY; YSC=UupqFrWvAR0; DEVICE_INFO=ChxOekU1TlRReU5qWTBOVFExTVRRNU5qRTBOdz09EOmU7Z4GGOmU7Z4G; GPS=1; PREF=tz=Asia.Shanghai; ST-h076le=itct=CBMQ8JMBGAoiEwjY34r0rYD9AhURSEwIHfHZAak%3D&csn=MC45NDM2MjgyNzM1ODE5NDAz&endpoint=%7B%22clickTrackingParams%22%3A%22CBMQ8JMBGAoiEwjY34r0rYD9AhURSEwIHfHZAak%3D%22%2C%22commandMetadata%22%3A%7B%22webCommandMetadata%22%3A%7B%22url%22%3A%22%2F%40weitravel%2Fabout%22%2C%22webPageType%22%3A%22WEB_PAGE_TYPE_CHANNEL%22%2C%22rootVe%22%3A3611%2C%22apiUrl%22%3A%22%2Fyoutubei%2Fv1%2Fbrowse%22%7D%7D%2C%22browseEndpoint%22%3A%7B%22browseId%22%3A%22UC08jgxf119fzynp2uHCvZIg%22%2C%22params%22%3A%22EgVhYm91dPIGBAoCEgA%253D%22%2C%22canonicalBaseUrl%22%3A%22%2F%40weitravel%22%7D%7D',
  228. 'origin': 'https://www.youtube.com',
  229. 'pragma': 'no-cache',
  230. 'referer': f'https://www.youtube.com/{out_user_id}/videos',
  231. 'sec-ch-ua': '"Not_A Brand";v="99", "Chromium";v="109", "Google Chrome";v="109.0.5414.87"',
  232. 'sec-ch-ua-arch': '"arm"',
  233. 'sec-ch-ua-bitness': '"64"',
  234. 'sec-ch-ua-full-version': '"109.0.1518.52"',
  235. 'sec-ch-ua-full-version-list': '"Not_A Brand";v="99.0.0.0", "Microsoft Edge";v="109.0.1518.52", "Chromium";v="109.0.5414.87"',
  236. 'sec-ch-ua-mobile': '?0',
  237. 'sec-ch-ua-model': '',
  238. 'sec-ch-ua-platform': '"macOS"',
  239. 'sec-ch-ua-platform-version': '"12.4.0"',
  240. 'sec-ch-ua-wow64': '?0',
  241. 'sec-fetch-dest': 'empty',
  242. 'sec-fetch-mode': 'same-origin',
  243. 'sec-fetch-site': 'same-origin',
  244. 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
  245. 'x-goog-visitor-id': 'CgtraDZfVnB4NXdIWSjL1IKfBg%3D%3D',
  246. 'x-youtube-bootstrap-logged-in': 'false',
  247. 'x-youtube-client-name': '1',
  248. 'x-youtube-client-version': '2.20230201.01.00'
  249. }
  250. response = requests.post(url=url, headers=headers, data=payload)
  251. if response.status_code != 200:
  252. Common.logger(log_type, crawler).warning(f'get_out_user_info:{response.text}\n')
  253. elif 'contents' not in response.text or 'header' not in response.text:
  254. Common.logger(log_type, crawler).warning(f'get_out_user_info:{response.text}\n')
  255. elif 'c4TabbedHeaderRenderer' not in response.json()['header']:
  256. Common.logger(log_type, crawler).warning(f'get_out_user_info:{response.json()["header"]}\n')
  257. elif 'twoColumnBrowseResultsRenderer' not in response.json()['contents']:
  258. Common.logger(log_type, crawler).warning(f'get_out_user_info:{response.json()}\n')
  259. elif 'tabs' not in response.json()['contents']['twoColumnBrowseResultsRenderer']:
  260. Common.logger(log_type, crawler).warning(f"get_out_user_info:{response.json()['contents']['twoColumnBrowseResultsRenderer']}\n")
  261. else:
  262. header = response.json()['header']['c4TabbedHeaderRenderer']
  263. tabs = response.json()['contents']['twoColumnBrowseResultsRenderer']['tabs']
  264. for i in range(len(tabs)):
  265. if 'tabRenderer' not in tabs[i]:
  266. title = ''
  267. elif 'title' not in tabs[i]['tabRenderer']:
  268. title = ''
  269. else:
  270. title = tabs[i]['tabRenderer']['title']
  271. if title == '简介':
  272. if 'tabRenderer' not in tabs[i]:
  273. Common.logger(log_type, crawler).warning(f"get_out_user_info:{tabs[i]}\n")
  274. elif 'content' not in tabs[i]['tabRenderer']:
  275. Common.logger(log_type, crawler).warning(f"get_out_user_info:{tabs[i]['tabRenderer']}\n")
  276. elif 'sectionListRenderer' not in tabs[i]['tabRenderer']['content']:
  277. Common.logger(log_type, crawler).warning(f"get_out_user_info:{tabs[i]['tabRenderer']['content']}\n")
  278. elif 'contents' not in tabs[i]['tabRenderer']['content']['sectionListRenderer']:
  279. Common.logger(log_type, crawler).warning(f"get_out_user_info:{tabs[i]['tabRenderer']['content']['sectionListRenderer']}\n")
  280. elif len(tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents']) == 0:
  281. Common.logger(log_type, crawler).warning(f"get_out_user_info:{tabs[i]['tabRenderer']['content']['sectionListRenderer']}\n")
  282. elif 'itemSectionRenderer' not in tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]:
  283. Common.logger(log_type, crawler).warning(f"get_out_user_info:{tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]}\n")
  284. elif 'contents' not in tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']:
  285. Common.logger(log_type, crawler).warning(f"get_out_user_info:{tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']}\n")
  286. elif len(tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents']) == 0:
  287. Common.logger(log_type, crawler).warning(f"get_out_user_info:{tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']}\n")
  288. elif 'channelAboutFullMetadataRenderer' not in tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]:
  289. Common.logger(log_type, crawler).warning(f"get_out_user_info:{tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]}\n")
  290. else:
  291. # 站外用户昵称
  292. if 'title' not in header and 'title' not in tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer']:
  293. out_user_name = ''
  294. elif 'title' in header:
  295. out_user_name = header['title']
  296. elif 'simpleText' not in tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer']['title']:
  297. out_user_name = ''
  298. else:
  299. out_user_name = tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer']['title']['simpleText']
  300. # 站外用户头像
  301. if 'avatar' not in header and 'avatar' not in tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer']:
  302. out_avatar_url = ''
  303. elif 'thumbnails' not in header['avatar'] and 'thumbnails' not in tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer']['avatar']:
  304. out_avatar_url = ''
  305. elif len(header['avatar']['thumbnails']) == 0 and len(tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer']['avatar']['thumbnails']) == 0:
  306. out_avatar_url = ''
  307. elif 'url' not in header['avatar']['thumbnails'][-1] and 'url' not in tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer']['avatar']['thumbnails'][-1]:
  308. out_avatar_url = ''
  309. elif 'url' in header['avatar']['thumbnails'][-1]:
  310. out_avatar_url = header['avatar']['thumbnails'][-1]['url']
  311. else:
  312. out_avatar_url = tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer']['avatar']['thumbnails'][-1]['url']
  313. # 站外用户粉丝
  314. if 'subscriberCountText' not in header:
  315. out_fans = 0
  316. elif 'accessibility' not in header['subscriberCountText']:
  317. out_fans = 0
  318. elif 'accessibilityData' not in header['subscriberCountText']['accessibility']:
  319. out_fans = 0
  320. elif 'label' not in header['subscriberCountText']['accessibility']['accessibilityData']:
  321. out_fans = 0
  322. else:
  323. out_fans = header['subscriberCountText']['accessibility']['accessibilityData']['label']
  324. if '万' in out_fans:
  325. out_fans = int(float(out_fans.split('万')[0])*10000)
  326. else:
  327. pass
  328. # 站外用户总播放量
  329. if 'viewCountText' not in tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer']:
  330. out_play_cnt = 0
  331. elif 'simpleText' not in tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer']['viewCountText']:
  332. out_play_cnt = 0
  333. else:
  334. out_play_cnt = int(tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer']['viewCountText']['simpleText'].split('次')[0].replace(',', ''))
  335. # 站外用户注册时间
  336. if 'joinedDateText' not in tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer']:
  337. out_create_time = ''
  338. elif 'runs' not in tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer']['joinedDateText']:
  339. out_create_time = ''
  340. elif len(tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer']['joinedDateText']['runs']) == 0:
  341. out_create_time = ''
  342. elif 'text' not in tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer']['joinedDateText']['runs'][0]:
  343. out_create_time = ''
  344. else:
  345. out_create_time = tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer']['joinedDateText']['runs'][0]['text'].replace('年', '-').replace('月', '-').replace('日', '')
  346. out_user_dict = {
  347. 'out_user_name': out_user_name,
  348. 'out_avatar_url': out_avatar_url,
  349. 'out_fans': out_fans,
  350. 'out_play_cnt': out_play_cnt,
  351. 'out_create_time': out_create_time,
  352. }
  353. # print(out_user_dict)
  354. return out_user_dict
  355. except Exception as e:
  356. Common.logger(log_type, crawler).error(f'get_out_user_info异常:{e}\n')
  357. @classmethod
  358. def get_user_from_feishu(cls, log_type, crawler, sheetid, env, machine):
  359. """
  360. 补全飞书用户表信息,并返回
  361. :param log_type: 日志
  362. :param crawler: 哪款爬虫
  363. :param sheetid: 飞书表
  364. :param env: 正式环境:prod,测试环境:dev
  365. :param machine: 部署机器,阿里云填写 aliyun,线下分别填写 macpro,macair,local
  366. :return: user_list
  367. """
  368. user_sheet = Feishu.get_values_batch(log_type, crawler, sheetid)
  369. user_list = []
  370. for i in range(1, len(user_sheet)):
  371. out_uid = user_sheet[i][2]
  372. user_name = user_sheet[i][3]
  373. browse_id = user_sheet[i][5]
  374. our_uid = user_sheet[i][6]
  375. Common.logger(log_type, crawler).info(f"正在更新 {user_name} 用户信息\n")
  376. # 获取站外browse_id,并写入飞书
  377. if browse_id is None:
  378. browse_id = cls.get_browse_id(log_type, crawler, out_uid, machine)
  379. if browse_id is None:
  380. Common.logger(log_type, crawler).warning('browse_id is None !')
  381. else:
  382. Feishu.update_values(log_type, crawler, sheetid, f'F{i+1}:F{i+1}', [[browse_id]])
  383. Common.logger(log_type, crawler).info(f'browse_id写入成功:{browse_id}')
  384. # 站内 UID 为空,且数据库中(youtube+out_user_id)返回数量 == 0,则创建新的站内账号
  385. if our_uid is None:
  386. sql = f""" select * from crawler_user where platform="{cls.platform}" and out_user_id="{out_uid}" """
  387. our_user_info = MysqlHelper.get_values(log_type, crawler, sql, env)
  388. # 数据库中(youtube + out_user_id)返回数量 == 0,则创建站内账号UID,并写入定向账号飞书表。并结合站外用户信息,一并写入爬虫账号数据库
  389. if our_user_info is None or len(our_user_info) == 0:
  390. # 获取站外账号信息,写入数据库
  391. out_user_dict = cls.get_out_user_info(log_type, crawler, browse_id, out_uid)
  392. out_avatar_url = out_user_dict['out_avatar_url']
  393. out_create_time = out_user_dict['out_create_time']
  394. out_play_cnt = out_user_dict['out_play_cnt']
  395. out_fans = out_user_dict['out_fans']
  396. tag = 'youtube爬虫,定向爬虫策略'
  397. # 创建站内账号
  398. create_user_dict = {
  399. 'nickName': user_name,
  400. 'avatarUrl': out_avatar_url,
  401. 'tagName': tag,
  402. }
  403. our_uid = Users.create_user(log_type, crawler, create_user_dict, env)
  404. if 'env' == 'prod':
  405. our_user_link = f'https://admin.piaoquantv.com/ums/user/{our_uid}/post'
  406. else:
  407. our_user_link = f'https://testadmin.piaoquantv.com/ums/user/{our_uid}/post'
  408. Common.logger(log_type, crawler).info(f'站内用户主页链接:{our_user_link}')
  409. Feishu.update_values(log_type, crawler, sheetid, f'G{i + 1}:H{i + 1}', [[our_uid, our_user_link]])
  410. Common.logger(log_type, crawler).info(f'站内用户信息写入飞书成功!')
  411. sql = f""" insert into crawler_user(user_id,
  412. out_user_id,
  413. out_user_name,
  414. out_avatar_url,
  415. out_create_time,
  416. out_play_cnt,
  417. out_fans,
  418. platform,
  419. tag)
  420. values({our_uid},
  421. "{out_uid}",
  422. "{user_name}",
  423. "{out_avatar_url}",
  424. "{out_create_time}",
  425. {out_play_cnt},
  426. {out_fans},
  427. "{cls.platform}",
  428. "{tag}") """
  429. MysqlHelper.update_values(log_type, crawler, sql, env)
  430. Common.logger(log_type, crawler).info('用户信息插入数据库成功!\n')
  431. # 数据库中(youtube + out_user_id)返回数量 != 0,则直接把数据库中的站内 UID 写入飞书
  432. else:
  433. our_uid = our_user_info[0][1]
  434. if 'env' == 'prod':
  435. our_user_link = f'https://admin.piaoquantv.com/ums/user/{our_uid}/post'
  436. else:
  437. our_user_link = f'https://testadmin.piaoquantv.com/ums/user/{our_uid}/post'
  438. Common.logger(log_type, crawler).info(f'站内用户主页链接:{our_user_link}')
  439. Feishu.update_values(log_type, crawler, sheetid, f'G{i+1}:H{i+1}', [[our_uid, our_user_link]])
  440. Common.logger(log_type, crawler).info(f'站内用户信息写入飞书成功!\n')
  441. user_dict = {
  442. 'out_user_id': out_uid,
  443. 'out_user_name': user_name,
  444. 'out_browse_id': browse_id,
  445. 'our_user_id': our_uid,
  446. }
  447. user_list.append(user_dict)
  448. return user_list
  449. @classmethod
  450. def get_feeds(cls, log_type, crawler, browse_id, out_uid):
  451. """
  452. 获取个人主页视频列表
  453. :param log_type: 日志
  454. :param crawler: 哪款爬虫
  455. :param browse_id: 每个用户主页的请求参数中唯一值
  456. :param out_uid: 站外用户UID
  457. :return: video_list
  458. """
  459. url = "https://www.youtube.com/youtubei/v1/browse?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8&prettyPrint=false"
  460. payload = json.dumps({
  461. "context": {
  462. "client": {
  463. "hl": "zh-CN",
  464. "gl": "US",
  465. "remoteHost": "38.93.247.21",
  466. "deviceMake": "Apple",
  467. "deviceModel": "",
  468. "visitorData": "CgtraDZfVnB4NXdIWSi6mIOfBg%3D%3D",
  469. "userAgent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36,gzip(gfe)",
  470. "clientName": "WEB",
  471. "clientVersion": "2.20230201.01.00",
  472. "osName": "Macintosh",
  473. "osVersion": "10_15_7",
  474. "originalUrl": f"https://www.youtube.com/{out_uid}/videos",
  475. "platform": "DESKTOP",
  476. "clientFormFactor": "UNKNOWN_FORM_FACTOR",
  477. "configInfo": {
  478. "appInstallData": "CLqYg58GEInorgUQuIuuBRCU-K4FENfkrgUQuNSuBRC2nP4SEPuj_hIQ5_euBRCy9a4FEKLsrgUQt-CuBRDi1K4FEILdrgUQh92uBRDM364FEP7urgUQzPWuBRDZ6a4FEOSg_hIQo_muBRDvo_4SEMnJrgUQlqf-EhCR-PwS"
  479. },
  480. "timeZone": "Asia/Shanghai",
  481. "browserName": "Chrome",
  482. "browserVersion": "109.0.0.0",
  483. "acceptHeader": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
  484. "deviceExperimentId": "ChxOekU1TlRReU5qWTBOVFExTVRRNU5qRTBOdz09ELqYg58GGOmU7Z4G",
  485. "screenWidthPoints": 944,
  486. "screenHeightPoints": 969,
  487. "screenPixelDensity": 1,
  488. "screenDensityFloat": 1,
  489. "utcOffsetMinutes": 480,
  490. "userInterfaceTheme": "USER_INTERFACE_THEME_LIGHT",
  491. "memoryTotalKbytes": "8000000",
  492. "mainAppWebInfo": {
  493. "graftUrl": f"/{out_uid}/videos",
  494. "pwaInstallabilityStatus": "PWA_INSTALLABILITY_STATUS_CAN_BE_INSTALLED",
  495. "webDisplayMode": "WEB_DISPLAY_MODE_FULLSCREEN",
  496. "isWebNativeShareAvailable": True
  497. }
  498. },
  499. "user": {
  500. "lockedSafetyMode": False
  501. },
  502. "request": {
  503. "useSsl": True,
  504. "internalExperimentFlags": [],
  505. "consistencyTokenJars": []
  506. },
  507. "clickTracking": {
  508. "clickTrackingParams": "CBcQ8JMBGAYiEwiNhIXX9IL9AhUFSUwIHWnnDks="
  509. },
  510. "adSignalsInfo": {
  511. "params": [
  512. {
  513. "key": "dt",
  514. "value": "1675676731048"
  515. },
  516. {
  517. "key": "flash",
  518. "value": "0"
  519. },
  520. {
  521. "key": "frm",
  522. "value": "0"
  523. },
  524. {
  525. "key": "u_tz",
  526. "value": "480"
  527. },
  528. {
  529. "key": "u_his",
  530. "value": "4"
  531. },
  532. {
  533. "key": "u_h",
  534. "value": "1080"
  535. },
  536. {
  537. "key": "u_w",
  538. "value": "1920"
  539. },
  540. {
  541. "key": "u_ah",
  542. "value": "1080"
  543. },
  544. {
  545. "key": "u_aw",
  546. "value": "1920"
  547. },
  548. {
  549. "key": "u_cd",
  550. "value": "24"
  551. },
  552. {
  553. "key": "bc",
  554. "value": "31"
  555. },
  556. {
  557. "key": "bih",
  558. "value": "969"
  559. },
  560. {
  561. "key": "biw",
  562. "value": "944"
  563. },
  564. {
  565. "key": "brdim",
  566. "value": "-269,-1080,-269,-1080,1920,-1080,1920,1080,944,969"
  567. },
  568. {
  569. "key": "vis",
  570. "value": "1"
  571. },
  572. {
  573. "key": "wgl",
  574. "value": "true"
  575. },
  576. {
  577. "key": "ca_type",
  578. "value": "image"
  579. }
  580. ],
  581. "bid": "ANyPxKpfiaAf-DBzNeKLgkceMEA9UIeCWFRTRm4AQMCuejhI3PGwDB1jizQIX60YcEYtt_CX7tZWAbYerQ-rWLvV7y_KCLkBww"
  582. }
  583. },
  584. "browseId": browse_id,
  585. "params": "EgZ2aWRlb3PyBgQKAjoA",
  586. "continuation": cls.continuation
  587. })
  588. headers = {
  589. 'authority': 'www.youtube.com',
  590. 'accept': '*/*',
  591. 'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
  592. 'cache-control': 'no-cache',
  593. 'content-type': 'application/json',
  594. 'cookie': 'VISITOR_INFO1_LIVE=kh6_Vpx5wHY; YSC=UupqFrWvAR0; DEVICE_INFO=ChxOekU1TlRReU5qWTBOVFExTVRRNU5qRTBOdz09EOmU7Z4GGOmU7Z4G; PREF=tz=Asia.Shanghai; ST-1kg1gfd=itct=CBcQ8JMBGAYiEwiNhIXX9IL9AhUFSUwIHWnnDks%3D&csn=MC4zNzI3MDcwMDA1Mjg4NzE5Ng..&endpoint=%7B%22clickTrackingParams%22%3A%22CBcQ8JMBGAYiEwiNhIXX9IL9AhUFSUwIHWnnDks%3D%22%2C%22commandMetadata%22%3A%7B%22webCommandMetadata%22%3A%7B%22url%22%3A%22%2F%40chinatravel5971%2Fvideos%22%2C%22webPageType%22%3A%22WEB_PAGE_TYPE_CHANNEL%22%2C%22rootVe%22%3A3611%2C%22apiUrl%22%3A%22%2Fyoutubei%2Fv1%2Fbrowse%22%7D%7D%2C%22browseEndpoint%22%3A%7B%22browseId%22%3A%22UCpLXnfBCNhj8KLnt54RQMKA%22%2C%22params%22%3A%22EgZ2aWRlb3PyBgQKAjoA%22%2C%22canonicalBaseUrl%22%3A%22%2F%40chinatravel5971%22%7D%7D',
  595. 'origin': 'https://www.youtube.com',
  596. 'pragma': 'no-cache',
  597. 'referer': f'https://www.youtube.com/{out_uid}/featured',
  598. 'sec-ch-ua': '"Not_A Brand";v="99", "Chromium";v="109", "Google Chrome";v="109.0.5414.87"',
  599. 'sec-ch-ua-arch': '"arm"',
  600. 'sec-ch-ua-bitness': '"64"',
  601. 'sec-ch-ua-full-version': '"109.0.1518.52"',
  602. 'sec-ch-ua-full-version-list': '"Not_A Brand";v="99.0.0.0", "Microsoft Edge";v="109.0.1518.52", "Chromium";v="109.0.5414.87"',
  603. 'sec-ch-ua-mobile': '?0',
  604. 'sec-ch-ua-model': '',
  605. 'sec-ch-ua-platform': '"macOS"',
  606. 'sec-ch-ua-platform-version': '"12.4.0"',
  607. 'sec-ch-ua-wow64': '?0',
  608. 'sec-fetch-dest': 'empty',
  609. 'sec-fetch-mode': 'same-origin',
  610. 'sec-fetch-site': 'same-origin',
  611. 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
  612. 'x-goog-visitor-id': 'CgtraDZfVnB4NXdIWSi6mIOfBg%3D%3D',
  613. 'x-youtube-bootstrap-logged-in': 'false',
  614. 'x-youtube-client-name': '1',
  615. 'x-youtube-client-version': '2.20230201.01.00'
  616. }
  617. # try:
  618. response = requests.post(url=url, headers=headers, data=payload)
  619. # Common.logger(log_type, crawler).info(f"get_feeds_response:{response.json()}\n")
  620. cls.continuation = response.json()['trackingParams']
  621. if response.status_code != 200:
  622. Common.logger(log_type, crawler).warning(f'get_feeds_response:{response.text}\n')
  623. elif 'continuationContents' not in response.text and 'onResponseReceivedActions' not in response.text:
  624. Common.logger(log_type, crawler).warning(f'get_feeds_response:{response.text}\n')
  625. elif 'continuationContents' in response.json():
  626. # Common.logger(log_type, crawler).info("'continuationContents' in response.json()\n")
  627. if 'richGridContinuation' not in response.json()['continuationContents']:
  628. # Common.logger(log_type, crawler).warning(f"'richGridContinuation' not in response.json()['continuationContents']\n")
  629. Common.logger(log_type, crawler).warning(f'get_feeds_response:{response.json()["continuationContents"]}\n')
  630. elif 'contents' not in response.json()['continuationContents']['richGridContinuation']:
  631. Common.logger(log_type, crawler).warning(f'get_feeds_response:{response.json()["continuationContents"]["richGridContinuation"]}\n')
  632. elif 'contents' in response.json()["continuationContents"]["richGridContinuation"]:
  633. feeds = response.json()["continuationContents"]["richGridContinuation"]['contents']
  634. return feeds
  635. elif 'onResponseReceivedActions' in response.json():
  636. Common.logger(log_type, crawler).info("'onResponseReceivedActions' in response.json()\n")
  637. if len(response.json()['onResponseReceivedActions']) == 0:
  638. Common.logger(log_type, crawler).warning(f'get_feeds_response:{response.json()["onResponseReceivedActions"]}\n')
  639. elif 'appendContinuationItemsAction' not in response.json()['onResponseReceivedActions'][0]:
  640. Common.logger(log_type, crawler).warning(f'get_feeds_response:{response.json()["onResponseReceivedActions"][0]}\n')
  641. elif 'continuationItems' not in response.json()['onResponseReceivedActions'][0]['appendContinuationItemsAction']:
  642. Common.logger(log_type, crawler).warning(f'get_feeds_response:{response.json()["onResponseReceivedActions"][0]["appendContinuationItemsAction"]}\n')
  643. elif len(response.json()['onResponseReceivedActions'][0]['appendContinuationItemsAction']['continuationItems']) == 0:
  644. Common.logger(log_type, crawler).warning(f'get_feeds_response:{response.json()["onResponseReceivedActions"][0]["appendContinuationItemsAction"]["continuationItems"]}\n')
  645. else:
  646. feeds = response.json()["onResponseReceivedActions"][0]["appendContinuationItemsAction"]["continuationItems"]
  647. return feeds
  648. else:
  649. Common.logger(log_type, crawler).info('feeds is None\n')
  650. # except Exception as e:
  651. # Common.logger(log_type, crawler).error(f'get_feeds异常:{e}\n')
  652. @classmethod
  653. def get_videos(cls, log_type, crawler, strategy, oss_endpoint, env, browse_id, out_uid, our_uid, machine):
  654. while True:
  655. feeds = cls.get_feeds(log_type, crawler, browse_id, out_uid)
  656. # Common.logger(log_type, crawler).info(f"feeds:{feeds}\n")
  657. for i in range(len(feeds)):
  658. if 'richItemRenderer' not in feeds[i]:
  659. Common.logger(log_type, crawler).warning(f'feeds:{feeds[i]}\n')
  660. elif 'content' not in feeds[i]['richItemRenderer']:
  661. Common.logger(log_type, crawler).warning(f'feeds:{feeds[i]["richItemRenderer"]}\n')
  662. elif 'videoRenderer' not in feeds[i]['richItemRenderer']['content']:
  663. Common.logger(log_type, crawler).warning(f'feeds:{feeds[i]["richItemRenderer"]["content"]}\n')
  664. elif 'videoId' not in feeds[i]["richItemRenderer"]["content"]['videoRenderer']:
  665. Common.logger(log_type, crawler).warning(f'feeds:{feeds[i]["richItemRenderer"]["content"]["videoRenderer"]}\n')
  666. else:
  667. video_id = feeds[i]["richItemRenderer"]["content"]['videoRenderer']['videoId']
  668. video_dict = cls.get_video_info(log_type, crawler, out_uid, video_id, machine)
  669. # 发布时间<=30天
  670. publish_time = int(time.mktime(time.strptime(video_dict['publish_time'], "%Y-%m-%d")))
  671. if int(time.time()) - publish_time <= 3600*24*30:
  672. cls.download_publish(log_type, crawler, video_dict, strategy, our_uid, env, oss_endpoint)
  673. else:
  674. Common.logger(log_type, crawler).info('发布时间超过30天\n')
  675. return
  676. @classmethod
  677. def get_video_info(cls, log_type, crawler, out_uid, video_id, machine):
  678. url = "https://www.youtube.com/youtubei/v1/player?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8&prettyPrint=false"
  679. payload = json.dumps({
  680. "context": {
  681. "client": {
  682. "hl": "zh-CN",
  683. "gl": "US",
  684. "remoteHost": "38.93.247.21",
  685. "deviceMake": "Apple",
  686. "deviceModel": "",
  687. "visitorData": "CgtraDZfVnB4NXdIWSjkzoefBg%3D%3D",
  688. "userAgent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36,gzip(gfe)",
  689. "clientName": "WEB",
  690. "clientVersion": "2.20230201.01.00",
  691. "osName": "Macintosh",
  692. "osVersion": "10_15_7",
  693. "originalUrl": f"https://www.youtube.com/watch?v={video_id}",
  694. "platform": "DESKTOP",
  695. "clientFormFactor": "UNKNOWN_FORM_FACTOR",
  696. "configInfo": {
  697. "appInstallData": "COTOh58GEPuj_hIQ1-SuBRC4i64FEMzfrgUQgt2uBRCi7K4FEOLUrgUQzPWuBRCKgK8FEOSg_hIQtpz-EhDa6a4FEP7urgUQieiuBRDn964FELjUrgUQlPiuBRCH3a4FELfgrgUQ76P-EhDJya4FEJan_hIQkfj8Eg%3D%3D"
  698. },
  699. "timeZone": "Asia/Shanghai",
  700. "browserName": "Chrome",
  701. "browserVersion": "109.0.0.0",
  702. "acceptHeader": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
  703. "deviceExperimentId": "ChxOekU1TlRReU5qWTBOVFExTVRRNU5qRTBOdz09EOTOh58GGOmU7Z4G",
  704. "screenWidthPoints": 1037,
  705. "screenHeightPoints": 969,
  706. "screenPixelDensity": 1,
  707. "screenDensityFloat": 1,
  708. "utcOffsetMinutes": 480,
  709. "userInterfaceTheme": "USER_INTERFACE_THEME_LIGHT",
  710. "memoryTotalKbytes": "8000000",
  711. "clientScreen": "WATCH",
  712. "mainAppWebInfo": {
  713. "graftUrl": f"/watch?v={video_id}",
  714. "pwaInstallabilityStatus": "PWA_INSTALLABILITY_STATUS_CAN_BE_INSTALLED",
  715. "webDisplayMode": "WEB_DISPLAY_MODE_FULLSCREEN",
  716. "isWebNativeShareAvailable": True
  717. }
  718. },
  719. "user": {
  720. "lockedSafetyMode": False
  721. },
  722. "request": {
  723. "useSsl": True,
  724. "internalExperimentFlags": [],
  725. "consistencyTokenJars": []
  726. },
  727. "clickTracking": {
  728. "clickTrackingParams": "CIwBEKQwGAYiEwipncqx3IL9AhXs4cQKHbKZDO4yB3JlbGF0ZWRInsS1qbGFtIlUmgEFCAEQ-B0="
  729. },
  730. "adSignalsInfo": {
  731. "params": [
  732. {
  733. "key": "dt",
  734. "value": "1675749222611"
  735. },
  736. {
  737. "key": "flash",
  738. "value": "0"
  739. },
  740. {
  741. "key": "frm",
  742. "value": "0"
  743. },
  744. {
  745. "key": "u_tz",
  746. "value": "480"
  747. },
  748. {
  749. "key": "u_his",
  750. "value": "3"
  751. },
  752. {
  753. "key": "u_h",
  754. "value": "1080"
  755. },
  756. {
  757. "key": "u_w",
  758. "value": "1920"
  759. },
  760. {
  761. "key": "u_ah",
  762. "value": "1080"
  763. },
  764. {
  765. "key": "u_aw",
  766. "value": "1920"
  767. },
  768. {
  769. "key": "u_cd",
  770. "value": "24"
  771. },
  772. {
  773. "key": "bc",
  774. "value": "31"
  775. },
  776. {
  777. "key": "bih",
  778. "value": "969"
  779. },
  780. {
  781. "key": "biw",
  782. "value": "1037"
  783. },
  784. {
  785. "key": "brdim",
  786. "value": "-269,-1080,-269,-1080,1920,-1080,1920,1080,1037,969"
  787. },
  788. {
  789. "key": "vis",
  790. "value": "1"
  791. },
  792. {
  793. "key": "wgl",
  794. "value": "true"
  795. },
  796. {
  797. "key": "ca_type",
  798. "value": "image"
  799. }
  800. ],
  801. "bid": "ANyPxKop8SijebwUCq4ZfKbJwlSjVQa_RTdS6c6a6WPYpCKnxpWCJ33B1SzRuSXjSfH9O2MhURebAs0CngRg6B4nOjBpeJDKgA"
  802. }
  803. },
  804. "videoId": str(video_id),
  805. "playbackContext": {
  806. "contentPlaybackContext": {
  807. "currentUrl": f"/watch?v={video_id}",
  808. "vis": 0,
  809. "splay": False,
  810. "autoCaptionsDefaultOn": False,
  811. "autonavState": "STATE_NONE",
  812. "html5Preference": "HTML5_PREF_WANTS",
  813. "signatureTimestamp": 19394,
  814. "referer": f"https://www.youtube.com/watch?v={video_id}",
  815. "lactMilliseconds": "-1",
  816. "watchAmbientModeContext": {
  817. "watchAmbientModeEnabled": True
  818. }
  819. }
  820. },
  821. "racyCheckOk": False,
  822. "contentCheckOk": False
  823. })
  824. headers = {
  825. 'authority': 'www.youtube.com',
  826. 'accept': '*/*',
  827. 'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
  828. 'cache-control': 'no-cache',
  829. 'content-type': 'application/json',
  830. 'cookie': f'VISITOR_INFO1_LIVE=kh6_Vpx5wHY; YSC=UupqFrWvAR0; DEVICE_INFO=ChxOekU1TlRReU5qWTBOVFExTVRRNU5qRTBOdz09EOmU7Z4GGOmU7Z4G; PREF=tz=Asia.Shanghai; ST-180dxzo=itct=CIwBEKQwGAYiEwipncqx3IL9AhXs4cQKHbKZDO4yB3JlbGF0ZWRInsS1qbGFtIlUmgEFCAEQ-B0%3D&csn=MC41MTQ1NTQzMTE3NTA4MjY0&endpoint=%7B%22clickTrackingParams%22%3A%22CIwBEKQwGAYiEwipncqx3IL9AhXs4cQKHbKZDO4yB3JlbGF0ZWRInsS1qbGFtIlUmgEFCAEQ-B0%3D%22%2C%22commandMetadata%22%3A%7B%22webCommandMetadata%22%3A%7B%22url%22%3A%22%2Fwatch%3Fv%3D{video_id}%22%2C%22webPageType%22%3A%22WEB_PAGE_TYPE_WATCH%22%2C%22rootVe%22%3A3832%7D%7D%2C%22watchEndpoint%22%3A%7B%22videoId%22%3A%22{video_id}%22%2C%22nofollow%22%3Atrue%2C%22watchEndpointSupportedOnesieConfig%22%3A%7B%22html5PlaybackOnesieConfig%22%3A%7B%22commonConfig%22%3A%7B%22url%22%3A%22https%3A%2F%2Frr5---sn-nx5s7n76.googlevideo.com%2Finitplayback%3Fsource%3Dyoutube%26oeis%3D1%26c%3DWEB%26oad%3D3200%26ovd%3D3200%26oaad%3D11000%26oavd%3D11000%26ocs%3D700%26oewis%3D1%26oputc%3D1%26ofpcc%3D1%26msp%3D1%26odepv%3D1%26id%3D38654ad085c12212%26ip%3D38.93.247.21%26initcwndbps%3D11346250%26mt%3D1675748964%26oweuc%3D%26pxtags%3DCg4KAnR4EggyNDQ1MTI4OA%26rxtags%3DCg4KAnR4EggyNDQ1MTI4Ng%252CCg4KAnR4EggyNDQ1MTI4Nw%252CCg4KAnR4EggyNDQ1MTI4OA%252CCg4KAnR4EggyNDQ1MTI4OQ%22%7D%7D%7D%7D%7D',
  831. 'origin': 'https://www.youtube.com',
  832. 'pragma': 'no-cache',
  833. 'referer': f'https://www.youtube.com/watch?v={video_id}',
  834. 'sec-ch-ua': '"Not_A Brand";v="99", "Chromium";v="109", "Google Chrome";v="109.0.5414.87"',
  835. 'sec-ch-ua-arch': '"arm"',
  836. 'sec-ch-ua-bitness': '"64"',
  837. 'sec-ch-ua-full-version': '"109.0.1518.52"',
  838. 'sec-ch-ua-full-version-list': '"Not_A Brand";v="99.0.0.0", "Microsoft Edge";v="109.0.1518.52", "Chromium";v="109.0.5414.87"',
  839. 'sec-ch-ua-mobile': '?0',
  840. 'sec-ch-ua-model': '',
  841. 'sec-ch-ua-platform': '"macOS"',
  842. 'sec-ch-ua-platform-version': '"12.4.0"',
  843. 'sec-ch-ua-wow64': '?0',
  844. 'sec-fetch-dest': 'empty',
  845. 'sec-fetch-mode': 'same-origin',
  846. 'sec-fetch-site': 'same-origin',
  847. 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
  848. 'x-goog-visitor-id': 'CgtraDZfVnB4NXdIWSjkzoefBg%3D%3D',
  849. 'x-youtube-bootstrap-logged-in': 'false',
  850. 'x-youtube-client-name': '1',
  851. 'x-youtube-client-version': '2.20230201.01.00'
  852. }
  853. response = requests.post(url=url, headers=headers, data=payload)
  854. # Common.logger(log_type, crawler).info(f"get_video_info_response:{response.json()}\n")
  855. if response.status_code != 200:
  856. Common.logger(log_type, crawler).warning(f"get_video_info_response:{response.text}\n")
  857. elif 'streamingData' not in response.json():
  858. Common.logger(log_type, crawler).warning(f"get_video_info_response:{response.json()}\n")
  859. elif 'videoDetails' not in response.json():
  860. Common.logger(log_type, crawler).warning(f"get_video_info_response:{response.json()}\n")
  861. elif 'microformat' not in response.json():
  862. Common.logger(log_type, crawler).warning(f"get_video_info_response:{response.json()}\n")
  863. else:
  864. playerMicroformatRenderer = response.json()['microformat']['playerMicroformatRenderer']
  865. videoDetails = response.json()['videoDetails']
  866. streamingData = response.json()['streamingData']
  867. # video_title
  868. if 'title' not in videoDetails:
  869. video_title = ''
  870. else:
  871. video_title = videoDetails['title']
  872. if Translate.is_contains_chinese(video_title) is False:
  873. video_title = Translate.google_translate(video_title, machine) # 自动翻译标题为中文
  874. # play_cnt
  875. if 'viewCount' not in videoDetails:
  876. play_cnt = 0
  877. else:
  878. play_cnt = int(videoDetails['viewCount'])
  879. # publish_time
  880. if 'publishDate' not in playerMicroformatRenderer:
  881. publish_time = ''
  882. else:
  883. publish_time = playerMicroformatRenderer['publishDate']
  884. # user_name
  885. if 'author' not in videoDetails:
  886. user_name = ''
  887. else:
  888. user_name = videoDetails['author']
  889. # cover_url
  890. if 'thumbnail' not in videoDetails:
  891. cover_url = ''
  892. elif 'thumbnails' not in videoDetails['thumbnail']:
  893. cover_url = ''
  894. elif len(videoDetails['thumbnail']['thumbnails']) == 0:
  895. cover_url = ''
  896. elif 'url' not in videoDetails['thumbnail']['thumbnails'][-1]:
  897. cover_url = ''
  898. else:
  899. cover_url = videoDetails['thumbnail']['thumbnails'][-1]['url']
  900. # video_url
  901. if 'formats' not in streamingData:
  902. video_url = ''
  903. elif len(streamingData['formats']) == 0:
  904. video_url = ''
  905. elif 'url' not in streamingData['formats'][-1]:
  906. video_url = ''
  907. else:
  908. video_url = streamingData['formats'][-1]['url']
  909. Common.logger(log_type, crawler).info(f'video_title:{video_title}')
  910. Common.logger(log_type, crawler).info(f'video_id:{video_id}')
  911. Common.logger(log_type, crawler).info(f'play_cnt:{play_cnt}')
  912. Common.logger(log_type, crawler).info(f'publish_time:{publish_time}')
  913. Common.logger(log_type, crawler).info(f'user_name:{user_name}')
  914. Common.logger(log_type, crawler).info(f'cover_url:{cover_url}')
  915. Common.logger(log_type, crawler).info(f'video_url:{video_url}')
  916. video_dict = {
  917. 'video_title': video_title,
  918. 'video_id': video_id,
  919. 'play_cnt': play_cnt,
  920. 'publish_time': publish_time,
  921. 'user_name': user_name,
  922. 'out_uid': out_uid,
  923. 'cover_url': cover_url,
  924. 'video_url': video_url,
  925. }
  926. return video_dict
  927. @classmethod
  928. def download_publish(cls, log_type, crawler, video_dict, strategy, our_uid, env, oss_endpoint):
  929. sql = f""" select * from crawler_video where platform="{cls.platform}" and out_video_id="{video_dict['video_id']}" """
  930. repeat_video = MysqlHelper.get_values(log_type, crawler, sql, env)
  931. if video_dict['video_title'] == '' or video_dict['video_url'] == '':
  932. Common.logger(log_type, crawler).info('无效视频\n')
  933. elif repeat_video is not None and len(repeat_video) != 0:
  934. Common.logger(log_type, crawler).info('视频已下载\n')
  935. else:
  936. # 下载视频
  937. Common.logger(log_type, crawler).info('开始下载视频...')
  938. Common.download_method(log_type, crawler, 'video', video_dict['video_title'], video_dict['video_url'])
  939. ffmpeg_dict = Common.ffmpeg(log_type, crawler, f"./{crawler}/videos/{video_dict['video_title']}/")
  940. video_width = ffmpeg_dict['width']
  941. video_height = ffmpeg_dict['height']
  942. duration = int(ffmpeg_dict['duration'])
  943. video_size = ffmpeg_dict['size']
  944. Common.logger(log_type, crawler).info(f'video_width:{video_width}')
  945. Common.logger(log_type, crawler).info(f'video_height:{video_height}')
  946. Common.logger(log_type, crawler).info(f'duration:{duration}')
  947. Common.logger(log_type, crawler).info(f'video_size:{video_size}\n')
  948. video_dict['video_width'] = video_width
  949. video_dict['video_height'] = video_height
  950. video_dict['duration'] = duration
  951. video_dict['comment_cnt'] = 0
  952. video_dict['like_cnt'] = 0
  953. video_dict['share_cnt'] = 0
  954. video_dict['avatar_url'] = video_dict['cover_url']
  955. video_dict['session'] = f'youtube{int(time.time())}'
  956. rule='1,2'
  957. if duration < 60 or duration > 600:
  958. # 删除视频文件夹
  959. shutil.rmtree(f"./{crawler}/videos/{video_dict['video_title']}/")
  960. Common.logger(log_type, crawler).info(f"时长:{video_dict['duration']}不满足抓取规则,删除成功\n")
  961. return
  962. elif video_size == 0 or duration == 0 or video_size is None or duration is None:
  963. # 删除视频文件夹
  964. shutil.rmtree(f"./{crawler}/videos/{video_dict['video_title']}/")
  965. Common.logger(log_type, crawler).info(f"视频下载出错,删除成功\n")
  966. return
  967. else:
  968. # 下载封面
  969. Common.download_method(log_type, crawler, 'cover', video_dict['video_title'], video_dict['cover_url'])
  970. # 保存视频文本信息
  971. Common.save_video_info(log_type, crawler, video_dict)
  972. # 上传视频
  973. Common.logger(log_type, crawler).info(f"开始上传视频")
  974. if env == 'dev':
  975. our_video_id = Publish.upload_and_publish(log_type, crawler, strategy, our_uid, env, oss_endpoint)
  976. our_video_link = f"https://testadmin.piaoquantv.com/cms/post-detail/{our_video_id}/info"
  977. else:
  978. our_video_id = Publish.upload_and_publish(log_type, crawler, strategy, our_uid, env, oss_endpoint)
  979. our_video_link = f"https://admin.piaoquantv.com/cms/post-detail/{our_video_id}/info"
  980. Common.logger(log_type, crawler).info("视频上传完成")
  981. # 视频信息保存至飞书
  982. Feishu.insert_columns(log_type, crawler, "GVxlYk", "ROWS", 1, 2)
  983. # 视频ID工作表,首行写入数据
  984. upload_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(time.time())))
  985. values = [[upload_time,
  986. "定向榜",
  987. video_dict['video_id'],
  988. video_dict['video_title'],
  989. our_video_link,
  990. video_dict['play_cnt'],
  991. video_dict['duration'],
  992. f'{video_width}*{video_height}',
  993. video_dict['publish_time'],
  994. video_dict['user_name'],
  995. video_dict['cover_url'],
  996. video_dict['video_url']
  997. ]]
  998. time.sleep(1)
  999. Feishu.update_values(log_type, crawler, "GVxlYk", "F2:Z2", values)
  1000. Common.logger(log_type, crawler).info('视频信息写入定向_已下载表成功\n')
  1001. # 视频信息保存数据库
  1002. sql = f""" insert into crawler_video(video_id,
  1003. user_id,
  1004. out_user_id,
  1005. platform,
  1006. strategy,
  1007. out_video_id,
  1008. video_title,
  1009. cover_url,
  1010. video_url,
  1011. duration,
  1012. publish_time,
  1013. play_cnt,
  1014. crawler_rule,
  1015. width,
  1016. height)
  1017. values({our_video_id},
  1018. "{our_uid}",
  1019. "{video_dict['out_uid']}",
  1020. "{cls.platform}",
  1021. "定向爬虫策略",
  1022. "{video_dict['video_id']}",
  1023. "{video_dict['video_title']}",
  1024. "{video_dict['cover_url']}",
  1025. "{video_dict['video_url']}",
  1026. {int(duration)},
  1027. "{video_dict['publish_time']}",
  1028. {int(video_dict['play_cnt'])},
  1029. "{rule}",
  1030. {int(video_width)},
  1031. {int(video_height)}) """
  1032. MysqlHelper.update_values(log_type, crawler, sql, env)
  1033. Common.logger(log_type, crawler).info('视频信息插入数据库成功!\n')
  1034. @classmethod
  1035. def get_follow_videos(cls, log_type, crawler, strategy, oss_endpoint, env, machine):
  1036. user_list = cls.get_user_from_feishu(log_type, crawler, 'c467d7', env, machine)
  1037. if len(user_list) == 0:
  1038. Common.logger(log_type, crawler).warning('用户列表为空\n')
  1039. else:
  1040. for user_dict in user_list:
  1041. out_uid = user_dict['out_user_id']
  1042. user_name = user_dict['out_user_name']
  1043. browse_id = user_dict['out_browse_id']
  1044. our_uid = user_dict['our_user_id']
  1045. Common.logger(log_type, crawler).info(f'获取 {user_name} 主页视频\n')
  1046. cls.get_videos(log_type, crawler, strategy, oss_endpoint, env, browse_id, out_uid, our_uid, machine)
  1047. Common.logger(log_type, crawler).info('休眠 10 秒')
  1048. time.sleep(10)
  1049. cls.continuation = ''
  1050. if __name__ == "__main__":
  1051. print(Follow.get_browse_id('follow', 'youtube', '@chinatravel5971', "local"))
  1052. # print(Follow.get_user_from_feishu('follow', 'youtube', 'c467d7', 'dev', 'local'))
  1053. # Follow.get_out_user_info('follow', 'youtube', 'UC08jgxf119fzynp2uHCvZIg', '@weitravel')
  1054. # Follow.get_video_info('follow', 'youtube', 'OGVK0IXBIhI')
  1055. # Follow.get_follow_videos('follow', 'youtube', 'youtube_follow', 'out', 'dev', 'local')
  1056. pass