youtube_follow.py 67 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175
  1. # -*- coding: utf-8 -*-
  2. # @Author: wangkun
  3. # @Time: 2023/2/3
  4. """
  5. YouTube 定向榜
  6. 1. 发布时间<=1个月
  7. 2. 10分钟>=时长>=1分钟
  8. """
  9. import os
  10. import re
  11. import shutil
  12. import sys
  13. import time
  14. import json
  15. # import emoji
  16. import requests
  17. from selenium import webdriver
  18. from selenium.webdriver.chrome.service import Service
  19. from selenium.webdriver.common.by import By
  20. from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
  21. sys.path.append(os.getcwd())
  22. from common.common import Common
  23. from common.db import MysqlHelper
  24. from common.feishu import Feishu
  25. from common.users import Users
  26. from common.publish import Publish
  27. from common.translate import Translate
  28. class Follow:
  29. # 翻页参数
  30. continuation = ''
  31. # 抓取平台
  32. platform = 'youtube'
  33. @classmethod
  34. def get_browse_id(cls, log_type, crawler, out_user_id, machine):
  35. """
  36. 获取每个用户的 browse_id
  37. :param log_type: 日志
  38. :param crawler: 哪款爬虫
  39. :param out_user_id: 站外用户 UID
  40. :param machine: 部署机器,阿里云填写 aliyun / aliyun_hk,线下分别填写 macpro,macair,local
  41. :return: browse_id
  42. """
  43. try:
  44. # 打印请求配置
  45. ca = DesiredCapabilities.CHROME
  46. ca["goog:loggingPrefs"] = {"performance": "ALL"}
  47. # 不打开浏览器运行
  48. chrome_options = webdriver.ChromeOptions()
  49. chrome_options.add_argument("--headless")
  50. chrome_options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36')
  51. chrome_options.add_argument("--no-sandbox")
  52. # driver初始化
  53. if machine == 'aliyun' or machine == 'aliyun_hk':
  54. driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options)
  55. elif machine == 'macpro':
  56. driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options, service=Service('/Users/lieyunye/Downloads/chromedriver_v86/chromedriver'))
  57. elif machine == 'macair':
  58. driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options, service=Service('/Users/piaoquan/Downloads/chromedriver'))
  59. else:
  60. driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options, service=Service('/Users/wangkun/Downloads/chromedriver/chromedriver_v110/chromedriver'))
  61. driver.implicitly_wait(10)
  62. url = f'https://www.youtube.com/{out_user_id}/videos'
  63. driver.get(url)
  64. # driver.save_screenshot("./1.png")
  65. # 向上滑动 1000 个像素
  66. # driver.execute_script('window.scrollBy(0, 2000)')
  67. # driver.save_screenshot("./2.png")
  68. time.sleep(3)
  69. accept_btns = driver.find_elements(By.XPATH, '//span[text()="全部接受"]')
  70. accept_btns_eng = driver.find_elements(By.XPATH, '//span[text()="Accept all"]')
  71. if len(accept_btns) != 0:
  72. accept_btns[0].click()
  73. time.sleep(2)
  74. elif len(accept_btns_eng) != 0:
  75. accept_btns_eng[0].click()
  76. time.sleep(2)
  77. browse_id = driver.find_element(By.XPATH, '//meta[@itemprop="channelId"]').get_attribute('content')
  78. driver.quit()
  79. return browse_id
  80. except Exception as e:
  81. Common.logger(log_type, crawler).error(f'get_browse_id异常:{e}\n')
  82. @classmethod
  83. def get_out_user_info(cls, log_type, crawler, browse_id, out_user_id):
  84. """
  85. 获取站外用户信息
  86. :param log_type: 日志
  87. :param crawler: 哪款爬虫
  88. :param browse_id: browse_id
  89. :param out_user_id: 站外用户 UID
  90. :return: out_user_dict = {'out_user_name': 站外用户昵称,
  91. 'out_avatar_url': 站外用户头像,
  92. 'out_fans': 站外用户粉丝量,
  93. 'out_play_cnt': 站外用户总播放量,
  94. 'out_create_time': 站外用户创建时间}
  95. """
  96. try:
  97. url = "https://www.youtube.com/youtubei/v1/browse?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8&prettyPrint=false"
  98. payload = json.dumps({
  99. "context": {
  100. "client": {
  101. "hl": "zh-CN",
  102. "gl": "US",
  103. "remoteHost": "38.93.247.21",
  104. "deviceMake": "Apple",
  105. "deviceModel": "",
  106. "visitorData": "CgtraDZfVnB4NXdIWSjL1IKfBg%3D%3D",
  107. "userAgent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36,gzip(gfe)",
  108. "clientName": "WEB",
  109. "clientVersion": "2.20230201.01.00",
  110. "osName": "Macintosh",
  111. "osVersion": "10_15_7",
  112. "originalUrl": f"https://www.youtube.com/{out_user_id}/about",
  113. "screenPixelDensity": 1,
  114. "platform": "DESKTOP",
  115. "clientFormFactor": "UNKNOWN_FORM_FACTOR",
  116. "configInfo": {
  117. "appInstallData": "CMvUgp8GEKLsrgUQzN-uBRC41K4FENfkrgUQsvWuBRDkoP4SELiLrgUQo_muBRDn964FENnprgUQlPiuBRC2nP4SEPuj_hIQ4tSuBRCJ6K4FEILdrgUQh92uBRD-7q4FEMz1rgUQ76P-EhDJya4FEJan_hIQkfj8Eg%3D%3D"
  118. },
  119. "screenDensityFloat": 1,
  120. "timeZone": "Asia/Shanghai",
  121. "browserName": "Chrome",
  122. "browserVersion": "109.0.0.0",
  123. "acceptHeader": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
  124. "deviceExperimentId": "ChxOekU1TlRReU5qWTBOVFExTVRRNU5qRTBOdz09EMvUgp8GGOmU7Z4G",
  125. "screenWidthPoints": 805,
  126. "screenHeightPoints": 969,
  127. "utcOffsetMinutes": 480,
  128. "userInterfaceTheme": "USER_INTERFACE_THEME_LIGHT",
  129. "memoryTotalKbytes": "8000000",
  130. "mainAppWebInfo": {
  131. "graftUrl": f"/{out_user_id}/about",
  132. "pwaInstallabilityStatus": "PWA_INSTALLABILITY_STATUS_CAN_BE_INSTALLED",
  133. "webDisplayMode": "WEB_DISPLAY_MODE_FULLSCREEN",
  134. "isWebNativeShareAvailable": True
  135. }
  136. },
  137. "user": {
  138. "lockedSafetyMode": False
  139. },
  140. "request": {
  141. "useSsl": True,
  142. "internalExperimentFlags": [],
  143. "consistencyTokenJars": []
  144. },
  145. "clickTracking": {
  146. "clickTrackingParams": "CBMQ8JMBGAoiEwjY34r0rYD9AhURSEwIHfHZAak="
  147. },
  148. "adSignalsInfo": {
  149. "params": [
  150. {
  151. "key": "dt",
  152. "value": "1675668045032"
  153. },
  154. {
  155. "key": "flash",
  156. "value": "0"
  157. },
  158. {
  159. "key": "frm",
  160. "value": "0"
  161. },
  162. {
  163. "key": "u_tz",
  164. "value": "480"
  165. },
  166. {
  167. "key": "u_his",
  168. "value": "1"
  169. },
  170. {
  171. "key": "u_h",
  172. "value": "1080"
  173. },
  174. {
  175. "key": "u_w",
  176. "value": "1920"
  177. },
  178. {
  179. "key": "u_ah",
  180. "value": "1080"
  181. },
  182. {
  183. "key": "u_aw",
  184. "value": "1920"
  185. },
  186. {
  187. "key": "u_cd",
  188. "value": "24"
  189. },
  190. {
  191. "key": "bc",
  192. "value": "31"
  193. },
  194. {
  195. "key": "bih",
  196. "value": "969"
  197. },
  198. {
  199. "key": "biw",
  200. "value": "805"
  201. },
  202. {
  203. "key": "brdim",
  204. "value": "-269,-1080,-269,-1080,1920,-1080,1920,1080,805,969"
  205. },
  206. {
  207. "key": "vis",
  208. "value": "1"
  209. },
  210. {
  211. "key": "wgl",
  212. "value": "true"
  213. },
  214. {
  215. "key": "ca_type",
  216. "value": "image"
  217. }
  218. ],
  219. "bid": "ANyPxKqvCBKtjNeHQ6uTC7sKj2ZwIvEkk3oRlmdU7H_soRJWLc4IQCkqMVP68RR-Xae0h3nMdOKYOtVh_Yb2OYr4znd60I5j7A"
  220. }
  221. },
  222. "browseId": browse_id,
  223. "params": "EgVhYm91dPIGBAoCEgA%3D"
  224. })
  225. headers = {
  226. 'authority': 'www.youtube.com',
  227. 'accept': '*/*',
  228. 'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
  229. 'cache-control': 'no-cache',
  230. 'content-type': 'application/json',
  231. 'cookie': 'VISITOR_INFO1_LIVE=kh6_Vpx5wHY; YSC=UupqFrWvAR0; DEVICE_INFO=ChxOekU1TlRReU5qWTBOVFExTVRRNU5qRTBOdz09EOmU7Z4GGOmU7Z4G; GPS=1; PREF=tz=Asia.Shanghai; ST-h076le=itct=CBMQ8JMBGAoiEwjY34r0rYD9AhURSEwIHfHZAak%3D&csn=MC45NDM2MjgyNzM1ODE5NDAz&endpoint=%7B%22clickTrackingParams%22%3A%22CBMQ8JMBGAoiEwjY34r0rYD9AhURSEwIHfHZAak%3D%22%2C%22commandMetadata%22%3A%7B%22webCommandMetadata%22%3A%7B%22url%22%3A%22%2F%40weitravel%2Fabout%22%2C%22webPageType%22%3A%22WEB_PAGE_TYPE_CHANNEL%22%2C%22rootVe%22%3A3611%2C%22apiUrl%22%3A%22%2Fyoutubei%2Fv1%2Fbrowse%22%7D%7D%2C%22browseEndpoint%22%3A%7B%22browseId%22%3A%22UC08jgxf119fzynp2uHCvZIg%22%2C%22params%22%3A%22EgVhYm91dPIGBAoCEgA%253D%22%2C%22canonicalBaseUrl%22%3A%22%2F%40weitravel%22%7D%7D',
  232. 'origin': 'https://www.youtube.com',
  233. 'pragma': 'no-cache',
  234. 'referer': f'https://www.youtube.com/{out_user_id}/videos',
  235. 'sec-ch-ua': '"Not_A Brand";v="99", "Chromium";v="109", "Google Chrome";v="109.0.5414.87"',
  236. 'sec-ch-ua-arch': '"arm"',
  237. 'sec-ch-ua-bitness': '"64"',
  238. 'sec-ch-ua-full-version': '"109.0.1518.52"',
  239. 'sec-ch-ua-full-version-list': '"Not_A Brand";v="99.0.0.0", "Microsoft Edge";v="109.0.1518.52", "Chromium";v="109.0.5414.87"',
  240. 'sec-ch-ua-mobile': '?0',
  241. 'sec-ch-ua-model': '',
  242. 'sec-ch-ua-platform': '"macOS"',
  243. 'sec-ch-ua-platform-version': '"12.4.0"',
  244. 'sec-ch-ua-wow64': '?0',
  245. 'sec-fetch-dest': 'empty',
  246. 'sec-fetch-mode': 'same-origin',
  247. 'sec-fetch-site': 'same-origin',
  248. 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
  249. 'x-goog-visitor-id': 'CgtraDZfVnB4NXdIWSjL1IKfBg%3D%3D',
  250. 'x-youtube-bootstrap-logged-in': 'false',
  251. 'x-youtube-client-name': '1',
  252. 'x-youtube-client-version': '2.20230201.01.00'
  253. }
  254. response = requests.post(url=url, headers=headers, data=payload)
  255. if response.status_code != 200:
  256. Common.logger(log_type, crawler).warning(f'get_out_user_info:{response.text}\n')
  257. elif 'contents' not in response.text or 'header' not in response.text:
  258. Common.logger(log_type, crawler).warning(f'get_out_user_info:{response.text}\n')
  259. elif 'c4TabbedHeaderRenderer' not in response.json()['header']:
  260. Common.logger(log_type, crawler).warning(f'get_out_user_info:{response.json()["header"]}\n')
  261. elif 'twoColumnBrowseResultsRenderer' not in response.json()['contents']:
  262. Common.logger(log_type, crawler).warning(f'get_out_user_info:{response.json()}\n')
  263. elif 'tabs' not in response.json()['contents']['twoColumnBrowseResultsRenderer']:
  264. Common.logger(log_type, crawler).warning(f"get_out_user_info:{response.json()['contents']['twoColumnBrowseResultsRenderer']}\n")
  265. else:
  266. header = response.json()['header']['c4TabbedHeaderRenderer']
  267. tabs = response.json()['contents']['twoColumnBrowseResultsRenderer']['tabs']
  268. for i in range(len(tabs)):
  269. if 'tabRenderer' not in tabs[i]:
  270. title = ''
  271. elif 'title' not in tabs[i]['tabRenderer']:
  272. title = ''
  273. else:
  274. title = tabs[i]['tabRenderer']['title']
  275. if title == '简介':
  276. if 'tabRenderer' not in tabs[i]:
  277. Common.logger(log_type, crawler).warning(f"get_out_user_info:{tabs[i]}\n")
  278. elif 'content' not in tabs[i]['tabRenderer']:
  279. Common.logger(log_type, crawler).warning(f"get_out_user_info:{tabs[i]['tabRenderer']}\n")
  280. elif 'sectionListRenderer' not in tabs[i]['tabRenderer']['content']:
  281. Common.logger(log_type, crawler).warning(f"get_out_user_info:{tabs[i]['tabRenderer']['content']}\n")
  282. elif 'contents' not in tabs[i]['tabRenderer']['content']['sectionListRenderer']:
  283. Common.logger(log_type, crawler).warning(f"get_out_user_info:{tabs[i]['tabRenderer']['content']['sectionListRenderer']}\n")
  284. elif len(tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents']) == 0:
  285. Common.logger(log_type, crawler).warning(f"get_out_user_info:{tabs[i]['tabRenderer']['content']['sectionListRenderer']}\n")
  286. elif 'itemSectionRenderer' not in tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]:
  287. Common.logger(log_type, crawler).warning(f"get_out_user_info:{tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]}\n")
  288. elif 'contents' not in tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']:
  289. Common.logger(log_type, crawler).warning(f"get_out_user_info:{tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']}\n")
  290. elif len(tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents']) == 0:
  291. Common.logger(log_type, crawler).warning(f"get_out_user_info:{tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']}\n")
  292. elif 'channelAboutFullMetadataRenderer' not in tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]:
  293. Common.logger(log_type, crawler).warning(f"get_out_user_info:{tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]}\n")
  294. else:
  295. # 站外用户昵称
  296. if 'title' not in header and 'title' not in tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer']:
  297. out_user_name = ''
  298. elif 'title' in header:
  299. out_user_name = header['title']
  300. elif 'simpleText' not in tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer']['title']:
  301. out_user_name = ''
  302. else:
  303. out_user_name = tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer']['title']['simpleText']
  304. # 站外用户头像
  305. if 'avatar' not in header and 'avatar' not in tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer']:
  306. out_avatar_url = ''
  307. elif 'thumbnails' not in header['avatar'] and 'thumbnails' not in tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer']['avatar']:
  308. out_avatar_url = ''
  309. elif len(header['avatar']['thumbnails']) == 0 and len(tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer']['avatar']['thumbnails']) == 0:
  310. out_avatar_url = ''
  311. elif 'url' not in header['avatar']['thumbnails'][-1] and 'url' not in tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer']['avatar']['thumbnails'][-1]:
  312. out_avatar_url = ''
  313. elif 'url' in header['avatar']['thumbnails'][-1]:
  314. out_avatar_url = header['avatar']['thumbnails'][-1]['url']
  315. else:
  316. out_avatar_url = tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer']['avatar']['thumbnails'][-1]['url']
  317. # 站外用户粉丝
  318. if 'subscriberCountText' not in header:
  319. out_fans = 0
  320. elif 'accessibility' not in header['subscriberCountText']:
  321. out_fans = 0
  322. elif 'accessibilityData' not in header['subscriberCountText']['accessibility']:
  323. out_fans = 0
  324. elif 'label' not in header['subscriberCountText']['accessibility']['accessibilityData']:
  325. out_fans = 0
  326. else:
  327. out_fans = header['subscriberCountText']['accessibility']['accessibilityData']['label']
  328. if '万' in out_fans:
  329. out_fans = int(float(out_fans.split('万')[0])*10000)
  330. elif "位" in out_fans:
  331. out_fans = int(out_fans.split('位')[0].replace(",", ""))
  332. else:
  333. pass
  334. # 站外用户总播放量
  335. if 'viewCountText' not in tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer']:
  336. out_play_cnt = 0
  337. elif 'simpleText' not in tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer']['viewCountText']:
  338. out_play_cnt = 0
  339. else:
  340. out_play_cnt = int(tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer']['viewCountText']['simpleText'].split('次')[0].replace(',', ''))
  341. # 站外用户注册时间
  342. if 'joinedDateText' not in tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer']:
  343. out_create_time = ''
  344. elif 'runs' not in tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer']['joinedDateText']:
  345. out_create_time = ''
  346. elif len(tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer']['joinedDateText']['runs']) == 0:
  347. out_create_time = ''
  348. elif 'text' not in tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer']['joinedDateText']['runs'][0]:
  349. out_create_time = ''
  350. else:
  351. out_create_time = tabs[i]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer']['joinedDateText']['runs'][0]['text'].replace('年', '-').replace('月', '-').replace('日', '')
  352. out_user_dict = {
  353. 'out_user_name': out_user_name,
  354. 'out_avatar_url': out_avatar_url,
  355. 'out_fans': out_fans,
  356. 'out_play_cnt': out_play_cnt,
  357. 'out_create_time': out_create_time,
  358. }
  359. # print(out_user_dict)
  360. return out_user_dict
  361. except Exception as e:
  362. Common.logger(log_type, crawler).error(f'get_out_user_info异常:{e}\n')
  363. @classmethod
  364. def get_user_from_feishu(cls, log_type, crawler, sheetid, env, machine):
  365. """
  366. 补全飞书用户表信息,并返回
  367. :param log_type: 日志
  368. :param crawler: 哪款爬虫
  369. :param sheetid: 飞书表
  370. :param env: 正式环境:prod,测试环境:dev
  371. :param machine: 部署机器,阿里云填写 aliyun,aliyun_hk ,线下分别填写 macpro,macair,local
  372. :return: user_list
  373. """
  374. try:
  375. user_sheet = Feishu.get_values_batch(log_type, crawler, sheetid)
  376. user_list = []
  377. for i in range(1, len(user_sheet)):
  378. # for i in range(181, len(user_sheet)):
  379. out_uid = user_sheet[i][2]
  380. user_name = user_sheet[i][3]
  381. browse_id = user_sheet[i][5]
  382. our_uid = user_sheet[i][6]
  383. if out_uid is not None and user_name is not None:
  384. Common.logger(log_type, crawler).info(f"正在更新 {user_name} 用户信息\n")
  385. # 获取站外browse_id,并写入飞书
  386. if browse_id is None:
  387. browse_id = cls.get_browse_id(log_type, crawler, out_uid, machine)
  388. if browse_id is None:
  389. Common.logger(log_type, crawler).warning('browse_id is None !')
  390. else:
  391. Feishu.update_values(log_type, crawler, sheetid, f'F{i+1}:F{i+1}', [[browse_id]])
  392. Common.logger(log_type, crawler).info(f'browse_id写入成功:{browse_id}')
  393. # 站内 UID 为空,且数据库中(youtube+out_user_id)返回数量 == 0,则创建新的站内账号
  394. if our_uid is None:
  395. sql = f""" select * from crawler_user where platform="{cls.platform}" and out_user_id="{out_uid}" """
  396. our_user_info = MysqlHelper.get_values(log_type, crawler, sql, env, machine)
  397. # 数据库中(youtube + out_user_id)返回数量 == 0,则创建站内账号UID,并写入定向账号飞书表。并结合站外用户信息,一并写入爬虫账号数据库
  398. if our_user_info is None or len(our_user_info) == 0:
  399. # 获取站外账号信息,写入数据库
  400. out_user_dict = cls.get_out_user_info(log_type, crawler, browse_id, out_uid)
  401. out_avatar_url = out_user_dict['out_avatar_url']
  402. out_create_time = out_user_dict['out_create_time']
  403. out_play_cnt = out_user_dict['out_play_cnt']
  404. out_fans = out_user_dict['out_fans']
  405. tag = 'youtube爬虫,定向爬虫策略'
  406. # 创建站内账号
  407. create_user_dict = {
  408. 'nickName': user_name,
  409. 'avatarUrl': out_avatar_url,
  410. 'tagName': tag,
  411. }
  412. our_uid = Users.create_uid(log_type, crawler, create_user_dict, env)
  413. Common.logger(log_type, crawler).info(f'新创建的站内UID:{our_uid}')
  414. if env == 'prod':
  415. our_user_link = f'https://admin.piaoquantv.com/ums/user/{our_uid}/post'
  416. else:
  417. our_user_link = f'https://testadmin.piaoquantv.com/ums/user/{our_uid}/post'
  418. Common.logger(log_type, crawler).info(f'站内用户主页链接:{our_user_link}')
  419. Feishu.update_values(log_type, crawler, sheetid, f'G{i + 1}:H{i + 1}', [[our_uid, our_user_link]])
  420. Common.logger(log_type, crawler).info(f'站内用户信息写入飞书成功!')
  421. sql = f""" insert into crawler_user(user_id,
  422. out_user_id,
  423. out_user_name,
  424. out_avatar_url,
  425. out_create_time,
  426. out_play_cnt,
  427. out_fans,
  428. platform,
  429. tag)
  430. values({our_uid},
  431. "{out_uid}",
  432. "{user_name}",
  433. "{out_avatar_url}",
  434. "{out_create_time}",
  435. {out_play_cnt},
  436. {out_fans},
  437. "{cls.platform}",
  438. "{tag}") """
  439. Common.logger(log_type, crawler).info(f'sql:{sql}')
  440. MysqlHelper.update_values(log_type, crawler, sql, env, machine)
  441. Common.logger(log_type, crawler).info('用户信息插入数据库成功!\n')
  442. # 数据库中(youtube + out_user_id)返回数量 != 0,则直接把数据库中的站内 UID 写入飞书
  443. else:
  444. our_uid = our_user_info[0][1]
  445. if 'env' == 'prod':
  446. our_user_link = f'https://admin.piaoquantv.com/ums/user/{our_uid}/post'
  447. else:
  448. our_user_link = f'https://testadmin.piaoquantv.com/ums/user/{our_uid}/post'
  449. Common.logger(log_type, crawler).info(f'站内用户主页链接:{our_user_link}')
  450. Feishu.update_values(log_type, crawler, sheetid, f'G{i+1}:H{i+1}', [[our_uid, our_user_link]])
  451. Common.logger(log_type, crawler).info(f'站内用户信息写入飞书成功!\n')
  452. user_dict = {
  453. 'out_user_id': out_uid,
  454. 'out_user_name': user_name,
  455. 'out_browse_id': browse_id,
  456. 'our_user_id': our_uid,
  457. }
  458. user_list.append(user_dict)
  459. else:
  460. pass
  461. return user_list
  462. except Exception as e:
  463. Common.logger(log_type, crawler).error(f"get_user_from_feishu异常:{e}\n")
  464. @classmethod
  465. def get_feeds(cls, log_type, crawler, browse_id, out_uid):
  466. """
  467. 获取个人主页视频列表
  468. :param log_type: 日志
  469. :param crawler: 哪款爬虫
  470. :param browse_id: 每个用户主页的请求参数中唯一值
  471. :param out_uid: 站外用户UID
  472. :return: video_list
  473. """
  474. url = "https://www.youtube.com/youtubei/v1/browse?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8&prettyPrint=false"
  475. payload = json.dumps({
  476. "context": {
  477. "client": {
  478. "hl": "zh-CN",
  479. "gl": "US",
  480. "remoteHost": "38.93.247.21",
  481. "deviceMake": "Apple",
  482. "deviceModel": "",
  483. "visitorData": "CgtraDZfVnB4NXdIWSi6mIOfBg%3D%3D",
  484. "userAgent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36,gzip(gfe)",
  485. "clientName": "WEB",
  486. "clientVersion": "2.20230201.01.00",
  487. "osName": "Macintosh",
  488. "osVersion": "10_15_7",
  489. "originalUrl": f"https://www.youtube.com/{out_uid}/videos",
  490. "platform": "DESKTOP",
  491. "clientFormFactor": "UNKNOWN_FORM_FACTOR",
  492. "configInfo": {
  493. "appInstallData": "CLqYg58GEInorgUQuIuuBRCU-K4FENfkrgUQuNSuBRC2nP4SEPuj_hIQ5_euBRCy9a4FEKLsrgUQt-CuBRDi1K4FEILdrgUQh92uBRDM364FEP7urgUQzPWuBRDZ6a4FEOSg_hIQo_muBRDvo_4SEMnJrgUQlqf-EhCR-PwS"
  494. },
  495. "timeZone": "Asia/Shanghai",
  496. "browserName": "Chrome",
  497. "browserVersion": "109.0.0.0",
  498. "acceptHeader": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
  499. "deviceExperimentId": "ChxOekU1TlRReU5qWTBOVFExTVRRNU5qRTBOdz09ELqYg58GGOmU7Z4G",
  500. "screenWidthPoints": 944,
  501. "screenHeightPoints": 969,
  502. "screenPixelDensity": 1,
  503. "screenDensityFloat": 1,
  504. "utcOffsetMinutes": 480,
  505. "userInterfaceTheme": "USER_INTERFACE_THEME_LIGHT",
  506. "memoryTotalKbytes": "8000000",
  507. "mainAppWebInfo": {
  508. "graftUrl": f"/{out_uid}/videos",
  509. "pwaInstallabilityStatus": "PWA_INSTALLABILITY_STATUS_CAN_BE_INSTALLED",
  510. "webDisplayMode": "WEB_DISPLAY_MODE_FULLSCREEN",
  511. "isWebNativeShareAvailable": True
  512. }
  513. },
  514. "user": {
  515. "lockedSafetyMode": False
  516. },
  517. "request": {
  518. "useSsl": True,
  519. "internalExperimentFlags": [],
  520. "consistencyTokenJars": []
  521. },
  522. "clickTracking": {
  523. "clickTrackingParams": "CBcQ8JMBGAYiEwiNhIXX9IL9AhUFSUwIHWnnDks="
  524. },
  525. "adSignalsInfo": {
  526. "params": [
  527. {
  528. "key": "dt",
  529. "value": "1675676731048"
  530. },
  531. {
  532. "key": "flash",
  533. "value": "0"
  534. },
  535. {
  536. "key": "frm",
  537. "value": "0"
  538. },
  539. {
  540. "key": "u_tz",
  541. "value": "480"
  542. },
  543. {
  544. "key": "u_his",
  545. "value": "4"
  546. },
  547. {
  548. "key": "u_h",
  549. "value": "1080"
  550. },
  551. {
  552. "key": "u_w",
  553. "value": "1920"
  554. },
  555. {
  556. "key": "u_ah",
  557. "value": "1080"
  558. },
  559. {
  560. "key": "u_aw",
  561. "value": "1920"
  562. },
  563. {
  564. "key": "u_cd",
  565. "value": "24"
  566. },
  567. {
  568. "key": "bc",
  569. "value": "31"
  570. },
  571. {
  572. "key": "bih",
  573. "value": "969"
  574. },
  575. {
  576. "key": "biw",
  577. "value": "944"
  578. },
  579. {
  580. "key": "brdim",
  581. "value": "-269,-1080,-269,-1080,1920,-1080,1920,1080,944,969"
  582. },
  583. {
  584. "key": "vis",
  585. "value": "1"
  586. },
  587. {
  588. "key": "wgl",
  589. "value": "true"
  590. },
  591. {
  592. "key": "ca_type",
  593. "value": "image"
  594. }
  595. ],
  596. "bid": "ANyPxKpfiaAf-DBzNeKLgkceMEA9UIeCWFRTRm4AQMCuejhI3PGwDB1jizQIX60YcEYtt_CX7tZWAbYerQ-rWLvV7y_KCLkBww"
  597. }
  598. },
  599. "browseId": browse_id,
  600. "params": "EgZ2aWRlb3PyBgQKAjoA",
  601. "continuation": cls.continuation
  602. })
  603. headers = {
  604. 'authority': 'www.youtube.com',
  605. 'accept': '*/*',
  606. 'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
  607. 'cache-control': 'no-cache',
  608. 'content-type': 'application/json',
  609. 'cookie': 'VISITOR_INFO1_LIVE=kh6_Vpx5wHY; YSC=UupqFrWvAR0; DEVICE_INFO=ChxOekU1TlRReU5qWTBOVFExTVRRNU5qRTBOdz09EOmU7Z4GGOmU7Z4G; PREF=tz=Asia.Shanghai; ST-1kg1gfd=itct=CBcQ8JMBGAYiEwiNhIXX9IL9AhUFSUwIHWnnDks%3D&csn=MC4zNzI3MDcwMDA1Mjg4NzE5Ng..&endpoint=%7B%22clickTrackingParams%22%3A%22CBcQ8JMBGAYiEwiNhIXX9IL9AhUFSUwIHWnnDks%3D%22%2C%22commandMetadata%22%3A%7B%22webCommandMetadata%22%3A%7B%22url%22%3A%22%2F%40chinatravel5971%2Fvideos%22%2C%22webPageType%22%3A%22WEB_PAGE_TYPE_CHANNEL%22%2C%22rootVe%22%3A3611%2C%22apiUrl%22%3A%22%2Fyoutubei%2Fv1%2Fbrowse%22%7D%7D%2C%22browseEndpoint%22%3A%7B%22browseId%22%3A%22UCpLXnfBCNhj8KLnt54RQMKA%22%2C%22params%22%3A%22EgZ2aWRlb3PyBgQKAjoA%22%2C%22canonicalBaseUrl%22%3A%22%2F%40chinatravel5971%22%7D%7D',
  610. 'origin': 'https://www.youtube.com',
  611. 'pragma': 'no-cache',
  612. 'referer': f'https://www.youtube.com/{out_uid}/featured',
  613. 'sec-ch-ua': '"Not_A Brand";v="99", "Chromium";v="109", "Google Chrome";v="109.0.5414.87"',
  614. 'sec-ch-ua-arch': '"arm"',
  615. 'sec-ch-ua-bitness': '"64"',
  616. 'sec-ch-ua-full-version': '"109.0.1518.52"',
  617. 'sec-ch-ua-full-version-list': '"Not_A Brand";v="99.0.0.0", "Microsoft Edge";v="109.0.1518.52", "Chromium";v="109.0.5414.87"',
  618. 'sec-ch-ua-mobile': '?0',
  619. 'sec-ch-ua-model': '',
  620. 'sec-ch-ua-platform': '"macOS"',
  621. 'sec-ch-ua-platform-version': '"12.4.0"',
  622. 'sec-ch-ua-wow64': '?0',
  623. 'sec-fetch-dest': 'empty',
  624. 'sec-fetch-mode': 'same-origin',
  625. 'sec-fetch-site': 'same-origin',
  626. 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
  627. 'x-goog-visitor-id': 'CgtraDZfVnB4NXdIWSi6mIOfBg%3D%3D',
  628. 'x-youtube-bootstrap-logged-in': 'false',
  629. 'x-youtube-client-name': '1',
  630. 'x-youtube-client-version': '2.20230201.01.00'
  631. }
  632. try:
  633. response = requests.post(url=url, headers=headers, data=payload)
  634. # Common.logger(log_type, crawler).info(f"get_feeds_response:{response.json()}\n")
  635. cls.continuation = response.json()['trackingParams']
  636. if response.status_code != 200:
  637. Common.logger(log_type, crawler).warning(f'get_feeds_response:{response.text}\n')
  638. elif 'continuationContents' not in response.text and 'onResponseReceivedActions' not in response.text:
  639. Common.logger(log_type, crawler).warning(f'get_feeds_response:{response.text}\n')
  640. elif 'continuationContents' in response.json():
  641. # Common.logger(log_type, crawler).info("'continuationContents' in response.json()\n")
  642. if 'richGridContinuation' not in response.json()['continuationContents']:
  643. # Common.logger(log_type, crawler).warning(f"'richGridContinuation' not in response.json()['continuationContents']\n")
  644. Common.logger(log_type, crawler).warning(f'get_feeds_response:{response.json()["continuationContents"]}\n')
  645. elif 'contents' not in response.json()['continuationContents']['richGridContinuation']:
  646. Common.logger(log_type, crawler).warning(f'get_feeds_response:{response.json()["continuationContents"]["richGridContinuation"]}\n')
  647. elif 'contents' in response.json()["continuationContents"]["richGridContinuation"]:
  648. feeds = response.json()["continuationContents"]["richGridContinuation"]['contents']
  649. return feeds
  650. elif 'onResponseReceivedActions' in response.json():
  651. Common.logger(log_type, crawler).info("'onResponseReceivedActions' in response.json()\n")
  652. if len(response.json()['onResponseReceivedActions']) == 0:
  653. Common.logger(log_type, crawler).warning(f'get_feeds_response:{response.json()["onResponseReceivedActions"]}\n')
  654. elif 'appendContinuationItemsAction' not in response.json()['onResponseReceivedActions'][0]:
  655. Common.logger(log_type, crawler).warning(f'get_feeds_response:{response.json()["onResponseReceivedActions"][0]}\n')
  656. elif 'continuationItems' not in response.json()['onResponseReceivedActions'][0]['appendContinuationItemsAction']:
  657. Common.logger(log_type, crawler).warning(f'get_feeds_response:{response.json()["onResponseReceivedActions"][0]["appendContinuationItemsAction"]}\n')
  658. elif len(response.json()['onResponseReceivedActions'][0]['appendContinuationItemsAction']['continuationItems']) == 0:
  659. Common.logger(log_type, crawler).warning(f'get_feeds_response:{response.json()["onResponseReceivedActions"][0]["appendContinuationItemsAction"]["continuationItems"]}\n')
  660. else:
  661. feeds = response.json()["onResponseReceivedActions"][0]["appendContinuationItemsAction"]["continuationItems"]
  662. return feeds
  663. else:
  664. Common.logger(log_type, crawler).info('feeds is None\n')
  665. except Exception as e:
  666. Common.logger(log_type, crawler).error(f'get_feeds异常:{e}\n')
  667. @classmethod
  668. def get_videos(cls, log_type, crawler, strategy, oss_endpoint, env, browse_id, out_uid, our_uid, machine):
  669. try:
  670. while True:
  671. feeds = cls.get_feeds(log_type, crawler, browse_id, out_uid)
  672. for i in range(len(feeds)):
  673. if 'richItemRenderer' not in feeds[i]:
  674. Common.logger(log_type, crawler).warning(f'feeds:{feeds[i]}\n')
  675. return
  676. elif 'content' not in feeds[i]['richItemRenderer']:
  677. Common.logger(log_type, crawler).warning(f'feeds:{feeds[i]["richItemRenderer"]}\n')
  678. return
  679. elif 'videoRenderer' not in feeds[i]['richItemRenderer']['content']:
  680. Common.logger(log_type, crawler).warning(f'feeds:{feeds[i]["richItemRenderer"]["content"]}\n')
  681. return
  682. elif 'videoId' not in feeds[i]["richItemRenderer"]["content"]['videoRenderer']:
  683. Common.logger(log_type, crawler).warning(f'feeds:{feeds[i]["richItemRenderer"]["content"]["videoRenderer"]}\n')
  684. return
  685. else:
  686. video_id = feeds[i]["richItemRenderer"]["content"]['videoRenderer']['videoId']
  687. video_dict = cls.get_video_info(log_type, crawler, out_uid, video_id, machine)
  688. # 发布时间<=30天
  689. publish_time = int(time.mktime(time.strptime(video_dict['publish_time'], "%Y-%m-%d")))
  690. if int(time.time()) - publish_time <= 3600*24*180:
  691. cls.download_publish(log_type, crawler, video_dict, strategy, our_uid, env, oss_endpoint, machine)
  692. else:
  693. Common.logger(log_type, crawler).info('发布时间超过180天\n')
  694. return
  695. except Exception as e:
  696. Common.logger(log_type, crawler).error(f"get_videos异常:{e}\n")
  697. @classmethod
  698. def filter_emoji(cls, title):
  699. # 过滤表情
  700. try:
  701. co = re.compile(u'[\U00010000-\U0010ffff]')
  702. except re.error:
  703. co = re.compile(u'[\uD800-\uDBFF][\uDC00-\uDFFF]')
  704. return co.sub("", title)
  705. @classmethod
  706. def get_video_info(cls, log_type, crawler, out_uid, video_id, machine):
  707. try:
  708. url = "https://www.youtube.com/youtubei/v1/player?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8&prettyPrint=false"
  709. payload = json.dumps({
  710. "context": {
  711. "client": {
  712. "hl": "zh-CN",
  713. "gl": "US",
  714. "remoteHost": "38.93.247.21",
  715. "deviceMake": "Apple",
  716. "deviceModel": "",
  717. "visitorData": "CgtraDZfVnB4NXdIWSjkzoefBg%3D%3D",
  718. "userAgent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36,gzip(gfe)",
  719. "clientName": "WEB",
  720. "clientVersion": "2.20230201.01.00",
  721. "osName": "Macintosh",
  722. "osVersion": "10_15_7",
  723. "originalUrl": f"https://www.youtube.com/watch?v={video_id}",
  724. "platform": "DESKTOP",
  725. "clientFormFactor": "UNKNOWN_FORM_FACTOR",
  726. "configInfo": {
  727. "appInstallData": "COTOh58GEPuj_hIQ1-SuBRC4i64FEMzfrgUQgt2uBRCi7K4FEOLUrgUQzPWuBRCKgK8FEOSg_hIQtpz-EhDa6a4FEP7urgUQieiuBRDn964FELjUrgUQlPiuBRCH3a4FELfgrgUQ76P-EhDJya4FEJan_hIQkfj8Eg%3D%3D"
  728. },
  729. "timeZone": "Asia/Shanghai",
  730. "browserName": "Chrome",
  731. "browserVersion": "109.0.0.0",
  732. "acceptHeader": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
  733. "deviceExperimentId": "ChxOekU1TlRReU5qWTBOVFExTVRRNU5qRTBOdz09EOTOh58GGOmU7Z4G",
  734. "screenWidthPoints": 1037,
  735. "screenHeightPoints": 969,
  736. "screenPixelDensity": 1,
  737. "screenDensityFloat": 1,
  738. "utcOffsetMinutes": 480,
  739. "userInterfaceTheme": "USER_INTERFACE_THEME_LIGHT",
  740. "memoryTotalKbytes": "8000000",
  741. "clientScreen": "WATCH",
  742. "mainAppWebInfo": {
  743. "graftUrl": f"/watch?v={video_id}",
  744. "pwaInstallabilityStatus": "PWA_INSTALLABILITY_STATUS_CAN_BE_INSTALLED",
  745. "webDisplayMode": "WEB_DISPLAY_MODE_FULLSCREEN",
  746. "isWebNativeShareAvailable": True
  747. }
  748. },
  749. "user": {
  750. "lockedSafetyMode": False
  751. },
  752. "request": {
  753. "useSsl": True,
  754. "internalExperimentFlags": [],
  755. "consistencyTokenJars": []
  756. },
  757. "clickTracking": {
  758. "clickTrackingParams": "CIwBEKQwGAYiEwipncqx3IL9AhXs4cQKHbKZDO4yB3JlbGF0ZWRInsS1qbGFtIlUmgEFCAEQ-B0="
  759. },
  760. "adSignalsInfo": {
  761. "params": [
  762. {
  763. "key": "dt",
  764. "value": "1675749222611"
  765. },
  766. {
  767. "key": "flash",
  768. "value": "0"
  769. },
  770. {
  771. "key": "frm",
  772. "value": "0"
  773. },
  774. {
  775. "key": "u_tz",
  776. "value": "480"
  777. },
  778. {
  779. "key": "u_his",
  780. "value": "3"
  781. },
  782. {
  783. "key": "u_h",
  784. "value": "1080"
  785. },
  786. {
  787. "key": "u_w",
  788. "value": "1920"
  789. },
  790. {
  791. "key": "u_ah",
  792. "value": "1080"
  793. },
  794. {
  795. "key": "u_aw",
  796. "value": "1920"
  797. },
  798. {
  799. "key": "u_cd",
  800. "value": "24"
  801. },
  802. {
  803. "key": "bc",
  804. "value": "31"
  805. },
  806. {
  807. "key": "bih",
  808. "value": "969"
  809. },
  810. {
  811. "key": "biw",
  812. "value": "1037"
  813. },
  814. {
  815. "key": "brdim",
  816. "value": "-269,-1080,-269,-1080,1920,-1080,1920,1080,1037,969"
  817. },
  818. {
  819. "key": "vis",
  820. "value": "1"
  821. },
  822. {
  823. "key": "wgl",
  824. "value": "true"
  825. },
  826. {
  827. "key": "ca_type",
  828. "value": "image"
  829. }
  830. ],
  831. "bid": "ANyPxKop8SijebwUCq4ZfKbJwlSjVQa_RTdS6c6a6WPYpCKnxpWCJ33B1SzRuSXjSfH9O2MhURebAs0CngRg6B4nOjBpeJDKgA"
  832. }
  833. },
  834. "videoId": str(video_id),
  835. "playbackContext": {
  836. "contentPlaybackContext": {
  837. "currentUrl": f"/watch?v={video_id}",
  838. "vis": 0,
  839. "splay": False,
  840. "autoCaptionsDefaultOn": False,
  841. "autonavState": "STATE_NONE",
  842. "html5Preference": "HTML5_PREF_WANTS",
  843. "signatureTimestamp": 19394,
  844. "referer": f"https://www.youtube.com/watch?v={video_id}",
  845. "lactMilliseconds": "-1",
  846. "watchAmbientModeContext": {
  847. "watchAmbientModeEnabled": True
  848. }
  849. }
  850. },
  851. "racyCheckOk": False,
  852. "contentCheckOk": False
  853. })
  854. headers = {
  855. 'authority': 'www.youtube.com',
  856. 'accept': '*/*',
  857. 'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
  858. 'cache-control': 'no-cache',
  859. 'content-type': 'application/json',
  860. 'cookie': f'VISITOR_INFO1_LIVE=kh6_Vpx5wHY; YSC=UupqFrWvAR0; DEVICE_INFO=ChxOekU1TlRReU5qWTBOVFExTVRRNU5qRTBOdz09EOmU7Z4GGOmU7Z4G; PREF=tz=Asia.Shanghai; ST-180dxzo=itct=CIwBEKQwGAYiEwipncqx3IL9AhXs4cQKHbKZDO4yB3JlbGF0ZWRInsS1qbGFtIlUmgEFCAEQ-B0%3D&csn=MC41MTQ1NTQzMTE3NTA4MjY0&endpoint=%7B%22clickTrackingParams%22%3A%22CIwBEKQwGAYiEwipncqx3IL9AhXs4cQKHbKZDO4yB3JlbGF0ZWRInsS1qbGFtIlUmgEFCAEQ-B0%3D%22%2C%22commandMetadata%22%3A%7B%22webCommandMetadata%22%3A%7B%22url%22%3A%22%2Fwatch%3Fv%3D{video_id}%22%2C%22webPageType%22%3A%22WEB_PAGE_TYPE_WATCH%22%2C%22rootVe%22%3A3832%7D%7D%2C%22watchEndpoint%22%3A%7B%22videoId%22%3A%22{video_id}%22%2C%22nofollow%22%3Atrue%2C%22watchEndpointSupportedOnesieConfig%22%3A%7B%22html5PlaybackOnesieConfig%22%3A%7B%22commonConfig%22%3A%7B%22url%22%3A%22https%3A%2F%2Frr5---sn-nx5s7n76.googlevideo.com%2Finitplayback%3Fsource%3Dyoutube%26oeis%3D1%26c%3DWEB%26oad%3D3200%26ovd%3D3200%26oaad%3D11000%26oavd%3D11000%26ocs%3D700%26oewis%3D1%26oputc%3D1%26ofpcc%3D1%26msp%3D1%26odepv%3D1%26id%3D38654ad085c12212%26ip%3D38.93.247.21%26initcwndbps%3D11346250%26mt%3D1675748964%26oweuc%3D%26pxtags%3DCg4KAnR4EggyNDQ1MTI4OA%26rxtags%3DCg4KAnR4EggyNDQ1MTI4Ng%252CCg4KAnR4EggyNDQ1MTI4Nw%252CCg4KAnR4EggyNDQ1MTI4OA%252CCg4KAnR4EggyNDQ1MTI4OQ%22%7D%7D%7D%7D%7D',
  861. 'origin': 'https://www.youtube.com',
  862. 'pragma': 'no-cache',
  863. 'referer': f'https://www.youtube.com/watch?v={video_id}',
  864. 'sec-ch-ua': '"Not_A Brand";v="99", "Chromium";v="109", "Google Chrome";v="109.0.5414.87"',
  865. 'sec-ch-ua-arch': '"arm"',
  866. 'sec-ch-ua-bitness': '"64"',
  867. 'sec-ch-ua-full-version': '"109.0.1518.52"',
  868. 'sec-ch-ua-full-version-list': '"Not_A Brand";v="99.0.0.0", "Microsoft Edge";v="109.0.1518.52", "Chromium";v="109.0.5414.87"',
  869. 'sec-ch-ua-mobile': '?0',
  870. 'sec-ch-ua-model': '',
  871. 'sec-ch-ua-platform': '"macOS"',
  872. 'sec-ch-ua-platform-version': '"12.4.0"',
  873. 'sec-ch-ua-wow64': '?0',
  874. 'sec-fetch-dest': 'empty',
  875. 'sec-fetch-mode': 'same-origin',
  876. 'sec-fetch-site': 'same-origin',
  877. 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
  878. 'x-goog-visitor-id': 'CgtraDZfVnB4NXdIWSjkzoefBg%3D%3D',
  879. 'x-youtube-bootstrap-logged-in': 'false',
  880. 'x-youtube-client-name': '1',
  881. 'x-youtube-client-version': '2.20230201.01.00'
  882. }
  883. response = requests.post(url=url, headers=headers, data=payload)
  884. if response.status_code != 200:
  885. Common.logger(log_type, crawler).warning(f"get_video_info_response:{response.text}\n")
  886. elif 'streamingData' not in response.json():
  887. Common.logger(log_type, crawler).warning(f"get_video_info_response:{response.json()}\n")
  888. elif 'videoDetails' not in response.json():
  889. Common.logger(log_type, crawler).warning(f"get_video_info_response:{response.json()}\n")
  890. elif 'microformat' not in response.json():
  891. Common.logger(log_type, crawler).warning(f"get_video_info_response:{response.json()}\n")
  892. else:
  893. playerMicroformatRenderer = response.json()['microformat']['playerMicroformatRenderer']
  894. videoDetails = response.json()['videoDetails']
  895. # streamingData = response.json()['streamingData']
  896. # video_title
  897. if 'title' not in videoDetails:
  898. video_title = ''
  899. else:
  900. video_title = videoDetails['title']
  901. video_title = cls.filter_emoji(video_title)
  902. # if Translate.is_contains_chinese(video_title) is False:
  903. video_title = Translate.google_translate(video_title, machine)\
  904. .strip().replace("\\", "").replace(" ", "").replace("\n", "")\
  905. .replace("/", "").replace("\r", "").replace("&NBSP", "").replace("&", "")\
  906. .replace(";", "").replace("amp;", "")# 自动翻译标题为中文
  907. if 'lengthSeconds' not in videoDetails:
  908. duration = 0
  909. else:
  910. duration = int(videoDetails['lengthSeconds'])
  911. # play_cnt
  912. if 'viewCount' not in videoDetails:
  913. play_cnt = 0
  914. else:
  915. play_cnt = int(videoDetails['viewCount'])
  916. # publish_time
  917. if 'publishDate' not in playerMicroformatRenderer:
  918. publish_time = ''
  919. else:
  920. publish_time = playerMicroformatRenderer['publishDate']
  921. if publish_time == '':
  922. publish_time_stamp = 0
  923. elif ':' in publish_time:
  924. publish_time_stamp = int(time.mktime(time.strptime(publish_time, "%Y-%m-%d %H:%M:%S")))
  925. else:
  926. publish_time_stamp = int(time.mktime(time.strptime(publish_time, "%Y-%m-%d")))
  927. # user_name
  928. if 'author' not in videoDetails:
  929. user_name = ''
  930. else:
  931. user_name = videoDetails['author']
  932. # cover_url
  933. if 'thumbnail' not in videoDetails:
  934. cover_url = ''
  935. elif 'thumbnails' not in videoDetails['thumbnail']:
  936. cover_url = ''
  937. elif len(videoDetails['thumbnail']['thumbnails']) == 0:
  938. cover_url = ''
  939. elif 'url' not in videoDetails['thumbnail']['thumbnails'][-1]:
  940. cover_url = ''
  941. else:
  942. cover_url = videoDetails['thumbnail']['thumbnails'][-1]['url']
  943. # video_url
  944. # if 'formats' not in streamingData:
  945. # video_url = ''
  946. # elif len(streamingData['formats']) == 0:
  947. # video_url = ''
  948. # elif 'url' not in streamingData['formats'][-1]:
  949. # video_url = ''
  950. # else:
  951. # video_url = streamingData['formats'][-1]['url']
  952. video_url = f"https://www.youtube.com/watch?v={video_id}"
  953. Common.logger(log_type, crawler).info(f'video_title:{video_title}')
  954. Common.logger(log_type, crawler).info(f'video_id:{video_id}')
  955. Common.logger(log_type, crawler).info(f'play_cnt:{play_cnt}')
  956. Common.logger(log_type, crawler).info(f'publish_time:{publish_time}')
  957. Common.logger(log_type, crawler).info(f'user_name:{user_name}')
  958. Common.logger(log_type, crawler).info(f'cover_url:{cover_url}')
  959. Common.logger(log_type, crawler).info(f'video_url:{video_url}')
  960. video_dict = {
  961. 'video_title': video_title,
  962. 'video_id': video_id,
  963. 'duration': duration,
  964. 'play_cnt': play_cnt,
  965. 'publish_time': publish_time,
  966. 'publish_time_stamp': publish_time_stamp,
  967. 'user_name': user_name,
  968. 'out_uid': out_uid,
  969. 'cover_url': cover_url,
  970. 'video_url': video_url,
  971. }
  972. return video_dict
  973. except Exception as e:
  974. Common.logger(log_type, crawler).error(f"get_video_info异常:{e}\n")
  975. @classmethod
  976. def repeat_video(cls, log_type, crawler, video_id, env, machine):
  977. sql = f""" select * from crawler_video where platform="{cls.platform}" and out_video_id="{video_id}"; """
  978. repeat_video = MysqlHelper.get_values(log_type, crawler, sql, env, machine)
  979. return len(repeat_video)
  980. @classmethod
  981. def download_publish(cls, log_type, crawler, video_dict, strategy, our_uid, env, oss_endpoint, machine):
  982. try:
  983. # sql = f""" select * from crawler_video where platform="{cls.platform}" and out_video_id="{video_dict['video_id']}" """
  984. # repeat_video = MysqlHelper.get_values(log_type, crawler, sql, env, machine)
  985. if video_dict['video_title'] == '' or video_dict['video_url'] == '':
  986. Common.logger(log_type, crawler).info('无效视频\n')
  987. elif video_dict['duration'] > 1200 or video_dict['duration'] < 60:
  988. Common.logger(log_type, crawler).info(f"时长:{video_dict['duration']}不满足规则\n")
  989. # elif repeat_video is not None and len(repeat_video) != 0:
  990. elif cls.repeat_video(log_type, crawler, video_dict['video_id'], env, machine) != 0:
  991. Common.logger(log_type, crawler).info('视频已下载\n')
  992. elif video_dict['video_id'] in [x for y in Feishu.get_values_batch(log_type, crawler, 'GVxlYk') for x in y]:
  993. Common.logger(log_type, crawler).info('视频已下载\n')
  994. else:
  995. # 下载视频
  996. Common.logger(log_type, crawler).info('开始下载视频...')
  997. # Common.download_method(log_type, crawler, 'video', video_dict['video_title'], video_dict['video_url'])
  998. Common.download_method(log_type, crawler, 'youtube_video', video_dict['video_title'], video_dict['video_url'])
  999. # ffmpeg_dict = Common.ffmpeg(log_type, crawler, f"./{crawler}/videos/{video_dict['video_title']}/video.mp4")
  1000. # video_width = int(ffmpeg_dict['width'])
  1001. # video_height = int(ffmpeg_dict['height'])
  1002. # video_size = int(ffmpeg_dict['size'])
  1003. video_width = 1280
  1004. video_height = 720
  1005. duration = int(video_dict['duration'])
  1006. Common.logger(log_type, crawler).info(f'video_width:{video_width}')
  1007. Common.logger(log_type, crawler).info(f'video_height:{video_height}')
  1008. Common.logger(log_type, crawler).info(f'duration:{duration}')
  1009. # Common.logger(log_type, crawler).info(f'video_size:{video_size}\n')
  1010. video_dict['video_width'] = video_width
  1011. video_dict['video_height'] = video_height
  1012. video_dict['duration'] = duration
  1013. video_dict['comment_cnt'] = 0
  1014. video_dict['like_cnt'] = 0
  1015. video_dict['share_cnt'] = 0
  1016. video_dict['avatar_url'] = video_dict['cover_url']
  1017. video_dict['session'] = f'youtube{int(time.time())}'
  1018. rule='1,2'
  1019. # if duration < 60 or duration > 600:
  1020. # # 删除视频文件夹
  1021. # shutil.rmtree(f"./{crawler}/videos/{video_dict['video_title']}/")
  1022. # Common.logger(log_type, crawler).info(f"时长:{video_dict['duration']}不满足抓取规则,删除成功\n")
  1023. # return
  1024. # if duration == 0 or duration is None:
  1025. # # 删除视频文件夹
  1026. # shutil.rmtree(f"./{crawler}/videos/{video_dict['video_title']}/")
  1027. # Common.logger(log_type, crawler).info(f"视频下载出错,删除成功\n")
  1028. # return
  1029. # else:
  1030. # 下载封面
  1031. Common.download_method(log_type, crawler, 'cover', video_dict['video_title'], video_dict['cover_url'])
  1032. # 保存视频文本信息
  1033. Common.save_video_info(log_type, crawler, video_dict)
  1034. # 上传视频
  1035. Common.logger(log_type, crawler).info(f"开始上传视频")
  1036. if env == 'dev':
  1037. our_video_id = Publish.upload_and_publish(log_type, crawler, strategy, our_uid, env, oss_endpoint)
  1038. our_video_link = f"https://testadmin.piaoquantv.com/cms/post-detail/{our_video_id}/info"
  1039. else:
  1040. our_video_id = Publish.upload_and_publish(log_type, crawler, strategy, our_uid, env, oss_endpoint)
  1041. our_video_link = f"https://admin.piaoquantv.com/cms/post-detail/{our_video_id}/info"
  1042. Common.logger(log_type, crawler).info("视频上传完成")
  1043. if our_video_id is None:
  1044. # 删除视频文件夹
  1045. shutil.rmtree(f"./{crawler}/videos/{video_dict['video_title']}/")
  1046. return
  1047. # 视频信息保存至飞书
  1048. Feishu.insert_columns(log_type, crawler, "GVxlYk", "ROWS", 1, 2)
  1049. # 视频ID工作表,首行写入数据
  1050. upload_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(time.time())))
  1051. values = [[upload_time,
  1052. "定向榜",
  1053. video_dict['video_id'],
  1054. video_dict['video_title'],
  1055. our_video_link,
  1056. video_dict['play_cnt'],
  1057. video_dict['duration'],
  1058. f'{video_width}*{video_height}',
  1059. video_dict['publish_time'],
  1060. video_dict['user_name'],
  1061. video_dict['cover_url'],
  1062. video_dict['video_url']
  1063. ]]
  1064. time.sleep(1)
  1065. Feishu.update_values(log_type, crawler, "GVxlYk", "F2:Z2", values)
  1066. Common.logger(log_type, crawler).info('视频信息写入定向_已下载表成功\n')
  1067. # 视频信息保存数据库
  1068. sql = f""" insert into crawler_video(video_id,
  1069. user_id,
  1070. out_user_id,
  1071. platform,
  1072. strategy,
  1073. out_video_id,
  1074. video_title,
  1075. cover_url,
  1076. video_url,
  1077. duration,
  1078. publish_time,
  1079. play_cnt,
  1080. crawler_rule,
  1081. width,
  1082. height)
  1083. values({our_video_id},
  1084. "{our_uid}",
  1085. "{video_dict['out_uid']}",
  1086. "{cls.platform}",
  1087. "定向爬虫策略",
  1088. "{video_dict['video_id']}",
  1089. "{video_dict['video_title']}",
  1090. "{video_dict['cover_url']}",
  1091. "{video_dict['video_url']}",
  1092. {int(duration)},
  1093. "{video_dict['publish_time']}",
  1094. {int(video_dict['play_cnt'])},
  1095. "{rule}",
  1096. {int(video_width)},
  1097. {int(video_height)}) """
  1098. MysqlHelper.update_values(log_type, crawler, sql, env, machine)
  1099. Common.logger(log_type, crawler).info('视频信息插入数据库成功!\n')
  1100. except Exception as e:
  1101. Common.logger(log_type, crawler).info(f"download_publish异常:{e}\n")
  1102. @classmethod
  1103. def get_follow_videos(cls, log_type, crawler, strategy, oss_endpoint, env, machine):
  1104. try:
  1105. user_list = cls.get_user_from_feishu(log_type, crawler, 'c467d7', env, machine)
  1106. if len(user_list) == 0:
  1107. Common.logger(log_type, crawler).warning('用户列表为空\n')
  1108. else:
  1109. for user_dict in user_list:
  1110. out_uid = user_dict['out_user_id']
  1111. user_name = user_dict['out_user_name']
  1112. browse_id = user_dict['out_browse_id']
  1113. our_uid = user_dict['our_user_id']
  1114. Common.logger(log_type, crawler).info(f'获取 {user_name} 主页视频\n')
  1115. cls.get_videos(log_type, crawler, strategy, oss_endpoint, env, browse_id, out_uid, our_uid, machine)
  1116. Common.logger(log_type, crawler).info('休眠 10 秒')
  1117. time.sleep(10)
  1118. cls.continuation = ''
  1119. except Exception as e:
  1120. Common.logger(log_type, crawler).error(f"get_follow_videos异常:{e}\n")
  1121. if __name__ == "__main__":
  1122. # print(Follow.get_browse_id('follow', 'youtube', '@chinatravel5971', "local"))
  1123. # print(Follow.get_user_from_feishu('follow', 'youtube', 'c467d7', 'dev', 'local'))
  1124. # Follow.get_out_user_info('follow', 'youtube', 'UC08jgxf119fzynp2uHCvZIg', '@weitravel')
  1125. # Follow.get_video_info('follow', 'youtube', 'OGVK0IXBIhI')
  1126. # Follow.get_follow_videos('follow', 'youtube', 'youtube_follow', 'out', 'dev', 'local')
  1127. # print(Follow.filter_emoji("姐妹倆一唱一和,完美配合,終於把大慶降服了😅😅#萌娃搞笑日常"))
  1128. # Follow.repeat_video('follow', 'youtube', 4, "dev", "local")
  1129. # title = "'西部巡游220丨两人一车环游中国半年,需要花费多少钱? 2万公里吃住行费用总结'"
  1130. # title = "'Insanely Crowded Shanghai Yu Garden Lantern Festival Walk Tour 2023 人气爆棚的上海豫园元宵节漫步之行 4K'"
  1131. # print(title.strip().replace("\\", "").replace(" ", "").replace("\n", "").replace("/", "").replace("\r", "").replace("&NBSP", "").replace("&", ""))
  1132. pass