youtube_follow_api.py 63 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188
  1. # -*- coding: utf-8 -*-
  2. # @Author: wangkun
  3. # @Time: 2023/2/3
  4. """
  5. YouTube 定向榜
  6. 1. 发布时间<=1个月
  7. 2. 10分钟>=时长>=1分钟
  8. """
  9. import os
  10. import re
  11. import shutil
  12. import sys
  13. import time
  14. import json
  15. import random
  16. # import emoji
  17. import requests
  18. from selenium import webdriver
  19. from selenium.webdriver.chrome.service import Service
  20. from selenium.webdriver.common.by import By
  21. from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
  22. sys.path.append(os.getcwd())
  23. from common.common import Common
  24. from common.db import MysqlHelper
  25. from common.feishu import Feishu
  26. from common.users import Users
  27. from common.publish import Publish
  28. from common.translate import Translate
  29. headers = {
  30. 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36',
  31. }
  32. def format_nums(data):
  33. data_dict = [{'亿': 100000000}, {'百万': 1000000}, {'万': 10000}, {'k': 1000}, {'w': 10000}, {'m': 1000000},
  34. {'千': 1000}, {'M': 1000000}, {'K': 1000}, {'W': 10000}]
  35. data = str(data)
  36. for i in data_dict:
  37. index = data.find(list(i.keys())[0])
  38. if index > 0:
  39. count = int(float(data[:index]) * list(i.values())[0])
  40. return count
  41. elif index < 0:
  42. continue
  43. count = int(float(re.findall(r'\d+', data)[0]))
  44. return count
  45. class YoutubeFollow:
  46. # 翻页参数
  47. continuation = ''
  48. # 抓取平台
  49. platform = 'youtube'
  50. headers = {
  51. 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36',
  52. }
  53. @classmethod
  54. def get_browse_id(cls, log_type, crawler, out_user_id, machine):
  55. """
  56. 获取每个用户的 browse_id
  57. :param log_type: 日志
  58. :param crawler: 哪款爬虫
  59. :param out_user_id: 站外用户 UID
  60. :param machine: 部署机器,阿里云填写 aliyun / aliyun_hk,线下分别填写 macpro,macair,local
  61. :return: browse_id
  62. """
  63. try:
  64. # 打印请求配置
  65. ca = DesiredCapabilities.CHROME
  66. ca["goog:loggingPrefs"] = {"performance": "ALL"}
  67. # 不打开浏览器运行
  68. chrome_options = webdriver.ChromeOptions()
  69. chrome_options.add_argument("--headless")
  70. chrome_options.add_argument(
  71. '--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36')
  72. chrome_options.add_argument("--no-sandbox")
  73. # driver初始化
  74. if machine == 'aliyun' or machine == 'aliyun_hk':
  75. driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options)
  76. elif machine == 'macpro':
  77. driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options,
  78. service=Service('/Users/lieyunye/Downloads/chromedriver_v86/chromedriver'))
  79. elif machine == 'macair':
  80. driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options,
  81. service=Service('/Users/piaoquan/Downloads/chromedriver'))
  82. else:
  83. driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options, service=Service(
  84. '/Users/wangkun/Downloads/chromedriver/chromedriver_v110/chromedriver'))
  85. driver.implicitly_wait(10)
  86. url = f'https://www.youtube.com/{out_user_id}/videos'
  87. driver.get(url)
  88. # driver.save_screenshot("./1.png")
  89. # 向上滑动 1000 个像素
  90. # driver.execute_script('window.scrollBy(0, 2000)')
  91. # driver.save_screenshot("./2.png")
  92. time.sleep(3)
  93. accept_btns = driver.find_elements(By.XPATH, '//span[text()="全部接受"]')
  94. accept_btns_eng = driver.find_elements(By.XPATH, '//span[text()="Accept all"]')
  95. if len(accept_btns) != 0:
  96. accept_btns[0].click()
  97. time.sleep(2)
  98. elif len(accept_btns_eng) != 0:
  99. accept_btns_eng[0].click()
  100. time.sleep(2)
  101. browse_id = driver.find_element(By.XPATH, '//meta[@itemprop="channelId"]').get_attribute('content')
  102. driver.quit()
  103. return browse_id
  104. except Exception as e:
  105. Common.logger(log_type, crawler).error(f'get_browse_id异常:{e}\n')
  106. @classmethod
  107. def get_out_user_info(cls, log_type, crawler, browse_id, out_user_id):
  108. """
  109. 获取站外用户信息
  110. :param log_type: 日志
  111. :param crawler: 哪款爬虫
  112. :param browse_id: browse_id
  113. :param out_user_id: 站外用户 UID
  114. :return: out_user_dict = {'out_user_name': 站外用户昵称,
  115. 'out_avatar_url': 站外用户头像,
  116. 'out_fans': 站外用户粉丝量,
  117. 'out_play_cnt': 站外用户总播放量,
  118. 'out_create_time': 站外用户创建时间}
  119. """
  120. try:
  121. url = f'https://www.youtube.com/{out_user_id}/about'
  122. res = requests.get(url=url, headers=headers)
  123. info = re.findall(r'var ytInitialData = (.*?);</script>', res.text, re.S)[0]
  124. data = json.loads(info)
  125. header = data['header']['c4TabbedHeaderRenderer']
  126. tabs = data['contents']['twoColumnBrowseResultsRenderer']['tabs']
  127. subsimpleText = header['subscriberCountText']['simpleText'].replace('位订阅者', '')
  128. for tab in tabs:
  129. if 'tabRenderer' not in tab or 'content' not in tab['tabRenderer']:
  130. continue
  131. viewCountText = \
  132. tab['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer'][
  133. 'contents'][0]['channelAboutFullMetadataRenderer']['viewCountText']['simpleText']
  134. out_create_time = \
  135. tab['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer'][
  136. 'contents'][0]['channelAboutFullMetadataRenderer']['joinedDateText']['runs'][1]['text']
  137. break
  138. out_user_dict = {
  139. 'out_user_name': header['title'],
  140. 'out_avatar_url': header['avatar']['thumbnails'][-1]['url'],
  141. 'out_fans': format_nums(subsimpleText),
  142. 'out_play_cnt': int(
  143. viewCountText.replace('收看次數:', '').replace('次', '').replace(',', '')) if viewCountText else 0,
  144. 'out_create_time': out_create_time.replace('年', '-').replace('月', '-').replace('日', ''),
  145. }
  146. # print(out_user_dict)
  147. return out_user_dict
  148. except Exception as e:
  149. Common.logger(log_type, crawler).error(f'get_out_user_info异常:{e}\n')
  150. @classmethod
  151. def get_user_from_feishu(cls, log_type, crawler, sheetid, env, machine):
  152. """
  153. 补全飞书用户表信息,并返回
  154. :param log_type: 日志
  155. :param crawler: 哪款爬虫
  156. :param sheetid: 飞书表
  157. :param env: 正式环境:prod,测试环境:dev
  158. :param machine: 部署机器,阿里云填写 aliyun,aliyun_hk ,线下分别填写 macpro,macair,local
  159. :return: user_list
  160. """
  161. try:
  162. user_sheet = Feishu.get_values_batch(log_type, crawler, sheetid)
  163. user_list = []
  164. for i in range(1, len(user_sheet)):
  165. # for i in range(181, len(user_sheet)):
  166. out_uid = user_sheet[i][2]
  167. user_name = user_sheet[i][3]
  168. browse_id = user_sheet[i][5]
  169. our_uid = user_sheet[i][6]
  170. uer_url = user_sheet[i][4]
  171. if out_uid is not None and user_name is not None:
  172. Common.logger(log_type, crawler).info(f"正在更新 {user_name} 用户信息\n")
  173. # 获取站外browse_id,并写入飞书
  174. # if browse_id is None:
  175. # browse_id = cls.get_browse_id(log_type, crawler, out_uid, machine)
  176. # if browse_id is None:
  177. # Common.logger(log_type, crawler).warning('browse_id is None !')
  178. # else:
  179. # Feishu.update_values(log_type, crawler, sheetid, f'F{i + 1}:F{i + 1}', [[browse_id]])
  180. # Common.logger(log_type, crawler).info(f'browse_id写入成功:{browse_id}')
  181. # 站内 UID 为空,且数据库中(youtube+out_user_id)返回数量 == 0,则创建新的站内账号
  182. if our_uid is None:
  183. sql = f""" select * from crawler_user where platform="{cls.platform}" and out_user_id="{out_uid}" """
  184. our_user_info = MysqlHelper.get_values(log_type, crawler, sql, env, machine)
  185. # 数据库中(youtube + out_user_id)返回数量 == 0,则创建站内账号UID,并写入定向账号飞书表。并结合站外用户信息,一并写入爬虫账号数据库
  186. if our_user_info is None or len(our_user_info) == 0:
  187. # 获取站外账号信息,写入数据库
  188. out_user_dict = cls.get_out_user_info(log_type, crawler, browse_id, out_uid)
  189. out_avatar_url = out_user_dict['out_avatar_url']
  190. out_create_time = out_user_dict['out_create_time']
  191. out_play_cnt = out_user_dict['out_play_cnt']
  192. out_fans = out_user_dict['out_fans']
  193. tag = 'youtube爬虫,定向爬虫策略'
  194. # 创建站内账号
  195. create_user_dict = {
  196. 'nickName': user_name,
  197. 'avatarUrl': out_avatar_url,
  198. 'tagName': tag,
  199. }
  200. our_uid = Users.create_uid(log_type, crawler, create_user_dict, env)
  201. Common.logger(log_type, crawler).info(f'新创建的站内UID:{our_uid}')
  202. if env == 'prod':
  203. our_user_link = f'https://admin.piaoquantv.com/ums/user/{our_uid}/post'
  204. else:
  205. our_user_link = f'https://testadmin.piaoquantv.com/ums/user/{our_uid}/post'
  206. Common.logger(log_type, crawler).info(f'站内用户主页链接:{our_user_link}')
  207. Feishu.update_values(log_type, crawler, sheetid, f'G{i + 1}:H{i + 1}',
  208. [[our_uid, our_user_link]])
  209. Common.logger(log_type, crawler).info(f'站内用户信息写入飞书成功!')
  210. sql = f""" insert into crawler_user(user_id,
  211. out_user_id,
  212. out_user_name,
  213. out_avatar_url,
  214. out_create_time,
  215. out_play_cnt,
  216. out_fans,
  217. platform,
  218. tag)
  219. values({our_uid},
  220. "{out_uid}",
  221. "{user_name}",
  222. "{out_avatar_url}",
  223. "{out_create_time}",
  224. {out_play_cnt},
  225. {out_fans},
  226. "{cls.platform}",
  227. "{tag}") """
  228. Common.logger(log_type, crawler).info(f'sql:{sql}')
  229. MysqlHelper.update_values(log_type, crawler, sql, env, machine)
  230. Common.logger(log_type, crawler).info('用户信息插入数据库成功!\n')
  231. # 数据库中(youtube + out_user_id)返回数量 != 0,则直接把数据库中的站内 UID 写入飞书
  232. else:
  233. our_uid = our_user_info[0][1]
  234. if 'env' == 'prod':
  235. our_user_link = f'https://admin.piaoquantv.com/ums/user/{our_uid}/post'
  236. else:
  237. our_user_link = f'https://testadmin.piaoquantv.com/ums/user/{our_uid}/post'
  238. Common.logger(log_type, crawler).info(f'站内用户主页链接:{our_user_link}')
  239. Feishu.update_values(log_type, crawler, sheetid, f'G{i + 1}:H{i + 1}',
  240. [[our_uid, our_user_link]])
  241. Common.logger(log_type, crawler).info(f'站内用户信息写入飞书成功!\n')
  242. user_dict = {
  243. 'out_user_id': out_uid,
  244. 'out_user_name': user_name,
  245. 'out_browse_id': browse_id,
  246. 'our_user_id': our_uid,
  247. 'out_user_url': uer_url
  248. }
  249. user_list.append(user_dict)
  250. else:
  251. pass
  252. return user_list
  253. except Exception as e:
  254. Common.logger(log_type, crawler).error(f"get_user_from_feishu异常:{e}\n")
  255. @classmethod
  256. def get_continuation(cls, data):
  257. continuation = data['continuationItemRenderer']['continuationEndpoint']['continuationCommand']['token']
  258. return continuation
  259. @classmethod
  260. def get_feeds(cls, log_type, crawler, browse_id, out_uid):
  261. """
  262. 获取个人主页视频列表
  263. :param log_type: 日志
  264. :param crawler: 哪款爬虫
  265. :param browse_id: 每个用户主页的请求参数中唯一值
  266. :param out_uid: 站外用户UID
  267. :return: video_list
  268. """
  269. url = "https://www.youtube.com/youtubei/v1/browse?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8&prettyPrint=false"
  270. payload = json.dumps({
  271. "context": {
  272. "client": {
  273. "hl": "zh-CN",
  274. "gl": "US",
  275. "remoteHost": "38.93.247.21",
  276. "deviceMake": "Apple",
  277. "deviceModel": "",
  278. "visitorData": "CgtraDZfVnB4NXdIWSi6mIOfBg%3D%3D",
  279. "userAgent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36,gzip(gfe)",
  280. "clientName": "WEB",
  281. "clientVersion": "2.20230201.01.00",
  282. "osName": "Macintosh",
  283. "osVersion": "10_15_7",
  284. "originalUrl": f"https://www.youtube.com/{out_uid}/videos",
  285. "platform": "DESKTOP",
  286. "clientFormFactor": "UNKNOWN_FORM_FACTOR",
  287. "configInfo": {
  288. "appInstallData": "CLqYg58GEInorgUQuIuuBRCU-K4FENfkrgUQuNSuBRC2nP4SEPuj_hIQ5_euBRCy9a4FEKLsrgUQt-CuBRDi1K4FEILdrgUQh92uBRDM364FEP7urgUQzPWuBRDZ6a4FEOSg_hIQo_muBRDvo_4SEMnJrgUQlqf-EhCR-PwS"
  289. },
  290. "timeZone": "Asia/Shanghai",
  291. "browserName": "Chrome",
  292. "browserVersion": "109.0.0.0",
  293. "acceptHeader": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
  294. "deviceExperimentId": "ChxOekU1TlRReU5qWTBOVFExTVRRNU5qRTBOdz09ELqYg58GGOmU7Z4G",
  295. "screenWidthPoints": 944,
  296. "screenHeightPoints": 969,
  297. "screenPixelDensity": 1,
  298. "screenDensityFloat": 1,
  299. "utcOffsetMinutes": 480,
  300. "userInterfaceTheme": "USER_INTERFACE_THEME_LIGHT",
  301. "memoryTotalKbytes": "8000000",
  302. "mainAppWebInfo": {
  303. "graftUrl": f"/{out_uid}/videos",
  304. "pwaInstallabilityStatus": "PWA_INSTALLABILITY_STATUS_CAN_BE_INSTALLED",
  305. "webDisplayMode": "WEB_DISPLAY_MODE_FULLSCREEN",
  306. "isWebNativeShareAvailable": True
  307. }
  308. },
  309. "user": {
  310. "lockedSafetyMode": False
  311. },
  312. "request": {
  313. "useSsl": True,
  314. "internalExperimentFlags": [],
  315. "consistencyTokenJars": []
  316. },
  317. "clickTracking": {
  318. "clickTrackingParams": "CBcQ8JMBGAYiEwiNhIXX9IL9AhUFSUwIHWnnDks="
  319. },
  320. "adSignalsInfo": {
  321. "params": [
  322. {
  323. "key": "dt",
  324. "value": "1675676731048"
  325. },
  326. {
  327. "key": "flash",
  328. "value": "0"
  329. },
  330. {
  331. "key": "frm",
  332. "value": "0"
  333. },
  334. {
  335. "key": "u_tz",
  336. "value": "480"
  337. },
  338. {
  339. "key": "u_his",
  340. "value": "4"
  341. },
  342. {
  343. "key": "u_h",
  344. "value": "1080"
  345. },
  346. {
  347. "key": "u_w",
  348. "value": "1920"
  349. },
  350. {
  351. "key": "u_ah",
  352. "value": "1080"
  353. },
  354. {
  355. "key": "u_aw",
  356. "value": "1920"
  357. },
  358. {
  359. "key": "u_cd",
  360. "value": "24"
  361. },
  362. {
  363. "key": "bc",
  364. "value": "31"
  365. },
  366. {
  367. "key": "bih",
  368. "value": "969"
  369. },
  370. {
  371. "key": "biw",
  372. "value": "944"
  373. },
  374. {
  375. "key": "brdim",
  376. "value": "-269,-1080,-269,-1080,1920,-1080,1920,1080,944,969"
  377. },
  378. {
  379. "key": "vis",
  380. "value": "1"
  381. },
  382. {
  383. "key": "wgl",
  384. "value": "true"
  385. },
  386. {
  387. "key": "ca_type",
  388. "value": "image"
  389. }
  390. ],
  391. "bid": "ANyPxKpfiaAf-DBzNeKLgkceMEA9UIeCWFRTRm4AQMCuejhI3PGwDB1jizQIX60YcEYtt_CX7tZWAbYerQ-rWLvV7y_KCLkBww"
  392. }
  393. },
  394. # "browseId": browse_id,
  395. "params": "EgZ2aWRlb3PyBgQKAjoA",
  396. "continuation": cls.continuation
  397. })
  398. headers = {
  399. 'authority': 'www.youtube.com',
  400. 'accept': '*/*',
  401. 'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
  402. 'cache-control': 'no-cache',
  403. 'content-type': 'application/json',
  404. 'cookie': 'VISITOR_INFO1_LIVE=kh6_Vpx5wHY; YSC=UupqFrWvAR0; DEVICE_INFO=ChxOekU1TlRReU5qWTBOVFExTVRRNU5qRTBOdz09EOmU7Z4GGOmU7Z4G; PREF=tz=Asia.Shanghai; ST-1kg1gfd=itct=CBcQ8JMBGAYiEwiNhIXX9IL9AhUFSUwIHWnnDks%3D&csn=MC4zNzI3MDcwMDA1Mjg4NzE5Ng..&endpoint=%7B%22clickTrackingParams%22%3A%22CBcQ8JMBGAYiEwiNhIXX9IL9AhUFSUwIHWnnDks%3D%22%2C%22commandMetadata%22%3A%7B%22webCommandMetadata%22%3A%7B%22url%22%3A%22%2F%40chinatravel5971%2Fvideos%22%2C%22webPageType%22%3A%22WEB_PAGE_TYPE_CHANNEL%22%2C%22rootVe%22%3A3611%2C%22apiUrl%22%3A%22%2Fyoutubei%2Fv1%2Fbrowse%22%7D%7D%2C%22browseEndpoint%22%3A%7B%22browseId%22%3A%22UCpLXnfBCNhj8KLnt54RQMKA%22%2C%22params%22%3A%22EgZ2aWRlb3PyBgQKAjoA%22%2C%22canonicalBaseUrl%22%3A%22%2F%40chinatravel5971%22%7D%7D',
  405. 'origin': 'https://www.youtube.com',
  406. 'pragma': 'no-cache',
  407. 'referer': f'https://www.youtube.com/{out_uid}/featured',
  408. 'sec-ch-ua': '"Not_A Brand";v="99", "Chromium";v="109", "Google Chrome";v="109.0.5414.87"',
  409. 'sec-ch-ua-arch': '"arm"',
  410. 'sec-ch-ua-bitness': '"64"',
  411. 'sec-ch-ua-full-version': '"109.0.1518.52"',
  412. 'sec-ch-ua-full-version-list': '"Not_A Brand";v="99.0.0.0", "Microsoft Edge";v="109.0.1518.52", "Chromium";v="109.0.5414.87"',
  413. 'sec-ch-ua-mobile': '?0',
  414. 'sec-ch-ua-model': '',
  415. 'sec-ch-ua-platform': '"macOS"',
  416. 'sec-ch-ua-platform-version': '"12.4.0"',
  417. 'sec-ch-ua-wow64': '?0',
  418. 'sec-fetch-dest': 'empty',
  419. 'sec-fetch-mode': 'same-origin',
  420. 'sec-fetch-site': 'same-origin',
  421. 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
  422. 'x-goog-visitor-id': 'CgtraDZfVnB4NXdIWSi6mIOfBg%3D%3D',
  423. 'x-youtube-bootstrap-logged-in': 'false',
  424. 'x-youtube-client-name': '1',
  425. 'x-youtube-client-version': '2.20230201.01.00'
  426. }
  427. try:
  428. response = requests.post(url=url, headers=headers, data=payload)
  429. # Common.logger(log_type, crawler).info(f"get_feeds_response:{response.json()}\n")
  430. cls.continuation = response.json()['trackingParams']
  431. if response.status_code != 200:
  432. Common.logger(log_type, crawler).warning(f'get_feeds_response:{response.text}\n')
  433. elif 'continuationContents' not in response.text and 'onResponseReceivedActions' not in response.text:
  434. Common.logger(log_type, crawler).warning(f'get_feeds_response:{response.text}\n')
  435. elif 'continuationContents' in response.json():
  436. # Common.logger(log_type, crawler).info("'continuationContents' in response.json()\n")
  437. if 'richGridContinuation' not in response.json()['continuationContents']:
  438. # Common.logger(log_type, crawler).warning(f"'richGridContinuation' not in response.json()['continuationContents']\n")
  439. Common.logger(log_type, crawler).warning(
  440. f'get_feeds_response:{response.json()["continuationContents"]}\n')
  441. elif 'contents' not in response.json()['continuationContents']['richGridContinuation']:
  442. Common.logger(log_type, crawler).warning(
  443. f'get_feeds_response:{response.json()["continuationContents"]["richGridContinuation"]}\n')
  444. elif 'contents' in response.json()["continuationContents"]["richGridContinuation"]:
  445. feeds = response.json()["continuationContents"]["richGridContinuation"]['contents']
  446. return feeds
  447. elif 'onResponseReceivedActions' in response.json():
  448. Common.logger(log_type, crawler).info("'onResponseReceivedActions' in response.json()\n")
  449. if len(response.json()['onResponseReceivedActions']) == 0:
  450. Common.logger(log_type, crawler).warning(
  451. f'get_feeds_response:{response.json()["onResponseReceivedActions"]}\n')
  452. elif 'appendContinuationItemsAction' not in response.json()['onResponseReceivedActions'][0]:
  453. Common.logger(log_type, crawler).warning(
  454. f'get_feeds_response:{response.json()["onResponseReceivedActions"][0]}\n')
  455. elif 'continuationItems' not in response.json()['onResponseReceivedActions'][0][
  456. 'appendContinuationItemsAction']:
  457. Common.logger(log_type, crawler).warning(
  458. f'get_feeds_response:{response.json()["onResponseReceivedActions"][0]["appendContinuationItemsAction"]}\n')
  459. elif len(response.json()['onResponseReceivedActions'][0]['appendContinuationItemsAction'][
  460. 'continuationItems']) == 0:
  461. Common.logger(log_type, crawler).warning(
  462. f'get_feeds_response:{response.json()["onResponseReceivedActions"][0]["appendContinuationItemsAction"]["continuationItems"]}\n')
  463. else:
  464. feeds = response.json()["onResponseReceivedActions"][0]["appendContinuationItemsAction"][
  465. "continuationItems"]
  466. return feeds
  467. else:
  468. Common.logger(log_type, crawler).info('feeds is None\n')
  469. except Exception as e:
  470. Common.logger(log_type, crawler).error(f'get_feeds异常:{e}\n')
  471. @classmethod
  472. def get_first_page(cls, user_url):
  473. try:
  474. res = requests.get(url=user_url, headers=cls.headers)
  475. info = re.findall(r'var ytInitialData = (.*?);', res.text, re.S)[0]
  476. ytInitialData = json.loads(info)
  477. video_list = \
  478. ytInitialData['contents']['twoColumnBrowseResultsRenderer']['tabs'][1]['tabRenderer']['content'][
  479. 'richGridRenderer']['contents']
  480. except Exception as e:
  481. video_list = []
  482. return video_list
  483. @classmethod
  484. def get_next_page(cls, log_type, crawler, strategy, oss_endpoint, env, out_uid, our_uid,
  485. machine, out_user_url, continuation):
  486. post_url = "https://www.youtube.com/youtubei/v1/browse?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8&prettyPrint=false"
  487. payload = json.dumps({
  488. "context": {
  489. "client": {
  490. "userAgent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36,gzip(gfe)",
  491. "clientName": "WEB",
  492. "clientVersion": "2.20230221.06.00",
  493. "osName": "Macintosh",
  494. "osVersion": "10_15_7",
  495. "originalUrl": "https://www.youtube.com/@wongkim728/videos",
  496. "screenPixelDensity": 2,
  497. "platform": "DESKTOP",
  498. "clientFormFactor": "UNKNOWN_FORM_FACTOR",
  499. "configInfo": {
  500. "appInstallData": "CKWy258GEOWg_hIQzN-uBRC4rP4SEOf3rgUQzPWuBRCi7K4FEMiJrwUQieiuBRDshq8FENrprgUQ4tSuBRD-7q4FEKOArwUQgt2uBRC2nP4SEJT4rgUQuIuuBRCH3a4FELjUrgUQjqj-EhCR-PwS"
  501. },
  502. "screenDensityFloat": 2,
  503. "timeZone": "Asia/Shanghai",
  504. "browserName": "Chrome",
  505. "browserVersion": "110.0.0.0",
  506. "acceptHeader": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
  507. "deviceExperimentId": "ChxOekl3TWpVek9UQXpPVE13TnpJd056a3pNZz09EKWy258GGJie0p8G",
  508. "screenWidthPoints": 576,
  509. "screenHeightPoints": 764,
  510. "utcOffsetMinutes": 480,
  511. "userInterfaceTheme": "USER_INTERFACE_THEME_LIGHT",
  512. "connectionType": "CONN_CELLULAR_4G",
  513. "memoryTotalKbytes": "8000000",
  514. "mainAppWebInfo": {
  515. "graftUrl": out_user_url,
  516. "pwaInstallabilityStatus": "PWA_INSTALLABILITY_STATUS_CAN_BE_INSTALLED",
  517. "webDisplayMode": "WEB_DISPLAY_MODE_FULLSCREEN",
  518. "isWebNativeShareAvailable": False
  519. }
  520. },
  521. "user": {
  522. "lockedSafetyMode": False
  523. },
  524. "request": {
  525. "useSsl": True,
  526. "internalExperimentFlags": [],
  527. "consistencyTokenJars": []
  528. },
  529. "clickTracking": {
  530. "clickTrackingParams": ""
  531. },
  532. "adSignalsInfo": {
  533. "params": [],
  534. "bid": "ANyPxKo8EXfKNGm3gYLAqhR5HA90FSKMvQf43tk3KV_XUWB5xi_0OxAo2TJTfoVx_516NRxz0qwRg-1x2kD-IVt7LPKrRHkJBA"
  535. }
  536. },
  537. "continuation": continuation
  538. })
  539. headers = {
  540. # 'authorization': 'SAPISIDHASH 1677121838_f5055bd4b4c242d18af423b37ac0f556bf1dfc30',
  541. 'content-type': 'application/json',
  542. 'cookie': 'VISITOR_INFO1_LIVE=HABZsLFdU40; DEVICE_INFO=ChxOekl3TWpVek9UQXpPVE13TnpJd056a3pNZz09EJie0p8GGJie0p8G; PREF=f4=4000000&tz=Asia.Shanghai; HSID=AxFp7ylWWebUZYqrl; SSID=ANHuSQMqvVcV0vVNn; APISID=AkwZgjPvFZ6LZCrE/Aiv0K-2rEUzY1bH1u; SAPISID=8yRrBMHYXAhqkybH/AEFGJvzZ3tPalnTy0; __Secure-1PAPISID=8yRrBMHYXAhqkybH/AEFGJvzZ3tPalnTy0; __Secure-3PAPISID=8yRrBMHYXAhqkybH/AEFGJvzZ3tPalnTy0; SID=TwjWkM4mrKb4o8pRKbyQVqELjNU43ZL0bF8QB2hdTI9z05T4Koo9aQoNQfX1AiGFWeD7WA.; __Secure-1PSID=TwjWkM4mrKb4o8pRKbyQVqELjNU43ZL0bF8QB2hdTI9z05T4bs4qvvXffLLTXq_VYw0XLw.; __Secure-3PSID=TwjWkM4mrKb4o8pRKbyQVqELjNU43ZL0bF8QB2hdTI9z05T4cNwzpudzvCglfQ5A1FJnog.; LOGIN_INFO=AFmmF2swRAIgO4TvR9xxWoHPgrGoGAEVo-P8Slqem__vIdF_oajjRiECIFiq4YtbL_IQGCbkjrHsWkWH6OpzKd8RlgdS6qNurR0Q:QUQ3MjNmejV5WkRVUmZXVlFjbjY0dW1aVGpoZkZQdmxYamIzV01zc0lmT3JiQl9ldVYwc0t4dlNkbWpoVEdJMHVaWjZXVEt3ZERQeUppU3AyNmR6ckFucWltZU5LNmZjQ3lHUEtKTDBzSlo5WXpJQzF3UlNCVlp2Q1ZKVmxtRk05OHRuWFFiWGphcFpPblFOUURWTlVxVGtBazVjcmVtS2pR; YSC=CtX0f3NennA; SIDCC=AFvIBn9aXC4vNCbg5jPzjbC8LMYCBVx_dy8uJO20b-768rmRfP9f5BqQ_xXspPemecVq29qZ7A; __Secure-1PSIDCC=AFvIBn-4TD_lPaKgbmYAGO6hZluLgSgbWgb7XAcaeNG6982LIIpS_Gb9vkqHTBMyCGvb4x7m6jk; __Secure-3PSIDCC=AFvIBn9ypvGX15qq4CsnsuhWTaXa9yMTxWMWbIDXtr6L3XZD81XBUQ0IMUv9ZKh9mf8NEbSvOy0; SIDCC=AFvIBn_DwLbohF2llhq4EQjFDFA3n9-_AK_7ITJsTZtCeYwy43J8KCYUPfY7ghqX9s-Qq5dOIQ; __Secure-1PSIDCC=AFvIBn-7x_HhxbmDkOzXew-sXAEWVuUGpglr8rypU623IyO8Y9OungcqMkuxBZQ2vr6G7x9UcxM; __Secure-3PSIDCC=AFvIBn-7aSYRxZkCKZp7-Mdn9PwbW4CUtXD0ok0nCvPIZXfkFrN9VqN1BHkI1fUaoIo_8YCjwRs',
  543. 'origin': 'https://www.youtube.com',
  544. 'referer': out_user_url,
  545. 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36',
  546. }
  547. try:
  548. res = requests.request("POST", post_url, headers=headers, data=payload).json()
  549. video_infos = res['onResponseReceivedActions'][0]['appendContinuationItemsAction']['continuationItems']
  550. for data in video_infos:
  551. if 'richItemRenderer' in data:
  552. video_id = data["richItemRenderer"]["content"]['videoRenderer']['videoId']
  553. video_dict = cls.get_video_info(log_type, crawler, out_uid, video_id, machine)
  554. # video_dict = cls.parse_video(video_dict, log_type, crawler, out_uid, video_id, machine)
  555. # 发布时间<=7天
  556. publish_time = int(time.mktime(time.strptime(video_dict['publish_time'], "%Y-%m-%d")))
  557. if int(time.time()) - publish_time <= 3600 * 24 * 7:
  558. cls.download_publish(log_type, crawler, video_dict, strategy, our_uid, env, oss_endpoint,
  559. machine)
  560. else:
  561. Common.logger(log_type, crawler).info('发布时间超过7天\n')
  562. return
  563. else:
  564. continuation = cls.get_continuation(data)
  565. cls.get_next_page(log_type, crawler, strategy, oss_endpoint, env, out_uid, our_uid,
  566. machine, out_user_url, continuation)
  567. except:
  568. return
  569. @classmethod
  570. def get_videos(cls, log_type, crawler, strategy, oss_endpoint, env, out_uid, our_uid,
  571. machine, out_user_url):
  572. try:
  573. feeds = cls.get_first_page(out_user_url)
  574. for data in feeds:
  575. if 'richItemRenderer' in data:
  576. video_id = data["richItemRenderer"]["content"]['videoRenderer']['videoId']
  577. video_dict = cls.get_video_info(log_type, crawler, out_uid, video_id, machine)
  578. # 发布时间<=7天
  579. publish_time = int(time.mktime(time.strptime(video_dict['publish_time'], "%Y-%m-%d")))
  580. if int(time.time()) - publish_time <= 3600 * 24 * 7:
  581. cls.download_publish(log_type, crawler, video_dict, strategy, our_uid, env, oss_endpoint,
  582. machine)
  583. else:
  584. Common.logger(log_type, crawler).info('发布时间超过7天\n')
  585. return
  586. else:
  587. continuation = cls.get_continuation(data)
  588. cls.get_next_page(log_type, crawler, strategy, oss_endpoint, env, out_uid, our_uid,
  589. machine, out_user_url, continuation=continuation)
  590. except Exception as e:
  591. Common.logger(log_type, crawler).error(f"get_videos异常:{e}\n")
  592. @classmethod
  593. def filter_emoji(cls, title):
  594. # 过滤表情
  595. try:
  596. co = re.compile(u'[\U00010000-\U0010ffff]')
  597. except re.error:
  598. co = re.compile(u'[\uD800-\uDBFF][\uDC00-\uDFFF]')
  599. return co.sub("", title)
  600. @classmethod
  601. def is_contain_chinese(cls, strword):
  602. for ch in strword:
  603. if u'\u4e00' <= ch <= u'\u9fff':
  604. return True
  605. return False
  606. @classmethod
  607. def parse_video(cls, video_dict, log_type, crawler, out_uid, video_id, machine):
  608. try:
  609. if 'streamingData' not in video_dict:
  610. Common.logger(log_type, crawler).warning(f"get_video_info_response:{video_dict}\n")
  611. elif 'videoDetails' not in video_dict:
  612. Common.logger(log_type, crawler).warning(f"get_video_info_response:{video_dict}\n")
  613. elif 'microformat' not in video_dict:
  614. Common.logger(log_type, crawler).warning(f"get_video_info_response:{video_dict}\n")
  615. else:
  616. playerMicroformatRenderer = video_dict['microformat']['playerMicroformatRenderer']
  617. videoDetails = video_dict['videoDetails']
  618. # streamingData = response.json()['streamingData']
  619. # video_title
  620. if 'title' not in videoDetails:
  621. video_title = ''
  622. else:
  623. video_title = videoDetails['title']
  624. video_title = cls.filter_emoji(video_title)
  625. # if Translate.is_contains_chinese(video_title) is False:
  626. if not cls.is_contain_chinese(video_title):
  627. video_title = Translate.google_translate(video_title, machine) \
  628. .strip().replace("\\", "").replace(" ", "").replace("\n", "") \
  629. .replace("/", "").replace("\r", "").replace("&NBSP", "").replace("&", "") \
  630. .replace(";", "").replace("amp;", "") # 自动翻译标题为中文
  631. if 'lengthSeconds' not in videoDetails:
  632. duration = 0
  633. else:
  634. duration = int(videoDetails['lengthSeconds'])
  635. # play_cnt
  636. if 'viewCount' not in videoDetails:
  637. play_cnt = 0
  638. else:
  639. play_cnt = int(videoDetails['viewCount'])
  640. # publish_time
  641. if 'publishDate' not in playerMicroformatRenderer:
  642. publish_time = ''
  643. else:
  644. publish_time = playerMicroformatRenderer['publishDate']
  645. if publish_time == '':
  646. publish_time_stamp = 0
  647. elif ':' in publish_time:
  648. publish_time_stamp = int(time.mktime(time.strptime(publish_time, "%Y-%m-%d %H:%M:%S")))
  649. else:
  650. publish_time_stamp = int(time.mktime(time.strptime(publish_time, "%Y-%m-%d")))
  651. # user_name
  652. if 'author' not in videoDetails:
  653. user_name = ''
  654. else:
  655. user_name = videoDetails['author']
  656. # cover_url
  657. if 'thumbnail' not in videoDetails:
  658. cover_url = ''
  659. elif 'thumbnails' not in videoDetails['thumbnail']:
  660. cover_url = ''
  661. elif len(videoDetails['thumbnail']['thumbnails']) == 0:
  662. cover_url = ''
  663. elif 'url' not in videoDetails['thumbnail']['thumbnails'][-1]:
  664. cover_url = ''
  665. else:
  666. cover_url = videoDetails['thumbnail']['thumbnails'][-1]['url']
  667. # video_url
  668. # if 'formats' not in streamingData:
  669. # video_url = ''
  670. # elif len(streamingData['formats']) == 0:
  671. # video_url = ''
  672. # elif 'url' not in streamingData['formats'][-1]:
  673. # video_url = ''
  674. # else:
  675. # video_url = streamingData['formats'][-1]['url']
  676. video_url = f"https://www.youtube.com/watch?v={video_id}"
  677. Common.logger(log_type, crawler).info(f'video_title:{video_title}')
  678. Common.logger(log_type, crawler).info(f'video_id:{video_id}')
  679. Common.logger(log_type, crawler).info(f'play_cnt:{play_cnt}')
  680. Common.logger(log_type, crawler).info(f'publish_time:{publish_time}')
  681. Common.logger(log_type, crawler).info(f'user_name:{user_name}')
  682. Common.logger(log_type, crawler).info(f'cover_url:{cover_url}')
  683. Common.logger(log_type, crawler).info(f'video_url:{video_url}')
  684. video_dict = {
  685. 'video_title': video_title,
  686. 'video_id': video_id,
  687. 'duration': duration,
  688. 'play_cnt': play_cnt,
  689. 'publish_time': publish_time,
  690. 'publish_time_stamp': publish_time_stamp,
  691. 'user_name': user_name,
  692. 'out_uid': out_uid,
  693. 'cover_url': cover_url,
  694. 'video_url': video_url,
  695. }
  696. return video_dict
  697. except Exception as e:
  698. Common.logger(log_type, crawler).error(f"get_video_info异常:{e}\n")
  699. @classmethod
  700. def get_video_info(cls, log_type, crawler, out_uid, video_id, machine):
  701. try:
  702. url = "https://www.youtube.com/youtubei/v1/player?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8&prettyPrint=false"
  703. payload = json.dumps({
  704. "context": {
  705. "client": {
  706. "hl": "zh-CN",
  707. "gl": "US",
  708. "remoteHost": "38.93.247.21",
  709. "deviceMake": "Apple",
  710. "deviceModel": "",
  711. "visitorData": "CgtraDZfVnB4NXdIWSjkzoefBg%3D%3D",
  712. "userAgent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36,gzip(gfe)",
  713. "clientName": "WEB",
  714. "clientVersion": "2.20230201.01.00",
  715. "osName": "Macintosh",
  716. "osVersion": "10_15_7",
  717. "originalUrl": f"https://www.youtube.com/watch?v={video_id}",
  718. "platform": "DESKTOP",
  719. "clientFormFactor": "UNKNOWN_FORM_FACTOR",
  720. "configInfo": {
  721. "appInstallData": "COTOh58GEPuj_hIQ1-SuBRC4i64FEMzfrgUQgt2uBRCi7K4FEOLUrgUQzPWuBRCKgK8FEOSg_hIQtpz-EhDa6a4FEP7urgUQieiuBRDn964FELjUrgUQlPiuBRCH3a4FELfgrgUQ76P-EhDJya4FEJan_hIQkfj8Eg%3D%3D"
  722. },
  723. "timeZone": "Asia/Shanghai",
  724. "browserName": "Chrome",
  725. "browserVersion": "109.0.0.0",
  726. "acceptHeader": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
  727. "deviceExperimentId": "ChxOekU1TlRReU5qWTBOVFExTVRRNU5qRTBOdz09EOTOh58GGOmU7Z4G",
  728. "screenWidthPoints": 1037,
  729. "screenHeightPoints": 969,
  730. "screenPixelDensity": 1,
  731. "screenDensityFloat": 1,
  732. "utcOffsetMinutes": 480,
  733. "userInterfaceTheme": "USER_INTERFACE_THEME_LIGHT",
  734. "memoryTotalKbytes": "8000000",
  735. "clientScreen": "WATCH",
  736. "mainAppWebInfo": {
  737. "graftUrl": f"/watch?v={video_id}",
  738. "pwaInstallabilityStatus": "PWA_INSTALLABILITY_STATUS_CAN_BE_INSTALLED",
  739. "webDisplayMode": "WEB_DISPLAY_MODE_FULLSCREEN",
  740. "isWebNativeShareAvailable": True
  741. }
  742. },
  743. "user": {
  744. "lockedSafetyMode": False
  745. },
  746. "request": {
  747. "useSsl": True,
  748. "internalExperimentFlags": [],
  749. "consistencyTokenJars": []
  750. },
  751. "clickTracking": {
  752. "clickTrackingParams": "CIwBEKQwGAYiEwipncqx3IL9AhXs4cQKHbKZDO4yB3JlbGF0ZWRInsS1qbGFtIlUmgEFCAEQ-B0="
  753. },
  754. "adSignalsInfo": {
  755. "params": [
  756. {
  757. "key": "dt",
  758. "value": "1675749222611"
  759. },
  760. {
  761. "key": "flash",
  762. "value": "0"
  763. },
  764. {
  765. "key": "frm",
  766. "value": "0"
  767. },
  768. {
  769. "key": "u_tz",
  770. "value": "480"
  771. },
  772. {
  773. "key": "u_his",
  774. "value": "3"
  775. },
  776. {
  777. "key": "u_h",
  778. "value": "1080"
  779. },
  780. {
  781. "key": "u_w",
  782. "value": "1920"
  783. },
  784. {
  785. "key": "u_ah",
  786. "value": "1080"
  787. },
  788. {
  789. "key": "u_aw",
  790. "value": "1920"
  791. },
  792. {
  793. "key": "u_cd",
  794. "value": "24"
  795. },
  796. {
  797. "key": "bc",
  798. "value": "31"
  799. },
  800. {
  801. "key": "bih",
  802. "value": "969"
  803. },
  804. {
  805. "key": "biw",
  806. "value": "1037"
  807. },
  808. {
  809. "key": "brdim",
  810. "value": "-269,-1080,-269,-1080,1920,-1080,1920,1080,1037,969"
  811. },
  812. {
  813. "key": "vis",
  814. "value": "1"
  815. },
  816. {
  817. "key": "wgl",
  818. "value": "true"
  819. },
  820. {
  821. "key": "ca_type",
  822. "value": "image"
  823. }
  824. ],
  825. "bid": "ANyPxKop8SijebwUCq4ZfKbJwlSjVQa_RTdS6c6a6WPYpCKnxpWCJ33B1SzRuSXjSfH9O2MhURebAs0CngRg6B4nOjBpeJDKgA"
  826. }
  827. },
  828. "videoId": str(video_id),
  829. "playbackContext": {
  830. "contentPlaybackContext": {
  831. "currentUrl": f"/watch?v={video_id}",
  832. "vis": 0,
  833. "splay": False,
  834. "autoCaptionsDefaultOn": False,
  835. "autonavState": "STATE_NONE",
  836. "html5Preference": "HTML5_PREF_WANTS",
  837. "signatureTimestamp": 19394,
  838. "referer": f"https://www.youtube.com/watch?v={video_id}",
  839. "lactMilliseconds": "-1",
  840. "watchAmbientModeContext": {
  841. "watchAmbientModeEnabled": True
  842. }
  843. }
  844. },
  845. "racyCheckOk": False,
  846. "contentCheckOk": False
  847. })
  848. headers = {
  849. 'authority': 'www.youtube.com',
  850. 'accept': '*/*',
  851. 'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
  852. 'cache-control': 'no-cache',
  853. 'content-type': 'application/json',
  854. 'cookie': f'VISITOR_INFO1_LIVE=kh6_Vpx5wHY; YSC=UupqFrWvAR0; DEVICE_INFO=ChxOekU1TlRReU5qWTBOVFExTVRRNU5qRTBOdz09EOmU7Z4GGOmU7Z4G; PREF=tz=Asia.Shanghai; ST-180dxzo=itct=CIwBEKQwGAYiEwipncqx3IL9AhXs4cQKHbKZDO4yB3JlbGF0ZWRInsS1qbGFtIlUmgEFCAEQ-B0%3D&csn=MC41MTQ1NTQzMTE3NTA4MjY0&endpoint=%7B%22clickTrackingParams%22%3A%22CIwBEKQwGAYiEwipncqx3IL9AhXs4cQKHbKZDO4yB3JlbGF0ZWRInsS1qbGFtIlUmgEFCAEQ-B0%3D%22%2C%22commandMetadata%22%3A%7B%22webCommandMetadata%22%3A%7B%22url%22%3A%22%2Fwatch%3Fv%3D{video_id}%22%2C%22webPageType%22%3A%22WEB_PAGE_TYPE_WATCH%22%2C%22rootVe%22%3A3832%7D%7D%2C%22watchEndpoint%22%3A%7B%22videoId%22%3A%22{video_id}%22%2C%22nofollow%22%3Atrue%2C%22watchEndpointSupportedOnesieConfig%22%3A%7B%22html5PlaybackOnesieConfig%22%3A%7B%22commonConfig%22%3A%7B%22url%22%3A%22https%3A%2F%2Frr5---sn-nx5s7n76.googlevideo.com%2Finitplayback%3Fsource%3Dyoutube%26oeis%3D1%26c%3DWEB%26oad%3D3200%26ovd%3D3200%26oaad%3D11000%26oavd%3D11000%26ocs%3D700%26oewis%3D1%26oputc%3D1%26ofpcc%3D1%26msp%3D1%26odepv%3D1%26id%3D38654ad085c12212%26ip%3D38.93.247.21%26initcwndbps%3D11346250%26mt%3D1675748964%26oweuc%3D%26pxtags%3DCg4KAnR4EggyNDQ1MTI4OA%26rxtags%3DCg4KAnR4EggyNDQ1MTI4Ng%252CCg4KAnR4EggyNDQ1MTI4Nw%252CCg4KAnR4EggyNDQ1MTI4OA%252CCg4KAnR4EggyNDQ1MTI4OQ%22%7D%7D%7D%7D%7D',
  855. 'origin': 'https://www.youtube.com',
  856. 'pragma': 'no-cache',
  857. 'referer': f'https://www.youtube.com/watch?v={video_id}',
  858. 'sec-ch-ua': '"Not_A Brand";v="99", "Chromium";v="109", "Google Chrome";v="109.0.5414.87"',
  859. 'sec-ch-ua-arch': '"arm"',
  860. 'sec-ch-ua-bitness': '"64"',
  861. 'sec-ch-ua-full-version': '"109.0.1518.52"',
  862. 'sec-ch-ua-full-version-list': '"Not_A Brand";v="99.0.0.0", "Microsoft Edge";v="109.0.1518.52", "Chromium";v="109.0.5414.87"',
  863. 'sec-ch-ua-mobile': '?0',
  864. 'sec-ch-ua-model': '',
  865. 'sec-ch-ua-platform': '"macOS"',
  866. 'sec-ch-ua-platform-version': '"12.4.0"',
  867. 'sec-ch-ua-wow64': '?0',
  868. 'sec-fetch-dest': 'empty',
  869. 'sec-fetch-mode': 'same-origin',
  870. 'sec-fetch-site': 'same-origin',
  871. 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
  872. 'x-goog-visitor-id': 'CgtraDZfVnB4NXdIWSjkzoefBg%3D%3D',
  873. 'x-youtube-bootstrap-logged-in': 'false',
  874. 'x-youtube-client-name': '1',
  875. 'x-youtube-client-version': '2.20230201.01.00'
  876. }
  877. response = requests.post(url=url, headers=headers, data=payload)
  878. if response.status_code != 200:
  879. Common.logger(log_type, crawler).warning(f"get_video_info_response:{response.text}\n")
  880. elif 'streamingData' not in response.json():
  881. Common.logger(log_type, crawler).warning(f"get_video_info_response:{response.json()}\n")
  882. elif 'videoDetails' not in response.json():
  883. Common.logger(log_type, crawler).warning(f"get_video_info_response:{response.json()}\n")
  884. elif 'microformat' not in response.json():
  885. Common.logger(log_type, crawler).warning(f"get_video_info_response:{response.json()}\n")
  886. else:
  887. playerMicroformatRenderer = response.json()['microformat']['playerMicroformatRenderer']
  888. videoDetails = response.json()['videoDetails']
  889. # streamingData = response.json()['streamingData']
  890. # video_title
  891. if 'title' not in videoDetails:
  892. video_title = ''
  893. else:
  894. video_title = videoDetails['title']
  895. video_title = cls.filter_emoji(video_title)
  896. if not cls.is_contain_chinese(video_title):
  897. video_title = Translate.google_translate(video_title, machine) \
  898. .strip().replace("\\", "").replace(" ", "").replace("\n", "") \
  899. .replace("/", "").replace("\r", "").replace("&NBSP", "").replace("&", "") \
  900. .replace(";", "").replace("amp;", "") # 自动翻译标题为中文
  901. if 'lengthSeconds' not in videoDetails:
  902. duration = 0
  903. else:
  904. duration = int(videoDetails['lengthSeconds'])
  905. # play_cnt
  906. if 'viewCount' not in videoDetails:
  907. play_cnt = 0
  908. else:
  909. play_cnt = int(videoDetails['viewCount'])
  910. # publish_time
  911. if 'publishDate' not in playerMicroformatRenderer:
  912. publish_time = ''
  913. else:
  914. publish_time = playerMicroformatRenderer['publishDate']
  915. if publish_time == '':
  916. publish_time_stamp = 0
  917. elif ':' in publish_time:
  918. publish_time_stamp = int(time.mktime(time.strptime(publish_time, "%Y-%m-%d %H:%M:%S")))
  919. else:
  920. publish_time_stamp = int(time.mktime(time.strptime(publish_time, "%Y-%m-%d")))
  921. # user_name
  922. if 'author' not in videoDetails:
  923. user_name = ''
  924. else:
  925. user_name = videoDetails['author']
  926. # cover_url
  927. if 'thumbnail' not in videoDetails:
  928. cover_url = ''
  929. elif 'thumbnails' not in videoDetails['thumbnail']:
  930. cover_url = ''
  931. elif len(videoDetails['thumbnail']['thumbnails']) == 0:
  932. cover_url = ''
  933. elif 'url' not in videoDetails['thumbnail']['thumbnails'][-1]:
  934. cover_url = ''
  935. else:
  936. cover_url = videoDetails['thumbnail']['thumbnails'][-1]['url']
  937. # video_url
  938. # if 'formats' not in streamingData:
  939. # video_url = ''
  940. # elif len(streamingData['formats']) == 0:
  941. # video_url = ''
  942. # elif 'url' not in streamingData['formats'][-1]:
  943. # video_url = ''
  944. # else:
  945. # video_url = streamingData['formats'][-1]['url']
  946. video_url = f"https://www.youtube.com/watch?v={video_id}"
  947. Common.logger(log_type, crawler).info(f'video_title:{video_title}')
  948. Common.logger(log_type, crawler).info(f'video_id:{video_id}')
  949. Common.logger(log_type, crawler).info(f'play_cnt:{play_cnt}')
  950. Common.logger(log_type, crawler).info(f'publish_time:{publish_time}')
  951. Common.logger(log_type, crawler).info(f'user_name:{user_name}')
  952. Common.logger(log_type, crawler).info(f'cover_url:{cover_url}')
  953. Common.logger(log_type, crawler).info(f'video_url:{video_url}')
  954. video_dict = {
  955. 'video_title': video_title,
  956. 'video_id': video_id,
  957. 'duration': duration,
  958. 'play_cnt': play_cnt,
  959. 'publish_time': publish_time,
  960. 'publish_time_stamp': publish_time_stamp,
  961. 'user_name': user_name,
  962. 'out_uid': out_uid,
  963. 'cover_url': cover_url,
  964. 'video_url': video_url,
  965. }
  966. return video_dict
  967. except Exception as e:
  968. Common.logger(log_type, crawler).error(f"get_video_info异常:{e}\n")
  969. @classmethod
  970. def repeat_video(cls, log_type, crawler, video_id, env, machine):
  971. sql = f""" select * from crawler_video where platform="{cls.platform}" and out_video_id="{video_id}"; """
  972. repeat_video = MysqlHelper.get_values(log_type, crawler, sql, env, machine)
  973. return len(repeat_video)
  974. @classmethod
  975. def download_publish(cls, log_type, crawler, video_dict, strategy, our_uid, env, oss_endpoint, machine):
  976. try:
  977. # sql = f""" select * from crawler_video where platform="{cls.platform}" and out_video_id="{video_dict['video_id']}" """
  978. # repeat_video = MysqlHelper.get_values(log_type, crawler, sql, env, machine)
  979. if video_dict['video_title'] == '' or video_dict['video_url'] == '':
  980. Common.logger(log_type, crawler).info('无效视频\n')
  981. elif video_dict['duration'] > 1200 or video_dict['duration'] < 60:
  982. Common.logger(log_type, crawler).info(f"时长:{video_dict['duration']}不满足规则\n")
  983. # elif repeat_video is not None and len(repeat_video) != 0:
  984. elif cls.repeat_video(log_type, crawler, video_dict['video_id'], env, machine) != 0:
  985. Common.logger(log_type, crawler).info('视频已下载\n')
  986. elif video_dict['video_id'] in [x for y in Feishu.get_values_batch(log_type, crawler, 'GVxlYk') for x in y]:
  987. Common.logger(log_type, crawler).info('视频已下载\n')
  988. else:
  989. # 下载视频
  990. Common.logger(log_type, crawler).info('开始下载视频...')
  991. # Common.download_method(log_type, crawler, 'video', video_dict['video_title'], video_dict['video_url'])
  992. Common.download_method(log_type, crawler, 'youtube_video', video_dict['video_title'],
  993. video_dict['video_url'])
  994. # ffmpeg_dict = Common.ffmpeg(log_type, crawler, f"./{crawler}/videos/{video_dict['video_title']}/video.mp4")
  995. # video_width = int(ffmpeg_dict['width'])
  996. # video_height = int(ffmpeg_dict['height'])
  997. # video_size = int(ffmpeg_dict['size'])
  998. video_width = 1280
  999. video_height = 720
  1000. duration = int(video_dict['duration'])
  1001. Common.logger(log_type, crawler).info(f'video_width:{video_width}')
  1002. Common.logger(log_type, crawler).info(f'video_height:{video_height}')
  1003. Common.logger(log_type, crawler).info(f'duration:{duration}')
  1004. # Common.logger(log_type, crawler).info(f'video_size:{video_size}\n')
  1005. video_dict['video_width'] = video_width
  1006. video_dict['video_height'] = video_height
  1007. video_dict['duration'] = duration
  1008. video_dict['comment_cnt'] = 0
  1009. video_dict['like_cnt'] = 0
  1010. video_dict['share_cnt'] = 0
  1011. video_dict['avatar_url'] = video_dict['cover_url']
  1012. video_dict['session'] = f'youtube{int(time.time())}'
  1013. rule = '1,2'
  1014. # if duration < 60 or duration > 600:
  1015. # # 删除视频文件夹
  1016. # shutil.rmtree(f"./{crawler}/videos/{video_dict['video_title']}/")
  1017. # Common.logger(log_type, crawler).info(f"时长:{video_dict['duration']}不满足抓取规则,删除成功\n")
  1018. # return
  1019. # if duration == 0 or duration is None:
  1020. # # 删除视频文件夹
  1021. # shutil.rmtree(f"./{crawler}/videos/{video_dict['video_title']}/")
  1022. # Common.logger(log_type, crawler).info(f"视频下载出错,删除成功\n")
  1023. # return
  1024. # else:
  1025. # 下载封面
  1026. Common.download_method(log_type, crawler, 'cover', video_dict['video_title'], video_dict['cover_url'])
  1027. # 保存视频文本信息
  1028. Common.save_video_info(log_type, crawler, video_dict)
  1029. # 上传视频
  1030. Common.logger(log_type, crawler).info(f"开始上传视频")
  1031. if env == 'dev':
  1032. our_video_id = Publish.upload_and_publish(log_type, crawler, strategy, our_uid, env, oss_endpoint)
  1033. our_video_link = f"https://testadmin.piaoquantv.com/cms/post-detail/{our_video_id}/info"
  1034. else:
  1035. our_video_id = Publish.upload_and_publish(log_type, crawler, strategy, our_uid, env, oss_endpoint)
  1036. our_video_link = f"https://admin.piaoquantv.com/cms/post-detail/{our_video_id}/info"
  1037. Common.logger(log_type, crawler).info("视频上传完成")
  1038. if our_video_id is None:
  1039. # 删除视频文件夹
  1040. shutil.rmtree(f"./{crawler}/videos/{video_dict['video_title']}/")
  1041. return
  1042. # 视频信息保存至飞书
  1043. Feishu.insert_columns(log_type, crawler, "GVxlYk", "ROWS", 1, 2)
  1044. # 视频ID工作表,首行写入数据
  1045. upload_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(time.time())))
  1046. values = [[upload_time,
  1047. "定向榜",
  1048. video_dict['video_id'],
  1049. video_dict['video_title'],
  1050. our_video_link,
  1051. video_dict['play_cnt'],
  1052. video_dict['duration'],
  1053. f'{video_width}*{video_height}',
  1054. video_dict['publish_time'],
  1055. video_dict['user_name'],
  1056. video_dict['cover_url'],
  1057. video_dict['video_url']
  1058. ]]
  1059. # time.sleep(1)
  1060. Feishu.update_values(log_type, crawler, "GVxlYk", "F2:Z2", values)
  1061. Common.logger(log_type, crawler).info('视频信息写入定向_已下载表成功\n')
  1062. # 视频信息保存数据库
  1063. sql = f""" insert into crawler_video(video_id,
  1064. user_id,
  1065. out_user_id,
  1066. platform,
  1067. strategy,
  1068. out_video_id,
  1069. video_title,
  1070. cover_url,
  1071. video_url,
  1072. duration,
  1073. publish_time,
  1074. play_cnt,
  1075. crawler_rule,
  1076. width,
  1077. height)
  1078. values({our_video_id},
  1079. "{our_uid}",
  1080. "{video_dict['out_uid']}",
  1081. "{cls.platform}",
  1082. "定向爬虫策略",
  1083. "{video_dict['video_id']}",
  1084. "{video_dict['video_title']}",
  1085. "{video_dict['cover_url']}",
  1086. "{video_dict['video_url']}",
  1087. {int(duration)},
  1088. "{video_dict['publish_time']}",
  1089. {int(video_dict['play_cnt'])},
  1090. "{rule}",
  1091. {int(video_width)},
  1092. {int(video_height)}) """
  1093. MysqlHelper.update_values(log_type, crawler, sql, env, machine)
  1094. Common.logger(log_type, crawler).info('视频信息插入数据库成功!\n')
  1095. except Exception as e:
  1096. Common.logger(log_type, crawler).info(f"download_publish异常:{e}\n")
  1097. @classmethod
  1098. def get_follow_videos(cls, log_type, crawler, strategy, oss_endpoint, env, machine):
  1099. try:
  1100. user_list = cls.get_user_from_feishu(log_type, crawler, 'c467d7', env, machine)
  1101. if len(user_list) == 0:
  1102. Common.logger(log_type, crawler).warning('用户列表为空\n')
  1103. else:
  1104. for user_dict in user_list:
  1105. out_uid = user_dict['out_user_id']
  1106. user_name = user_dict['out_user_name']
  1107. browse_id = user_dict['out_browse_id']
  1108. our_uid = user_dict['our_user_id']
  1109. out_user_url = user_dict['out_user_url']
  1110. Common.logger(log_type, crawler).info(f'获取 {user_name} 主页视频\n')
  1111. cls.get_videos(log_type, crawler, strategy, oss_endpoint, env, out_uid, our_uid, machine,
  1112. out_user_url)
  1113. # Common.logger(log_type, crawler).info('休眠 10 秒')
  1114. # time.sleep(random.randint(1, 2))
  1115. cls.continuation = ''
  1116. except Exception as e:
  1117. Common.logger(log_type, crawler).error(f"get_follow_videos异常:{e}\n")
  1118. if __name__ == "__main__":
  1119. # print(YoutubeFollow.get_browse_id('follow', 'youtube', '@chinatravel5971', "local"))
  1120. # print(YoutubeFollow.get_user_from_feishu('follow', 'youtube', 'c467d7', 'dev', 'local'))
  1121. # print(YoutubeFollow.get_user_from_feishu('follow', 'youtube', 'c467d7', 'prod', 'prod'))
  1122. # YoutubeFollow.get_out_user_info('follow', 'youtube', 'UC08jgxf119fzynp2uHCvZIg', '@weitravel')
  1123. # YoutubeFollow.get_video_info('follow', 'youtube', 'OGVK0IXBIhI')
  1124. # YoutubeFollow.get_follow_videos('follow', 'youtube', 'youtube_follow', 'out', 'dev', 'local')
  1125. # print(YoutubeFollow.filter_emoji("姐妹倆一唱一和,完美配合,終於把大慶降服了😅😅#萌娃搞笑日常"))
  1126. # YoutubeFollow.repeat_video('follow', 'youtube', 4, "dev", "local")
  1127. # title = "'西部巡游220丨两人一车环游中国半年,需要花费多少钱? 2万公里吃住行费用总结'"
  1128. # title = "'Insanely Crowded Shanghai Yu Garden Lantern Festival Walk Tour 2023 人气爆棚的上海豫园元宵节漫步之行 4K'"
  1129. # print(title.strip().replace("\\", "").replace(" ", "").replace("\n", "").replace("/", "").replace("\r", "").replace("&NBSP", "").replace("&", ""))
  1130. pass