youtube_follow_api.py 59 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127
  1. # -*- coding: utf-8 -*-
  2. # @Author: wangkun
  3. # @Time: 2023/2/3
  4. """
  5. YouTube 定向榜
  6. 1. 发布时间<=1个月
  7. 2. 10分钟>=时长>=1分钟
  8. """
  9. import os
  10. import re
  11. import shutil
  12. import sys
  13. import time
  14. import json
  15. from hashlib import md5
  16. import requests
  17. sys.path.append(os.getcwd())
  18. from common.common import Common
  19. from common.db import MysqlHelper
  20. from common.feishu import Feishu
  21. from common.getuser import getUser
  22. from common.publish import Publish
  23. from common.translate import Translate
  24. from common.public import get_user_from_mysql, get_config_from_mysql
  25. headers = {
  26. 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36',
  27. }
  28. def format_nums(data):
  29. data_dict = [{'亿': 100000000}, {'百万': 1000000}, {'万': 10000}, {'k': 1000}, {'w': 10000}, {'m': 1000000},
  30. {'千': 1000}, {'M': 1000000}, {'K': 1000}, {'W': 10000}]
  31. data = str(data)
  32. for i in data_dict:
  33. index = data.find(list(i.keys())[0])
  34. if index > 0:
  35. count = int(float(data[:index]) * list(i.values())[0])
  36. return count
  37. elif index < 0:
  38. continue
  39. count = int(float(re.findall(r'\d+', data)[0]))
  40. return count
  41. class YoutubeFollow:
  42. # 翻页参数
  43. continuation = ''
  44. # 抓取平台
  45. platform = 'youtube'
  46. headers = {
  47. 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36',
  48. }
  49. @classmethod
  50. def get_out_user_info(cls, log_type, crawler, browse_id, out_user_id):
  51. """
  52. 获取站外用户信息
  53. :param log_type: 日志
  54. :param crawler: 哪款爬虫
  55. :param browse_id: browse_id
  56. :param out_user_id: 站外用户 UID
  57. :return: out_user_dict = {'out_user_name': 站外用户昵称,
  58. 'out_avatar_url': 站外用户头像,
  59. 'out_fans': 站外用户粉丝量,
  60. 'out_play_cnt': 站外用户总播放量,
  61. 'out_create_time': 站外用户创建时间}
  62. """
  63. try:
  64. url = f'https://www.youtube.com/{out_user_id}/about'
  65. res = requests.get(url=url, headers=headers)
  66. info = re.findall(r'var ytInitialData = (.*?);</script>', res.text, re.S)[0]
  67. data = json.loads(info)
  68. header = data['header']['c4TabbedHeaderRenderer']
  69. tabs = data['contents']['twoColumnBrowseResultsRenderer']['tabs']
  70. try:
  71. subsimpleText = header['subscriberCountText']['simpleText'].replace('位订阅者', '')
  72. out_fans = format_nums(subsimpleText)
  73. except Exception as e:
  74. out_fans = 0
  75. for tab in tabs:
  76. if 'tabRenderer' not in tab or 'content' not in tab['tabRenderer']:
  77. continue
  78. viewCountText = \
  79. tab['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer'][
  80. 'contents'][0]['channelAboutFullMetadataRenderer']['viewCountText']['simpleText']
  81. out_create_time = \
  82. tab['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer'][
  83. 'contents'][0]['channelAboutFullMetadataRenderer']['joinedDateText']['runs'][1]['text']
  84. break
  85. out_user_dict = {
  86. 'out_user_name': header['title'],
  87. 'out_avatar_url': header['avatar']['thumbnails'][-1]['url'],
  88. 'out_fans': out_fans,
  89. 'out_play_cnt': int(
  90. viewCountText.replace('收看次數:', '').replace('次', '').replace(',', '')) if viewCountText else 0,
  91. 'out_create_time': out_create_time.replace('年', '-').replace('月', '-').replace('日', ''),
  92. }
  93. # print(out_user_dict)
  94. return out_user_dict
  95. except Exception as e:
  96. Common.logger(log_type, crawler).error(f'get_out_user_info异常:{e}\n')
  97. @classmethod
  98. def get_user_from_feishu(cls, log_type, crawler, sheetid, env, machine):
  99. """
  100. 补全飞书用户表信息,并返回
  101. :param log_type: 日志
  102. :param crawler: 哪款爬虫
  103. :param sheetid: 飞书表
  104. :param env: 正式环境:prod,测试环境:dev
  105. :param machine: 部署机器,阿里云填写 aliyun,aliyun_hk ,线下分别填写 macpro,macair,local
  106. :return: user_list
  107. """
  108. try:
  109. user_sheet = Feishu.get_values_batch(log_type, crawler, sheetid)
  110. user_list = []
  111. for i in range(1, len(user_sheet)):
  112. out_uid = user_sheet[i][2]
  113. user_name = user_sheet[i][3]
  114. browse_id = user_sheet[i][5]
  115. our_uid = user_sheet[i][6]
  116. uer_url = user_sheet[i][4]
  117. if out_uid is not None and user_name is not None:
  118. Common.logger(log_type, crawler).info(f"正在更新 {user_name} 用户信息\n")
  119. if our_uid is None:
  120. sql = f""" select * from crawler_user where platform="{cls.platform}" and out_user_id="{out_uid}" """
  121. our_user_info = MysqlHelper.get_values(log_type, crawler, sql, env, machine)
  122. # 数据库中(youtube + out_user_id)返回数量 == 0,则创建站内账号UID,并写入定向账号飞书表。并结合站外用户信息,一并写入爬虫账号数据库
  123. if not our_user_info:
  124. # 获取站外账号信息,写入数据库
  125. try:
  126. out_user_dict = cls.get_out_user_info(log_type, crawler, browse_id, out_uid)
  127. except Exception as e:
  128. continue
  129. out_avatar_url = out_user_dict['out_avatar_url']
  130. out_create_time = out_user_dict['out_create_time']
  131. out_play_cnt = out_user_dict['out_play_cnt']
  132. out_fans = out_user_dict['out_fans']
  133. tag = 'youtube爬虫,定向爬虫策略'
  134. # 创建站内账号
  135. create_user_dict = {
  136. 'nickName': user_name,
  137. 'avatarUrl': out_avatar_url,
  138. 'tagName': tag,
  139. }
  140. our_uid = getUser.create_uid(log_type, crawler, create_user_dict, env)
  141. Common.logger(log_type, crawler).info(f'新创建的站内UID:{our_uid}')
  142. if env == 'prod':
  143. our_user_link = f'https://admin.piaoquantv.com/ums/user/{our_uid}/post'
  144. else:
  145. our_user_link = f'https://testadmin.piaoquantv.com/ums/user/{our_uid}/post'
  146. Common.logger(log_type, crawler).info(f'站内用户主页链接:{our_user_link}')
  147. Feishu.update_values(log_type, crawler, sheetid, f'G{i + 1}:H{i + 1}',
  148. [[our_uid, our_user_link]])
  149. Common.logger(log_type, crawler).info(f'站内用户信息写入飞书成功!')
  150. sql = f""" insert into crawler_user(user_id,
  151. out_user_id,
  152. out_user_name,
  153. out_avatar_url,
  154. out_create_time,
  155. out_play_cnt,
  156. out_fans,
  157. platform,
  158. tag)
  159. values({our_uid},
  160. "{out_uid}",
  161. "{user_name}",
  162. "{out_avatar_url}",
  163. "{out_create_time}",
  164. {out_play_cnt},
  165. {out_fans},
  166. "{cls.platform}",
  167. "{tag}") """
  168. Common.logger(log_type, crawler).info(f'sql:{sql}')
  169. MysqlHelper.update_values(log_type, crawler, sql, env, machine)
  170. Common.logger(log_type, crawler).info('用户信息插入数据库成功!\n')
  171. # 数据库中(youtube + out_user_id)返回数量 != 0,则直接把数据库中的站内 UID 写入飞书
  172. else:
  173. our_uid = our_user_info[0][1]
  174. if 'env' == 'prod':
  175. our_user_link = f'https://admin.piaoquantv.com/ums/user/{our_uid}/post'
  176. else:
  177. our_user_link = f'https://testadmin.piaoquantv.com/ums/user/{our_uid}/post'
  178. Common.logger(log_type, crawler).info(f'站内用户主页链接:{our_user_link}')
  179. Feishu.update_values(log_type, crawler, sheetid, f'G{i + 1}:H{i + 1}',
  180. [[our_uid, our_user_link]])
  181. Common.logger(log_type, crawler).info(f'站内用户信息写入飞书成功!\n')
  182. user_dict = {
  183. 'out_user_id': out_uid,
  184. 'out_user_name': user_name,
  185. 'out_browse_id': browse_id,
  186. 'our_user_id': our_uid,
  187. 'out_user_url': uer_url
  188. }
  189. user_list.append(user_dict)
  190. else:
  191. pass
  192. return user_list
  193. except Exception as e:
  194. Common.logger(log_type, crawler).error(f"get_user_from_feishu异常:{e}\n")
  195. @classmethod
  196. def get_continuation(cls, data):
  197. continuation = data['continuationItemRenderer']['continuationEndpoint']['continuationCommand']['token']
  198. return continuation
  199. @classmethod
  200. def get_feeds(cls, log_type, crawler, browse_id, out_uid):
  201. """
  202. 获取个人主页视频列表
  203. :param log_type: 日志
  204. :param crawler: 哪款爬虫
  205. :param browse_id: 每个用户主页的请求参数中唯一值
  206. :param out_uid: 站外用户UID
  207. :return: video_list
  208. """
  209. url = "https://www.youtube.com/youtubei/v1/browse?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8&prettyPrint=false"
  210. payload = json.dumps({
  211. "context": {
  212. "client": {
  213. "hl": "zh-CN",
  214. "gl": "US",
  215. "remoteHost": "38.93.247.21",
  216. "deviceMake": "Apple",
  217. "deviceModel": "",
  218. "visitorData": "CgtraDZfVnB4NXdIWSi6mIOfBg%3D%3D",
  219. "userAgent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36,gzip(gfe)",
  220. "clientName": "WEB",
  221. "clientVersion": "2.20230201.01.00",
  222. "osName": "Macintosh",
  223. "osVersion": "10_15_7",
  224. "originalUrl": f"https://www.youtube.com/{out_uid}/videos",
  225. "platform": "DESKTOP",
  226. "clientFormFactor": "UNKNOWN_FORM_FACTOR",
  227. "configInfo": {
  228. "appInstallData": "CLqYg58GEInorgUQuIuuBRCU-K4FENfkrgUQuNSuBRC2nP4SEPuj_hIQ5_euBRCy9a4FEKLsrgUQt-CuBRDi1K4FEILdrgUQh92uBRDM364FEP7urgUQzPWuBRDZ6a4FEOSg_hIQo_muBRDvo_4SEMnJrgUQlqf-EhCR-PwS"
  229. },
  230. "timeZone": "Asia/Shanghai",
  231. "browserName": "Chrome",
  232. "browserVersion": "109.0.0.0",
  233. "acceptHeader": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
  234. "deviceExperimentId": "ChxOekU1TlRReU5qWTBOVFExTVRRNU5qRTBOdz09ELqYg58GGOmU7Z4G",
  235. "screenWidthPoints": 944,
  236. "screenHeightPoints": 969,
  237. "screenPixelDensity": 1,
  238. "screenDensityFloat": 1,
  239. "utcOffsetMinutes": 480,
  240. "userInterfaceTheme": "USER_INTERFACE_THEME_LIGHT",
  241. "memoryTotalKbytes": "8000000",
  242. "mainAppWebInfo": {
  243. "graftUrl": f"/{out_uid}/videos",
  244. "pwaInstallabilityStatus": "PWA_INSTALLABILITY_STATUS_CAN_BE_INSTALLED",
  245. "webDisplayMode": "WEB_DISPLAY_MODE_FULLSCREEN",
  246. "isWebNativeShareAvailable": True
  247. }
  248. },
  249. "user": {
  250. "lockedSafetyMode": False
  251. },
  252. "request": {
  253. "useSsl": True,
  254. "internalExperimentFlags": [],
  255. "consistencyTokenJars": []
  256. },
  257. "clickTracking": {
  258. "clickTrackingParams": "CBcQ8JMBGAYiEwiNhIXX9IL9AhUFSUwIHWnnDks="
  259. },
  260. "adSignalsInfo": {
  261. "params": [
  262. {
  263. "key": "dt",
  264. "value": "1675676731048"
  265. },
  266. {
  267. "key": "flash",
  268. "value": "0"
  269. },
  270. {
  271. "key": "frm",
  272. "value": "0"
  273. },
  274. {
  275. "key": "u_tz",
  276. "value": "480"
  277. },
  278. {
  279. "key": "u_his",
  280. "value": "4"
  281. },
  282. {
  283. "key": "u_h",
  284. "value": "1080"
  285. },
  286. {
  287. "key": "u_w",
  288. "value": "1920"
  289. },
  290. {
  291. "key": "u_ah",
  292. "value": "1080"
  293. },
  294. {
  295. "key": "u_aw",
  296. "value": "1920"
  297. },
  298. {
  299. "key": "u_cd",
  300. "value": "24"
  301. },
  302. {
  303. "key": "bc",
  304. "value": "31"
  305. },
  306. {
  307. "key": "bih",
  308. "value": "969"
  309. },
  310. {
  311. "key": "biw",
  312. "value": "944"
  313. },
  314. {
  315. "key": "brdim",
  316. "value": "-269,-1080,-269,-1080,1920,-1080,1920,1080,944,969"
  317. },
  318. {
  319. "key": "vis",
  320. "value": "1"
  321. },
  322. {
  323. "key": "wgl",
  324. "value": "true"
  325. },
  326. {
  327. "key": "ca_type",
  328. "value": "image"
  329. }
  330. ],
  331. "bid": "ANyPxKpfiaAf-DBzNeKLgkceMEA9UIeCWFRTRm4AQMCuejhI3PGwDB1jizQIX60YcEYtt_CX7tZWAbYerQ-rWLvV7y_KCLkBww"
  332. }
  333. },
  334. # "browseId": browse_id,
  335. "params": "EgZ2aWRlb3PyBgQKAjoA",
  336. "continuation": cls.continuation
  337. })
  338. headers = {
  339. 'authority': 'www.youtube.com',
  340. 'accept': '*/*',
  341. 'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
  342. 'cache-control': 'no-cache',
  343. 'content-type': 'application/json',
  344. 'cookie': 'VISITOR_INFO1_LIVE=kh6_Vpx5wHY; YSC=UupqFrWvAR0; DEVICE_INFO=ChxOekU1TlRReU5qWTBOVFExTVRRNU5qRTBOdz09EOmU7Z4GGOmU7Z4G; PREF=tz=Asia.Shanghai; ST-1kg1gfd=itct=CBcQ8JMBGAYiEwiNhIXX9IL9AhUFSUwIHWnnDks%3D&csn=MC4zNzI3MDcwMDA1Mjg4NzE5Ng..&endpoint=%7B%22clickTrackingParams%22%3A%22CBcQ8JMBGAYiEwiNhIXX9IL9AhUFSUwIHWnnDks%3D%22%2C%22commandMetadata%22%3A%7B%22webCommandMetadata%22%3A%7B%22url%22%3A%22%2F%40chinatravel5971%2Fvideos%22%2C%22webPageType%22%3A%22WEB_PAGE_TYPE_CHANNEL%22%2C%22rootVe%22%3A3611%2C%22apiUrl%22%3A%22%2Fyoutubei%2Fv1%2Fbrowse%22%7D%7D%2C%22browseEndpoint%22%3A%7B%22browseId%22%3A%22UCpLXnfBCNhj8KLnt54RQMKA%22%2C%22params%22%3A%22EgZ2aWRlb3PyBgQKAjoA%22%2C%22canonicalBaseUrl%22%3A%22%2F%40chinatravel5971%22%7D%7D',
  345. 'origin': 'https://www.youtube.com',
  346. 'pragma': 'no-cache',
  347. 'referer': f'https://www.youtube.com/{out_uid}/featured',
  348. 'sec-ch-ua': '"Not_A Brand";v="99", "Chromium";v="109", "Google Chrome";v="109.0.5414.87"',
  349. 'sec-ch-ua-arch': '"arm"',
  350. 'sec-ch-ua-bitness': '"64"',
  351. 'sec-ch-ua-full-version': '"109.0.1518.52"',
  352. 'sec-ch-ua-full-version-list': '"Not_A Brand";v="99.0.0.0", "Microsoft Edge";v="109.0.1518.52", "Chromium";v="109.0.5414.87"',
  353. 'sec-ch-ua-mobile': '?0',
  354. 'sec-ch-ua-model': '',
  355. 'sec-ch-ua-platform': '"macOS"',
  356. 'sec-ch-ua-platform-version': '"12.4.0"',
  357. 'sec-ch-ua-wow64': '?0',
  358. 'sec-fetch-dest': 'empty',
  359. 'sec-fetch-mode': 'same-origin',
  360. 'sec-fetch-site': 'same-origin',
  361. 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
  362. 'x-goog-visitor-id': 'CgtraDZfVnB4NXdIWSi6mIOfBg%3D%3D',
  363. 'x-youtube-bootstrap-logged-in': 'false',
  364. 'x-youtube-client-name': '1',
  365. 'x-youtube-client-version': '2.20230201.01.00'
  366. }
  367. try:
  368. response = requests.post(url=url, headers=headers, data=payload)
  369. # Common.logger(log_type, crawler).info(f"get_feeds_response:{response.json()}\n")
  370. cls.continuation = response.json()['trackingParams']
  371. if response.status_code != 200:
  372. Common.logger(log_type, crawler).warning(f'get_feeds_response:{response.text}\n')
  373. elif 'continuationContents' not in response.text and 'onResponseReceivedActions' not in response.text:
  374. Common.logger(log_type, crawler).warning(f'get_feeds_response:{response.text}\n')
  375. elif 'continuationContents' in response.json():
  376. # Common.logger(log_type, crawler).info("'continuationContents' in response.json()\n")
  377. if 'richGridContinuation' not in response.json()['continuationContents']:
  378. # Common.logger(log_type, crawler).warning(f"'richGridContinuation' not in response.json()['continuationContents']\n")
  379. Common.logger(log_type, crawler).warning(
  380. f'get_feeds_response:{response.json()["continuationContents"]}\n')
  381. elif 'contents' not in response.json()['continuationContents']['richGridContinuation']:
  382. Common.logger(log_type, crawler).warning(
  383. f'get_feeds_response:{response.json()["continuationContents"]["richGridContinuation"]}\n')
  384. elif 'contents' in response.json()["continuationContents"]["richGridContinuation"]:
  385. feeds = response.json()["continuationContents"]["richGridContinuation"]['contents']
  386. return feeds
  387. elif 'onResponseReceivedActions' in response.json():
  388. Common.logger(log_type, crawler).info("'onResponseReceivedActions' in response.json()\n")
  389. if len(response.json()['onResponseReceivedActions']) == 0:
  390. Common.logger(log_type, crawler).warning(
  391. f'get_feeds_response:{response.json()["onResponseReceivedActions"]}\n')
  392. elif 'appendContinuationItemsAction' not in response.json()['onResponseReceivedActions'][0]:
  393. Common.logger(log_type, crawler).warning(
  394. f'get_feeds_response:{response.json()["onResponseReceivedActions"][0]}\n')
  395. elif 'continuationItems' not in response.json()['onResponseReceivedActions'][0][
  396. 'appendContinuationItemsAction']:
  397. Common.logger(log_type, crawler).warning(
  398. f'get_feeds_response:{response.json()["onResponseReceivedActions"][0]["appendContinuationItemsAction"]}\n')
  399. elif len(response.json()['onResponseReceivedActions'][0]['appendContinuationItemsAction'][
  400. 'continuationItems']) == 0:
  401. Common.logger(log_type, crawler).warning(
  402. f'get_feeds_response:{response.json()["onResponseReceivedActions"][0]["appendContinuationItemsAction"]["continuationItems"]}\n')
  403. else:
  404. feeds = response.json()["onResponseReceivedActions"][0]["appendContinuationItemsAction"][
  405. "continuationItems"]
  406. return feeds
  407. else:
  408. Common.logger(log_type, crawler).info('feeds is None\n')
  409. except Exception as e:
  410. Common.logger(log_type, crawler).error(f'get_feeds异常:{e}\n')
  411. @classmethod
  412. def get_first_page(cls, user_url):
  413. try:
  414. res = requests.get(url=user_url, headers=cls.headers)
  415. info = re.findall(r'var ytInitialData = (.*?);', res.text, re.S)[0]
  416. ytInitialData = json.loads(info)
  417. video_list = \
  418. ytInitialData['contents']['twoColumnBrowseResultsRenderer']['tabs'][1]['tabRenderer']['content'][
  419. 'richGridRenderer']['contents']
  420. except Exception as e:
  421. video_list = []
  422. return video_list
  423. @classmethod
  424. def get_next_page(cls, log_type, crawler, strategy, oss_endpoint, env, out_uid, our_uid,
  425. machine, out_user_url, continuation):
  426. post_url = "https://www.youtube.com/youtubei/v1/browse?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8&prettyPrint=false"
  427. payload = json.dumps({
  428. "context": {
  429. "client": {
  430. "userAgent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36,gzip(gfe)",
  431. "clientName": "WEB",
  432. "clientVersion": "2.20230221.06.00",
  433. "osName": "Macintosh",
  434. "osVersion": "10_15_7",
  435. "originalUrl": "https://www.youtube.com/@wongkim728/videos",
  436. "screenPixelDensity": 2,
  437. "platform": "DESKTOP",
  438. "clientFormFactor": "UNKNOWN_FORM_FACTOR",
  439. "configInfo": {
  440. "appInstallData": "CKWy258GEOWg_hIQzN-uBRC4rP4SEOf3rgUQzPWuBRCi7K4FEMiJrwUQieiuBRDshq8FENrprgUQ4tSuBRD-7q4FEKOArwUQgt2uBRC2nP4SEJT4rgUQuIuuBRCH3a4FELjUrgUQjqj-EhCR-PwS"
  441. },
  442. "screenDensityFloat": 2,
  443. "timeZone": "Asia/Shanghai",
  444. "browserName": "Chrome",
  445. "browserVersion": "110.0.0.0",
  446. "acceptHeader": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
  447. "deviceExperimentId": "ChxOekl3TWpVek9UQXpPVE13TnpJd056a3pNZz09EKWy258GGJie0p8G",
  448. "screenWidthPoints": 576,
  449. "screenHeightPoints": 764,
  450. "utcOffsetMinutes": 480,
  451. "userInterfaceTheme": "USER_INTERFACE_THEME_LIGHT",
  452. "connectionType": "CONN_CELLULAR_4G",
  453. "memoryTotalKbytes": "8000000",
  454. "mainAppWebInfo": {
  455. "graftUrl": out_user_url,
  456. "pwaInstallabilityStatus": "PWA_INSTALLABILITY_STATUS_CAN_BE_INSTALLED",
  457. "webDisplayMode": "WEB_DISPLAY_MODE_FULLSCREEN",
  458. "isWebNativeShareAvailable": False
  459. }
  460. },
  461. "user": {
  462. "lockedSafetyMode": False
  463. },
  464. "request": {
  465. "useSsl": True,
  466. "internalExperimentFlags": [],
  467. "consistencyTokenJars": []
  468. },
  469. "clickTracking": {
  470. "clickTrackingParams": ""
  471. },
  472. "adSignalsInfo": {
  473. "params": [],
  474. "bid": "ANyPxKo8EXfKNGm3gYLAqhR5HA90FSKMvQf43tk3KV_XUWB5xi_0OxAo2TJTfoVx_516NRxz0qwRg-1x2kD-IVt7LPKrRHkJBA"
  475. }
  476. },
  477. "continuation": continuation
  478. })
  479. headers = {
  480. # 'authorization': 'SAPISIDHASH 1677121838_f5055bd4b4c242d18af423b37ac0f556bf1dfc30',
  481. 'content-type': 'application/json',
  482. 'cookie': 'VISITOR_INFO1_LIVE=HABZsLFdU40; DEVICE_INFO=ChxOekl3TWpVek9UQXpPVE13TnpJd056a3pNZz09EJie0p8GGJie0p8G; PREF=f4=4000000&tz=Asia.Shanghai; HSID=AxFp7ylWWebUZYqrl; SSID=ANHuSQMqvVcV0vVNn; APISID=AkwZgjPvFZ6LZCrE/Aiv0K-2rEUzY1bH1u; SAPISID=8yRrBMHYXAhqkybH/AEFGJvzZ3tPalnTy0; __Secure-1PAPISID=8yRrBMHYXAhqkybH/AEFGJvzZ3tPalnTy0; __Secure-3PAPISID=8yRrBMHYXAhqkybH/AEFGJvzZ3tPalnTy0; SID=TwjWkM4mrKb4o8pRKbyQVqELjNU43ZL0bF8QB2hdTI9z05T4Koo9aQoNQfX1AiGFWeD7WA.; __Secure-1PSID=TwjWkM4mrKb4o8pRKbyQVqELjNU43ZL0bF8QB2hdTI9z05T4bs4qvvXffLLTXq_VYw0XLw.; __Secure-3PSID=TwjWkM4mrKb4o8pRKbyQVqELjNU43ZL0bF8QB2hdTI9z05T4cNwzpudzvCglfQ5A1FJnog.; LOGIN_INFO=AFmmF2swRAIgO4TvR9xxWoHPgrGoGAEVo-P8Slqem__vIdF_oajjRiECIFiq4YtbL_IQGCbkjrHsWkWH6OpzKd8RlgdS6qNurR0Q:QUQ3MjNmejV5WkRVUmZXVlFjbjY0dW1aVGpoZkZQdmxYamIzV01zc0lmT3JiQl9ldVYwc0t4dlNkbWpoVEdJMHVaWjZXVEt3ZERQeUppU3AyNmR6ckFucWltZU5LNmZjQ3lHUEtKTDBzSlo5WXpJQzF3UlNCVlp2Q1ZKVmxtRk05OHRuWFFiWGphcFpPblFOUURWTlVxVGtBazVjcmVtS2pR; YSC=CtX0f3NennA; SIDCC=AFvIBn9aXC4vNCbg5jPzjbC8LMYCBVx_dy8uJO20b-768rmRfP9f5BqQ_xXspPemecVq29qZ7A; __Secure-1PSIDCC=AFvIBn-4TD_lPaKgbmYAGO6hZluLgSgbWgb7XAcaeNG6982LIIpS_Gb9vkqHTBMyCGvb4x7m6jk; __Secure-3PSIDCC=AFvIBn9ypvGX15qq4CsnsuhWTaXa9yMTxWMWbIDXtr6L3XZD81XBUQ0IMUv9ZKh9mf8NEbSvOy0; SIDCC=AFvIBn_DwLbohF2llhq4EQjFDFA3n9-_AK_7ITJsTZtCeYwy43J8KCYUPfY7ghqX9s-Qq5dOIQ; __Secure-1PSIDCC=AFvIBn-7x_HhxbmDkOzXew-sXAEWVuUGpglr8rypU623IyO8Y9OungcqMkuxBZQ2vr6G7x9UcxM; __Secure-3PSIDCC=AFvIBn-7aSYRxZkCKZp7-Mdn9PwbW4CUtXD0ok0nCvPIZXfkFrN9VqN1BHkI1fUaoIo_8YCjwRs',
  483. 'origin': 'https://www.youtube.com',
  484. 'referer': out_user_url,
  485. 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36',
  486. }
  487. try:
  488. res = requests.request("POST", post_url, headers=headers, data=payload).json()
  489. video_infos = res['onResponseReceivedActions'][0]['appendContinuationItemsAction']['continuationItems']
  490. for data in video_infos:
  491. if 'richItemRenderer' in data:
  492. video_id = data["richItemRenderer"]["content"]['videoRenderer']['videoId']
  493. video_dict = cls.get_video_info(log_type, crawler, out_uid, video_id, machine)
  494. # video_dict = cls.parse_video(video_dict, log_type, crawler, out_uid, video_id, machine)
  495. # 发布时间<=7天
  496. publish_time = int(time.mktime(time.strptime(video_dict['publish_time'], "%Y-%m-%d")))
  497. if int(time.time()) - publish_time <= 3600 * 24 * 7:
  498. cls.download_publish(log_type, crawler, video_dict, strategy, our_uid, env, oss_endpoint,
  499. machine)
  500. else:
  501. Common.logger(log_type, crawler).info('发布时间超过7天\n')
  502. return
  503. else:
  504. continuation = cls.get_continuation(data)
  505. cls.get_next_page(log_type, crawler, strategy, oss_endpoint, env, out_uid, our_uid,
  506. machine, out_user_url, continuation)
  507. except:
  508. return
  509. @classmethod
  510. def get_videos(cls, log_type, crawler, strategy, oss_endpoint, env, out_uid, our_uid,
  511. machine, out_user_url):
  512. try:
  513. feeds = cls.get_first_page(out_user_url)
  514. for data in feeds:
  515. if 'richItemRenderer' in data:
  516. video_id = data["richItemRenderer"]["content"]['videoRenderer']['videoId']
  517. video_dict = cls.get_video_info(log_type, crawler, out_uid, video_id, machine)
  518. # 发布时间<=7天
  519. publish_time = int(time.mktime(time.strptime(video_dict['publish_time'], "%Y-%m-%d")))
  520. if int(time.time()) - publish_time <= 3600 * 24 * 7:
  521. cls.download_publish(log_type, crawler, video_dict, strategy, our_uid, env, oss_endpoint,
  522. machine)
  523. else:
  524. Common.logger(log_type, crawler).info('发布时间超过7天\n')
  525. return
  526. else:
  527. continuation = cls.get_continuation(data)
  528. cls.get_next_page(log_type, crawler, strategy, oss_endpoint, env, out_uid, our_uid,
  529. machine, out_user_url, continuation=continuation)
  530. except Exception as e:
  531. Common.logger(log_type, crawler).error(f"get_videos异常:{e}\n")
  532. @classmethod
  533. def filter_emoji(cls, title):
  534. # 过滤表情
  535. try:
  536. co = re.compile(u'[\U00010000-\U0010ffff]')
  537. except re.error:
  538. co = re.compile(u'[\uD800-\uDBFF][\uDC00-\uDFFF]')
  539. return co.sub("", title)
  540. @classmethod
  541. def is_contain_chinese(cls, strword):
  542. for ch in strword:
  543. if u'\u4e00' <= ch <= u'\u9fff':
  544. return True
  545. return False
  546. @classmethod
  547. def parse_video(cls, video_dict, log_type, crawler, out_uid, video_id, machine):
  548. try:
  549. if 'streamingData' not in video_dict:
  550. Common.logger(log_type, crawler).warning(f"get_video_info_response:{video_dict}\n")
  551. elif 'videoDetails' not in video_dict:
  552. Common.logger(log_type, crawler).warning(f"get_video_info_response:{video_dict}\n")
  553. elif 'microformat' not in video_dict:
  554. Common.logger(log_type, crawler).warning(f"get_video_info_response:{video_dict}\n")
  555. else:
  556. playerMicroformatRenderer = video_dict['microformat']['playerMicroformatRenderer']
  557. videoDetails = video_dict['videoDetails']
  558. # streamingData = response.json()['streamingData']
  559. # video_title
  560. if 'title' not in videoDetails:
  561. video_title = ''
  562. else:
  563. video_title = videoDetails['title']
  564. video_title = cls.filter_emoji(video_title)
  565. # if Translate.is_contains_chinese(video_title) is False:
  566. if not cls.is_contain_chinese(video_title):
  567. video_title = Translate.google_translate(video_title, machine) \
  568. .strip().replace("\\", "").replace(" ", "").replace("\n", "") \
  569. .replace("/", "").replace("\r", "").replace("&NBSP", "").replace("&", "") \
  570. .replace(";", "").replace("amp;", "") # 自动翻译标题为中文
  571. if 'lengthSeconds' not in videoDetails:
  572. duration = 0
  573. else:
  574. duration = int(videoDetails['lengthSeconds'])
  575. # play_cnt
  576. if 'viewCount' not in videoDetails:
  577. play_cnt = 0
  578. else:
  579. play_cnt = int(videoDetails['viewCount'])
  580. # publish_time
  581. if 'publishDate' not in playerMicroformatRenderer:
  582. publish_time = ''
  583. else:
  584. publish_time = playerMicroformatRenderer['publishDate']
  585. if publish_time == '':
  586. publish_time_stamp = 0
  587. elif ':' in publish_time:
  588. publish_time_stamp = int(time.mktime(time.strptime(publish_time, "%Y-%m-%d %H:%M:%S")))
  589. else:
  590. publish_time_stamp = int(time.mktime(time.strptime(publish_time, "%Y-%m-%d")))
  591. # user_name
  592. if 'author' not in videoDetails:
  593. user_name = ''
  594. else:
  595. user_name = videoDetails['author']
  596. # cover_url
  597. if 'thumbnail' not in videoDetails:
  598. cover_url = ''
  599. elif 'thumbnails' not in videoDetails['thumbnail']:
  600. cover_url = ''
  601. elif len(videoDetails['thumbnail']['thumbnails']) == 0:
  602. cover_url = ''
  603. elif 'url' not in videoDetails['thumbnail']['thumbnails'][-1]:
  604. cover_url = ''
  605. else:
  606. cover_url = videoDetails['thumbnail']['thumbnails'][-1]['url']
  607. # video_url
  608. # if 'formats' not in streamingData:
  609. # video_url = ''
  610. # elif len(streamingData['formats']) == 0:
  611. # video_url = ''
  612. # elif 'url' not in streamingData['formats'][-1]:
  613. # video_url = ''
  614. # else:
  615. # video_url = streamingData['formats'][-1]['url']
  616. video_url = f"https://www.youtube.com/watch?v={video_id}"
  617. Common.logger(log_type, crawler).info(f'video_title:{video_title}')
  618. Common.logger(log_type, crawler).info(f'video_id:{video_id}')
  619. Common.logger(log_type, crawler).info(f'play_cnt:{play_cnt}')
  620. Common.logger(log_type, crawler).info(f'publish_time:{publish_time}')
  621. Common.logger(log_type, crawler).info(f'user_name:{user_name}')
  622. Common.logger(log_type, crawler).info(f'cover_url:{cover_url}')
  623. Common.logger(log_type, crawler).info(f'video_url:{video_url}')
  624. video_dict = {
  625. 'video_title': video_title,
  626. 'video_id': video_id,
  627. 'duration': duration,
  628. 'play_cnt': play_cnt,
  629. 'publish_time': publish_time,
  630. 'publish_time_stamp': publish_time_stamp,
  631. 'user_name': user_name,
  632. 'out_uid': out_uid,
  633. 'cover_url': cover_url,
  634. 'video_url': video_url,
  635. }
  636. return video_dict
  637. except Exception as e:
  638. Common.logger(log_type, crawler).error(f"get_video_info异常:{e}\n")
  639. @classmethod
  640. def get_video_info(cls, log_type, crawler, out_uid, video_id, machine):
  641. try:
  642. url = "https://www.youtube.com/youtubei/v1/player?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8&prettyPrint=false"
  643. payload = json.dumps({
  644. "context": {
  645. "client": {
  646. "hl": "zh-CN",
  647. "gl": "US",
  648. "remoteHost": "38.93.247.21",
  649. "deviceMake": "Apple",
  650. "deviceModel": "",
  651. "visitorData": "CgtraDZfVnB4NXdIWSjkzoefBg%3D%3D",
  652. "userAgent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36,gzip(gfe)",
  653. "clientName": "WEB",
  654. "clientVersion": "2.20230201.01.00",
  655. "osName": "Macintosh",
  656. "osVersion": "10_15_7",
  657. "originalUrl": f"https://www.youtube.com/watch?v={video_id}",
  658. "platform": "DESKTOP",
  659. "clientFormFactor": "UNKNOWN_FORM_FACTOR",
  660. "configInfo": {
  661. "appInstallData": "COTOh58GEPuj_hIQ1-SuBRC4i64FEMzfrgUQgt2uBRCi7K4FEOLUrgUQzPWuBRCKgK8FEOSg_hIQtpz-EhDa6a4FEP7urgUQieiuBRDn964FELjUrgUQlPiuBRCH3a4FELfgrgUQ76P-EhDJya4FEJan_hIQkfj8Eg%3D%3D"
  662. },
  663. "timeZone": "Asia/Shanghai",
  664. "browserName": "Chrome",
  665. "browserVersion": "109.0.0.0",
  666. "acceptHeader": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
  667. "deviceExperimentId": "ChxOekU1TlRReU5qWTBOVFExTVRRNU5qRTBOdz09EOTOh58GGOmU7Z4G",
  668. "screenWidthPoints": 1037,
  669. "screenHeightPoints": 969,
  670. "screenPixelDensity": 1,
  671. "screenDensityFloat": 1,
  672. "utcOffsetMinutes": 480,
  673. "userInterfaceTheme": "USER_INTERFACE_THEME_LIGHT",
  674. "memoryTotalKbytes": "8000000",
  675. "clientScreen": "WATCH",
  676. "mainAppWebInfo": {
  677. "graftUrl": f"/watch?v={video_id}",
  678. "pwaInstallabilityStatus": "PWA_INSTALLABILITY_STATUS_CAN_BE_INSTALLED",
  679. "webDisplayMode": "WEB_DISPLAY_MODE_FULLSCREEN",
  680. "isWebNativeShareAvailable": True
  681. }
  682. },
  683. "user": {
  684. "lockedSafetyMode": False
  685. },
  686. "request": {
  687. "useSsl": True,
  688. "internalExperimentFlags": [],
  689. "consistencyTokenJars": []
  690. },
  691. "clickTracking": {
  692. "clickTrackingParams": "CIwBEKQwGAYiEwipncqx3IL9AhXs4cQKHbKZDO4yB3JlbGF0ZWRInsS1qbGFtIlUmgEFCAEQ-B0="
  693. },
  694. "adSignalsInfo": {
  695. "params": [
  696. {
  697. "key": "dt",
  698. "value": "1675749222611"
  699. },
  700. {
  701. "key": "flash",
  702. "value": "0"
  703. },
  704. {
  705. "key": "frm",
  706. "value": "0"
  707. },
  708. {
  709. "key": "u_tz",
  710. "value": "480"
  711. },
  712. {
  713. "key": "u_his",
  714. "value": "3"
  715. },
  716. {
  717. "key": "u_h",
  718. "value": "1080"
  719. },
  720. {
  721. "key": "u_w",
  722. "value": "1920"
  723. },
  724. {
  725. "key": "u_ah",
  726. "value": "1080"
  727. },
  728. {
  729. "key": "u_aw",
  730. "value": "1920"
  731. },
  732. {
  733. "key": "u_cd",
  734. "value": "24"
  735. },
  736. {
  737. "key": "bc",
  738. "value": "31"
  739. },
  740. {
  741. "key": "bih",
  742. "value": "969"
  743. },
  744. {
  745. "key": "biw",
  746. "value": "1037"
  747. },
  748. {
  749. "key": "brdim",
  750. "value": "-269,-1080,-269,-1080,1920,-1080,1920,1080,1037,969"
  751. },
  752. {
  753. "key": "vis",
  754. "value": "1"
  755. },
  756. {
  757. "key": "wgl",
  758. "value": "true"
  759. },
  760. {
  761. "key": "ca_type",
  762. "value": "image"
  763. }
  764. ],
  765. "bid": "ANyPxKop8SijebwUCq4ZfKbJwlSjVQa_RTdS6c6a6WPYpCKnxpWCJ33B1SzRuSXjSfH9O2MhURebAs0CngRg6B4nOjBpeJDKgA"
  766. }
  767. },
  768. "videoId": str(video_id),
  769. "playbackContext": {
  770. "contentPlaybackContext": {
  771. "currentUrl": f"/watch?v={video_id}",
  772. "vis": 0,
  773. "splay": False,
  774. "autoCaptionsDefaultOn": False,
  775. "autonavState": "STATE_NONE",
  776. "html5Preference": "HTML5_PREF_WANTS",
  777. "signatureTimestamp": 19394,
  778. "referer": f"https://www.youtube.com/watch?v={video_id}",
  779. "lactMilliseconds": "-1",
  780. "watchAmbientModeContext": {
  781. "watchAmbientModeEnabled": True
  782. }
  783. }
  784. },
  785. "racyCheckOk": False,
  786. "contentCheckOk": False
  787. })
  788. headers = {
  789. 'authority': 'www.youtube.com',
  790. 'accept': '*/*',
  791. 'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
  792. 'cache-control': 'no-cache',
  793. 'content-type': 'application/json',
  794. 'cookie': f'VISITOR_INFO1_LIVE=kh6_Vpx5wHY; YSC=UupqFrWvAR0; DEVICE_INFO=ChxOekU1TlRReU5qWTBOVFExTVRRNU5qRTBOdz09EOmU7Z4GGOmU7Z4G; PREF=tz=Asia.Shanghai; ST-180dxzo=itct=CIwBEKQwGAYiEwipncqx3IL9AhXs4cQKHbKZDO4yB3JlbGF0ZWRInsS1qbGFtIlUmgEFCAEQ-B0%3D&csn=MC41MTQ1NTQzMTE3NTA4MjY0&endpoint=%7B%22clickTrackingParams%22%3A%22CIwBEKQwGAYiEwipncqx3IL9AhXs4cQKHbKZDO4yB3JlbGF0ZWRInsS1qbGFtIlUmgEFCAEQ-B0%3D%22%2C%22commandMetadata%22%3A%7B%22webCommandMetadata%22%3A%7B%22url%22%3A%22%2Fwatch%3Fv%3D{video_id}%22%2C%22webPageType%22%3A%22WEB_PAGE_TYPE_WATCH%22%2C%22rootVe%22%3A3832%7D%7D%2C%22watchEndpoint%22%3A%7B%22videoId%22%3A%22{video_id}%22%2C%22nofollow%22%3Atrue%2C%22watchEndpointSupportedOnesieConfig%22%3A%7B%22html5PlaybackOnesieConfig%22%3A%7B%22commonConfig%22%3A%7B%22url%22%3A%22https%3A%2F%2Frr5---sn-nx5s7n76.googlevideo.com%2Finitplayback%3Fsource%3Dyoutube%26oeis%3D1%26c%3DWEB%26oad%3D3200%26ovd%3D3200%26oaad%3D11000%26oavd%3D11000%26ocs%3D700%26oewis%3D1%26oputc%3D1%26ofpcc%3D1%26msp%3D1%26odepv%3D1%26id%3D38654ad085c12212%26ip%3D38.93.247.21%26initcwndbps%3D11346250%26mt%3D1675748964%26oweuc%3D%26pxtags%3DCg4KAnR4EggyNDQ1MTI4OA%26rxtags%3DCg4KAnR4EggyNDQ1MTI4Ng%252CCg4KAnR4EggyNDQ1MTI4Nw%252CCg4KAnR4EggyNDQ1MTI4OA%252CCg4KAnR4EggyNDQ1MTI4OQ%22%7D%7D%7D%7D%7D',
  795. 'origin': 'https://www.youtube.com',
  796. 'pragma': 'no-cache',
  797. 'referer': f'https://www.youtube.com/watch?v={video_id}',
  798. 'sec-ch-ua': '"Not_A Brand";v="99", "Chromium";v="109", "Google Chrome";v="109.0.5414.87"',
  799. 'sec-ch-ua-arch': '"arm"',
  800. 'sec-ch-ua-bitness': '"64"',
  801. 'sec-ch-ua-full-version': '"109.0.1518.52"',
  802. 'sec-ch-ua-full-version-list': '"Not_A Brand";v="99.0.0.0", "Microsoft Edge";v="109.0.1518.52", "Chromium";v="109.0.5414.87"',
  803. 'sec-ch-ua-mobile': '?0',
  804. 'sec-ch-ua-model': '',
  805. 'sec-ch-ua-platform': '"macOS"',
  806. 'sec-ch-ua-platform-version': '"12.4.0"',
  807. 'sec-ch-ua-wow64': '?0',
  808. 'sec-fetch-dest': 'empty',
  809. 'sec-fetch-mode': 'same-origin',
  810. 'sec-fetch-site': 'same-origin',
  811. 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
  812. 'x-goog-visitor-id': 'CgtraDZfVnB4NXdIWSjkzoefBg%3D%3D',
  813. 'x-youtube-bootstrap-logged-in': 'false',
  814. 'x-youtube-client-name': '1',
  815. 'x-youtube-client-version': '2.20230201.01.00'
  816. }
  817. response = requests.post(url=url, headers=headers, data=payload)
  818. if response.status_code != 200:
  819. Common.logger(log_type, crawler).warning(f"get_video_info_response:{response.text}\n")
  820. elif 'streamingData' not in response.json():
  821. Common.logger(log_type, crawler).warning(f"get_video_info_response:{response.json()}\n")
  822. elif 'videoDetails' not in response.json():
  823. Common.logger(log_type, crawler).warning(f"get_video_info_response:{response.json()}\n")
  824. elif 'microformat' not in response.json():
  825. Common.logger(log_type, crawler).warning(f"get_video_info_response:{response.json()}\n")
  826. else:
  827. playerMicroformatRenderer = response.json()['microformat']['playerMicroformatRenderer']
  828. videoDetails = response.json()['videoDetails']
  829. # streamingData = response.json()['streamingData']
  830. # video_title
  831. if 'title' not in videoDetails:
  832. video_title = ''
  833. else:
  834. video_title = videoDetails['title'].replace('"', '').replace("'", '')
  835. video_title = cls.filter_emoji(video_title)
  836. if not cls.is_contain_chinese(video_title):
  837. video_title = Translate.google_translate(video_title, machine) \
  838. .strip().replace("\\", "").replace(" ", "").replace("\n", "") \
  839. .replace("/", "").replace("\r", "").replace("&NBSP", "").replace("&", "") \
  840. .replace(";", "").replace("amp;", "") # 自动翻译标题为中文
  841. if 'lengthSeconds' not in videoDetails:
  842. duration = 0
  843. else:
  844. duration = int(videoDetails['lengthSeconds'])
  845. # play_cnt
  846. if 'viewCount' not in videoDetails:
  847. play_cnt = 0
  848. else:
  849. play_cnt = int(videoDetails['viewCount'])
  850. # publish_time
  851. if 'publishDate' not in playerMicroformatRenderer:
  852. publish_time = ''
  853. else:
  854. publish_time = playerMicroformatRenderer['publishDate']
  855. if publish_time == '':
  856. publish_time_stamp = 0
  857. elif ':' in publish_time:
  858. publish_time_stamp = int(time.mktime(time.strptime(publish_time, "%Y-%m-%d %H:%M:%S")))
  859. else:
  860. publish_time_stamp = int(time.mktime(time.strptime(publish_time, "%Y-%m-%d")))
  861. # user_name
  862. if 'author' not in videoDetails:
  863. user_name = ''
  864. else:
  865. user_name = videoDetails['author']
  866. # cover_url
  867. if 'thumbnail' not in videoDetails:
  868. cover_url = ''
  869. elif 'thumbnails' not in videoDetails['thumbnail']:
  870. cover_url = ''
  871. elif len(videoDetails['thumbnail']['thumbnails']) == 0:
  872. cover_url = ''
  873. elif 'url' not in videoDetails['thumbnail']['thumbnails'][-1]:
  874. cover_url = ''
  875. else:
  876. cover_url = videoDetails['thumbnail']['thumbnails'][-1]['url']
  877. # video_url
  878. # if 'formats' not in streamingData:
  879. # video_url = ''
  880. # elif len(streamingData['formats']) == 0:
  881. # video_url = ''
  882. # elif 'url' not in streamingData['formats'][-1]:
  883. # video_url = ''
  884. # else:
  885. # video_url = streamingData['formats'][-1]['url']
  886. video_url = f"https://www.youtube.com/watch?v={video_id}"
  887. Common.logger(log_type, crawler).info(f'video_title:{video_title}')
  888. Common.logger(log_type, crawler).info(f'video_id:{video_id}')
  889. Common.logger(log_type, crawler).info(f'play_cnt:{play_cnt}')
  890. Common.logger(log_type, crawler).info(f'publish_time:{publish_time}')
  891. Common.logger(log_type, crawler).info(f'user_name:{user_name}')
  892. Common.logger(log_type, crawler).info(f'cover_url:{cover_url}')
  893. Common.logger(log_type, crawler).info(f'video_url:{video_url}')
  894. video_dict = {
  895. 'video_title': video_title,
  896. 'video_id': video_id,
  897. 'duration': duration,
  898. 'play_cnt': play_cnt,
  899. 'publish_time': publish_time,
  900. 'publish_time_stamp': publish_time_stamp,
  901. 'user_name': user_name,
  902. 'out_uid': out_uid,
  903. 'cover_url': cover_url,
  904. 'video_url': video_url,
  905. }
  906. return video_dict
  907. except Exception as e:
  908. Common.logger(log_type, crawler).error(f"get_video_info异常:{e}\n")
  909. @classmethod
  910. def repeat_video(cls, log_type, crawler, video_id, env, machine):
  911. sql = f""" select * from crawler_video where platform="{cls.platform}" and out_video_id="{video_id}"; """
  912. repeat_video = MysqlHelper.get_values(log_type, crawler, sql, env, machine)
  913. return len(repeat_video)
  914. @classmethod
  915. def download_publish(cls, log_type, crawler, video_dict, strategy, our_uid, env, oss_endpoint, machine):
  916. try:
  917. filter_words = get_config_from_mysql(log_type, crawler, env, text='filter', action='get_author_map')
  918. for filter_word in filter_words:
  919. if filter_word in video_dict['video_title']:
  920. Common.logger(log_type, crawler).info('标题已中过滤词:{}\n', video_dict['video_title'])
  921. return
  922. if video_dict['video_title'] == '' or video_dict['video_url'] == '':
  923. Common.logger(log_type, crawler).info('无效视频\n')
  924. elif video_dict['duration'] > 1200 or video_dict['duration'] < 60:
  925. Common.logger(log_type, crawler).info(f"时长:{video_dict['duration']}不满足规则\n")
  926. elif cls.repeat_video(log_type, crawler, video_dict['video_id'], env, machine) != 0:
  927. Common.logger(log_type, crawler).info('视频已下载\n')
  928. else:
  929. # 下载视频
  930. Common.logger(log_type, crawler).info('开始下载视频...')
  931. Common.download_method(log_type, crawler, 'youtube_video', video_dict['video_title'], video_dict['video_url'])
  932. # ffmpeg_dict = Common.ffmpeg(log_type, crawler, f"./{crawler}/videos/{video_dict['video_title']}/video.mp4")
  933. # video_width = int(ffmpeg_dict['width'])
  934. # video_height = int(ffmpeg_dict['height'])
  935. # video_size = int(ffmpeg_dict['size'])
  936. md_title = md5(video_dict['video_title'].encode('utf8')).hexdigest()
  937. try:
  938. if os.path.getsize(f"./{crawler}/videos/{md_title}/video.mp4") == 0:
  939. # 删除视频文件夹
  940. shutil.rmtree(f"./{crawler}/videos/{md_title}")
  941. Common.logger(log_type, crawler).info("视频size=0,删除成功\n")
  942. return
  943. except FileNotFoundError:
  944. # 删除视频文件夹
  945. shutil.rmtree(f"./{crawler}/videos/{md_title}")
  946. Common.logger(log_type, crawler).info("视频文件不存在,删除文件夹成功\n")
  947. return
  948. video_width = 1280
  949. video_height = 720
  950. duration = int(video_dict['duration'])
  951. Common.logger(log_type, crawler).info(f'video_width:{video_width}')
  952. Common.logger(log_type, crawler).info(f'video_height:{video_height}')
  953. Common.logger(log_type, crawler).info(f'duration:{duration}')
  954. video_dict['video_width'] = video_width
  955. video_dict['video_height'] = video_height
  956. video_dict['duration'] = duration
  957. video_dict['comment_cnt'] = 0
  958. video_dict['like_cnt'] = 0
  959. video_dict['share_cnt'] = 0
  960. video_dict['avatar_url'] = video_dict['cover_url']
  961. video_dict['session'] = f'youtube{int(time.time())}'
  962. rule = '1,2'
  963. # 下载封面
  964. Common.download_method(log_type, crawler, 'cover', video_dict['video_title'], video_dict['cover_url'])
  965. # 保存视频文本信息
  966. Common.save_video_info(log_type, crawler, video_dict)
  967. # 上传视频
  968. Common.logger(log_type, crawler).info(f"开始上传视频")
  969. if env == 'dev':
  970. our_video_id = Publish.upload_and_publish(log_type, crawler, strategy, our_uid, env, oss_endpoint)
  971. our_video_link = f"https://testadmin.piaoquantv.com/cms/post-detail/{our_video_id}/info"
  972. else:
  973. our_video_id = Publish.upload_and_publish(log_type, crawler, strategy, our_uid, env, oss_endpoint)
  974. our_video_link = f"https://admin.piaoquantv.com/cms/post-detail/{our_video_id}/info"
  975. Common.logger(log_type, crawler).info("视频上传完成")
  976. if our_video_id is None:
  977. try:
  978. # 删除视频文件夹
  979. shutil.rmtree(f"./{crawler}/videos/{md_title}")
  980. return
  981. except FileNotFoundError:
  982. return
  983. # 视频信息保存至飞书
  984. Feishu.insert_columns(log_type, crawler, "GVxlYk", "ROWS", 1, 2)
  985. # 视频ID工作表,首行写入数据
  986. upload_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(time.time())))
  987. values = [[upload_time,
  988. "定向榜",
  989. video_dict['video_id'],
  990. video_dict['video_title'],
  991. our_video_link,
  992. video_dict['play_cnt'],
  993. video_dict['duration'],
  994. f'{video_width}*{video_height}',
  995. video_dict['publish_time'],
  996. video_dict['user_name'],
  997. video_dict['cover_url'],
  998. video_dict['video_url']
  999. ]]
  1000. # time.sleep(1)
  1001. Feishu.update_values(log_type, crawler, "GVxlYk", "F2:Z2", values)
  1002. Common.logger(log_type, crawler).info('视频信息写入定向_已下载表成功\n')
  1003. # 视频信息保存数据库
  1004. sql = f""" insert into crawler_video(video_id,
  1005. user_id,
  1006. out_user_id,
  1007. platform,
  1008. strategy,
  1009. out_video_id,
  1010. video_title,
  1011. cover_url,
  1012. video_url,
  1013. duration,
  1014. publish_time,
  1015. play_cnt,
  1016. crawler_rule,
  1017. width,
  1018. height)
  1019. values({our_video_id},
  1020. "{our_uid}",
  1021. "{video_dict['out_uid']}",
  1022. "{cls.platform}",
  1023. "定向爬虫策略",
  1024. "{video_dict['video_id']}",
  1025. "{video_dict['video_title']}",
  1026. "{video_dict['cover_url']}",
  1027. "{video_dict['video_url']}",
  1028. {int(duration)},
  1029. "{video_dict['publish_time']}",
  1030. {int(video_dict['play_cnt'])},
  1031. "{rule}",
  1032. {int(video_width)},
  1033. {int(video_height)}) """
  1034. MysqlHelper.update_values(log_type, crawler, sql, env, machine)
  1035. Common.logger(log_type, crawler).info('视频信息插入数据库成功!\n')
  1036. except Exception as e:
  1037. Common.logger(log_type, crawler).info(f"download_publish异常:{e}\n")
  1038. @classmethod
  1039. def get_follow_videos(cls, log_type, crawler, strategy, oss_endpoint, env, machine):
  1040. try:
  1041. # user_list = cls.get_user_from_feishu(log_type, crawler, 'c467d7', env, machine)
  1042. user_list = get_user_from_mysql(log_type, crawler, crawler, env, action='get_author_map')
  1043. if len(user_list) == 0:
  1044. Common.logger(log_type, crawler).warning('用户列表为空\n')
  1045. else:
  1046. for user_dict in user_list:
  1047. out_user_url = user_dict['link']
  1048. out_uid = out_user_url.split('/')[3]
  1049. user_name = user_dict['nick_name']
  1050. our_uid = user_dict['uid']
  1051. Common.logger(log_type, crawler).info(f'获取 {user_name} 主页视频\n')
  1052. cls.get_videos(log_type, crawler, strategy, oss_endpoint, env, out_uid, our_uid, machine,
  1053. out_user_url)
  1054. # Common.logger(log_type, crawler).info('休眠 10 秒')
  1055. # time.sleep(random.randint(1, 2))
  1056. cls.continuation = ''
  1057. except Exception as e:
  1058. Common.logger(log_type, crawler).error(f"get_follow_videos异常:{e}\n")
  1059. if __name__ == "__main__":
  1060. # print(YoutubeFollow.get_browse_id('follow', 'youtube', '@chinatravel5971', "local"))
  1061. # print(YoutubeFollow.get_user_from_feishu('follow', 'youtube', 'c467d7', 'dev', 'local'))
  1062. print(YoutubeFollow.get_user_from_feishu('follow', 'youtube', 'c467d7', 'prod', 'prod'))
  1063. # YoutubeFollow.get_out_user_info('follow', 'youtube', 'UC08jgxf119fzynp2uHCvZIg', '@weitravel')
  1064. # YoutubeFollow.get_video_info('follow', 'youtube', 'OGVK0IXBIhI')
  1065. # YoutubeFollow.get_follow_videos('follow', 'youtube', 'youtube_follow', 'hk', 'dev', 'local')
  1066. # print(YoutubeFollow.filter_emoji("姐妹倆一唱一和,完美配合,終於把大慶降服了😅😅#萌娃搞笑日常"))
  1067. # YoutubeFollow.repeat_video('follow', 'youtube', 4, "dev", "local")
  1068. # title = "'西部巡游220丨两人一车环游中国半年,需要花费多少钱? 2万公里吃住行费用总结'"
  1069. # title = "'Insanely Crowded Shanghai Yu Garden Lantern Festival Walk Tour 2023 人气爆棚的上海豫园元宵节漫步之行 4K'"
  1070. # print(title.strip().replace("\\", "").replace(" ", "").replace("\n", "").replace("/", "").replace("\r", "").replace("&NBSP", "").replace("&", ""))