youtube_follow_scheduling.py 58 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112
  1. # -*- coding: utf-8 -*-
  2. # @Author: wangkun
  3. # @Time: 2023/2/3
  4. """
  5. YouTube 定向榜
  6. 1. 发布时间<=1个月
  7. 2. 10分钟>=时长>=1分钟
  8. """
  9. import os
  10. import re
  11. import shutil
  12. import sys
  13. import time
  14. import json
  15. import requests
  16. sys.path.append(os.getcwd())
  17. from common.common import Common
  18. # from common.db import MysqlHelper
  19. from common.scheduling_db import MysqlHelper
  20. from common.feishu import Feishu
  21. from common.getuser import getUser
  22. from common.publish import Publish
  23. from common.translate import Translate
  24. from common.public import get_user_from_mysql, get_config_from_mysql
  25. headers = {
  26. 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36',
  27. }
  28. def format_nums(data):
  29. data_dict = [{'亿': 100000000}, {'百万': 1000000}, {'万': 10000}, {'k': 1000}, {'w': 10000}, {'m': 1000000},
  30. {'千': 1000}, {'M': 1000000}, {'K': 1000}, {'W': 10000}]
  31. data = str(data)
  32. for i in data_dict:
  33. index = data.find(list(i.keys())[0])
  34. if index > 0:
  35. count = int(float(data[:index]) * list(i.values())[0])
  36. return count
  37. elif index < 0:
  38. continue
  39. count = int(float(re.findall(r'\d+', data)[0]))
  40. return count
  41. class YoutubeAuthorScheduling:
  42. # 翻页参数
  43. continuation = ''
  44. # 抓取平台
  45. platform = 'youtube'
  46. headers = {
  47. 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36',
  48. }
  49. @classmethod
  50. def get_out_user_info(cls, log_type, crawler, browse_id, out_user_id):
  51. """
  52. 获取站外用户信息
  53. :param log_type: 日志
  54. :param crawler: 哪款爬虫
  55. :param browse_id: browse_id
  56. :param out_user_id: 站外用户 UID
  57. :return: out_user_dict = {'out_user_name': 站外用户昵称,
  58. 'out_avatar_url': 站外用户头像,
  59. 'out_fans': 站外用户粉丝量,
  60. 'out_play_cnt': 站外用户总播放量,
  61. 'out_create_time': 站外用户创建时间}
  62. """
  63. try:
  64. url = f'https://www.youtube.com/{out_user_id}/about'
  65. res = requests.get(url=url, headers=headers)
  66. info = re.findall(r'var ytInitialData = (.*?);</script>', res.text, re.S)[0]
  67. data = json.loads(info)
  68. header = data['header']['c4TabbedHeaderRenderer']
  69. tabs = data['contents']['twoColumnBrowseResultsRenderer']['tabs']
  70. try:
  71. subsimpleText = header['subscriberCountText']['simpleText'].replace('位订阅者', '')
  72. out_fans = format_nums(subsimpleText)
  73. except Exception as e:
  74. out_fans = 0
  75. for tab in tabs:
  76. if 'tabRenderer' not in tab or 'content' not in tab['tabRenderer']:
  77. continue
  78. viewCountText = \
  79. tab['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer'][
  80. 'contents'][0]['channelAboutFullMetadataRenderer']['viewCountText']['simpleText']
  81. out_create_time = \
  82. tab['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer'][
  83. 'contents'][0]['channelAboutFullMetadataRenderer']['joinedDateText']['runs'][1]['text']
  84. break
  85. out_user_dict = {
  86. 'out_user_name': header['title'],
  87. 'out_avatar_url': header['avatar']['thumbnails'][-1]['url'],
  88. 'out_fans': out_fans,
  89. 'out_play_cnt': int(
  90. viewCountText.replace('收看次數:', '').replace('次', '').replace(',', '')) if viewCountText else 0,
  91. 'out_create_time': out_create_time.replace('年', '-').replace('月', '-').replace('日', ''),
  92. }
  93. # print(out_user_dict)
  94. return out_user_dict
  95. except Exception as e:
  96. Common.logger(log_type, crawler).error(f'get_out_user_info异常:{e}\n')
  97. @classmethod
  98. def get_user_from_feishu(cls, log_type, crawler, sheetid, env):
  99. """
  100. 补全飞书用户表信息,并返回
  101. :param log_type: 日志
  102. :param crawler: 哪款爬虫
  103. :param sheetid: 飞书表
  104. :param env: 正式环境:prod,测试环境:dev
  105. :param machine: 部署机器,阿里云填写 aliyun,aliyun_hk ,线下分别填写 macpro,macair,local
  106. :return: user_list
  107. """
  108. try:
  109. user_sheet = Feishu.get_values_batch(log_type, crawler, sheetid)
  110. user_list = []
  111. for i in range(1, len(user_sheet)):
  112. out_uid = user_sheet[i][2]
  113. user_name = user_sheet[i][3]
  114. browse_id = user_sheet[i][5]
  115. our_uid = user_sheet[i][6]
  116. uer_url = user_sheet[i][4]
  117. if out_uid is not None and user_name is not None:
  118. Common.logger(log_type, crawler).info(f"正在更新 {user_name} 用户信息\n")
  119. if our_uid is None:
  120. sql = f""" select * from crawler_user where platform="{cls.platform}" and out_user_id="{out_uid}" """
  121. our_user_info = MysqlHelper.get_values(log_type, crawler, sql, env)
  122. # 数据库中(youtube + out_user_id)返回数量 == 0,则创建站内账号UID,并写入定向账号飞书表。并结合站外用户信息,一并写入爬虫账号数据库
  123. if not our_user_info:
  124. # 获取站外账号信息,写入数据库
  125. try:
  126. out_user_dict = cls.get_out_user_info(log_type, crawler, browse_id, out_uid)
  127. except Exception as e:
  128. continue
  129. out_avatar_url = out_user_dict['out_avatar_url']
  130. out_create_time = out_user_dict['out_create_time']
  131. out_play_cnt = out_user_dict['out_play_cnt']
  132. out_fans = out_user_dict['out_fans']
  133. tag = 'youtube爬虫,定向爬虫策略'
  134. # 创建站内账号
  135. create_user_dict = {
  136. 'nickName': user_name,
  137. 'avatarUrl': out_avatar_url,
  138. 'tagName': tag,
  139. }
  140. our_uid = getUser.create_uid(log_type, crawler, create_user_dict, env)
  141. Common.logger(log_type, crawler).info(f'新创建的站内UID:{our_uid}')
  142. if env == 'prod':
  143. our_user_link = f'https://admin.piaoquantv.com/ums/user/{our_uid}/post'
  144. else:
  145. our_user_link = f'https://testadmin.piaoquantv.com/ums/user/{our_uid}/post'
  146. Common.logger(log_type, crawler).info(f'站内用户主页链接:{our_user_link}')
  147. Feishu.update_values(log_type, crawler, sheetid, f'G{i + 1}:H{i + 1}',
  148. [[our_uid, our_user_link]])
  149. Common.logger(log_type, crawler).info(f'站内用户信息写入飞书成功!')
  150. sql = f""" insert into crawler_user(user_id,
  151. out_user_id,
  152. out_user_name,
  153. out_avatar_url,
  154. out_create_time,
  155. out_play_cnt,
  156. out_fans,
  157. platform,
  158. tag)
  159. values({our_uid},
  160. "{out_uid}",
  161. "{user_name}",
  162. "{out_avatar_url}",
  163. "{out_create_time}",
  164. {out_play_cnt},
  165. {out_fans},
  166. "{cls.platform}",
  167. "{tag}") """
  168. Common.logger(log_type, crawler).info(f'sql:{sql}')
  169. MysqlHelper.update_values(log_type, crawler, sql, env)
  170. Common.logger(log_type, crawler).info('用户信息插入数据库成功!\n')
  171. # 数据库中(youtube + out_user_id)返回数量 != 0,则直接把数据库中的站内 UID 写入飞书
  172. else:
  173. our_uid = our_user_info[0][1]
  174. if 'env' == 'prod':
  175. our_user_link = f'https://admin.piaoquantv.com/ums/user/{our_uid}/post'
  176. else:
  177. our_user_link = f'https://testadmin.piaoquantv.com/ums/user/{our_uid}/post'
  178. Common.logger(log_type, crawler).info(f'站内用户主页链接:{our_user_link}')
  179. Feishu.update_values(log_type, crawler, sheetid, f'G{i + 1}:H{i + 1}',
  180. [[our_uid, our_user_link]])
  181. Common.logger(log_type, crawler).info(f'站内用户信息写入飞书成功!\n')
  182. user_dict = {
  183. 'out_user_id': out_uid,
  184. 'out_user_name': user_name,
  185. 'out_browse_id': browse_id,
  186. 'our_user_id': our_uid,
  187. 'out_user_url': uer_url
  188. }
  189. user_list.append(user_dict)
  190. else:
  191. pass
  192. return user_list
  193. except Exception as e:
  194. Common.logger(log_type, crawler).error(f"get_user_from_feishu异常:{e}\n")
  195. @classmethod
  196. def get_continuation(cls, data):
  197. continuation = data['continuationItemRenderer']['continuationEndpoint']['continuationCommand']['token']
  198. return continuation
  199. @classmethod
  200. def get_feeds(cls, log_type, crawler, browse_id, out_uid):
  201. """
  202. 获取个人主页视频列表
  203. :param log_type: 日志
  204. :param crawler: 哪款爬虫
  205. :param browse_id: 每个用户主页的请求参数中唯一值
  206. :param out_uid: 站外用户UID
  207. :return: video_list
  208. """
  209. url = "https://www.youtube.com/youtubei/v1/browse?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8&prettyPrint=false"
  210. payload = json.dumps({
  211. "context": {
  212. "client": {
  213. "hl": "zh-CN",
  214. "gl": "US",
  215. "remoteHost": "38.93.247.21",
  216. "deviceMake": "Apple",
  217. "deviceModel": "",
  218. "visitorData": "CgtraDZfVnB4NXdIWSi6mIOfBg%3D%3D",
  219. "userAgent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36,gzip(gfe)",
  220. "clientName": "WEB",
  221. "clientVersion": "2.20230201.01.00",
  222. "osName": "Macintosh",
  223. "osVersion": "10_15_7",
  224. "originalUrl": f"https://www.youtube.com/{out_uid}/videos",
  225. "platform": "DESKTOP",
  226. "clientFormFactor": "UNKNOWN_FORM_FACTOR",
  227. "configInfo": {
  228. "appInstallData": "CLqYg58GEInorgUQuIuuBRCU-K4FENfkrgUQuNSuBRC2nP4SEPuj_hIQ5_euBRCy9a4FEKLsrgUQt-CuBRDi1K4FEILdrgUQh92uBRDM364FEP7urgUQzPWuBRDZ6a4FEOSg_hIQo_muBRDvo_4SEMnJrgUQlqf-EhCR-PwS"
  229. },
  230. "timeZone": "Asia/Shanghai",
  231. "browserName": "Chrome",
  232. "browserVersion": "109.0.0.0",
  233. "acceptHeader": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
  234. "deviceExperimentId": "ChxOekU1TlRReU5qWTBOVFExTVRRNU5qRTBOdz09ELqYg58GGOmU7Z4G",
  235. "screenWidthPoints": 944,
  236. "screenHeightPoints": 969,
  237. "screenPixelDensity": 1,
  238. "screenDensityFloat": 1,
  239. "utcOffsetMinutes": 480,
  240. "userInterfaceTheme": "USER_INTERFACE_THEME_LIGHT",
  241. "memoryTotalKbytes": "8000000",
  242. "mainAppWebInfo": {
  243. "graftUrl": f"/{out_uid}/videos",
  244. "pwaInstallabilityStatus": "PWA_INSTALLABILITY_STATUS_CAN_BE_INSTALLED",
  245. "webDisplayMode": "WEB_DISPLAY_MODE_FULLSCREEN",
  246. "isWebNativeShareAvailable": True
  247. }
  248. },
  249. "user": {
  250. "lockedSafetyMode": False
  251. },
  252. "request": {
  253. "useSsl": True,
  254. "internalExperimentFlags": [],
  255. "consistencyTokenJars": []
  256. },
  257. "clickTracking": {
  258. "clickTrackingParams": "CBcQ8JMBGAYiEwiNhIXX9IL9AhUFSUwIHWnnDks="
  259. },
  260. "adSignalsInfo": {
  261. "params": [
  262. {
  263. "key": "dt",
  264. "value": "1675676731048"
  265. },
  266. {
  267. "key": "flash",
  268. "value": "0"
  269. },
  270. {
  271. "key": "frm",
  272. "value": "0"
  273. },
  274. {
  275. "key": "u_tz",
  276. "value": "480"
  277. },
  278. {
  279. "key": "u_his",
  280. "value": "4"
  281. },
  282. {
  283. "key": "u_h",
  284. "value": "1080"
  285. },
  286. {
  287. "key": "u_w",
  288. "value": "1920"
  289. },
  290. {
  291. "key": "u_ah",
  292. "value": "1080"
  293. },
  294. {
  295. "key": "u_aw",
  296. "value": "1920"
  297. },
  298. {
  299. "key": "u_cd",
  300. "value": "24"
  301. },
  302. {
  303. "key": "bc",
  304. "value": "31"
  305. },
  306. {
  307. "key": "bih",
  308. "value": "969"
  309. },
  310. {
  311. "key": "biw",
  312. "value": "944"
  313. },
  314. {
  315. "key": "brdim",
  316. "value": "-269,-1080,-269,-1080,1920,-1080,1920,1080,944,969"
  317. },
  318. {
  319. "key": "vis",
  320. "value": "1"
  321. },
  322. {
  323. "key": "wgl",
  324. "value": "true"
  325. },
  326. {
  327. "key": "ca_type",
  328. "value": "image"
  329. }
  330. ],
  331. "bid": "ANyPxKpfiaAf-DBzNeKLgkceMEA9UIeCWFRTRm4AQMCuejhI3PGwDB1jizQIX60YcEYtt_CX7tZWAbYerQ-rWLvV7y_KCLkBww"
  332. }
  333. },
  334. # "browseId": browse_id,
  335. "params": "EgZ2aWRlb3PyBgQKAjoA",
  336. "continuation": cls.continuation
  337. })
  338. headers = {
  339. 'authority': 'www.youtube.com',
  340. 'accept': '*/*',
  341. 'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
  342. 'cache-control': 'no-cache',
  343. 'content-type': 'application/json',
  344. 'cookie': 'VISITOR_INFO1_LIVE=kh6_Vpx5wHY; YSC=UupqFrWvAR0; DEVICE_INFO=ChxOekU1TlRReU5qWTBOVFExTVRRNU5qRTBOdz09EOmU7Z4GGOmU7Z4G; PREF=tz=Asia.Shanghai; ST-1kg1gfd=itct=CBcQ8JMBGAYiEwiNhIXX9IL9AhUFSUwIHWnnDks%3D&csn=MC4zNzI3MDcwMDA1Mjg4NzE5Ng..&endpoint=%7B%22clickTrackingParams%22%3A%22CBcQ8JMBGAYiEwiNhIXX9IL9AhUFSUwIHWnnDks%3D%22%2C%22commandMetadata%22%3A%7B%22webCommandMetadata%22%3A%7B%22url%22%3A%22%2F%40chinatravel5971%2Fvideos%22%2C%22webPageType%22%3A%22WEB_PAGE_TYPE_CHANNEL%22%2C%22rootVe%22%3A3611%2C%22apiUrl%22%3A%22%2Fyoutubei%2Fv1%2Fbrowse%22%7D%7D%2C%22browseEndpoint%22%3A%7B%22browseId%22%3A%22UCpLXnfBCNhj8KLnt54RQMKA%22%2C%22params%22%3A%22EgZ2aWRlb3PyBgQKAjoA%22%2C%22canonicalBaseUrl%22%3A%22%2F%40chinatravel5971%22%7D%7D',
  345. 'origin': 'https://www.youtube.com',
  346. 'pragma': 'no-cache',
  347. 'referer': f'https://www.youtube.com/{out_uid}/featured',
  348. 'sec-ch-ua': '"Not_A Brand";v="99", "Chromium";v="109", "Google Chrome";v="109.0.5414.87"',
  349. 'sec-ch-ua-arch': '"arm"',
  350. 'sec-ch-ua-bitness': '"64"',
  351. 'sec-ch-ua-full-version': '"109.0.1518.52"',
  352. 'sec-ch-ua-full-version-list': '"Not_A Brand";v="99.0.0.0", "Microsoft Edge";v="109.0.1518.52", "Chromium";v="109.0.5414.87"',
  353. 'sec-ch-ua-mobile': '?0',
  354. 'sec-ch-ua-model': '',
  355. 'sec-ch-ua-platform': '"macOS"',
  356. 'sec-ch-ua-platform-version': '"12.4.0"',
  357. 'sec-ch-ua-wow64': '?0',
  358. 'sec-fetch-dest': 'empty',
  359. 'sec-fetch-mode': 'same-origin',
  360. 'sec-fetch-site': 'same-origin',
  361. 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
  362. 'x-goog-visitor-id': 'CgtraDZfVnB4NXdIWSi6mIOfBg%3D%3D',
  363. 'x-youtube-bootstrap-logged-in': 'false',
  364. 'x-youtube-client-name': '1',
  365. 'x-youtube-client-version': '2.20230201.01.00'
  366. }
  367. try:
  368. response = requests.post(url=url, headers=headers, data=payload)
  369. # Common.logger(log_type, crawler).info(f"get_feeds_response:{response.json()}\n")
  370. cls.continuation = response.json()['trackingParams']
  371. if response.status_code != 200:
  372. Common.logger(log_type, crawler).warning(f'get_feeds_response:{response.text}\n')
  373. elif 'continuationContents' not in response.text and 'onResponseReceivedActions' not in response.text:
  374. Common.logger(log_type, crawler).warning(f'get_feeds_response:{response.text}\n')
  375. elif 'continuationContents' in response.json():
  376. # Common.logger(log_type, crawler).info("'continuationContents' in response.json()\n")
  377. if 'richGridContinuation' not in response.json()['continuationContents']:
  378. Common.logger(log_type, crawler).warning(
  379. f'get_feeds_response:{response.json()["continuationContents"]}\n')
  380. elif 'contents' not in response.json()['continuationContents']['richGridContinuation']:
  381. Common.logger(log_type, crawler).warning(
  382. f'get_feeds_response:{response.json()["continuationContents"]["richGridContinuation"]}\n')
  383. elif 'contents' in response.json()["continuationContents"]["richGridContinuation"]:
  384. feeds = response.json()["continuationContents"]["richGridContinuation"]['contents']
  385. return feeds
  386. elif 'onResponseReceivedActions' in response.json():
  387. Common.logger(log_type, crawler).info("'onResponseReceivedActions' in response.json()\n")
  388. if len(response.json()['onResponseReceivedActions']) == 0:
  389. Common.logger(log_type, crawler).warning(
  390. f'get_feeds_response:{response.json()["onResponseReceivedActions"]}\n')
  391. elif 'appendContinuationItemsAction' not in response.json()['onResponseReceivedActions'][0]:
  392. Common.logger(log_type, crawler).warning(
  393. f'get_feeds_response:{response.json()["onResponseReceivedActions"][0]}\n')
  394. elif 'continuationItems' not in response.json()['onResponseReceivedActions'][0][
  395. 'appendContinuationItemsAction']:
  396. Common.logger(log_type, crawler).warning(
  397. f'get_feeds_response:{response.json()["onResponseReceivedActions"][0]["appendContinuationItemsAction"]}\n')
  398. elif len(response.json()['onResponseReceivedActions'][0]['appendContinuationItemsAction'][
  399. 'continuationItems']) == 0:
  400. Common.logger(log_type, crawler).warning(
  401. f'get_feeds_response:{response.json()["onResponseReceivedActions"][0]["appendContinuationItemsAction"]["continuationItems"]}\n')
  402. else:
  403. feeds = response.json()["onResponseReceivedActions"][0]["appendContinuationItemsAction"][
  404. "continuationItems"]
  405. return feeds
  406. else:
  407. Common.logger(log_type, crawler).info('feeds is None\n')
  408. except Exception as e:
  409. Common.logger(log_type, crawler).error(f'get_feeds异常:{e}\n')
  410. @classmethod
  411. def get_first_page(cls, user_url):
  412. try:
  413. res = requests.get(url=user_url, headers=cls.headers)
  414. info = re.findall(r'var ytInitialData = (.*?);', res.text, re.S)[0]
  415. ytInitialData = json.loads(info)
  416. video_list = \
  417. ytInitialData['contents']['twoColumnBrowseResultsRenderer']['tabs'][1]['tabRenderer']['content'][
  418. 'richGridRenderer']['contents']
  419. except Exception as e:
  420. video_list = []
  421. return video_list
  422. @classmethod
  423. def get_next_page(cls, log_type, crawler, strategy, oss_endpoint, env, out_uid, our_uid, out_user_url, continuation):
  424. post_url = "https://www.youtube.com/youtubei/v1/browse?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8&prettyPrint=false"
  425. payload = json.dumps({
  426. "context": {
  427. "client": {
  428. "userAgent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36,gzip(gfe)",
  429. "clientName": "WEB",
  430. "clientVersion": "2.20230221.06.00",
  431. "osName": "Macintosh",
  432. "osVersion": "10_15_7",
  433. "originalUrl": "https://www.youtube.com/@wongkim728/videos",
  434. "screenPixelDensity": 2,
  435. "platform": "DESKTOP",
  436. "clientFormFactor": "UNKNOWN_FORM_FACTOR",
  437. "configInfo": {
  438. "appInstallData": "CKWy258GEOWg_hIQzN-uBRC4rP4SEOf3rgUQzPWuBRCi7K4FEMiJrwUQieiuBRDshq8FENrprgUQ4tSuBRD-7q4FEKOArwUQgt2uBRC2nP4SEJT4rgUQuIuuBRCH3a4FELjUrgUQjqj-EhCR-PwS"
  439. },
  440. "screenDensityFloat": 2,
  441. "timeZone": "Asia/Shanghai",
  442. "browserName": "Chrome",
  443. "browserVersion": "110.0.0.0",
  444. "acceptHeader": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
  445. "deviceExperimentId": "ChxOekl3TWpVek9UQXpPVE13TnpJd056a3pNZz09EKWy258GGJie0p8G",
  446. "screenWidthPoints": 576,
  447. "screenHeightPoints": 764,
  448. "utcOffsetMinutes": 480,
  449. "userInterfaceTheme": "USER_INTERFACE_THEME_LIGHT",
  450. "connectionType": "CONN_CELLULAR_4G",
  451. "memoryTotalKbytes": "8000000",
  452. "mainAppWebInfo": {
  453. "graftUrl": out_user_url,
  454. "pwaInstallabilityStatus": "PWA_INSTALLABILITY_STATUS_CAN_BE_INSTALLED",
  455. "webDisplayMode": "WEB_DISPLAY_MODE_FULLSCREEN",
  456. "isWebNativeShareAvailable": False
  457. }
  458. },
  459. "user": {
  460. "lockedSafetyMode": False
  461. },
  462. "request": {
  463. "useSsl": True,
  464. "internalExperimentFlags": [],
  465. "consistencyTokenJars": []
  466. },
  467. "clickTracking": {
  468. "clickTrackingParams": ""
  469. },
  470. "adSignalsInfo": {
  471. "params": [],
  472. "bid": "ANyPxKo8EXfKNGm3gYLAqhR5HA90FSKMvQf43tk3KV_XUWB5xi_0OxAo2TJTfoVx_516NRxz0qwRg-1x2kD-IVt7LPKrRHkJBA"
  473. }
  474. },
  475. "continuation": continuation
  476. })
  477. headers = {
  478. # 'authorization': 'SAPISIDHASH 1677121838_f5055bd4b4c242d18af423b37ac0f556bf1dfc30',
  479. 'content-type': 'application/json',
  480. 'cookie': 'VISITOR_INFO1_LIVE=HABZsLFdU40; DEVICE_INFO=ChxOekl3TWpVek9UQXpPVE13TnpJd056a3pNZz09EJie0p8GGJie0p8G; PREF=f4=4000000&tz=Asia.Shanghai; HSID=AxFp7ylWWebUZYqrl; SSID=ANHuSQMqvVcV0vVNn; APISID=AkwZgjPvFZ6LZCrE/Aiv0K-2rEUzY1bH1u; SAPISID=8yRrBMHYXAhqkybH/AEFGJvzZ3tPalnTy0; __Secure-1PAPISID=8yRrBMHYXAhqkybH/AEFGJvzZ3tPalnTy0; __Secure-3PAPISID=8yRrBMHYXAhqkybH/AEFGJvzZ3tPalnTy0; SID=TwjWkM4mrKb4o8pRKbyQVqELjNU43ZL0bF8QB2hdTI9z05T4Koo9aQoNQfX1AiGFWeD7WA.; __Secure-1PSID=TwjWkM4mrKb4o8pRKbyQVqELjNU43ZL0bF8QB2hdTI9z05T4bs4qvvXffLLTXq_VYw0XLw.; __Secure-3PSID=TwjWkM4mrKb4o8pRKbyQVqELjNU43ZL0bF8QB2hdTI9z05T4cNwzpudzvCglfQ5A1FJnog.; LOGIN_INFO=AFmmF2swRAIgO4TvR9xxWoHPgrGoGAEVo-P8Slqem__vIdF_oajjRiECIFiq4YtbL_IQGCbkjrHsWkWH6OpzKd8RlgdS6qNurR0Q:QUQ3MjNmejV5WkRVUmZXVlFjbjY0dW1aVGpoZkZQdmxYamIzV01zc0lmT3JiQl9ldVYwc0t4dlNkbWpoVEdJMHVaWjZXVEt3ZERQeUppU3AyNmR6ckFucWltZU5LNmZjQ3lHUEtKTDBzSlo5WXpJQzF3UlNCVlp2Q1ZKVmxtRk05OHRuWFFiWGphcFpPblFOUURWTlVxVGtBazVjcmVtS2pR; YSC=CtX0f3NennA; SIDCC=AFvIBn9aXC4vNCbg5jPzjbC8LMYCBVx_dy8uJO20b-768rmRfP9f5BqQ_xXspPemecVq29qZ7A; __Secure-1PSIDCC=AFvIBn-4TD_lPaKgbmYAGO6hZluLgSgbWgb7XAcaeNG6982LIIpS_Gb9vkqHTBMyCGvb4x7m6jk; __Secure-3PSIDCC=AFvIBn9ypvGX15qq4CsnsuhWTaXa9yMTxWMWbIDXtr6L3XZD81XBUQ0IMUv9ZKh9mf8NEbSvOy0; SIDCC=AFvIBn_DwLbohF2llhq4EQjFDFA3n9-_AK_7ITJsTZtCeYwy43J8KCYUPfY7ghqX9s-Qq5dOIQ; __Secure-1PSIDCC=AFvIBn-7x_HhxbmDkOzXew-sXAEWVuUGpglr8rypU623IyO8Y9OungcqMkuxBZQ2vr6G7x9UcxM; __Secure-3PSIDCC=AFvIBn-7aSYRxZkCKZp7-Mdn9PwbW4CUtXD0ok0nCvPIZXfkFrN9VqN1BHkI1fUaoIo_8YCjwRs',
  481. 'origin': 'https://www.youtube.com',
  482. 'referer': out_user_url,
  483. 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36',
  484. }
  485. try:
  486. res = requests.request("POST", post_url, headers=headers, data=payload).json()
  487. video_infos = res['onResponseReceivedActions'][0]['appendContinuationItemsAction']['continuationItems']
  488. for data in video_infos:
  489. if 'richItemRenderer' in data:
  490. video_id = data["richItemRenderer"]["content"]['videoRenderer']['videoId']
  491. video_dict = cls.get_video_info(log_type, crawler, out_uid, video_id)
  492. # 发布时间<=7天
  493. publish_time = int(time.mktime(time.strptime(video_dict['publish_time'], "%Y-%m-%d")))
  494. if int(time.time()) - publish_time <= 3600 * 24 * 7:
  495. cls.download_publish(log_type, crawler, video_dict, strategy, our_uid, env, oss_endpoint)
  496. else:
  497. Common.logger(log_type, crawler).info('发布时间超过7天\n')
  498. return
  499. else:
  500. continuation = cls.get_continuation(data)
  501. cls.get_next_page(log_type, crawler, strategy, oss_endpoint, env, out_uid, our_uid,out_user_url, continuation)
  502. except:
  503. return
  504. @classmethod
  505. def get_videos(cls, log_type, crawler, strategy, task, oss_endpoint, env, out_uid, our_uid, out_user_url):
  506. try:
  507. feeds = cls.get_first_page(out_user_url)
  508. for data in feeds:
  509. if 'richItemRenderer' in data:
  510. video_id = data["richItemRenderer"]["content"]['videoRenderer']['videoId']
  511. video_dict = cls.get_video_info(log_type, crawler, out_uid, video_id)
  512. # 发布时间<=7天
  513. publish_time = int(time.mktime(time.strptime(video_dict['publish_time'], "%Y-%m-%d")))
  514. if int(time.time()) - publish_time <= 3600 * 24 * 7:
  515. cls.download_publish(log_type, crawler, video_dict, strategy, our_uid, env, oss_endpoint)
  516. else:
  517. Common.logger(log_type, crawler).info('发布时间超过7天\n')
  518. return
  519. else:
  520. continuation = cls.get_continuation(data)
  521. cls.get_next_page(log_type, crawler, strategy, oss_endpoint, env, out_uid, our_uid, out_user_url, continuation=continuation)
  522. except Exception as e:
  523. Common.logger(log_type, crawler).error(f"get_videos异常:{e}\n")
  524. @classmethod
  525. def filter_emoji(cls, title):
  526. # 过滤表情
  527. try:
  528. co = re.compile(u'[\U00010000-\U0010ffff]')
  529. except re.error:
  530. co = re.compile(u'[\uD800-\uDBFF][\uDC00-\uDFFF]')
  531. return co.sub("", title)
  532. @classmethod
  533. def is_contain_chinese(cls, strword):
  534. for ch in strword:
  535. if u'\u4e00' <= ch <= u'\u9fff':
  536. return True
  537. return False
  538. @classmethod
  539. def parse_video(cls, video_dict, log_type, crawler, out_uid, video_id):
  540. try:
  541. if 'streamingData' not in video_dict:
  542. Common.logger(log_type, crawler).warning(f"get_video_info_response:{video_dict}\n")
  543. elif 'videoDetails' not in video_dict:
  544. Common.logger(log_type, crawler).warning(f"get_video_info_response:{video_dict}\n")
  545. elif 'microformat' not in video_dict:
  546. Common.logger(log_type, crawler).warning(f"get_video_info_response:{video_dict}\n")
  547. else:
  548. playerMicroformatRenderer = video_dict['microformat']['playerMicroformatRenderer']
  549. videoDetails = video_dict['videoDetails']
  550. # streamingData = response.json()['streamingData']
  551. # video_title
  552. if 'title' not in videoDetails:
  553. video_title = ''
  554. else:
  555. video_title = videoDetails['title']
  556. video_title = cls.filter_emoji(video_title)
  557. # if Translate.is_contains_chinese(video_title) is False:
  558. if not cls.is_contain_chinese(video_title):
  559. video_title = Translate.google_translate(video_title) \
  560. .strip().replace("\\", "").replace(" ", "").replace("\n", "") \
  561. .replace("/", "").replace("\r", "").replace("&NBSP", "").replace("&", "") \
  562. .replace(";", "").replace("amp;", "") # 自动翻译标题为中文
  563. if 'lengthSeconds' not in videoDetails:
  564. duration = 0
  565. else:
  566. duration = int(videoDetails['lengthSeconds'])
  567. # play_cnt
  568. if 'viewCount' not in videoDetails:
  569. play_cnt = 0
  570. else:
  571. play_cnt = int(videoDetails['viewCount'])
  572. # publish_time
  573. if 'publishDate' not in playerMicroformatRenderer:
  574. publish_time = ''
  575. else:
  576. publish_time = playerMicroformatRenderer['publishDate']
  577. if publish_time == '':
  578. publish_time_stamp = 0
  579. elif ':' in publish_time:
  580. publish_time_stamp = int(time.mktime(time.strptime(publish_time, "%Y-%m-%d %H:%M:%S")))
  581. else:
  582. publish_time_stamp = int(time.mktime(time.strptime(publish_time, "%Y-%m-%d")))
  583. # user_name
  584. if 'author' not in videoDetails:
  585. user_name = ''
  586. else:
  587. user_name = videoDetails['author']
  588. # cover_url
  589. if 'thumbnail' not in videoDetails:
  590. cover_url = ''
  591. elif 'thumbnails' not in videoDetails['thumbnail']:
  592. cover_url = ''
  593. elif len(videoDetails['thumbnail']['thumbnails']) == 0:
  594. cover_url = ''
  595. elif 'url' not in videoDetails['thumbnail']['thumbnails'][-1]:
  596. cover_url = ''
  597. else:
  598. cover_url = videoDetails['thumbnail']['thumbnails'][-1]['url']
  599. # video_url
  600. # if 'formats' not in streamingData:
  601. # video_url = ''
  602. # elif len(streamingData['formats']) == 0:
  603. # video_url = ''
  604. # elif 'url' not in streamingData['formats'][-1]:
  605. # video_url = ''
  606. # else:
  607. # video_url = streamingData['formats'][-1]['url']
  608. video_url = f"https://www.youtube.com/watch?v={video_id}"
  609. Common.logger(log_type, crawler).info(f'video_title:{video_title}')
  610. Common.logger(log_type, crawler).info(f'video_id:{video_id}')
  611. Common.logger(log_type, crawler).info(f'play_cnt:{play_cnt}')
  612. Common.logger(log_type, crawler).info(f'publish_time:{publish_time}')
  613. Common.logger(log_type, crawler).info(f'user_name:{user_name}')
  614. Common.logger(log_type, crawler).info(f'cover_url:{cover_url}')
  615. Common.logger(log_type, crawler).info(f'video_url:{video_url}')
  616. video_dict = {
  617. 'video_title': video_title,
  618. 'video_id': video_id,
  619. 'duration': duration,
  620. 'play_cnt': play_cnt,
  621. 'publish_time': publish_time,
  622. 'publish_time_stamp': publish_time_stamp,
  623. 'user_name': user_name,
  624. 'out_uid': out_uid,
  625. 'cover_url': cover_url,
  626. 'video_url': video_url,
  627. }
  628. return video_dict
  629. except Exception as e:
  630. Common.logger(log_type, crawler).error(f"get_video_info异常:{e}\n")
  631. @classmethod
  632. def get_video_info(cls, log_type, crawler, out_uid, video_id):
  633. try:
  634. url = "https://www.youtube.com/youtubei/v1/player?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8&prettyPrint=false"
  635. payload = json.dumps({
  636. "context": {
  637. "client": {
  638. "hl": "zh-CN",
  639. "gl": "US",
  640. "remoteHost": "38.93.247.21",
  641. "deviceMake": "Apple",
  642. "deviceModel": "",
  643. "visitorData": "CgtraDZfVnB4NXdIWSjkzoefBg%3D%3D",
  644. "userAgent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36,gzip(gfe)",
  645. "clientName": "WEB",
  646. "clientVersion": "2.20230201.01.00",
  647. "osName": "Macintosh",
  648. "osVersion": "10_15_7",
  649. "originalUrl": f"https://www.youtube.com/watch?v={video_id}",
  650. "platform": "DESKTOP",
  651. "clientFormFactor": "UNKNOWN_FORM_FACTOR",
  652. "configInfo": {
  653. "appInstallData": "COTOh58GEPuj_hIQ1-SuBRC4i64FEMzfrgUQgt2uBRCi7K4FEOLUrgUQzPWuBRCKgK8FEOSg_hIQtpz-EhDa6a4FEP7urgUQieiuBRDn964FELjUrgUQlPiuBRCH3a4FELfgrgUQ76P-EhDJya4FEJan_hIQkfj8Eg%3D%3D"
  654. },
  655. "timeZone": "Asia/Shanghai",
  656. "browserName": "Chrome",
  657. "browserVersion": "109.0.0.0",
  658. "acceptHeader": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
  659. "deviceExperimentId": "ChxOekU1TlRReU5qWTBOVFExTVRRNU5qRTBOdz09EOTOh58GGOmU7Z4G",
  660. "screenWidthPoints": 1037,
  661. "screenHeightPoints": 969,
  662. "screenPixelDensity": 1,
  663. "screenDensityFloat": 1,
  664. "utcOffsetMinutes": 480,
  665. "userInterfaceTheme": "USER_INTERFACE_THEME_LIGHT",
  666. "memoryTotalKbytes": "8000000",
  667. "clientScreen": "WATCH",
  668. "mainAppWebInfo": {
  669. "graftUrl": f"/watch?v={video_id}",
  670. "pwaInstallabilityStatus": "PWA_INSTALLABILITY_STATUS_CAN_BE_INSTALLED",
  671. "webDisplayMode": "WEB_DISPLAY_MODE_FULLSCREEN",
  672. "isWebNativeShareAvailable": True
  673. }
  674. },
  675. "user": {
  676. "lockedSafetyMode": False
  677. },
  678. "request": {
  679. "useSsl": True,
  680. "internalExperimentFlags": [],
  681. "consistencyTokenJars": []
  682. },
  683. "clickTracking": {
  684. "clickTrackingParams": "CIwBEKQwGAYiEwipncqx3IL9AhXs4cQKHbKZDO4yB3JlbGF0ZWRInsS1qbGFtIlUmgEFCAEQ-B0="
  685. },
  686. "adSignalsInfo": {
  687. "params": [
  688. {
  689. "key": "dt",
  690. "value": "1675749222611"
  691. },
  692. {
  693. "key": "flash",
  694. "value": "0"
  695. },
  696. {
  697. "key": "frm",
  698. "value": "0"
  699. },
  700. {
  701. "key": "u_tz",
  702. "value": "480"
  703. },
  704. {
  705. "key": "u_his",
  706. "value": "3"
  707. },
  708. {
  709. "key": "u_h",
  710. "value": "1080"
  711. },
  712. {
  713. "key": "u_w",
  714. "value": "1920"
  715. },
  716. {
  717. "key": "u_ah",
  718. "value": "1080"
  719. },
  720. {
  721. "key": "u_aw",
  722. "value": "1920"
  723. },
  724. {
  725. "key": "u_cd",
  726. "value": "24"
  727. },
  728. {
  729. "key": "bc",
  730. "value": "31"
  731. },
  732. {
  733. "key": "bih",
  734. "value": "969"
  735. },
  736. {
  737. "key": "biw",
  738. "value": "1037"
  739. },
  740. {
  741. "key": "brdim",
  742. "value": "-269,-1080,-269,-1080,1920,-1080,1920,1080,1037,969"
  743. },
  744. {
  745. "key": "vis",
  746. "value": "1"
  747. },
  748. {
  749. "key": "wgl",
  750. "value": "true"
  751. },
  752. {
  753. "key": "ca_type",
  754. "value": "image"
  755. }
  756. ],
  757. "bid": "ANyPxKop8SijebwUCq4ZfKbJwlSjVQa_RTdS6c6a6WPYpCKnxpWCJ33B1SzRuSXjSfH9O2MhURebAs0CngRg6B4nOjBpeJDKgA"
  758. }
  759. },
  760. "videoId": str(video_id),
  761. "playbackContext": {
  762. "contentPlaybackContext": {
  763. "currentUrl": f"/watch?v={video_id}",
  764. "vis": 0,
  765. "splay": False,
  766. "autoCaptionsDefaultOn": False,
  767. "autonavState": "STATE_NONE",
  768. "html5Preference": "HTML5_PREF_WANTS",
  769. "signatureTimestamp": 19394,
  770. "referer": f"https://www.youtube.com/watch?v={video_id}",
  771. "lactMilliseconds": "-1",
  772. "watchAmbientModeContext": {
  773. "watchAmbientModeEnabled": True
  774. }
  775. }
  776. },
  777. "racyCheckOk": False,
  778. "contentCheckOk": False
  779. })
  780. headers = {
  781. 'authority': 'www.youtube.com',
  782. 'accept': '*/*',
  783. 'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
  784. 'cache-control': 'no-cache',
  785. 'content-type': 'application/json',
  786. 'cookie': f'VISITOR_INFO1_LIVE=kh6_Vpx5wHY; YSC=UupqFrWvAR0; DEVICE_INFO=ChxOekU1TlRReU5qWTBOVFExTVRRNU5qRTBOdz09EOmU7Z4GGOmU7Z4G; PREF=tz=Asia.Shanghai; ST-180dxzo=itct=CIwBEKQwGAYiEwipncqx3IL9AhXs4cQKHbKZDO4yB3JlbGF0ZWRInsS1qbGFtIlUmgEFCAEQ-B0%3D&csn=MC41MTQ1NTQzMTE3NTA4MjY0&endpoint=%7B%22clickTrackingParams%22%3A%22CIwBEKQwGAYiEwipncqx3IL9AhXs4cQKHbKZDO4yB3JlbGF0ZWRInsS1qbGFtIlUmgEFCAEQ-B0%3D%22%2C%22commandMetadata%22%3A%7B%22webCommandMetadata%22%3A%7B%22url%22%3A%22%2Fwatch%3Fv%3D{video_id}%22%2C%22webPageType%22%3A%22WEB_PAGE_TYPE_WATCH%22%2C%22rootVe%22%3A3832%7D%7D%2C%22watchEndpoint%22%3A%7B%22videoId%22%3A%22{video_id}%22%2C%22nofollow%22%3Atrue%2C%22watchEndpointSupportedOnesieConfig%22%3A%7B%22html5PlaybackOnesieConfig%22%3A%7B%22commonConfig%22%3A%7B%22url%22%3A%22https%3A%2F%2Frr5---sn-nx5s7n76.googlevideo.com%2Finitplayback%3Fsource%3Dyoutube%26oeis%3D1%26c%3DWEB%26oad%3D3200%26ovd%3D3200%26oaad%3D11000%26oavd%3D11000%26ocs%3D700%26oewis%3D1%26oputc%3D1%26ofpcc%3D1%26msp%3D1%26odepv%3D1%26id%3D38654ad085c12212%26ip%3D38.93.247.21%26initcwndbps%3D11346250%26mt%3D1675748964%26oweuc%3D%26pxtags%3DCg4KAnR4EggyNDQ1MTI4OA%26rxtags%3DCg4KAnR4EggyNDQ1MTI4Ng%252CCg4KAnR4EggyNDQ1MTI4Nw%252CCg4KAnR4EggyNDQ1MTI4OA%252CCg4KAnR4EggyNDQ1MTI4OQ%22%7D%7D%7D%7D%7D',
  787. 'origin': 'https://www.youtube.com',
  788. 'pragma': 'no-cache',
  789. 'referer': f'https://www.youtube.com/watch?v={video_id}',
  790. 'sec-ch-ua': '"Not_A Brand";v="99", "Chromium";v="109", "Google Chrome";v="109.0.5414.87"',
  791. 'sec-ch-ua-arch': '"arm"',
  792. 'sec-ch-ua-bitness': '"64"',
  793. 'sec-ch-ua-full-version': '"109.0.1518.52"',
  794. 'sec-ch-ua-full-version-list': '"Not_A Brand";v="99.0.0.0", "Microsoft Edge";v="109.0.1518.52", "Chromium";v="109.0.5414.87"',
  795. 'sec-ch-ua-mobile': '?0',
  796. 'sec-ch-ua-model': '',
  797. 'sec-ch-ua-platform': '"macOS"',
  798. 'sec-ch-ua-platform-version': '"12.4.0"',
  799. 'sec-ch-ua-wow64': '?0',
  800. 'sec-fetch-dest': 'empty',
  801. 'sec-fetch-mode': 'same-origin',
  802. 'sec-fetch-site': 'same-origin',
  803. 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
  804. 'x-goog-visitor-id': 'CgtraDZfVnB4NXdIWSjkzoefBg%3D%3D',
  805. 'x-youtube-bootstrap-logged-in': 'false',
  806. 'x-youtube-client-name': '1',
  807. 'x-youtube-client-version': '2.20230201.01.00'
  808. }
  809. response = requests.post(url=url, headers=headers, data=payload)
  810. if response.status_code != 200:
  811. Common.logger(log_type, crawler).warning(f"get_video_info_response:{response.text}\n")
  812. elif 'streamingData' not in response.json():
  813. Common.logger(log_type, crawler).warning(f"get_video_info_response:{response.json()}\n")
  814. elif 'videoDetails' not in response.json():
  815. Common.logger(log_type, crawler).warning(f"get_video_info_response:{response.json()}\n")
  816. elif 'microformat' not in response.json():
  817. Common.logger(log_type, crawler).warning(f"get_video_info_response:{response.json()}\n")
  818. else:
  819. playerMicroformatRenderer = response.json()['microformat']['playerMicroformatRenderer']
  820. videoDetails = response.json()['videoDetails']
  821. # streamingData = response.json()['streamingData']
  822. # video_title
  823. if 'title' not in videoDetails:
  824. video_title = ''
  825. else:
  826. video_title = videoDetails['title'].replace('"', '').replace("'", '')
  827. video_title = cls.filter_emoji(video_title)
  828. if not cls.is_contain_chinese(video_title):
  829. video_title = Translate.google_translate(video_title) \
  830. .strip().replace("\\", "").replace(" ", "").replace("\n", "") \
  831. .replace("/", "").replace("\r", "").replace("&NBSP", "").replace("&", "") \
  832. .replace(";", "").replace("amp;", "") # 自动翻译标题为中文
  833. if 'lengthSeconds' not in videoDetails:
  834. duration = 0
  835. else:
  836. duration = int(videoDetails['lengthSeconds'])
  837. # play_cnt
  838. if 'viewCount' not in videoDetails:
  839. play_cnt = 0
  840. else:
  841. play_cnt = int(videoDetails['viewCount'])
  842. # publish_time
  843. if 'publishDate' not in playerMicroformatRenderer:
  844. publish_time = ''
  845. else:
  846. publish_time = playerMicroformatRenderer['publishDate']
  847. if publish_time == '':
  848. publish_time_stamp = 0
  849. elif ':' in publish_time:
  850. publish_time_stamp = int(time.mktime(time.strptime(publish_time, "%Y-%m-%d %H:%M:%S")))
  851. else:
  852. publish_time_stamp = int(time.mktime(time.strptime(publish_time, "%Y-%m-%d")))
  853. # user_name
  854. if 'author' not in videoDetails:
  855. user_name = ''
  856. else:
  857. user_name = videoDetails['author']
  858. # cover_url
  859. if 'thumbnail' not in videoDetails:
  860. cover_url = ''
  861. elif 'thumbnails' not in videoDetails['thumbnail']:
  862. cover_url = ''
  863. elif len(videoDetails['thumbnail']['thumbnails']) == 0:
  864. cover_url = ''
  865. elif 'url' not in videoDetails['thumbnail']['thumbnails'][-1]:
  866. cover_url = ''
  867. else:
  868. cover_url = videoDetails['thumbnail']['thumbnails'][-1]['url']
  869. # video_url
  870. # if 'formats' not in streamingData:
  871. # video_url = ''
  872. # elif len(streamingData['formats']) == 0:
  873. # video_url = ''
  874. # elif 'url' not in streamingData['formats'][-1]:
  875. # video_url = ''
  876. # else:
  877. # video_url = streamingData['formats'][-1]['url']
  878. video_url = f"https://www.youtube.com/watch?v={video_id}"
  879. Common.logger(log_type, crawler).info(f'video_title:{video_title}')
  880. Common.logger(log_type, crawler).info(f'video_id:{video_id}')
  881. Common.logger(log_type, crawler).info(f'play_cnt:{play_cnt}')
  882. Common.logger(log_type, crawler).info(f'publish_time:{publish_time}')
  883. Common.logger(log_type, crawler).info(f'user_name:{user_name}')
  884. Common.logger(log_type, crawler).info(f'cover_url:{cover_url}')
  885. Common.logger(log_type, crawler).info(f'video_url:{video_url}')
  886. video_dict = {
  887. 'video_title': video_title,
  888. 'video_id': video_id,
  889. 'duration': duration,
  890. 'play_cnt': play_cnt,
  891. 'publish_time': publish_time,
  892. 'publish_time_stamp': publish_time_stamp,
  893. 'user_name': user_name,
  894. 'out_uid': out_uid,
  895. 'cover_url': cover_url,
  896. 'video_url': video_url,
  897. }
  898. return video_dict
  899. except Exception as e:
  900. Common.logger(log_type, crawler).error(f"get_video_info异常:{e}\n")
  901. @classmethod
  902. def repeat_video(cls, log_type, crawler, video_id, env):
  903. sql = f""" select * from crawler_video where platform="{cls.platform}" and out_video_id="{video_id}"; """
  904. repeat_video = MysqlHelper.get_values(log_type, crawler, sql, env)
  905. return len(repeat_video)
  906. @classmethod
  907. def download_publish(cls, log_type, crawler, video_dict, strategy, our_uid, env, oss_endpoint):
  908. try:
  909. filter_words = get_config_from_mysql(log_type, crawler, env, text='filter', action='get_author_map')
  910. for filter_word in filter_words:
  911. if filter_word in video_dict['video_title']:
  912. Common.logger(log_type, crawler).info('标题已中过滤词:{}\n', video_dict['video_title'])
  913. return
  914. if video_dict['video_title'] == '' or video_dict['video_url'] == '':
  915. Common.logger(log_type, crawler).info('无效视频\n')
  916. elif video_dict['duration'] > 1200 or video_dict['duration'] < 60:
  917. Common.logger(log_type, crawler).info(f"时长:{video_dict['duration']}不满足规则\n")
  918. # elif repeat_video is not None and len(repeat_video) != 0:
  919. elif cls.repeat_video(log_type, crawler, video_dict['video_id'], env) != 0:
  920. Common.logger(log_type, crawler).info('视频已下载\n')
  921. else:
  922. # 下载视频
  923. Common.logger(log_type, crawler).info('开始下载视频...')
  924. # Common.download_method(log_type, crawler, 'video', video_dict['video_title'], video_dict['video_url'])
  925. Common.download_method(log_type, crawler, 'youtube_video', video_dict['video_title'],
  926. video_dict['video_url'])
  927. # ffmpeg_dict = Common.ffmpeg(log_type, crawler, f"./{crawler}/videos/{video_dict['video_title']}/video.mp4")
  928. # video_width = int(ffmpeg_dict['width'])
  929. # video_height = int(ffmpeg_dict['height'])
  930. # video_size = int(ffmpeg_dict['size'])
  931. video_width = 1280
  932. video_height = 720
  933. duration = int(video_dict['duration'])
  934. Common.logger(log_type, crawler).info(f'video_width:{video_width}')
  935. Common.logger(log_type, crawler).info(f'video_height:{video_height}')
  936. Common.logger(log_type, crawler).info(f'duration:{duration}')
  937. # Common.logger(log_type, crawler).info(f'video_size:{video_size}\n')
  938. video_dict['video_width'] = video_width
  939. video_dict['video_height'] = video_height
  940. video_dict['duration'] = duration
  941. video_dict['comment_cnt'] = 0
  942. video_dict['like_cnt'] = 0
  943. video_dict['share_cnt'] = 0
  944. video_dict['avatar_url'] = video_dict['cover_url']
  945. video_dict['session'] = f'youtube{int(time.time())}'
  946. rule = '1,2'
  947. # if duration < 60 or duration > 600:
  948. # # 删除视频文件夹
  949. # shutil.rmtree(f"./{crawler}/videos/{video_dict['video_title']}/")
  950. # Common.logger(log_type, crawler).info(f"时长:{video_dict['duration']}不满足抓取规则,删除成功\n")
  951. # return
  952. # if duration == 0 or duration is None:
  953. # # 删除视频文件夹
  954. # shutil.rmtree(f"./{crawler}/videos/{video_dict['video_title']}/")
  955. # Common.logger(log_type, crawler).info(f"视频下载出错,删除成功\n")
  956. # return
  957. # else:
  958. # 下载封面
  959. Common.download_method(log_type, crawler, 'cover', video_dict['video_title'], video_dict['cover_url'])
  960. # 保存视频文本信息
  961. Common.save_video_info(log_type, crawler, video_dict)
  962. # 上传视频
  963. Common.logger(log_type, crawler).info(f"开始上传视频")
  964. if env == 'dev':
  965. our_video_id = Publish.upload_and_publish(log_type, crawler, strategy, our_uid, env, oss_endpoint)
  966. our_video_link = f"https://testadmin.piaoquantv.com/cms/post-detail/{our_video_id}/info"
  967. else:
  968. our_video_id = Publish.upload_and_publish(log_type, crawler, strategy, our_uid, env, oss_endpoint)
  969. our_video_link = f"https://admin.piaoquantv.com/cms/post-detail/{our_video_id}/info"
  970. Common.logger(log_type, crawler).info("视频上传完成")
  971. if our_video_id is None:
  972. # 删除视频文件夹
  973. shutil.rmtree(f"./{crawler}/videos/{video_dict['video_title']}/")
  974. return
  975. # 视频信息保存至飞书
  976. Feishu.insert_columns(log_type, crawler, "GVxlYk", "ROWS", 1, 2)
  977. # 视频ID工作表,首行写入数据
  978. upload_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(time.time())))
  979. values = [[upload_time,
  980. "定向榜",
  981. video_dict['video_id'],
  982. video_dict['video_title'],
  983. our_video_link,
  984. video_dict['play_cnt'],
  985. video_dict['duration'],
  986. f'{video_width}*{video_height}',
  987. video_dict['publish_time'],
  988. video_dict['user_name'],
  989. video_dict['cover_url'],
  990. video_dict['video_url']
  991. ]]
  992. # time.sleep(1)
  993. Feishu.update_values(log_type, crawler, "GVxlYk", "F2:Z2", values)
  994. Common.logger(log_type, crawler).info('视频信息写入定向_已下载表成功\n')
  995. # 视频信息保存数据库
  996. sql = f""" insert into crawler_video(video_id,
  997. user_id,
  998. out_user_id,
  999. platform,
  1000. strategy,
  1001. out_video_id,
  1002. video_title,
  1003. cover_url,
  1004. video_url,
  1005. duration,
  1006. publish_time,
  1007. play_cnt,
  1008. crawler_rule,
  1009. width,
  1010. height)
  1011. values({our_video_id},
  1012. "{our_uid}",
  1013. "{video_dict['out_uid']}",
  1014. "{cls.platform}",
  1015. "定向爬虫策略",
  1016. "{video_dict['video_id']}",
  1017. "{video_dict['video_title']}",
  1018. "{video_dict['cover_url']}",
  1019. "{video_dict['video_url']}",
  1020. {int(duration)},
  1021. "{video_dict['publish_time']}",
  1022. {int(video_dict['play_cnt'])},
  1023. "{rule}",
  1024. {int(video_width)},
  1025. {int(video_height)}) """
  1026. MysqlHelper.update_values(log_type, crawler, sql, env)
  1027. Common.logger(log_type, crawler).info('视频信息插入数据库成功!\n')
  1028. except Exception as e:
  1029. Common.logger(log_type, crawler).info(f"download_publish异常:{e}\n")
  1030. @classmethod
  1031. def get_follow_videos(cls, log_type, crawler, task, oss_endpoint, env):
  1032. try:
  1033. user_list = get_user_from_mysql(log_type, crawler, crawler, env, action='get_author_map')
  1034. strategy = '定向抓取策略'
  1035. for user_dict in user_list:
  1036. out_user_url = user_dict['link']
  1037. out_uid = out_user_url.split('/')[3]
  1038. user_name = user_dict['nick_name']
  1039. our_uid = user_dict['uid']
  1040. Common.logger(log_type, crawler).info(f'获取 {user_name} 主页视频\n')
  1041. cls.get_videos(
  1042. log_type=log_type,
  1043. crawler=crawler,
  1044. strategy=strategy,
  1045. task=task,
  1046. oss_endpoint=oss_endpoint,
  1047. env=env,
  1048. our_uid=our_uid,
  1049. out_uid=out_uid,
  1050. out_user_url=out_user_url
  1051. )
  1052. cls.continuation = ''
  1053. except Exception as e:
  1054. Common.logger(log_type, crawler).error(f"get_follow_videos异常:{e}\n")
  1055. if __name__ == "__main__":
  1056. YoutubeAuthorScheduling.get_follow_videos('author', 'youtube', '', 'outer', 'dev')