youtube_follow_api.py 63 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180
  1. # -*- coding: utf-8 -*-
  2. # @Author: wangkun
  3. # @Time: 2023/2/3
  4. """
  5. YouTube 定向榜
  6. 1. 发布时间<=1个月
  7. 2. 10分钟>=时长>=1分钟
  8. """
  9. import os
  10. import re
  11. import shutil
  12. import sys
  13. import time
  14. import json
  15. import requests
  16. sys.path.append(os.getcwd())
  17. from common.common import Common
  18. from common.db import MysqlHelper
  19. from common.feishu import Feishu
  20. from common.getuser import getUser
  21. from common.publish import Publish
  22. from common.translate import Translate
  23. from common.public import get_user_from_mysql
  24. headers = {
  25. 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36',
  26. }
  27. def format_nums(data):
  28. data_dict = [{'亿': 100000000}, {'百万': 1000000}, {'万': 10000}, {'k': 1000}, {'w': 10000}, {'m': 1000000},
  29. {'千': 1000}, {'M': 1000000}, {'K': 1000}, {'W': 10000}]
  30. data = str(data)
  31. for i in data_dict:
  32. index = data.find(list(i.keys())[0])
  33. if index > 0:
  34. count = int(float(data[:index]) * list(i.values())[0])
  35. return count
  36. elif index < 0:
  37. continue
  38. count = int(float(re.findall(r'\d+', data)[0]))
  39. return count
  40. class YoutubeFollow:
  41. # 翻页参数
  42. continuation = ''
  43. # 抓取平台
  44. platform = 'youtube'
  45. headers = {
  46. 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36',
  47. }
  48. # @classmethod
  49. # def get_browse_id(cls, log_type, crawler, out_user_id, machine):
  50. # """
  51. # 获取每个用户的 browse_id
  52. # :param log_type: 日志
  53. # :param crawler: 哪款爬虫
  54. # :param out_user_id: 站外用户 UID
  55. # :param machine: 部署机器,阿里云填写 aliyun / aliyun_hk,线下分别填写 macpro,macair,local
  56. # :return: browse_id
  57. # """
  58. # try:
  59. # # 打印请求配置
  60. # ca = DesiredCapabilities.CHROME
  61. # ca["goog:loggingPrefs"] = {"performance": "ALL"}
  62. #
  63. # # 不打开浏览器运行
  64. # chrome_options = webdriver.ChromeOptions()
  65. # chrome_options.add_argument("--headless")
  66. # chrome_options.add_argument(
  67. # '--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36')
  68. # chrome_options.add_argument("--no-sandbox")
  69. #
  70. # # driver初始化
  71. # if machine == 'aliyun' or machine == 'aliyun_hk':
  72. # driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options)
  73. # elif machine == 'macpro':
  74. # driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options,
  75. # service=Service('/Users/lieyunye/Downloads/chromedriver_v86/chromedriver'))
  76. # elif machine == 'macair':
  77. # driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options,
  78. # service=Service('/Users/piaoquan/Downloads/chromedriver'))
  79. # else:
  80. # driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options, service=Service(
  81. # '/Users/wangkun/Downloads/chromedriver/chromedriver_v110/chromedriver'))
  82. #
  83. # driver.implicitly_wait(10)
  84. # url = f'https://www.youtube.com/{out_user_id}/videos'
  85. # driver.get(url)
  86. # # driver.save_screenshot("./1.png")
  87. # # 向上滑动 1000 个像素
  88. # # driver.execute_script('window.scrollBy(0, 2000)')
  89. # # driver.save_screenshot("./2.png")
  90. # time.sleep(3)
  91. # accept_btns = driver.find_elements(By.XPATH, '//span[text()="全部接受"]')
  92. # accept_btns_eng = driver.find_elements(By.XPATH, '//span[text()="Accept all"]')
  93. # if len(accept_btns) != 0:
  94. # accept_btns[0].click()
  95. # time.sleep(2)
  96. # elif len(accept_btns_eng) != 0:
  97. # accept_btns_eng[0].click()
  98. # time.sleep(2)
  99. # browse_id = driver.find_element(By.XPATH, '//meta[@itemprop="channelId"]').get_attribute('content')
  100. # driver.quit()
  101. # return browse_id
  102. # except Exception as e:
  103. # Common.logger(log_type, crawler).error(f'get_browse_id异常:{e}\n')
  104. @classmethod
  105. def get_out_user_info(cls, log_type, crawler, browse_id, out_user_id):
  106. """
  107. 获取站外用户信息
  108. :param log_type: 日志
  109. :param crawler: 哪款爬虫
  110. :param browse_id: browse_id
  111. :param out_user_id: 站外用户 UID
  112. :return: out_user_dict = {'out_user_name': 站外用户昵称,
  113. 'out_avatar_url': 站外用户头像,
  114. 'out_fans': 站外用户粉丝量,
  115. 'out_play_cnt': 站外用户总播放量,
  116. 'out_create_time': 站外用户创建时间}
  117. """
  118. try:
  119. url = f'https://www.youtube.com/{out_user_id}/about'
  120. res = requests.get(url=url, headers=headers)
  121. info = re.findall(r'var ytInitialData = (.*?);</script>', res.text, re.S)[0]
  122. data = json.loads(info)
  123. header = data['header']['c4TabbedHeaderRenderer']
  124. tabs = data['contents']['twoColumnBrowseResultsRenderer']['tabs']
  125. try:
  126. subsimpleText = header['subscriberCountText']['simpleText'].replace('位订阅者', '')
  127. out_fans = format_nums(subsimpleText)
  128. except Exception as e:
  129. out_fans = 0
  130. for tab in tabs:
  131. if 'tabRenderer' not in tab or 'content' not in tab['tabRenderer']:
  132. continue
  133. viewCountText = \
  134. tab['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer'][
  135. 'contents'][0]['channelAboutFullMetadataRenderer']['viewCountText']['simpleText']
  136. out_create_time = \
  137. tab['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer'][
  138. 'contents'][0]['channelAboutFullMetadataRenderer']['joinedDateText']['runs'][1]['text']
  139. break
  140. out_user_dict = {
  141. 'out_user_name': header['title'],
  142. 'out_avatar_url': header['avatar']['thumbnails'][-1]['url'],
  143. 'out_fans': out_fans,
  144. 'out_play_cnt': int(
  145. viewCountText.replace('收看次數:', '').replace('次', '').replace(',', '')) if viewCountText else 0,
  146. 'out_create_time': out_create_time.replace('年', '-').replace('月', '-').replace('日', ''),
  147. }
  148. # print(out_user_dict)
  149. return out_user_dict
  150. except Exception as e:
  151. Common.logger(log_type, crawler).error(f'get_out_user_info异常:{e}\n')
  152. @classmethod
  153. def get_user_from_feishu(cls, log_type, crawler, sheetid, env, machine):
  154. """
  155. 补全飞书用户表信息,并返回
  156. :param log_type: 日志
  157. :param crawler: 哪款爬虫
  158. :param sheetid: 飞书表
  159. :param env: 正式环境:prod,测试环境:dev
  160. :param machine: 部署机器,阿里云填写 aliyun,aliyun_hk ,线下分别填写 macpro,macair,local
  161. :return: user_list
  162. """
  163. try:
  164. user_sheet = Feishu.get_values_batch(log_type, crawler, sheetid)
  165. user_list = []
  166. for i in range(1, len(user_sheet)):
  167. out_uid = user_sheet[i][2]
  168. user_name = user_sheet[i][3]
  169. browse_id = user_sheet[i][5]
  170. our_uid = user_sheet[i][6]
  171. uer_url = user_sheet[i][4]
  172. if out_uid is not None and user_name is not None:
  173. Common.logger(log_type, crawler).info(f"正在更新 {user_name} 用户信息\n")
  174. if our_uid is None:
  175. sql = f""" select * from crawler_user where platform="{cls.platform}" and out_user_id="{out_uid}" """
  176. our_user_info = MysqlHelper.get_values(log_type, crawler, sql, env, machine)
  177. # 数据库中(youtube + out_user_id)返回数量 == 0,则创建站内账号UID,并写入定向账号飞书表。并结合站外用户信息,一并写入爬虫账号数据库
  178. if not our_user_info:
  179. # 获取站外账号信息,写入数据库
  180. try:
  181. out_user_dict = cls.get_out_user_info(log_type, crawler, browse_id, out_uid)
  182. except Exception as e:
  183. continue
  184. out_avatar_url = out_user_dict['out_avatar_url']
  185. out_create_time = out_user_dict['out_create_time']
  186. out_play_cnt = out_user_dict['out_play_cnt']
  187. out_fans = out_user_dict['out_fans']
  188. tag = 'youtube爬虫,定向爬虫策略'
  189. # 创建站内账号
  190. create_user_dict = {
  191. 'nickName': user_name,
  192. 'avatarUrl': out_avatar_url,
  193. 'tagName': tag,
  194. }
  195. our_uid = getUser.create_uid(log_type, crawler, create_user_dict, env)
  196. Common.logger(log_type, crawler).info(f'新创建的站内UID:{our_uid}')
  197. if env == 'prod':
  198. our_user_link = f'https://admin.piaoquantv.com/ums/user/{our_uid}/post'
  199. else:
  200. our_user_link = f'https://testadmin.piaoquantv.com/ums/user/{our_uid}/post'
  201. Common.logger(log_type, crawler).info(f'站内用户主页链接:{our_user_link}')
  202. Feishu.update_values(log_type, crawler, sheetid, f'G{i + 1}:H{i + 1}',
  203. [[our_uid, our_user_link]])
  204. Common.logger(log_type, crawler).info(f'站内用户信息写入飞书成功!')
  205. sql = f""" insert into crawler_user(user_id,
  206. out_user_id,
  207. out_user_name,
  208. out_avatar_url,
  209. out_create_time,
  210. out_play_cnt,
  211. out_fans,
  212. platform,
  213. tag)
  214. values({our_uid},
  215. "{out_uid}",
  216. "{user_name}",
  217. "{out_avatar_url}",
  218. "{out_create_time}",
  219. {out_play_cnt},
  220. {out_fans},
  221. "{cls.platform}",
  222. "{tag}") """
  223. Common.logger(log_type, crawler).info(f'sql:{sql}')
  224. MysqlHelper.update_values(log_type, crawler, sql, env, machine)
  225. Common.logger(log_type, crawler).info('用户信息插入数据库成功!\n')
  226. # 数据库中(youtube + out_user_id)返回数量 != 0,则直接把数据库中的站内 UID 写入飞书
  227. else:
  228. our_uid = our_user_info[0][1]
  229. if 'env' == 'prod':
  230. our_user_link = f'https://admin.piaoquantv.com/ums/user/{our_uid}/post'
  231. else:
  232. our_user_link = f'https://testadmin.piaoquantv.com/ums/user/{our_uid}/post'
  233. Common.logger(log_type, crawler).info(f'站内用户主页链接:{our_user_link}')
  234. Feishu.update_values(log_type, crawler, sheetid, f'G{i + 1}:H{i + 1}',
  235. [[our_uid, our_user_link]])
  236. Common.logger(log_type, crawler).info(f'站内用户信息写入飞书成功!\n')
  237. user_dict = {
  238. 'out_user_id': out_uid,
  239. 'out_user_name': user_name,
  240. 'out_browse_id': browse_id,
  241. 'our_user_id': our_uid,
  242. 'out_user_url': uer_url
  243. }
  244. user_list.append(user_dict)
  245. else:
  246. pass
  247. return user_list
  248. except Exception as e:
  249. Common.logger(log_type, crawler).error(f"get_user_from_feishu异常:{e}\n")
  250. @classmethod
  251. def get_continuation(cls, data):
  252. continuation = data['continuationItemRenderer']['continuationEndpoint']['continuationCommand']['token']
  253. return continuation
  254. @classmethod
  255. def get_feeds(cls, log_type, crawler, browse_id, out_uid):
  256. """
  257. 获取个人主页视频列表
  258. :param log_type: 日志
  259. :param crawler: 哪款爬虫
  260. :param browse_id: 每个用户主页的请求参数中唯一值
  261. :param out_uid: 站外用户UID
  262. :return: video_list
  263. """
  264. url = "https://www.youtube.com/youtubei/v1/browse?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8&prettyPrint=false"
  265. payload = json.dumps({
  266. "context": {
  267. "client": {
  268. "hl": "zh-CN",
  269. "gl": "US",
  270. "remoteHost": "38.93.247.21",
  271. "deviceMake": "Apple",
  272. "deviceModel": "",
  273. "visitorData": "CgtraDZfVnB4NXdIWSi6mIOfBg%3D%3D",
  274. "userAgent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36,gzip(gfe)",
  275. "clientName": "WEB",
  276. "clientVersion": "2.20230201.01.00",
  277. "osName": "Macintosh",
  278. "osVersion": "10_15_7",
  279. "originalUrl": f"https://www.youtube.com/{out_uid}/videos",
  280. "platform": "DESKTOP",
  281. "clientFormFactor": "UNKNOWN_FORM_FACTOR",
  282. "configInfo": {
  283. "appInstallData": "CLqYg58GEInorgUQuIuuBRCU-K4FENfkrgUQuNSuBRC2nP4SEPuj_hIQ5_euBRCy9a4FEKLsrgUQt-CuBRDi1K4FEILdrgUQh92uBRDM364FEP7urgUQzPWuBRDZ6a4FEOSg_hIQo_muBRDvo_4SEMnJrgUQlqf-EhCR-PwS"
  284. },
  285. "timeZone": "Asia/Shanghai",
  286. "browserName": "Chrome",
  287. "browserVersion": "109.0.0.0",
  288. "acceptHeader": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
  289. "deviceExperimentId": "ChxOekU1TlRReU5qWTBOVFExTVRRNU5qRTBOdz09ELqYg58GGOmU7Z4G",
  290. "screenWidthPoints": 944,
  291. "screenHeightPoints": 969,
  292. "screenPixelDensity": 1,
  293. "screenDensityFloat": 1,
  294. "utcOffsetMinutes": 480,
  295. "userInterfaceTheme": "USER_INTERFACE_THEME_LIGHT",
  296. "memoryTotalKbytes": "8000000",
  297. "mainAppWebInfo": {
  298. "graftUrl": f"/{out_uid}/videos",
  299. "pwaInstallabilityStatus": "PWA_INSTALLABILITY_STATUS_CAN_BE_INSTALLED",
  300. "webDisplayMode": "WEB_DISPLAY_MODE_FULLSCREEN",
  301. "isWebNativeShareAvailable": True
  302. }
  303. },
  304. "user": {
  305. "lockedSafetyMode": False
  306. },
  307. "request": {
  308. "useSsl": True,
  309. "internalExperimentFlags": [],
  310. "consistencyTokenJars": []
  311. },
  312. "clickTracking": {
  313. "clickTrackingParams": "CBcQ8JMBGAYiEwiNhIXX9IL9AhUFSUwIHWnnDks="
  314. },
  315. "adSignalsInfo": {
  316. "params": [
  317. {
  318. "key": "dt",
  319. "value": "1675676731048"
  320. },
  321. {
  322. "key": "flash",
  323. "value": "0"
  324. },
  325. {
  326. "key": "frm",
  327. "value": "0"
  328. },
  329. {
  330. "key": "u_tz",
  331. "value": "480"
  332. },
  333. {
  334. "key": "u_his",
  335. "value": "4"
  336. },
  337. {
  338. "key": "u_h",
  339. "value": "1080"
  340. },
  341. {
  342. "key": "u_w",
  343. "value": "1920"
  344. },
  345. {
  346. "key": "u_ah",
  347. "value": "1080"
  348. },
  349. {
  350. "key": "u_aw",
  351. "value": "1920"
  352. },
  353. {
  354. "key": "u_cd",
  355. "value": "24"
  356. },
  357. {
  358. "key": "bc",
  359. "value": "31"
  360. },
  361. {
  362. "key": "bih",
  363. "value": "969"
  364. },
  365. {
  366. "key": "biw",
  367. "value": "944"
  368. },
  369. {
  370. "key": "brdim",
  371. "value": "-269,-1080,-269,-1080,1920,-1080,1920,1080,944,969"
  372. },
  373. {
  374. "key": "vis",
  375. "value": "1"
  376. },
  377. {
  378. "key": "wgl",
  379. "value": "true"
  380. },
  381. {
  382. "key": "ca_type",
  383. "value": "image"
  384. }
  385. ],
  386. "bid": "ANyPxKpfiaAf-DBzNeKLgkceMEA9UIeCWFRTRm4AQMCuejhI3PGwDB1jizQIX60YcEYtt_CX7tZWAbYerQ-rWLvV7y_KCLkBww"
  387. }
  388. },
  389. # "browseId": browse_id,
  390. "params": "EgZ2aWRlb3PyBgQKAjoA",
  391. "continuation": cls.continuation
  392. })
  393. headers = {
  394. 'authority': 'www.youtube.com',
  395. 'accept': '*/*',
  396. 'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
  397. 'cache-control': 'no-cache',
  398. 'content-type': 'application/json',
  399. 'cookie': 'VISITOR_INFO1_LIVE=kh6_Vpx5wHY; YSC=UupqFrWvAR0; DEVICE_INFO=ChxOekU1TlRReU5qWTBOVFExTVRRNU5qRTBOdz09EOmU7Z4GGOmU7Z4G; PREF=tz=Asia.Shanghai; ST-1kg1gfd=itct=CBcQ8JMBGAYiEwiNhIXX9IL9AhUFSUwIHWnnDks%3D&csn=MC4zNzI3MDcwMDA1Mjg4NzE5Ng..&endpoint=%7B%22clickTrackingParams%22%3A%22CBcQ8JMBGAYiEwiNhIXX9IL9AhUFSUwIHWnnDks%3D%22%2C%22commandMetadata%22%3A%7B%22webCommandMetadata%22%3A%7B%22url%22%3A%22%2F%40chinatravel5971%2Fvideos%22%2C%22webPageType%22%3A%22WEB_PAGE_TYPE_CHANNEL%22%2C%22rootVe%22%3A3611%2C%22apiUrl%22%3A%22%2Fyoutubei%2Fv1%2Fbrowse%22%7D%7D%2C%22browseEndpoint%22%3A%7B%22browseId%22%3A%22UCpLXnfBCNhj8KLnt54RQMKA%22%2C%22params%22%3A%22EgZ2aWRlb3PyBgQKAjoA%22%2C%22canonicalBaseUrl%22%3A%22%2F%40chinatravel5971%22%7D%7D',
  400. 'origin': 'https://www.youtube.com',
  401. 'pragma': 'no-cache',
  402. 'referer': f'https://www.youtube.com/{out_uid}/featured',
  403. 'sec-ch-ua': '"Not_A Brand";v="99", "Chromium";v="109", "Google Chrome";v="109.0.5414.87"',
  404. 'sec-ch-ua-arch': '"arm"',
  405. 'sec-ch-ua-bitness': '"64"',
  406. 'sec-ch-ua-full-version': '"109.0.1518.52"',
  407. 'sec-ch-ua-full-version-list': '"Not_A Brand";v="99.0.0.0", "Microsoft Edge";v="109.0.1518.52", "Chromium";v="109.0.5414.87"',
  408. 'sec-ch-ua-mobile': '?0',
  409. 'sec-ch-ua-model': '',
  410. 'sec-ch-ua-platform': '"macOS"',
  411. 'sec-ch-ua-platform-version': '"12.4.0"',
  412. 'sec-ch-ua-wow64': '?0',
  413. 'sec-fetch-dest': 'empty',
  414. 'sec-fetch-mode': 'same-origin',
  415. 'sec-fetch-site': 'same-origin',
  416. 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
  417. 'x-goog-visitor-id': 'CgtraDZfVnB4NXdIWSi6mIOfBg%3D%3D',
  418. 'x-youtube-bootstrap-logged-in': 'false',
  419. 'x-youtube-client-name': '1',
  420. 'x-youtube-client-version': '2.20230201.01.00'
  421. }
  422. try:
  423. response = requests.post(url=url, headers=headers, data=payload)
  424. # Common.logger(log_type, crawler).info(f"get_feeds_response:{response.json()}\n")
  425. cls.continuation = response.json()['trackingParams']
  426. if response.status_code != 200:
  427. Common.logger(log_type, crawler).warning(f'get_feeds_response:{response.text}\n')
  428. elif 'continuationContents' not in response.text and 'onResponseReceivedActions' not in response.text:
  429. Common.logger(log_type, crawler).warning(f'get_feeds_response:{response.text}\n')
  430. elif 'continuationContents' in response.json():
  431. # Common.logger(log_type, crawler).info("'continuationContents' in response.json()\n")
  432. if 'richGridContinuation' not in response.json()['continuationContents']:
  433. # Common.logger(log_type, crawler).warning(f"'richGridContinuation' not in response.json()['continuationContents']\n")
  434. Common.logger(log_type, crawler).warning(
  435. f'get_feeds_response:{response.json()["continuationContents"]}\n')
  436. elif 'contents' not in response.json()['continuationContents']['richGridContinuation']:
  437. Common.logger(log_type, crawler).warning(
  438. f'get_feeds_response:{response.json()["continuationContents"]["richGridContinuation"]}\n')
  439. elif 'contents' in response.json()["continuationContents"]["richGridContinuation"]:
  440. feeds = response.json()["continuationContents"]["richGridContinuation"]['contents']
  441. return feeds
  442. elif 'onResponseReceivedActions' in response.json():
  443. Common.logger(log_type, crawler).info("'onResponseReceivedActions' in response.json()\n")
  444. if len(response.json()['onResponseReceivedActions']) == 0:
  445. Common.logger(log_type, crawler).warning(
  446. f'get_feeds_response:{response.json()["onResponseReceivedActions"]}\n')
  447. elif 'appendContinuationItemsAction' not in response.json()['onResponseReceivedActions'][0]:
  448. Common.logger(log_type, crawler).warning(
  449. f'get_feeds_response:{response.json()["onResponseReceivedActions"][0]}\n')
  450. elif 'continuationItems' not in response.json()['onResponseReceivedActions'][0][
  451. 'appendContinuationItemsAction']:
  452. Common.logger(log_type, crawler).warning(
  453. f'get_feeds_response:{response.json()["onResponseReceivedActions"][0]["appendContinuationItemsAction"]}\n')
  454. elif len(response.json()['onResponseReceivedActions'][0]['appendContinuationItemsAction'][
  455. 'continuationItems']) == 0:
  456. Common.logger(log_type, crawler).warning(
  457. f'get_feeds_response:{response.json()["onResponseReceivedActions"][0]["appendContinuationItemsAction"]["continuationItems"]}\n')
  458. else:
  459. feeds = response.json()["onResponseReceivedActions"][0]["appendContinuationItemsAction"][
  460. "continuationItems"]
  461. return feeds
  462. else:
  463. Common.logger(log_type, crawler).info('feeds is None\n')
  464. except Exception as e:
  465. Common.logger(log_type, crawler).error(f'get_feeds异常:{e}\n')
  466. @classmethod
  467. def get_first_page(cls, user_url):
  468. try:
  469. res = requests.get(url=user_url, headers=cls.headers)
  470. info = re.findall(r'var ytInitialData = (.*?);', res.text, re.S)[0]
  471. ytInitialData = json.loads(info)
  472. video_list = \
  473. ytInitialData['contents']['twoColumnBrowseResultsRenderer']['tabs'][1]['tabRenderer']['content'][
  474. 'richGridRenderer']['contents']
  475. except Exception as e:
  476. video_list = []
  477. return video_list
  478. @classmethod
  479. def get_next_page(cls, log_type, crawler, strategy, oss_endpoint, env, out_uid, our_uid,
  480. machine, out_user_url, continuation):
  481. post_url = "https://www.youtube.com/youtubei/v1/browse?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8&prettyPrint=false"
  482. payload = json.dumps({
  483. "context": {
  484. "client": {
  485. "userAgent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36,gzip(gfe)",
  486. "clientName": "WEB",
  487. "clientVersion": "2.20230221.06.00",
  488. "osName": "Macintosh",
  489. "osVersion": "10_15_7",
  490. "originalUrl": "https://www.youtube.com/@wongkim728/videos",
  491. "screenPixelDensity": 2,
  492. "platform": "DESKTOP",
  493. "clientFormFactor": "UNKNOWN_FORM_FACTOR",
  494. "configInfo": {
  495. "appInstallData": "CKWy258GEOWg_hIQzN-uBRC4rP4SEOf3rgUQzPWuBRCi7K4FEMiJrwUQieiuBRDshq8FENrprgUQ4tSuBRD-7q4FEKOArwUQgt2uBRC2nP4SEJT4rgUQuIuuBRCH3a4FELjUrgUQjqj-EhCR-PwS"
  496. },
  497. "screenDensityFloat": 2,
  498. "timeZone": "Asia/Shanghai",
  499. "browserName": "Chrome",
  500. "browserVersion": "110.0.0.0",
  501. "acceptHeader": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
  502. "deviceExperimentId": "ChxOekl3TWpVek9UQXpPVE13TnpJd056a3pNZz09EKWy258GGJie0p8G",
  503. "screenWidthPoints": 576,
  504. "screenHeightPoints": 764,
  505. "utcOffsetMinutes": 480,
  506. "userInterfaceTheme": "USER_INTERFACE_THEME_LIGHT",
  507. "connectionType": "CONN_CELLULAR_4G",
  508. "memoryTotalKbytes": "8000000",
  509. "mainAppWebInfo": {
  510. "graftUrl": out_user_url,
  511. "pwaInstallabilityStatus": "PWA_INSTALLABILITY_STATUS_CAN_BE_INSTALLED",
  512. "webDisplayMode": "WEB_DISPLAY_MODE_FULLSCREEN",
  513. "isWebNativeShareAvailable": False
  514. }
  515. },
  516. "user": {
  517. "lockedSafetyMode": False
  518. },
  519. "request": {
  520. "useSsl": True,
  521. "internalExperimentFlags": [],
  522. "consistencyTokenJars": []
  523. },
  524. "clickTracking": {
  525. "clickTrackingParams": ""
  526. },
  527. "adSignalsInfo": {
  528. "params": [],
  529. "bid": "ANyPxKo8EXfKNGm3gYLAqhR5HA90FSKMvQf43tk3KV_XUWB5xi_0OxAo2TJTfoVx_516NRxz0qwRg-1x2kD-IVt7LPKrRHkJBA"
  530. }
  531. },
  532. "continuation": continuation
  533. })
  534. headers = {
  535. # 'authorization': 'SAPISIDHASH 1677121838_f5055bd4b4c242d18af423b37ac0f556bf1dfc30',
  536. 'content-type': 'application/json',
  537. 'cookie': 'VISITOR_INFO1_LIVE=HABZsLFdU40; DEVICE_INFO=ChxOekl3TWpVek9UQXpPVE13TnpJd056a3pNZz09EJie0p8GGJie0p8G; PREF=f4=4000000&tz=Asia.Shanghai; HSID=AxFp7ylWWebUZYqrl; SSID=ANHuSQMqvVcV0vVNn; APISID=AkwZgjPvFZ6LZCrE/Aiv0K-2rEUzY1bH1u; SAPISID=8yRrBMHYXAhqkybH/AEFGJvzZ3tPalnTy0; __Secure-1PAPISID=8yRrBMHYXAhqkybH/AEFGJvzZ3tPalnTy0; __Secure-3PAPISID=8yRrBMHYXAhqkybH/AEFGJvzZ3tPalnTy0; SID=TwjWkM4mrKb4o8pRKbyQVqELjNU43ZL0bF8QB2hdTI9z05T4Koo9aQoNQfX1AiGFWeD7WA.; __Secure-1PSID=TwjWkM4mrKb4o8pRKbyQVqELjNU43ZL0bF8QB2hdTI9z05T4bs4qvvXffLLTXq_VYw0XLw.; __Secure-3PSID=TwjWkM4mrKb4o8pRKbyQVqELjNU43ZL0bF8QB2hdTI9z05T4cNwzpudzvCglfQ5A1FJnog.; LOGIN_INFO=AFmmF2swRAIgO4TvR9xxWoHPgrGoGAEVo-P8Slqem__vIdF_oajjRiECIFiq4YtbL_IQGCbkjrHsWkWH6OpzKd8RlgdS6qNurR0Q:QUQ3MjNmejV5WkRVUmZXVlFjbjY0dW1aVGpoZkZQdmxYamIzV01zc0lmT3JiQl9ldVYwc0t4dlNkbWpoVEdJMHVaWjZXVEt3ZERQeUppU3AyNmR6ckFucWltZU5LNmZjQ3lHUEtKTDBzSlo5WXpJQzF3UlNCVlp2Q1ZKVmxtRk05OHRuWFFiWGphcFpPblFOUURWTlVxVGtBazVjcmVtS2pR; YSC=CtX0f3NennA; SIDCC=AFvIBn9aXC4vNCbg5jPzjbC8LMYCBVx_dy8uJO20b-768rmRfP9f5BqQ_xXspPemecVq29qZ7A; __Secure-1PSIDCC=AFvIBn-4TD_lPaKgbmYAGO6hZluLgSgbWgb7XAcaeNG6982LIIpS_Gb9vkqHTBMyCGvb4x7m6jk; __Secure-3PSIDCC=AFvIBn9ypvGX15qq4CsnsuhWTaXa9yMTxWMWbIDXtr6L3XZD81XBUQ0IMUv9ZKh9mf8NEbSvOy0; SIDCC=AFvIBn_DwLbohF2llhq4EQjFDFA3n9-_AK_7ITJsTZtCeYwy43J8KCYUPfY7ghqX9s-Qq5dOIQ; __Secure-1PSIDCC=AFvIBn-7x_HhxbmDkOzXew-sXAEWVuUGpglr8rypU623IyO8Y9OungcqMkuxBZQ2vr6G7x9UcxM; __Secure-3PSIDCC=AFvIBn-7aSYRxZkCKZp7-Mdn9PwbW4CUtXD0ok0nCvPIZXfkFrN9VqN1BHkI1fUaoIo_8YCjwRs',
  538. 'origin': 'https://www.youtube.com',
  539. 'referer': out_user_url,
  540. 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36',
  541. }
  542. try:
  543. res = requests.request("POST", post_url, headers=headers, data=payload).json()
  544. video_infos = res['onResponseReceivedActions'][0]['appendContinuationItemsAction']['continuationItems']
  545. for data in video_infos:
  546. if 'richItemRenderer' in data:
  547. video_id = data["richItemRenderer"]["content"]['videoRenderer']['videoId']
  548. video_dict = cls.get_video_info(log_type, crawler, out_uid, video_id, machine)
  549. # video_dict = cls.parse_video(video_dict, log_type, crawler, out_uid, video_id, machine)
  550. # 发布时间<=7天
  551. publish_time = int(time.mktime(time.strptime(video_dict['publish_time'], "%Y-%m-%d")))
  552. if int(time.time()) - publish_time <= 3600 * 24 * 7:
  553. cls.download_publish(log_type, crawler, video_dict, strategy, our_uid, env, oss_endpoint,
  554. machine)
  555. else:
  556. Common.logger(log_type, crawler).info('发布时间超过7天\n')
  557. return
  558. else:
  559. continuation = cls.get_continuation(data)
  560. cls.get_next_page(log_type, crawler, strategy, oss_endpoint, env, out_uid, our_uid,
  561. machine, out_user_url, continuation)
  562. except:
  563. return
  564. @classmethod
  565. def get_videos(cls, log_type, crawler, strategy, oss_endpoint, env, out_uid, our_uid,
  566. machine, out_user_url):
  567. try:
  568. feeds = cls.get_first_page(out_user_url)
  569. for data in feeds:
  570. if 'richItemRenderer' in data:
  571. video_id = data["richItemRenderer"]["content"]['videoRenderer']['videoId']
  572. video_dict = cls.get_video_info(log_type, crawler, out_uid, video_id, machine)
  573. # 发布时间<=7天
  574. publish_time = int(time.mktime(time.strptime(video_dict['publish_time'], "%Y-%m-%d")))
  575. if int(time.time()) - publish_time <= 3600 * 24 * 7:
  576. cls.download_publish(log_type, crawler, video_dict, strategy, our_uid, env, oss_endpoint,
  577. machine)
  578. else:
  579. Common.logger(log_type, crawler).info('发布时间超过7天\n')
  580. return
  581. else:
  582. continuation = cls.get_continuation(data)
  583. cls.get_next_page(log_type, crawler, strategy, oss_endpoint, env, out_uid, our_uid,
  584. machine, out_user_url, continuation=continuation)
  585. except Exception as e:
  586. Common.logger(log_type, crawler).error(f"get_videos异常:{e}\n")
  587. @classmethod
  588. def filter_emoji(cls, title):
  589. # 过滤表情
  590. try:
  591. co = re.compile(u'[\U00010000-\U0010ffff]')
  592. except re.error:
  593. co = re.compile(u'[\uD800-\uDBFF][\uDC00-\uDFFF]')
  594. return co.sub("", title)
  595. @classmethod
  596. def is_contain_chinese(cls, strword):
  597. for ch in strword:
  598. if u'\u4e00' <= ch <= u'\u9fff':
  599. return True
  600. return False
  601. @classmethod
  602. def parse_video(cls, video_dict, log_type, crawler, out_uid, video_id, machine):
  603. try:
  604. if 'streamingData' not in video_dict:
  605. Common.logger(log_type, crawler).warning(f"get_video_info_response:{video_dict}\n")
  606. elif 'videoDetails' not in video_dict:
  607. Common.logger(log_type, crawler).warning(f"get_video_info_response:{video_dict}\n")
  608. elif 'microformat' not in video_dict:
  609. Common.logger(log_type, crawler).warning(f"get_video_info_response:{video_dict}\n")
  610. else:
  611. playerMicroformatRenderer = video_dict['microformat']['playerMicroformatRenderer']
  612. videoDetails = video_dict['videoDetails']
  613. # streamingData = response.json()['streamingData']
  614. # video_title
  615. if 'title' not in videoDetails:
  616. video_title = ''
  617. else:
  618. video_title = videoDetails['title']
  619. video_title = cls.filter_emoji(video_title)
  620. # if Translate.is_contains_chinese(video_title) is False:
  621. if not cls.is_contain_chinese(video_title):
  622. video_title = Translate.google_translate(video_title, machine) \
  623. .strip().replace("\\", "").replace(" ", "").replace("\n", "") \
  624. .replace("/", "").replace("\r", "").replace("&NBSP", "").replace("&", "") \
  625. .replace(";", "").replace("amp;", "") # 自动翻译标题为中文
  626. if 'lengthSeconds' not in videoDetails:
  627. duration = 0
  628. else:
  629. duration = int(videoDetails['lengthSeconds'])
  630. # play_cnt
  631. if 'viewCount' not in videoDetails:
  632. play_cnt = 0
  633. else:
  634. play_cnt = int(videoDetails['viewCount'])
  635. # publish_time
  636. if 'publishDate' not in playerMicroformatRenderer:
  637. publish_time = ''
  638. else:
  639. publish_time = playerMicroformatRenderer['publishDate']
  640. if publish_time == '':
  641. publish_time_stamp = 0
  642. elif ':' in publish_time:
  643. publish_time_stamp = int(time.mktime(time.strptime(publish_time, "%Y-%m-%d %H:%M:%S")))
  644. else:
  645. publish_time_stamp = int(time.mktime(time.strptime(publish_time, "%Y-%m-%d")))
  646. # user_name
  647. if 'author' not in videoDetails:
  648. user_name = ''
  649. else:
  650. user_name = videoDetails['author']
  651. # cover_url
  652. if 'thumbnail' not in videoDetails:
  653. cover_url = ''
  654. elif 'thumbnails' not in videoDetails['thumbnail']:
  655. cover_url = ''
  656. elif len(videoDetails['thumbnail']['thumbnails']) == 0:
  657. cover_url = ''
  658. elif 'url' not in videoDetails['thumbnail']['thumbnails'][-1]:
  659. cover_url = ''
  660. else:
  661. cover_url = videoDetails['thumbnail']['thumbnails'][-1]['url']
  662. # video_url
  663. # if 'formats' not in streamingData:
  664. # video_url = ''
  665. # elif len(streamingData['formats']) == 0:
  666. # video_url = ''
  667. # elif 'url' not in streamingData['formats'][-1]:
  668. # video_url = ''
  669. # else:
  670. # video_url = streamingData['formats'][-1]['url']
  671. video_url = f"https://www.youtube.com/watch?v={video_id}"
  672. Common.logger(log_type, crawler).info(f'video_title:{video_title}')
  673. Common.logger(log_type, crawler).info(f'video_id:{video_id}')
  674. Common.logger(log_type, crawler).info(f'play_cnt:{play_cnt}')
  675. Common.logger(log_type, crawler).info(f'publish_time:{publish_time}')
  676. Common.logger(log_type, crawler).info(f'user_name:{user_name}')
  677. Common.logger(log_type, crawler).info(f'cover_url:{cover_url}')
  678. Common.logger(log_type, crawler).info(f'video_url:{video_url}')
  679. video_dict = {
  680. 'video_title': video_title,
  681. 'video_id': video_id,
  682. 'duration': duration,
  683. 'play_cnt': play_cnt,
  684. 'publish_time': publish_time,
  685. 'publish_time_stamp': publish_time_stamp,
  686. 'user_name': user_name,
  687. 'out_uid': out_uid,
  688. 'cover_url': cover_url,
  689. 'video_url': video_url,
  690. }
  691. return video_dict
  692. except Exception as e:
  693. Common.logger(log_type, crawler).error(f"get_video_info异常:{e}\n")
  694. @classmethod
  695. def get_video_info(cls, log_type, crawler, out_uid, video_id, machine):
  696. try:
  697. url = "https://www.youtube.com/youtubei/v1/player?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8&prettyPrint=false"
  698. payload = json.dumps({
  699. "context": {
  700. "client": {
  701. "hl": "zh-CN",
  702. "gl": "US",
  703. "remoteHost": "38.93.247.21",
  704. "deviceMake": "Apple",
  705. "deviceModel": "",
  706. "visitorData": "CgtraDZfVnB4NXdIWSjkzoefBg%3D%3D",
  707. "userAgent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36,gzip(gfe)",
  708. "clientName": "WEB",
  709. "clientVersion": "2.20230201.01.00",
  710. "osName": "Macintosh",
  711. "osVersion": "10_15_7",
  712. "originalUrl": f"https://www.youtube.com/watch?v={video_id}",
  713. "platform": "DESKTOP",
  714. "clientFormFactor": "UNKNOWN_FORM_FACTOR",
  715. "configInfo": {
  716. "appInstallData": "COTOh58GEPuj_hIQ1-SuBRC4i64FEMzfrgUQgt2uBRCi7K4FEOLUrgUQzPWuBRCKgK8FEOSg_hIQtpz-EhDa6a4FEP7urgUQieiuBRDn964FELjUrgUQlPiuBRCH3a4FELfgrgUQ76P-EhDJya4FEJan_hIQkfj8Eg%3D%3D"
  717. },
  718. "timeZone": "Asia/Shanghai",
  719. "browserName": "Chrome",
  720. "browserVersion": "109.0.0.0",
  721. "acceptHeader": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
  722. "deviceExperimentId": "ChxOekU1TlRReU5qWTBOVFExTVRRNU5qRTBOdz09EOTOh58GGOmU7Z4G",
  723. "screenWidthPoints": 1037,
  724. "screenHeightPoints": 969,
  725. "screenPixelDensity": 1,
  726. "screenDensityFloat": 1,
  727. "utcOffsetMinutes": 480,
  728. "userInterfaceTheme": "USER_INTERFACE_THEME_LIGHT",
  729. "memoryTotalKbytes": "8000000",
  730. "clientScreen": "WATCH",
  731. "mainAppWebInfo": {
  732. "graftUrl": f"/watch?v={video_id}",
  733. "pwaInstallabilityStatus": "PWA_INSTALLABILITY_STATUS_CAN_BE_INSTALLED",
  734. "webDisplayMode": "WEB_DISPLAY_MODE_FULLSCREEN",
  735. "isWebNativeShareAvailable": True
  736. }
  737. },
  738. "user": {
  739. "lockedSafetyMode": False
  740. },
  741. "request": {
  742. "useSsl": True,
  743. "internalExperimentFlags": [],
  744. "consistencyTokenJars": []
  745. },
  746. "clickTracking": {
  747. "clickTrackingParams": "CIwBEKQwGAYiEwipncqx3IL9AhXs4cQKHbKZDO4yB3JlbGF0ZWRInsS1qbGFtIlUmgEFCAEQ-B0="
  748. },
  749. "adSignalsInfo": {
  750. "params": [
  751. {
  752. "key": "dt",
  753. "value": "1675749222611"
  754. },
  755. {
  756. "key": "flash",
  757. "value": "0"
  758. },
  759. {
  760. "key": "frm",
  761. "value": "0"
  762. },
  763. {
  764. "key": "u_tz",
  765. "value": "480"
  766. },
  767. {
  768. "key": "u_his",
  769. "value": "3"
  770. },
  771. {
  772. "key": "u_h",
  773. "value": "1080"
  774. },
  775. {
  776. "key": "u_w",
  777. "value": "1920"
  778. },
  779. {
  780. "key": "u_ah",
  781. "value": "1080"
  782. },
  783. {
  784. "key": "u_aw",
  785. "value": "1920"
  786. },
  787. {
  788. "key": "u_cd",
  789. "value": "24"
  790. },
  791. {
  792. "key": "bc",
  793. "value": "31"
  794. },
  795. {
  796. "key": "bih",
  797. "value": "969"
  798. },
  799. {
  800. "key": "biw",
  801. "value": "1037"
  802. },
  803. {
  804. "key": "brdim",
  805. "value": "-269,-1080,-269,-1080,1920,-1080,1920,1080,1037,969"
  806. },
  807. {
  808. "key": "vis",
  809. "value": "1"
  810. },
  811. {
  812. "key": "wgl",
  813. "value": "true"
  814. },
  815. {
  816. "key": "ca_type",
  817. "value": "image"
  818. }
  819. ],
  820. "bid": "ANyPxKop8SijebwUCq4ZfKbJwlSjVQa_RTdS6c6a6WPYpCKnxpWCJ33B1SzRuSXjSfH9O2MhURebAs0CngRg6B4nOjBpeJDKgA"
  821. }
  822. },
  823. "videoId": str(video_id),
  824. "playbackContext": {
  825. "contentPlaybackContext": {
  826. "currentUrl": f"/watch?v={video_id}",
  827. "vis": 0,
  828. "splay": False,
  829. "autoCaptionsDefaultOn": False,
  830. "autonavState": "STATE_NONE",
  831. "html5Preference": "HTML5_PREF_WANTS",
  832. "signatureTimestamp": 19394,
  833. "referer": f"https://www.youtube.com/watch?v={video_id}",
  834. "lactMilliseconds": "-1",
  835. "watchAmbientModeContext": {
  836. "watchAmbientModeEnabled": True
  837. }
  838. }
  839. },
  840. "racyCheckOk": False,
  841. "contentCheckOk": False
  842. })
  843. headers = {
  844. 'authority': 'www.youtube.com',
  845. 'accept': '*/*',
  846. 'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
  847. 'cache-control': 'no-cache',
  848. 'content-type': 'application/json',
  849. 'cookie': f'VISITOR_INFO1_LIVE=kh6_Vpx5wHY; YSC=UupqFrWvAR0; DEVICE_INFO=ChxOekU1TlRReU5qWTBOVFExTVRRNU5qRTBOdz09EOmU7Z4GGOmU7Z4G; PREF=tz=Asia.Shanghai; ST-180dxzo=itct=CIwBEKQwGAYiEwipncqx3IL9AhXs4cQKHbKZDO4yB3JlbGF0ZWRInsS1qbGFtIlUmgEFCAEQ-B0%3D&csn=MC41MTQ1NTQzMTE3NTA4MjY0&endpoint=%7B%22clickTrackingParams%22%3A%22CIwBEKQwGAYiEwipncqx3IL9AhXs4cQKHbKZDO4yB3JlbGF0ZWRInsS1qbGFtIlUmgEFCAEQ-B0%3D%22%2C%22commandMetadata%22%3A%7B%22webCommandMetadata%22%3A%7B%22url%22%3A%22%2Fwatch%3Fv%3D{video_id}%22%2C%22webPageType%22%3A%22WEB_PAGE_TYPE_WATCH%22%2C%22rootVe%22%3A3832%7D%7D%2C%22watchEndpoint%22%3A%7B%22videoId%22%3A%22{video_id}%22%2C%22nofollow%22%3Atrue%2C%22watchEndpointSupportedOnesieConfig%22%3A%7B%22html5PlaybackOnesieConfig%22%3A%7B%22commonConfig%22%3A%7B%22url%22%3A%22https%3A%2F%2Frr5---sn-nx5s7n76.googlevideo.com%2Finitplayback%3Fsource%3Dyoutube%26oeis%3D1%26c%3DWEB%26oad%3D3200%26ovd%3D3200%26oaad%3D11000%26oavd%3D11000%26ocs%3D700%26oewis%3D1%26oputc%3D1%26ofpcc%3D1%26msp%3D1%26odepv%3D1%26id%3D38654ad085c12212%26ip%3D38.93.247.21%26initcwndbps%3D11346250%26mt%3D1675748964%26oweuc%3D%26pxtags%3DCg4KAnR4EggyNDQ1MTI4OA%26rxtags%3DCg4KAnR4EggyNDQ1MTI4Ng%252CCg4KAnR4EggyNDQ1MTI4Nw%252CCg4KAnR4EggyNDQ1MTI4OA%252CCg4KAnR4EggyNDQ1MTI4OQ%22%7D%7D%7D%7D%7D',
  850. 'origin': 'https://www.youtube.com',
  851. 'pragma': 'no-cache',
  852. 'referer': f'https://www.youtube.com/watch?v={video_id}',
  853. 'sec-ch-ua': '"Not_A Brand";v="99", "Chromium";v="109", "Google Chrome";v="109.0.5414.87"',
  854. 'sec-ch-ua-arch': '"arm"',
  855. 'sec-ch-ua-bitness': '"64"',
  856. 'sec-ch-ua-full-version': '"109.0.1518.52"',
  857. 'sec-ch-ua-full-version-list': '"Not_A Brand";v="99.0.0.0", "Microsoft Edge";v="109.0.1518.52", "Chromium";v="109.0.5414.87"',
  858. 'sec-ch-ua-mobile': '?0',
  859. 'sec-ch-ua-model': '',
  860. 'sec-ch-ua-platform': '"macOS"',
  861. 'sec-ch-ua-platform-version': '"12.4.0"',
  862. 'sec-ch-ua-wow64': '?0',
  863. 'sec-fetch-dest': 'empty',
  864. 'sec-fetch-mode': 'same-origin',
  865. 'sec-fetch-site': 'same-origin',
  866. 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
  867. 'x-goog-visitor-id': 'CgtraDZfVnB4NXdIWSjkzoefBg%3D%3D',
  868. 'x-youtube-bootstrap-logged-in': 'false',
  869. 'x-youtube-client-name': '1',
  870. 'x-youtube-client-version': '2.20230201.01.00'
  871. }
  872. response = requests.post(url=url, headers=headers, data=payload)
  873. if response.status_code != 200:
  874. Common.logger(log_type, crawler).warning(f"get_video_info_response:{response.text}\n")
  875. elif 'streamingData' not in response.json():
  876. Common.logger(log_type, crawler).warning(f"get_video_info_response:{response.json()}\n")
  877. elif 'videoDetails' not in response.json():
  878. Common.logger(log_type, crawler).warning(f"get_video_info_response:{response.json()}\n")
  879. elif 'microformat' not in response.json():
  880. Common.logger(log_type, crawler).warning(f"get_video_info_response:{response.json()}\n")
  881. else:
  882. playerMicroformatRenderer = response.json()['microformat']['playerMicroformatRenderer']
  883. videoDetails = response.json()['videoDetails']
  884. # streamingData = response.json()['streamingData']
  885. # video_title
  886. if 'title' not in videoDetails:
  887. video_title = ''
  888. else:
  889. video_title = videoDetails['title']
  890. video_title = cls.filter_emoji(video_title)
  891. if not cls.is_contain_chinese(video_title):
  892. video_title = Translate.google_translate(video_title, machine) \
  893. .strip().replace("\\", "").replace(" ", "").replace("\n", "") \
  894. .replace("/", "").replace("\r", "").replace("&NBSP", "").replace("&", "") \
  895. .replace(";", "").replace("amp;", "") # 自动翻译标题为中文
  896. if 'lengthSeconds' not in videoDetails:
  897. duration = 0
  898. else:
  899. duration = int(videoDetails['lengthSeconds'])
  900. # play_cnt
  901. if 'viewCount' not in videoDetails:
  902. play_cnt = 0
  903. else:
  904. play_cnt = int(videoDetails['viewCount'])
  905. # publish_time
  906. if 'publishDate' not in playerMicroformatRenderer:
  907. publish_time = ''
  908. else:
  909. publish_time = playerMicroformatRenderer['publishDate']
  910. if publish_time == '':
  911. publish_time_stamp = 0
  912. elif ':' in publish_time:
  913. publish_time_stamp = int(time.mktime(time.strptime(publish_time, "%Y-%m-%d %H:%M:%S")))
  914. else:
  915. publish_time_stamp = int(time.mktime(time.strptime(publish_time, "%Y-%m-%d")))
  916. # user_name
  917. if 'author' not in videoDetails:
  918. user_name = ''
  919. else:
  920. user_name = videoDetails['author']
  921. # cover_url
  922. if 'thumbnail' not in videoDetails:
  923. cover_url = ''
  924. elif 'thumbnails' not in videoDetails['thumbnail']:
  925. cover_url = ''
  926. elif len(videoDetails['thumbnail']['thumbnails']) == 0:
  927. cover_url = ''
  928. elif 'url' not in videoDetails['thumbnail']['thumbnails'][-1]:
  929. cover_url = ''
  930. else:
  931. cover_url = videoDetails['thumbnail']['thumbnails'][-1]['url']
  932. # video_url
  933. # if 'formats' not in streamingData:
  934. # video_url = ''
  935. # elif len(streamingData['formats']) == 0:
  936. # video_url = ''
  937. # elif 'url' not in streamingData['formats'][-1]:
  938. # video_url = ''
  939. # else:
  940. # video_url = streamingData['formats'][-1]['url']
  941. video_url = f"https://www.youtube.com/watch?v={video_id}"
  942. Common.logger(log_type, crawler).info(f'video_title:{video_title}')
  943. Common.logger(log_type, crawler).info(f'video_id:{video_id}')
  944. Common.logger(log_type, crawler).info(f'play_cnt:{play_cnt}')
  945. Common.logger(log_type, crawler).info(f'publish_time:{publish_time}')
  946. Common.logger(log_type, crawler).info(f'user_name:{user_name}')
  947. Common.logger(log_type, crawler).info(f'cover_url:{cover_url}')
  948. Common.logger(log_type, crawler).info(f'video_url:{video_url}')
  949. video_dict = {
  950. 'video_title': video_title,
  951. 'video_id': video_id,
  952. 'duration': duration,
  953. 'play_cnt': play_cnt,
  954. 'publish_time': publish_time,
  955. 'publish_time_stamp': publish_time_stamp,
  956. 'user_name': user_name,
  957. 'out_uid': out_uid,
  958. 'cover_url': cover_url,
  959. 'video_url': video_url,
  960. }
  961. return video_dict
  962. except Exception as e:
  963. Common.logger(log_type, crawler).error(f"get_video_info异常:{e}\n")
  964. @classmethod
  965. def repeat_video(cls, log_type, crawler, video_id, env, machine):
  966. sql = f""" select * from crawler_video where platform="{cls.platform}" and out_video_id="{video_id}"; """
  967. repeat_video = MysqlHelper.get_values(log_type, crawler, sql, env, machine)
  968. return len(repeat_video)
  969. @classmethod
  970. def download_publish(cls, log_type, crawler, video_dict, strategy, our_uid, env, oss_endpoint, machine):
  971. try:
  972. # sql = f""" select * from crawler_video where platform="{cls.platform}" and out_video_id="{video_dict['video_id']}" """
  973. # repeat_video = MysqlHelper.get_values(log_type, crawler, sql, env, machine)
  974. if video_dict['video_title'] == '' or video_dict['video_url'] == '':
  975. Common.logger(log_type, crawler).info('无效视频\n')
  976. elif video_dict['duration'] > 1200 or video_dict['duration'] < 60:
  977. Common.logger(log_type, crawler).info(f"时长:{video_dict['duration']}不满足规则\n")
  978. # elif repeat_video is not None and len(repeat_video) != 0:
  979. elif cls.repeat_video(log_type, crawler, video_dict['video_id'], env, machine) != 0:
  980. Common.logger(log_type, crawler).info('视频已下载\n')
  981. elif video_dict['video_id'] in [x for y in Feishu.get_values_batch(log_type, crawler, 'GVxlYk') for x in y]:
  982. Common.logger(log_type, crawler).info('视频已下载\n')
  983. else:
  984. # 下载视频
  985. Common.logger(log_type, crawler).info('开始下载视频...')
  986. # Common.download_method(log_type, crawler, 'video', video_dict['video_title'], video_dict['video_url'])
  987. Common.download_method(log_type, crawler, 'youtube_video', video_dict['video_title'],
  988. video_dict['video_url'])
  989. # ffmpeg_dict = Common.ffmpeg(log_type, crawler, f"./{crawler}/videos/{video_dict['video_title']}/video.mp4")
  990. # video_width = int(ffmpeg_dict['width'])
  991. # video_height = int(ffmpeg_dict['height'])
  992. # video_size = int(ffmpeg_dict['size'])
  993. video_width = 1280
  994. video_height = 720
  995. duration = int(video_dict['duration'])
  996. Common.logger(log_type, crawler).info(f'video_width:{video_width}')
  997. Common.logger(log_type, crawler).info(f'video_height:{video_height}')
  998. Common.logger(log_type, crawler).info(f'duration:{duration}')
  999. # Common.logger(log_type, crawler).info(f'video_size:{video_size}\n')
  1000. video_dict['video_width'] = video_width
  1001. video_dict['video_height'] = video_height
  1002. video_dict['duration'] = duration
  1003. video_dict['comment_cnt'] = 0
  1004. video_dict['like_cnt'] = 0
  1005. video_dict['share_cnt'] = 0
  1006. video_dict['avatar_url'] = video_dict['cover_url']
  1007. video_dict['session'] = f'youtube{int(time.time())}'
  1008. rule = '1,2'
  1009. # if duration < 60 or duration > 600:
  1010. # # 删除视频文件夹
  1011. # shutil.rmtree(f"./{crawler}/videos/{video_dict['video_title']}/")
  1012. # Common.logger(log_type, crawler).info(f"时长:{video_dict['duration']}不满足抓取规则,删除成功\n")
  1013. # return
  1014. # if duration == 0 or duration is None:
  1015. # # 删除视频文件夹
  1016. # shutil.rmtree(f"./{crawler}/videos/{video_dict['video_title']}/")
  1017. # Common.logger(log_type, crawler).info(f"视频下载出错,删除成功\n")
  1018. # return
  1019. # else:
  1020. # 下载封面
  1021. Common.download_method(log_type, crawler, 'cover', video_dict['video_title'], video_dict['cover_url'])
  1022. # 保存视频文本信息
  1023. Common.save_video_info(log_type, crawler, video_dict)
  1024. # 上传视频
  1025. Common.logger(log_type, crawler).info(f"开始上传视频")
  1026. if env == 'dev':
  1027. our_video_id = Publish.upload_and_publish(log_type, crawler, strategy, our_uid, env, oss_endpoint)
  1028. our_video_link = f"https://testadmin.piaoquantv.com/cms/post-detail/{our_video_id}/info"
  1029. else:
  1030. our_video_id = Publish.upload_and_publish(log_type, crawler, strategy, our_uid, env, oss_endpoint)
  1031. our_video_link = f"https://admin.piaoquantv.com/cms/post-detail/{our_video_id}/info"
  1032. Common.logger(log_type, crawler).info("视频上传完成")
  1033. if our_video_id is None:
  1034. # 删除视频文件夹
  1035. shutil.rmtree(f"./{crawler}/videos/{video_dict['video_title']}/")
  1036. return
  1037. # 视频信息保存至飞书
  1038. Feishu.insert_columns(log_type, crawler, "GVxlYk", "ROWS", 1, 2)
  1039. # 视频ID工作表,首行写入数据
  1040. upload_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(time.time())))
  1041. values = [[upload_time,
  1042. "定向榜",
  1043. video_dict['video_id'],
  1044. video_dict['video_title'],
  1045. our_video_link,
  1046. video_dict['play_cnt'],
  1047. video_dict['duration'],
  1048. f'{video_width}*{video_height}',
  1049. video_dict['publish_time'],
  1050. video_dict['user_name'],
  1051. video_dict['cover_url'],
  1052. video_dict['video_url']
  1053. ]]
  1054. # time.sleep(1)
  1055. Feishu.update_values(log_type, crawler, "GVxlYk", "F2:Z2", values)
  1056. Common.logger(log_type, crawler).info('视频信息写入定向_已下载表成功\n')
  1057. # 视频信息保存数据库
  1058. sql = f""" insert into crawler_video(video_id,
  1059. user_id,
  1060. out_user_id,
  1061. platform,
  1062. strategy,
  1063. out_video_id,
  1064. video_title,
  1065. cover_url,
  1066. video_url,
  1067. duration,
  1068. publish_time,
  1069. play_cnt,
  1070. crawler_rule,
  1071. width,
  1072. height)
  1073. values({our_video_id},
  1074. "{our_uid}",
  1075. "{video_dict['out_uid']}",
  1076. "{cls.platform}",
  1077. "定向爬虫策略",
  1078. "{video_dict['video_id']}",
  1079. "{video_dict['video_title']}",
  1080. "{video_dict['cover_url']}",
  1081. "{video_dict['video_url']}",
  1082. {int(duration)},
  1083. "{video_dict['publish_time']}",
  1084. {int(video_dict['play_cnt'])},
  1085. "{rule}",
  1086. {int(video_width)},
  1087. {int(video_height)}) """
  1088. MysqlHelper.update_values(log_type, crawler, sql, env, machine)
  1089. Common.logger(log_type, crawler).info('视频信息插入数据库成功!\n')
  1090. except Exception as e:
  1091. Common.logger(log_type, crawler).info(f"download_publish异常:{e}\n")
  1092. @classmethod
  1093. def get_follow_videos(cls, log_type, crawler, strategy, oss_endpoint, env, machine):
  1094. try:
  1095. # user_list = cls.get_user_from_feishu(log_type, crawler, 'c467d7', env, machine)
  1096. user_list = get_user_from_mysql(log_type, crawler, crawler, env, action='get_author_map')
  1097. if len(user_list) == 0:
  1098. Common.logger(log_type, crawler).warning('用户列表为空\n')
  1099. else:
  1100. for user_dict in user_list:
  1101. out_user_url = user_dict['spider_link']
  1102. out_uid = out_user_url.split('/')[3]
  1103. user_name = user_dict['nick_name']
  1104. our_uid = user_dict['media_id']
  1105. Common.logger(log_type, crawler).info(f'获取 {user_name} 主页视频\n')
  1106. cls.get_videos(log_type, crawler, strategy, oss_endpoint, env, out_uid, our_uid, machine,
  1107. out_user_url)
  1108. # Common.logger(log_type, crawler).info('休眠 10 秒')
  1109. # time.sleep(random.randint(1, 2))
  1110. cls.continuation = ''
  1111. except Exception as e:
  1112. Common.logger(log_type, crawler).error(f"get_follow_videos异常:{e}\n")
  1113. if __name__ == "__main__":
  1114. # print(YoutubeFollow.get_browse_id('follow', 'youtube', '@chinatravel5971', "local"))
  1115. # print(YoutubeFollow.get_user_from_feishu('follow', 'youtube', 'c467d7', 'dev', 'local'))
  1116. print(YoutubeFollow.get_user_from_feishu('follow', 'youtube', 'c467d7', 'prod', 'prod'))
  1117. # YoutubeFollow.get_out_user_info('follow', 'youtube', 'UC08jgxf119fzynp2uHCvZIg', '@weitravel')
  1118. # YoutubeFollow.get_video_info('follow', 'youtube', 'OGVK0IXBIhI')
  1119. # YoutubeFollow.get_follow_videos('follow', 'youtube', 'youtube_follow', 'hk', 'dev', 'local')
  1120. # print(YoutubeFollow.filter_emoji("姐妹倆一唱一和,完美配合,終於把大慶降服了😅😅#萌娃搞笑日常"))
  1121. # YoutubeFollow.repeat_video('follow', 'youtube', 4, "dev", "local")
  1122. # title = "'西部巡游220丨两人一车环游中国半年,需要花费多少钱? 2万公里吃住行费用总结'"
  1123. # title = "'Insanely Crowded Shanghai Yu Garden Lantern Festival Walk Tour 2023 人气爆棚的上海豫园元宵节漫步之行 4K'"
  1124. # print(title.strip().replace("\\", "").replace(" ", "").replace("\n", "").replace("/", "").replace("\r", "").replace("&NBSP", "").replace("&", ""))