youtube_follow_scheduling.py 57 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104
  1. # -*- coding: utf-8 -*-
  2. # @Author: lierqiang
  3. # @Time: 2023/3/15
  4. import os
  5. import re
  6. import shutil
  7. import sys
  8. import time
  9. import json
  10. import requests
  11. sys.path.append(os.getcwd())
  12. from common.common import Common
  13. from common.scheduling_db import MysqlHelper
  14. from common.feishu import Feishu
  15. from common.users import Users
  16. from common.publish import Publish
  17. from common.translate import Translate
  18. from common.userAgent import random_user_agent, get_random_user_agent
  19. def format_nums(data):
  20. data_dict = [{'亿': 100000000}, {'百万': 1000000}, {'万': 10000}, {'k': 1000}, {'w': 10000}, {'m': 1000000},
  21. {'千': 1000}, {'M': 1000000}, {'K': 1000}, {'W': 10000}]
  22. data = str(data)
  23. for i in data_dict:
  24. index = data.find(list(i.keys())[0])
  25. if index > 0:
  26. count = int(float(data[:index]) * list(i.values())[0])
  27. return count
  28. elif index < 0:
  29. continue
  30. count = int(float(re.findall(r'\d+', data)[0]))
  31. return count
  32. class Follow:
  33. # 翻页参数
  34. continuation = ''
  35. # 抓取平台
  36. platform = 'youtube'
  37. @classmethod
  38. def get_out_user_info(cls, log_type, crawler, browse_id, out_user_id):
  39. """
  40. 获取站外用户信息
  41. :param log_type: 日志
  42. :param crawler: 哪款爬虫
  43. :param browse_id: browse_id
  44. :param out_user_id: 站外用户 UID
  45. :return: out_user_dict = {'out_user_name': 站外用户昵称,
  46. 'out_avatar_url': 站外用户头像,
  47. 'out_fans': 站外用户粉丝量,
  48. 'out_play_cnt': 站外用户总播放量,
  49. 'out_create_time': 站外用户创建时间}
  50. """
  51. try:
  52. url = f'https://www.youtube.com/{out_user_id}/about'
  53. res = requests.get(url=url, headers=random_user_agent('pc'))
  54. info = re.findall(r'var ytInitialData = (.*?);</script>', res.text, re.S)[0]
  55. data = json.loads(info)
  56. header = data['header']['c4TabbedHeaderRenderer']
  57. tabs = data['contents']['twoColumnBrowseResultsRenderer']['tabs']
  58. subsimpleText = header['subscriberCountText']['simpleText'].replace('位订阅者', '')
  59. for tab in tabs:
  60. if 'tabRenderer' not in tab or 'content' not in tab['tabRenderer']:
  61. continue
  62. viewCountText = \
  63. tab['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer'][
  64. 'contents'][0]['channelAboutFullMetadataRenderer']['viewCountText']['simpleText']
  65. out_create_time = \
  66. tab['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer'][
  67. 'contents'][0]['channelAboutFullMetadataRenderer']['joinedDateText']['runs'][1]['text']
  68. break
  69. out_user_dict = {
  70. 'out_user_name': header['title'],
  71. 'out_avatar_url': header['avatar']['thumbnails'][-1]['url'],
  72. 'out_fans': format_nums(subsimpleText),
  73. 'out_play_cnt': int(
  74. viewCountText.replace('收看次數:', '').replace('次', '').replace(',', '')) if viewCountText else 0,
  75. 'out_create_time': out_create_time.replace('年', '-').replace('月', '-').replace('日', ''),
  76. }
  77. # print(out_user_dict)
  78. return out_user_dict
  79. except Exception as e:
  80. Common.logger(log_type, crawler).error(f'get_out_user_info异常:{e}\n')
  81. @classmethod
  82. def get_user_from_feishu(cls, log_type, crawler, sheetid, env):
  83. """
  84. 补全飞书用户表信息,并返回
  85. :param log_type: 日志
  86. :param crawler: 哪款爬虫
  87. :param sheetid: 飞书表
  88. :param env: 正式环境:prod,测试环境:dev
  89. :param machine: 部署机器,阿里云填写 aliyun,aliyun_hk ,线下分别填写 macpro,macair,local
  90. :return: user_list
  91. """
  92. try:
  93. user_sheet = Feishu.get_values_batch(log_type, crawler, sheetid)
  94. user_list = []
  95. for i in range(1, len(user_sheet)):
  96. out_uid = user_sheet[i][2]
  97. user_name = user_sheet[i][3]
  98. browse_id = user_sheet[i][5]
  99. our_uid = user_sheet[i][6]
  100. uer_url = user_sheet[i][4]
  101. if out_uid is not None and user_name is not None:
  102. Common.logger(log_type, crawler).info(f"正在更新 {user_name} 用户信息\n")
  103. if our_uid is None:
  104. sql = f""" select * from crawler_user where platform="{cls.platform}" and out_user_id="{out_uid}" """
  105. our_user_info = MysqlHelper.get_values(log_type, crawler, sql, env)
  106. # 数据库中(youtube + out_user_id)返回数量 == 0,则创建站内账号UID,并写入定向账号飞书表。并结合站外用户信息,一并写入爬虫账号数据库
  107. if our_user_info is None or len(our_user_info) == 0:
  108. # 获取站外账号信息,写入数据库
  109. out_user_dict = cls.get_out_user_info(log_type, crawler, browse_id, out_uid)
  110. out_avatar_url = out_user_dict['out_avatar_url']
  111. out_create_time = out_user_dict['out_create_time']
  112. out_play_cnt = out_user_dict['out_play_cnt']
  113. out_fans = out_user_dict['out_fans']
  114. tag = 'youtube爬虫,定向爬虫策略'
  115. # 创建站内账号
  116. create_user_dict = {
  117. 'nickName': user_name,
  118. 'avatarUrl': out_avatar_url,
  119. 'tagName': tag,
  120. }
  121. our_uid = Users.create_uid(log_type, crawler, create_user_dict, env)
  122. Common.logger(log_type, crawler).info(f'新创建的站内UID:{our_uid}')
  123. if env == 'prod':
  124. our_user_link = f'https://admin.piaoquantv.com/ums/user/{our_uid}/post'
  125. else:
  126. our_user_link = f'https://testadmin.piaoquantv.com/ums/user/{our_uid}/post'
  127. Common.logger(log_type, crawler).info(f'站内用户主页链接:{our_user_link}')
  128. Feishu.update_values(log_type, crawler, sheetid, f'G{i + 1}:H{i + 1}',
  129. [[our_uid, our_user_link]])
  130. Common.logger(log_type, crawler).info(f'站内用户信息写入飞书成功!')
  131. sql = f""" insert into crawler_user(user_id,
  132. out_user_id,
  133. out_user_name,
  134. out_avatar_url,
  135. out_create_time,
  136. out_play_cnt,
  137. out_fans,
  138. platform,
  139. tag)
  140. values({our_uid},
  141. "{out_uid}",
  142. "{user_name}",
  143. "{out_avatar_url}",
  144. "{out_create_time}",
  145. {out_play_cnt},
  146. {out_fans},
  147. "{cls.platform}",
  148. "{tag}") """
  149. Common.logger(log_type, crawler).info(f'sql:{sql}')
  150. MysqlHelper.update_values(log_type, crawler, sql, env)
  151. Common.logger(log_type, crawler).info('用户信息插入数据库成功!\n')
  152. # 数据库中(youtube + out_user_id)返回数量 != 0,则直接把数据库中的站内 UID 写入飞书
  153. else:
  154. our_uid = our_user_info[0][1]
  155. if 'env' == 'prod':
  156. our_user_link = f'https://admin.piaoquantv.com/ums/user/{our_uid}/post'
  157. else:
  158. our_user_link = f'https://testadmin.piaoquantv.com/ums/user/{our_uid}/post'
  159. Common.logger(log_type, crawler).info(f'站内用户主页链接:{our_user_link}')
  160. Feishu.update_values(log_type, crawler, sheetid, f'G{i + 1}:H{i + 1}',
  161. [[our_uid, our_user_link]])
  162. Common.logger(log_type, crawler).info(f'站内用户信息写入飞书成功!\n')
  163. user_dict = {
  164. 'out_user_id': out_uid,
  165. 'out_user_name': user_name,
  166. 'out_browse_id': browse_id,
  167. 'our_user_id': our_uid,
  168. 'out_user_url': uer_url
  169. }
  170. user_list.append(user_dict)
  171. else:
  172. pass
  173. return user_list
  174. except Exception as e:
  175. Common.logger(log_type, crawler).error(f"get_user_from_feishu异常:{e}\n")
  176. @classmethod
  177. def get_continuation(cls, data):
  178. continuation = data['continuationItemRenderer']['continuationEndpoint']['continuationCommand']['token']
  179. return continuation
  180. @classmethod
  181. def get_feeds(cls, log_type, crawler, browse_id, out_uid):
  182. """
  183. 获取个人主页视频列表
  184. :param log_type: 日志
  185. :param crawler: 哪款爬虫
  186. :param browse_id: 每个用户主页的请求参数中唯一值
  187. :param out_uid: 站外用户UID
  188. :return: video_list
  189. """
  190. url = "https://www.youtube.com/youtubei/v1/browse?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8&prettyPrint=false"
  191. payload = json.dumps({
  192. "context": {
  193. "client": {
  194. "hl": "zh-CN",
  195. "gl": "US",
  196. "remoteHost": "38.93.247.21",
  197. "deviceMake": "Apple",
  198. "deviceModel": "",
  199. "visitorData": "CgtraDZfVnB4NXdIWSi6mIOfBg%3D%3D",
  200. "userAgent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36,gzip(gfe)",
  201. "clientName": "WEB",
  202. "clientVersion": "2.20230201.01.00",
  203. "osName": "Macintosh",
  204. "osVersion": "10_15_7",
  205. "originalUrl": f"https://www.youtube.com/{out_uid}/videos",
  206. "platform": "DESKTOP",
  207. "clientFormFactor": "UNKNOWN_FORM_FACTOR",
  208. "configInfo": {
  209. "appInstallData": "CLqYg58GEInorgUQuIuuBRCU-K4FENfkrgUQuNSuBRC2nP4SEPuj_hIQ5_euBRCy9a4FEKLsrgUQt-CuBRDi1K4FEILdrgUQh92uBRDM364FEP7urgUQzPWuBRDZ6a4FEOSg_hIQo_muBRDvo_4SEMnJrgUQlqf-EhCR-PwS"
  210. },
  211. "timeZone": "Asia/Shanghai",
  212. "browserName": "Chrome",
  213. "browserVersion": "109.0.0.0",
  214. "acceptHeader": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
  215. "deviceExperimentId": "ChxOekU1TlRReU5qWTBOVFExTVRRNU5qRTBOdz09ELqYg58GGOmU7Z4G",
  216. "screenWidthPoints": 944,
  217. "screenHeightPoints": 969,
  218. "screenPixelDensity": 1,
  219. "screenDensityFloat": 1,
  220. "utcOffsetMinutes": 480,
  221. "userInterfaceTheme": "USER_INTERFACE_THEME_LIGHT",
  222. "memoryTotalKbytes": "8000000",
  223. "mainAppWebInfo": {
  224. "graftUrl": f"/{out_uid}/videos",
  225. "pwaInstallabilityStatus": "PWA_INSTALLABILITY_STATUS_CAN_BE_INSTALLED",
  226. "webDisplayMode": "WEB_DISPLAY_MODE_FULLSCREEN",
  227. "isWebNativeShareAvailable": True
  228. }
  229. },
  230. "user": {
  231. "lockedSafetyMode": False
  232. },
  233. "request": {
  234. "useSsl": True,
  235. "internalExperimentFlags": [],
  236. "consistencyTokenJars": []
  237. },
  238. "clickTracking": {
  239. "clickTrackingParams": "CBcQ8JMBGAYiEwiNhIXX9IL9AhUFSUwIHWnnDks="
  240. },
  241. "adSignalsInfo": {
  242. "params": [
  243. {
  244. "key": "dt",
  245. "value": "1675676731048"
  246. },
  247. {
  248. "key": "flash",
  249. "value": "0"
  250. },
  251. {
  252. "key": "frm",
  253. "value": "0"
  254. },
  255. {
  256. "key": "u_tz",
  257. "value": "480"
  258. },
  259. {
  260. "key": "u_his",
  261. "value": "4"
  262. },
  263. {
  264. "key": "u_h",
  265. "value": "1080"
  266. },
  267. {
  268. "key": "u_w",
  269. "value": "1920"
  270. },
  271. {
  272. "key": "u_ah",
  273. "value": "1080"
  274. },
  275. {
  276. "key": "u_aw",
  277. "value": "1920"
  278. },
  279. {
  280. "key": "u_cd",
  281. "value": "24"
  282. },
  283. {
  284. "key": "bc",
  285. "value": "31"
  286. },
  287. {
  288. "key": "bih",
  289. "value": "969"
  290. },
  291. {
  292. "key": "biw",
  293. "value": "944"
  294. },
  295. {
  296. "key": "brdim",
  297. "value": "-269,-1080,-269,-1080,1920,-1080,1920,1080,944,969"
  298. },
  299. {
  300. "key": "vis",
  301. "value": "1"
  302. },
  303. {
  304. "key": "wgl",
  305. "value": "true"
  306. },
  307. {
  308. "key": "ca_type",
  309. "value": "image"
  310. }
  311. ],
  312. "bid": "ANyPxKpfiaAf-DBzNeKLgkceMEA9UIeCWFRTRm4AQMCuejhI3PGwDB1jizQIX60YcEYtt_CX7tZWAbYerQ-rWLvV7y_KCLkBww"
  313. }
  314. },
  315. # "browseId": browse_id,
  316. "params": "EgZ2aWRlb3PyBgQKAjoA",
  317. "continuation": cls.continuation
  318. })
  319. headers = {
  320. 'authority': 'www.youtube.com',
  321. 'accept': '*/*',
  322. 'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
  323. 'cache-control': 'no-cache',
  324. 'content-type': 'application/json',
  325. 'cookie': 'VISITOR_INFO1_LIVE=kh6_Vpx5wHY; YSC=UupqFrWvAR0; DEVICE_INFO=ChxOekU1TlRReU5qWTBOVFExTVRRNU5qRTBOdz09EOmU7Z4GGOmU7Z4G; PREF=tz=Asia.Shanghai; ST-1kg1gfd=itct=CBcQ8JMBGAYiEwiNhIXX9IL9AhUFSUwIHWnnDks%3D&csn=MC4zNzI3MDcwMDA1Mjg4NzE5Ng..&endpoint=%7B%22clickTrackingParams%22%3A%22CBcQ8JMBGAYiEwiNhIXX9IL9AhUFSUwIHWnnDks%3D%22%2C%22commandMetadata%22%3A%7B%22webCommandMetadata%22%3A%7B%22url%22%3A%22%2F%40chinatravel5971%2Fvideos%22%2C%22webPageType%22%3A%22WEB_PAGE_TYPE_CHANNEL%22%2C%22rootVe%22%3A3611%2C%22apiUrl%22%3A%22%2Fyoutubei%2Fv1%2Fbrowse%22%7D%7D%2C%22browseEndpoint%22%3A%7B%22browseId%22%3A%22UCpLXnfBCNhj8KLnt54RQMKA%22%2C%22params%22%3A%22EgZ2aWRlb3PyBgQKAjoA%22%2C%22canonicalBaseUrl%22%3A%22%2F%40chinatravel5971%22%7D%7D',
  326. 'origin': 'https://www.youtube.com',
  327. 'pragma': 'no-cache',
  328. 'referer': f'https://www.youtube.com/{out_uid}/featured',
  329. 'sec-ch-ua': '"Not_A Brand";v="99", "Chromium";v="109", "Google Chrome";v="109.0.5414.87"',
  330. 'sec-ch-ua-arch': '"arm"',
  331. 'sec-ch-ua-bitness': '"64"',
  332. 'sec-ch-ua-full-version': '"109.0.1518.52"',
  333. 'sec-ch-ua-full-version-list': '"Not_A Brand";v="99.0.0.0", "Microsoft Edge";v="109.0.1518.52", "Chromium";v="109.0.5414.87"',
  334. 'sec-ch-ua-mobile': '?0',
  335. 'sec-ch-ua-model': '',
  336. 'sec-ch-ua-platform': '"macOS"',
  337. 'sec-ch-ua-platform-version': '"12.4.0"',
  338. 'sec-ch-ua-wow64': '?0',
  339. 'sec-fetch-dest': 'empty',
  340. 'sec-fetch-mode': 'same-origin',
  341. 'sec-fetch-site': 'same-origin',
  342. 'user-agent': get_random_user_agent('pc'),
  343. 'x-goog-visitor-id': 'CgtraDZfVnB4NXdIWSi6mIOfBg%3D%3D',
  344. 'x-youtube-bootstrap-logged-in': 'false',
  345. 'x-youtube-client-name': '1',
  346. 'x-youtube-client-version': '2.20230201.01.00'
  347. }
  348. try:
  349. response = requests.post(url=url, headers=headers, data=payload)
  350. # Common.logger(log_type, crawler).info(f"get_feeds_response:{response.json()}\n")
  351. cls.continuation = response.json()['trackingParams']
  352. if response.status_code != 200:
  353. Common.logger(log_type, crawler).warning(f'get_feeds_response:{response.text}\n')
  354. elif 'continuationContents' not in response.text and 'onResponseReceivedActions' not in response.text:
  355. Common.logger(log_type, crawler).warning(f'get_feeds_response:{response.text}\n')
  356. elif 'continuationContents' in response.json():
  357. # Common.logger(log_type, crawler).info("'continuationContents' in response.json()\n")
  358. if 'richGridContinuation' not in response.json()['continuationContents']:
  359. # Common.logger(log_type, crawler).warning(f"'richGridContinuation' not in response.json()['continuationContents']\n")
  360. Common.logger(log_type, crawler).warning(
  361. f'get_feeds_response:{response.json()["continuationContents"]}\n')
  362. elif 'contents' not in response.json()['continuationContents']['richGridContinuation']:
  363. Common.logger(log_type, crawler).warning(
  364. f'get_feeds_response:{response.json()["continuationContents"]["richGridContinuation"]}\n')
  365. elif 'contents' in response.json()["continuationContents"]["richGridContinuation"]:
  366. feeds = response.json()["continuationContents"]["richGridContinuation"]['contents']
  367. return feeds
  368. elif 'onResponseReceivedActions' in response.json():
  369. Common.logger(log_type, crawler).info("'onResponseReceivedActions' in response.json()\n")
  370. if len(response.json()['onResponseReceivedActions']) == 0:
  371. Common.logger(log_type, crawler).warning(
  372. f'get_feeds_response:{response.json()["onResponseReceivedActions"]}\n')
  373. elif 'appendContinuationItemsAction' not in response.json()['onResponseReceivedActions'][0]:
  374. Common.logger(log_type, crawler).warning(
  375. f'get_feeds_response:{response.json()["onResponseReceivedActions"][0]}\n')
  376. elif 'continuationItems' not in response.json()['onResponseReceivedActions'][0][
  377. 'appendContinuationItemsAction']:
  378. Common.logger(log_type, crawler).warning(
  379. f'get_feeds_response:{response.json()["onResponseReceivedActions"][0]["appendContinuationItemsAction"]}\n')
  380. elif len(response.json()['onResponseReceivedActions'][0]['appendContinuationItemsAction'][
  381. 'continuationItems']) == 0:
  382. Common.logger(log_type, crawler).warning(
  383. f'get_feeds_response:{response.json()["onResponseReceivedActions"][0]["appendContinuationItemsAction"]["continuationItems"]}\n')
  384. else:
  385. feeds = response.json()["onResponseReceivedActions"][0]["appendContinuationItemsAction"][
  386. "continuationItems"]
  387. return feeds
  388. else:
  389. Common.logger(log_type, crawler).info('feeds is None\n')
  390. except Exception as e:
  391. Common.logger(log_type, crawler).error(f'get_feeds异常:{e}\n')
  392. @classmethod
  393. def get_first_page(cls, user_url):
  394. try:
  395. res = requests.get(url=user_url, headers=random_user_agent('pc'))
  396. info = re.findall(r'var ytInitialData = (.*?);', res.text, re.S)[0]
  397. ytInitialData = json.loads(info)
  398. video_list = \
  399. ytInitialData['contents']['twoColumnBrowseResultsRenderer']['tabs'][1]['tabRenderer']['content'][
  400. 'richGridRenderer']['contents']
  401. except Exception as e:
  402. video_list = []
  403. return video_list
  404. @classmethod
  405. def get_next_page(cls, log_type, crawler, task, strategy, oss_endpoint, env, our_uid,
  406. out_uid, out_user_url, continuation):
  407. min_publish_day = task['min_publish_day']
  408. post_url = "https://www.youtube.com/youtubei/v1/browse?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8&prettyPrint=false"
  409. payload = json.dumps({
  410. "context": {
  411. "client": {
  412. "userAgent": get_random_user_agent('pc'),
  413. "clientName": "WEB",
  414. "clientVersion": "2.20230221.06.00",
  415. "osName": "Macintosh",
  416. "osVersion": "10_15_7",
  417. "originalUrl": "https://www.youtube.com/{}/videos".format(out_uid),
  418. "screenPixelDensity": 2,
  419. "platform": "DESKTOP",
  420. "clientFormFactor": "UNKNOWN_FORM_FACTOR",
  421. "configInfo": {
  422. "appInstallData": "CKWy258GEOWg_hIQzN-uBRC4rP4SEOf3rgUQzPWuBRCi7K4FEMiJrwUQieiuBRDshq8FENrprgUQ4tSuBRD-7q4FEKOArwUQgt2uBRC2nP4SEJT4rgUQuIuuBRCH3a4FELjUrgUQjqj-EhCR-PwS"
  423. },
  424. "screenDensityFloat": 2,
  425. "timeZone": "Asia/Shanghai",
  426. "browserName": "Chrome",
  427. "browserVersion": "110.0.0.0",
  428. "acceptHeader": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
  429. "deviceExperimentId": "ChxOekl3TWpVek9UQXpPVE13TnpJd056a3pNZz09EKWy258GGJie0p8G",
  430. "screenWidthPoints": 576,
  431. "screenHeightPoints": 764,
  432. "utcOffsetMinutes": 480,
  433. "userInterfaceTheme": "USER_INTERFACE_THEME_LIGHT",
  434. "connectionType": "CONN_CELLULAR_4G",
  435. "memoryTotalKbytes": "8000000",
  436. "mainAppWebInfo": {
  437. "graftUrl": out_user_url,
  438. "pwaInstallabilityStatus": "PWA_INSTALLABILITY_STATUS_CAN_BE_INSTALLED",
  439. "webDisplayMode": "WEB_DISPLAY_MODE_FULLSCREEN",
  440. "isWebNativeShareAvailable": False
  441. }
  442. },
  443. "user": {
  444. "lockedSafetyMode": False
  445. },
  446. "request": {
  447. "useSsl": True,
  448. "internalExperimentFlags": [],
  449. "consistencyTokenJars": []
  450. },
  451. "clickTracking": {
  452. "clickTrackingParams": ""
  453. },
  454. "adSignalsInfo": {
  455. "params": [],
  456. "bid": "ANyPxKo8EXfKNGm3gYLAqhR5HA90FSKMvQf43tk3KV_XUWB5xi_0OxAo2TJTfoVx_516NRxz0qwRg-1x2kD-IVt7LPKrRHkJBA"
  457. }
  458. },
  459. "continuation": continuation
  460. })
  461. headers = {
  462. # 'authorization': 'SAPISIDHASH 1677121838_f5055bd4b4c242d18af423b37ac0f556bf1dfc30',
  463. 'content-type': 'application/json',
  464. 'cookie': 'VISITOR_INFO1_LIVE=HABZsLFdU40; DEVICE_INFO=ChxOekl3TWpVek9UQXpPVE13TnpJd056a3pNZz09EJie0p8GGJie0p8G; PREF=f4=4000000&tz=Asia.Shanghai; HSID=AxFp7ylWWebUZYqrl; SSID=ANHuSQMqvVcV0vVNn; APISID=AkwZgjPvFZ6LZCrE/Aiv0K-2rEUzY1bH1u; SAPISID=8yRrBMHYXAhqkybH/AEFGJvzZ3tPalnTy0; __Secure-1PAPISID=8yRrBMHYXAhqkybH/AEFGJvzZ3tPalnTy0; __Secure-3PAPISID=8yRrBMHYXAhqkybH/AEFGJvzZ3tPalnTy0; SID=TwjWkM4mrKb4o8pRKbyQVqELjNU43ZL0bF8QB2hdTI9z05T4Koo9aQoNQfX1AiGFWeD7WA.; __Secure-1PSID=TwjWkM4mrKb4o8pRKbyQVqELjNU43ZL0bF8QB2hdTI9z05T4bs4qvvXffLLTXq_VYw0XLw.; __Secure-3PSID=TwjWkM4mrKb4o8pRKbyQVqELjNU43ZL0bF8QB2hdTI9z05T4cNwzpudzvCglfQ5A1FJnog.; LOGIN_INFO=AFmmF2swRAIgO4TvR9xxWoHPgrGoGAEVo-P8Slqem__vIdF_oajjRiECIFiq4YtbL_IQGCbkjrHsWkWH6OpzKd8RlgdS6qNurR0Q:QUQ3MjNmejV5WkRVUmZXVlFjbjY0dW1aVGpoZkZQdmxYamIzV01zc0lmT3JiQl9ldVYwc0t4dlNkbWpoVEdJMHVaWjZXVEt3ZERQeUppU3AyNmR6ckFucWltZU5LNmZjQ3lHUEtKTDBzSlo5WXpJQzF3UlNCVlp2Q1ZKVmxtRk05OHRuWFFiWGphcFpPblFOUURWTlVxVGtBazVjcmVtS2pR; YSC=CtX0f3NennA; SIDCC=AFvIBn9aXC4vNCbg5jPzjbC8LMYCBVx_dy8uJO20b-768rmRfP9f5BqQ_xXspPemecVq29qZ7A; __Secure-1PSIDCC=AFvIBn-4TD_lPaKgbmYAGO6hZluLgSgbWgb7XAcaeNG6982LIIpS_Gb9vkqHTBMyCGvb4x7m6jk; __Secure-3PSIDCC=AFvIBn9ypvGX15qq4CsnsuhWTaXa9yMTxWMWbIDXtr6L3XZD81XBUQ0IMUv9ZKh9mf8NEbSvOy0; SIDCC=AFvIBn_DwLbohF2llhq4EQjFDFA3n9-_AK_7ITJsTZtCeYwy43J8KCYUPfY7ghqX9s-Qq5dOIQ; __Secure-1PSIDCC=AFvIBn-7x_HhxbmDkOzXew-sXAEWVuUGpglr8rypU623IyO8Y9OungcqMkuxBZQ2vr6G7x9UcxM; __Secure-3PSIDCC=AFvIBn-7aSYRxZkCKZp7-Mdn9PwbW4CUtXD0ok0nCvPIZXfkFrN9VqN1BHkI1fUaoIo_8YCjwRs',
  465. 'origin': 'https://www.youtube.com',
  466. 'referer': out_user_url,
  467. 'user-agent': get_random_user_agent('pc'),
  468. }
  469. try:
  470. res = requests.request("POST", post_url, headers=headers, data=payload).json()
  471. video_infos = res['onResponseReceivedActions'][0]['appendContinuationItemsAction']['continuationItems']
  472. for data in video_infos:
  473. if 'richItemRenderer' in data:
  474. video_id = data["richItemRenderer"]["content"]['videoRenderer']['videoId']
  475. video_dict = cls.get_video_info(log_type, crawler, out_uid, video_id)
  476. # video_dict = cls.parse_video(video_dict, log_type, crawler, out_uid, video_id, machine)
  477. # 发布时间<=7天
  478. publish_time = int(time.mktime(time.strptime(video_dict['publish_time'], "%Y-%m-%d")))
  479. if int(time.time()) - publish_time <= 3600 * 24 * min_publish_day:
  480. cls.download_publish(log_type, crawler, video_dict, strategy, our_uid, env, oss_endpoint)
  481. else:
  482. Common.logger(log_type, crawler).info('发布时间超过7天\n')
  483. return
  484. else:
  485. continuation = cls.get_continuation(data)
  486. cls.get_next_page(log_type, crawler, task, strategy, oss_endpoint, env, our_uid, out_uid,
  487. out_user_url, continuation)
  488. except:
  489. return
  490. @classmethod
  491. def get_videos(cls, log_type, crawler, task, oss_endpoint, env, our_uid, out_uid, out_user_url):
  492. try:
  493. # 修改
  494. strategy = task['user_tag']
  495. min_publish_day = int(task['min_publish_day'])
  496. feeds = cls.get_first_page(out_user_url)
  497. for data in feeds:
  498. if 'richItemRenderer' in data:
  499. video_id = data["richItemRenderer"]["content"]['videoRenderer']['videoId']
  500. video_dict = cls.get_video_info(log_type, crawler, out_uid, video_id)
  501. # 发布时间判断
  502. publish_time = int(time.mktime(time.strptime(video_dict['publish_time'], "%Y-%m-%d")))
  503. if int(time.time()) - publish_time > 3600 * 24 * min_publish_day:
  504. Common.logger(log_type, crawler).info(f'发布时间超过{min_publish_day}天\n')
  505. elif video_dict['video_title'] == '' or video_dict['video_url'] == '':
  506. Common.logger(log_type, crawler).info('无效视频\n')
  507. elif video_dict['duration'] > task['duration_max'] or video_dict['duration'] < task['duration_min']:
  508. Common.logger(log_type, crawler).info(f"时长:{video_dict['duration']}不满足规则\n")
  509. elif cls.repeat_video(log_type, crawler, video_dict['video_id'], env) != 0:
  510. Common.logger(log_type, crawler).info('视频已下载\n')
  511. else:
  512. cls.download_publish(log_type, crawler, video_dict, strategy, our_uid, env, oss_endpoint)
  513. else:
  514. continuation = cls.get_continuation(data)
  515. cls.get_next_page(log_type, crawler, task, strategy, oss_endpoint, env, our_uid, out_uid,
  516. out_user_url, continuation=continuation)
  517. except Exception as e:
  518. Common.logger(log_type, crawler).error(f"get_videos异常:{e}\n")
  519. @classmethod
  520. def filter_emoji(cls, title):
  521. # 过滤表情
  522. try:
  523. co = re.compile(u'[\U00010000-\U0010ffff]')
  524. except re.error:
  525. co = re.compile(u'[\uD800-\uDBFF][\uDC00-\uDFFF]')
  526. return co.sub("", title)
  527. @classmethod
  528. def is_contain_chinese(cls, strword):
  529. for ch in strword:
  530. if u'\u4e00' <= ch <= u'\u9fff':
  531. return True
  532. return False
  533. @classmethod
  534. def parse_video(cls, video_dict, log_type, crawler, out_uid, video_id):
  535. try:
  536. if 'streamingData' not in video_dict:
  537. Common.logger(log_type, crawler).warning(f"get_video_info_response:{video_dict}\n")
  538. elif 'videoDetails' not in video_dict:
  539. Common.logger(log_type, crawler).warning(f"get_video_info_response:{video_dict}\n")
  540. elif 'microformat' not in video_dict:
  541. Common.logger(log_type, crawler).warning(f"get_video_info_response:{video_dict}\n")
  542. else:
  543. playerMicroformatRenderer = video_dict['microformat']['playerMicroformatRenderer']
  544. videoDetails = video_dict['videoDetails']
  545. # streamingData = response.json()['streamingData']
  546. # video_title
  547. if 'title' not in videoDetails:
  548. video_title = ''
  549. else:
  550. video_title = videoDetails['title']
  551. video_title = cls.filter_emoji(video_title)
  552. # if Translate.is_contains_chinese(video_title) is False:
  553. if not cls.is_contain_chinese(video_title):
  554. video_title = Translate.google_translate(video_title) \
  555. .strip().replace("\\", "").replace(" ", "").replace("\n", "") \
  556. .replace("/", "").replace("\r", "").replace("&NBSP", "").replace("&", "") \
  557. .replace(";", "").replace("amp;", "") # 自动翻译标题为中文
  558. if 'lengthSeconds' not in videoDetails:
  559. duration = 0
  560. else:
  561. duration = int(videoDetails['lengthSeconds'])
  562. # play_cnt
  563. if 'viewCount' not in videoDetails:
  564. play_cnt = 0
  565. else:
  566. play_cnt = int(videoDetails['viewCount'])
  567. # publish_time
  568. if 'publishDate' not in playerMicroformatRenderer:
  569. publish_time = ''
  570. else:
  571. publish_time = playerMicroformatRenderer['publishDate']
  572. if publish_time == '':
  573. publish_time_stamp = 0
  574. elif ':' in publish_time:
  575. publish_time_stamp = int(time.mktime(time.strptime(publish_time, "%Y-%m-%d %H:%M:%S")))
  576. else:
  577. publish_time_stamp = int(time.mktime(time.strptime(publish_time, "%Y-%m-%d")))
  578. # user_name
  579. if 'author' not in videoDetails:
  580. user_name = ''
  581. else:
  582. user_name = videoDetails['author']
  583. # cover_url
  584. if 'thumbnail' not in videoDetails:
  585. cover_url = ''
  586. elif 'thumbnails' not in videoDetails['thumbnail']:
  587. cover_url = ''
  588. elif len(videoDetails['thumbnail']['thumbnails']) == 0:
  589. cover_url = ''
  590. elif 'url' not in videoDetails['thumbnail']['thumbnails'][-1]:
  591. cover_url = ''
  592. else:
  593. cover_url = videoDetails['thumbnail']['thumbnails'][-1]['url']
  594. video_url = f"https://www.youtube.com/watch?v={video_id}"
  595. Common.logger(log_type, crawler).info(f'video_title:{video_title}')
  596. Common.logger(log_type, crawler).info(f'video_id:{video_id}')
  597. Common.logger(log_type, crawler).info(f'play_cnt:{play_cnt}')
  598. Common.logger(log_type, crawler).info(f'publish_time:{publish_time}')
  599. Common.logger(log_type, crawler).info(f'user_name:{user_name}')
  600. Common.logger(log_type, crawler).info(f'cover_url:{cover_url}')
  601. Common.logger(log_type, crawler).info(f'video_url:{video_url}')
  602. video_dict = {
  603. 'video_title': video_title,
  604. 'video_id': video_id,
  605. 'duration': duration,
  606. 'play_cnt': play_cnt,
  607. 'publish_time': publish_time,
  608. 'publish_time_stamp': publish_time_stamp,
  609. 'user_name': user_name,
  610. 'out_uid': out_uid,
  611. 'cover_url': cover_url,
  612. 'video_url': video_url,
  613. }
  614. return video_dict
  615. except Exception as e:
  616. Common.logger(log_type, crawler).error(f"get_video_info异常:{e}\n")
  617. @classmethod
  618. def get_video_info(cls, log_type, crawler, out_uid, video_id):
  619. try:
  620. url = "https://www.youtube.com/youtubei/v1/player?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8&prettyPrint=false"
  621. payload = json.dumps({
  622. "context": {
  623. "client": {
  624. "hl": "zh-CN",
  625. "gl": "US",
  626. "remoteHost": "38.93.247.21",
  627. "deviceMake": "Apple",
  628. "deviceModel": "",
  629. "visitorData": "CgtraDZfVnB4NXdIWSjkzoefBg%3D%3D",
  630. "userAgent": get_random_user_agent('pc'),
  631. "clientName": "WEB",
  632. "clientVersion": "2.20230201.01.00",
  633. "osName": "Macintosh",
  634. "osVersion": "10_15_7",
  635. "originalUrl": f"https://www.youtube.com/watch?v={video_id}",
  636. "platform": "DESKTOP",
  637. "clientFormFactor": "UNKNOWN_FORM_FACTOR",
  638. "configInfo": {
  639. "appInstallData": "COTOh58GEPuj_hIQ1-SuBRC4i64FEMzfrgUQgt2uBRCi7K4FEOLUrgUQzPWuBRCKgK8FEOSg_hIQtpz-EhDa6a4FEP7urgUQieiuBRDn964FELjUrgUQlPiuBRCH3a4FELfgrgUQ76P-EhDJya4FEJan_hIQkfj8Eg%3D%3D"
  640. },
  641. "timeZone": "Asia/Shanghai",
  642. "browserName": "Chrome",
  643. "browserVersion": "109.0.0.0",
  644. "acceptHeader": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
  645. "deviceExperimentId": "ChxOekU1TlRReU5qWTBOVFExTVRRNU5qRTBOdz09EOTOh58GGOmU7Z4G",
  646. "screenWidthPoints": 1037,
  647. "screenHeightPoints": 969,
  648. "screenPixelDensity": 1,
  649. "screenDensityFloat": 1,
  650. "utcOffsetMinutes": 480,
  651. "userInterfaceTheme": "USER_INTERFACE_THEME_LIGHT",
  652. "memoryTotalKbytes": "8000000",
  653. "clientScreen": "WATCH",
  654. "mainAppWebInfo": {
  655. "graftUrl": f"/watch?v={video_id}",
  656. "pwaInstallabilityStatus": "PWA_INSTALLABILITY_STATUS_CAN_BE_INSTALLED",
  657. "webDisplayMode": "WEB_DISPLAY_MODE_FULLSCREEN",
  658. "isWebNativeShareAvailable": True
  659. }
  660. },
  661. "user": {
  662. "lockedSafetyMode": False
  663. },
  664. "request": {
  665. "useSsl": True,
  666. "internalExperimentFlags": [],
  667. "consistencyTokenJars": []
  668. },
  669. "clickTracking": {
  670. "clickTrackingParams": "CIwBEKQwGAYiEwipncqx3IL9AhXs4cQKHbKZDO4yB3JlbGF0ZWRInsS1qbGFtIlUmgEFCAEQ-B0="
  671. },
  672. "adSignalsInfo": {
  673. "params": [
  674. {
  675. "key": "dt",
  676. "value": "1675749222611"
  677. },
  678. {
  679. "key": "flash",
  680. "value": "0"
  681. },
  682. {
  683. "key": "frm",
  684. "value": "0"
  685. },
  686. {
  687. "key": "u_tz",
  688. "value": "480"
  689. },
  690. {
  691. "key": "u_his",
  692. "value": "3"
  693. },
  694. {
  695. "key": "u_h",
  696. "value": "1080"
  697. },
  698. {
  699. "key": "u_w",
  700. "value": "1920"
  701. },
  702. {
  703. "key": "u_ah",
  704. "value": "1080"
  705. },
  706. {
  707. "key": "u_aw",
  708. "value": "1920"
  709. },
  710. {
  711. "key": "u_cd",
  712. "value": "24"
  713. },
  714. {
  715. "key": "bc",
  716. "value": "31"
  717. },
  718. {
  719. "key": "bih",
  720. "value": "969"
  721. },
  722. {
  723. "key": "biw",
  724. "value": "1037"
  725. },
  726. {
  727. "key": "brdim",
  728. "value": "-269,-1080,-269,-1080,1920,-1080,1920,1080,1037,969"
  729. },
  730. {
  731. "key": "vis",
  732. "value": "1"
  733. },
  734. {
  735. "key": "wgl",
  736. "value": "true"
  737. },
  738. {
  739. "key": "ca_type",
  740. "value": "image"
  741. }
  742. ],
  743. "bid": "ANyPxKop8SijebwUCq4ZfKbJwlSjVQa_RTdS6c6a6WPYpCKnxpWCJ33B1SzRuSXjSfH9O2MhURebAs0CngRg6B4nOjBpeJDKgA"
  744. }
  745. },
  746. "videoId": str(video_id),
  747. "playbackContext": {
  748. "contentPlaybackContext": {
  749. "currentUrl": f"/watch?v={video_id}",
  750. "vis": 0,
  751. "splay": False,
  752. "autoCaptionsDefaultOn": False,
  753. "autonavState": "STATE_NONE",
  754. "html5Preference": "HTML5_PREF_WANTS",
  755. "signatureTimestamp": 19394,
  756. "referer": f"https://www.youtube.com/watch?v={video_id}",
  757. "lactMilliseconds": "-1",
  758. "watchAmbientModeContext": {
  759. "watchAmbientModeEnabled": True
  760. }
  761. }
  762. },
  763. "racyCheckOk": False,
  764. "contentCheckOk": False
  765. })
  766. headers = {
  767. 'authority': 'www.youtube.com',
  768. 'accept': '*/*',
  769. 'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
  770. 'cache-control': 'no-cache',
  771. 'content-type': 'application/json',
  772. 'cookie': f'VISITOR_INFO1_LIVE=kh6_Vpx5wHY; YSC=UupqFrWvAR0; DEVICE_INFO=ChxOekU1TlRReU5qWTBOVFExTVRRNU5qRTBOdz09EOmU7Z4GGOmU7Z4G; PREF=tz=Asia.Shanghai; ST-180dxzo=itct=CIwBEKQwGAYiEwipncqx3IL9AhXs4cQKHbKZDO4yB3JlbGF0ZWRInsS1qbGFtIlUmgEFCAEQ-B0%3D&csn=MC41MTQ1NTQzMTE3NTA4MjY0&endpoint=%7B%22clickTrackingParams%22%3A%22CIwBEKQwGAYiEwipncqx3IL9AhXs4cQKHbKZDO4yB3JlbGF0ZWRInsS1qbGFtIlUmgEFCAEQ-B0%3D%22%2C%22commandMetadata%22%3A%7B%22webCommandMetadata%22%3A%7B%22url%22%3A%22%2Fwatch%3Fv%3D{video_id}%22%2C%22webPageType%22%3A%22WEB_PAGE_TYPE_WATCH%22%2C%22rootVe%22%3A3832%7D%7D%2C%22watchEndpoint%22%3A%7B%22videoId%22%3A%22{video_id}%22%2C%22nofollow%22%3Atrue%2C%22watchEndpointSupportedOnesieConfig%22%3A%7B%22html5PlaybackOnesieConfig%22%3A%7B%22commonConfig%22%3A%7B%22url%22%3A%22https%3A%2F%2Frr5---sn-nx5s7n76.googlevideo.com%2Finitplayback%3Fsource%3Dyoutube%26oeis%3D1%26c%3DWEB%26oad%3D3200%26ovd%3D3200%26oaad%3D11000%26oavd%3D11000%26ocs%3D700%26oewis%3D1%26oputc%3D1%26ofpcc%3D1%26msp%3D1%26odepv%3D1%26id%3D38654ad085c12212%26ip%3D38.93.247.21%26initcwndbps%3D11346250%26mt%3D1675748964%26oweuc%3D%26pxtags%3DCg4KAnR4EggyNDQ1MTI4OA%26rxtags%3DCg4KAnR4EggyNDQ1MTI4Ng%252CCg4KAnR4EggyNDQ1MTI4Nw%252CCg4KAnR4EggyNDQ1MTI4OA%252CCg4KAnR4EggyNDQ1MTI4OQ%22%7D%7D%7D%7D%7D',
  773. 'origin': 'https://www.youtube.com',
  774. 'pragma': 'no-cache',
  775. 'referer': f'https://www.youtube.com/watch?v={video_id}',
  776. 'sec-ch-ua': '"Not_A Brand";v="99", "Chromium";v="109", "Google Chrome";v="109.0.5414.87"',
  777. 'sec-ch-ua-arch': '"arm"',
  778. 'sec-ch-ua-bitness': '"64"',
  779. 'sec-ch-ua-full-version': '"109.0.1518.52"',
  780. 'sec-ch-ua-full-version-list': '"Not_A Brand";v="99.0.0.0", "Microsoft Edge";v="109.0.1518.52", "Chromium";v="109.0.5414.87"',
  781. 'sec-ch-ua-mobile': '?0',
  782. 'sec-ch-ua-model': '',
  783. 'sec-ch-ua-platform': '"macOS"',
  784. 'sec-ch-ua-platform-version': '"12.4.0"',
  785. 'sec-ch-ua-wow64': '?0',
  786. 'sec-fetch-dest': 'empty',
  787. 'sec-fetch-mode': 'same-origin',
  788. 'sec-fetch-site': 'same-origin',
  789. 'user-agent': get_random_user_agent('pc'),
  790. 'x-goog-visitor-id': 'CgtraDZfVnB4NXdIWSjkzoefBg%3D%3D',
  791. 'x-youtube-bootstrap-logged-in': 'false',
  792. 'x-youtube-client-name': '1',
  793. 'x-youtube-client-version': '2.20230201.01.00'
  794. }
  795. response = requests.post(url=url, headers=headers, data=payload)
  796. if response.status_code != 200:
  797. Common.logger(log_type, crawler).warning(f"get_video_info_response:{response.text}\n")
  798. elif 'streamingData' not in response.json():
  799. Common.logger(log_type, crawler).warning(f"get_video_info_response:{response.json()}\n")
  800. elif 'videoDetails' not in response.json():
  801. Common.logger(log_type, crawler).warning(f"get_video_info_response:{response.json()}\n")
  802. elif 'microformat' not in response.json():
  803. Common.logger(log_type, crawler).warning(f"get_video_info_response:{response.json()}\n")
  804. else:
  805. playerMicroformatRenderer = response.json()['microformat']['playerMicroformatRenderer']
  806. videoDetails = response.json()['videoDetails']
  807. # streamingData = response.json()['streamingData']
  808. # video_title
  809. if 'title' not in videoDetails:
  810. video_title = ''
  811. else:
  812. video_title = videoDetails['title']
  813. video_title = cls.filter_emoji(video_title)
  814. if not cls.is_contain_chinese(video_title):
  815. video_title = Translate.google_translate(video_title) \
  816. .strip().replace("\\", "").replace(" ", "").replace("\n", "") \
  817. .replace("/", "").replace("\r", "").replace("&NBSP", "").replace("&", "") \
  818. .replace(";", "").replace("amp;", "") # 自动翻译标题为中文
  819. if 'lengthSeconds' not in videoDetails:
  820. duration = 0
  821. else:
  822. duration = int(videoDetails['lengthSeconds'])
  823. # play_cnt
  824. if 'viewCount' not in videoDetails:
  825. play_cnt = 0
  826. else:
  827. play_cnt = int(videoDetails['viewCount'])
  828. # publish_time
  829. if 'publishDate' not in playerMicroformatRenderer:
  830. publish_time = ''
  831. else:
  832. publish_time = playerMicroformatRenderer['publishDate']
  833. if publish_time == '':
  834. publish_time_stamp = 0
  835. elif ':' in publish_time:
  836. publish_time_stamp = int(time.mktime(time.strptime(publish_time, "%Y-%m-%d %H:%M:%S")))
  837. else:
  838. publish_time_stamp = int(time.mktime(time.strptime(publish_time, "%Y-%m-%d")))
  839. # user_name
  840. if 'author' not in videoDetails:
  841. user_name = ''
  842. else:
  843. user_name = videoDetails['author']
  844. # cover_url
  845. if 'thumbnail' not in videoDetails:
  846. cover_url = ''
  847. elif 'thumbnails' not in videoDetails['thumbnail']:
  848. cover_url = ''
  849. elif len(videoDetails['thumbnail']['thumbnails']) == 0:
  850. cover_url = ''
  851. elif 'url' not in videoDetails['thumbnail']['thumbnails'][-1]:
  852. cover_url = ''
  853. else:
  854. cover_url = videoDetails['thumbnail']['thumbnails'][-1]['url']
  855. # video_url
  856. # if 'formats' not in streamingData:
  857. # video_url = ''
  858. # elif len(streamingData['formats']) == 0:
  859. # video_url = ''
  860. # elif 'url' not in streamingData['formats'][-1]:
  861. # video_url = ''
  862. # else:
  863. # video_url = streamingData['formats'][-1]['url']
  864. video_url = f"https://www.youtube.com/watch?v={video_id}"
  865. Common.logger(log_type, crawler).info(f'video_title:{video_title}')
  866. Common.logger(log_type, crawler).info(f'video_id:{video_id}')
  867. Common.logger(log_type, crawler).info(f'play_cnt:{play_cnt}')
  868. Common.logger(log_type, crawler).info(f'publish_time:{publish_time}')
  869. Common.logger(log_type, crawler).info(f'user_name:{user_name}')
  870. Common.logger(log_type, crawler).info(f'cover_url:{cover_url}')
  871. Common.logger(log_type, crawler).info(f'video_url:{video_url}')
  872. video_dict = {
  873. 'video_title': video_title,
  874. 'video_id': video_id,
  875. 'duration': duration,
  876. 'play_cnt': play_cnt,
  877. 'publish_time': publish_time,
  878. 'publish_time_stamp': publish_time_stamp,
  879. 'user_name': user_name,
  880. 'out_uid': out_uid,
  881. 'cover_url': cover_url,
  882. 'video_url': video_url,
  883. }
  884. return video_dict
  885. except Exception as e:
  886. Common.logger(log_type, crawler).error(f"get_video_info异常:{e}\n")
  887. @classmethod
  888. def repeat_video(cls, log_type, crawler, video_id, env):
  889. sql = f""" select * from crawler_video where platform="{cls.platform}" and out_video_id="{video_id}"; """
  890. repeat_video = MysqlHelper.get_values(log_type, crawler, sql, env)
  891. return len(repeat_video)
  892. @classmethod
  893. def download_publish(cls, log_type, crawler, video_dict, strategy, our_uid, env, oss_endpoint):
  894. try:
  895. # 下载视频
  896. Common.logger(log_type, crawler).info('开始下载视频...')
  897. Common.download_method(log_type, crawler, 'youtube_video', video_dict['video_title'],
  898. video_dict['video_url'])
  899. video_width = 1280
  900. video_height = 720
  901. duration = int(video_dict['duration'])
  902. Common.logger(log_type, crawler).info(f'video_width:{video_width}')
  903. Common.logger(log_type, crawler).info(f'video_height:{video_height}')
  904. Common.logger(log_type, crawler).info(f'duration:{duration}')
  905. video_dict['video_width'] = video_width
  906. video_dict['video_height'] = video_height
  907. video_dict['duration'] = duration
  908. video_dict['comment_cnt'] = 0
  909. video_dict['like_cnt'] = 0
  910. video_dict['share_cnt'] = 0
  911. video_dict['avatar_url'] = video_dict['cover_url']
  912. video_dict['session'] = f'youtube{int(time.time())}'
  913. rule = '1,2'
  914. # 下载封面
  915. Common.download_method(log_type, crawler, 'cover', video_dict['video_title'], video_dict['cover_url'])
  916. # 保存视频文本信息
  917. Common.save_video_info(log_type, crawler, video_dict)
  918. # 上传视频
  919. Common.logger(log_type, crawler).info(f"开始上传视频")
  920. if env == 'dev':
  921. our_video_id = Publish.upload_and_publish(log_type, crawler, strategy, our_uid, env, oss_endpoint)
  922. our_video_link = f"https://testadmin.piaoquantv.com/cms/post-detail/{our_video_id}/info"
  923. else:
  924. our_video_id = Publish.upload_and_publish(log_type, crawler, strategy, our_uid, env, oss_endpoint)
  925. our_video_link = f"https://admin.piaoquantv.com/cms/post-detail/{our_video_id}/info"
  926. Common.logger(log_type, crawler).info("视频上传完成")
  927. if our_video_id is None:
  928. # 删除视频文件夹
  929. shutil.rmtree(f"./{crawler}/videos/{video_dict['video_title']}/")
  930. return
  931. # 视频信息保存至飞书
  932. Feishu.insert_columns(log_type, crawler, "GVxlYk", "ROWS", 1, 2)
  933. # 视频ID工作表,首行写入数据
  934. upload_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(time.time())))
  935. values = [[upload_time,
  936. "定向榜",
  937. video_dict['video_id'],
  938. video_dict['video_title'],
  939. our_video_link,
  940. video_dict['play_cnt'],
  941. video_dict['duration'],
  942. f'{video_width}*{video_height}',
  943. video_dict['publish_time'],
  944. video_dict['user_name'],
  945. video_dict['cover_url'],
  946. video_dict['video_url']
  947. ]]
  948. # time.sleep(1)
  949. Feishu.update_values(log_type, crawler, "GVxlYk", "F2:Z2", values)
  950. Common.logger(log_type, crawler).info('视频信息写入定向_已下载表成功\n')
  951. # 视频信息保存数据库
  952. sql = f""" insert into crawler_video(video_id,
  953. user_id,
  954. out_user_id,
  955. platform,
  956. strategy,
  957. out_video_id,
  958. video_title,
  959. cover_url,
  960. video_url,
  961. duration,
  962. publish_time,
  963. play_cnt,
  964. crawler_rule,
  965. width,
  966. height)
  967. values({our_video_id},
  968. "{our_uid}",
  969. "{video_dict['out_uid']}",
  970. "{cls.platform}",
  971. "定向爬虫策略",
  972. "{video_dict['video_id']}",
  973. "{video_dict['video_title']}",
  974. "{video_dict['cover_url']}",
  975. "{video_dict['video_url']}",
  976. {int(duration)},
  977. "{video_dict['publish_time']}",
  978. {int(video_dict['play_cnt'])},
  979. "{rule}",
  980. {int(video_width)},
  981. {int(video_height)}) """
  982. MysqlHelper.update_values(log_type, crawler, sql, env)
  983. Common.logger(log_type, crawler).info('视频信息插入数据库成功!\n')
  984. except Exception as e:
  985. Common.logger(log_type, crawler).info(f"download_publish异常:{e}\n")
  986. @classmethod
  987. def get_users(cls, log_type, crawler, task, env):
  988. link_list = task['spider_link']
  989. user_list = []
  990. for link in link_list:
  991. out_uid = link.split("/")[3]
  992. sql = f""" select * from crawler_author_map where spider_link="{link}" """
  993. our_user_info = MysqlHelper.get_values(log_type=log_type, crawler=crawler, sql=sql, env=env)
  994. if len(our_user_info) == 0:
  995. our_uid = 0
  996. Common.logger(log_type, crawler).info(f"没有站内虚拟账号: {link}\n")
  997. else:
  998. our_uid = our_user_info[0]["media_id"]
  999. user_dict = {
  1000. "out_uid": out_uid,
  1001. "out_user_url": link,
  1002. "our_uid": our_uid
  1003. }
  1004. user_list.append(user_dict)
  1005. Common.logger(log_type, crawler).info(f"user_list:{user_list}")
  1006. return user_list
  1007. @classmethod
  1008. def get_follow_videos(cls, log_type, crawler, task, oss_endpoint, env):
  1009. try:
  1010. user_list = cls.get_users(log_type, crawler, task, env)
  1011. if len(user_list) == 0:
  1012. Common.logger(log_type, crawler).warning('用户列表为空\n')
  1013. else:
  1014. for user_dict in user_list:
  1015. out_user_url = user_dict['out_user_url']
  1016. our_uid = user_dict['our_uid']
  1017. out_uid = user_dict['out_uid']
  1018. Common.logger(log_type, crawler).info(f'获取 {out_uid} 主页视频\n')
  1019. cls.get_videos(log_type=log_type,
  1020. crawler=crawler,
  1021. task=task,
  1022. our_uid=our_uid,
  1023. oss_endpoint=oss_endpoint,
  1024. env=env,
  1025. out_uid=out_uid,
  1026. out_user_url=out_user_url
  1027. )
  1028. # Common.logger(log_type, crawler).info('休眠 10 秒')
  1029. # time.sleep(random.randint(1, 2))
  1030. cls.continuation = ''
  1031. except Exception as e:
  1032. Common.logger(log_type, crawler).error(f"get_follow_videos异常:{e}\n")
  1033. if __name__ == "__main__":
  1034. # print(Follow.get_browse_id('follow', 'youtube', '@chinatravel5971', "local"))
  1035. # print(Follow.get_user_from_feishu('follow', 'youtube', 'c467d7', 'dev', 'local'))
  1036. # print(Follow.get_user_from_feishu('follow', 'youtube', 'c467d7', 'prod', 'prod'))
  1037. # Follow.get_out_user_info('follow', 'youtube', 'UC08jgxf119fzynp2uHCvZIg', '@weitravel')
  1038. # Follow.get_video_info('follow', 'youtube', 'OGVK0IXBIhI')
  1039. # Follow.get_follow_videos('follow', 'youtube', 'youtube_follow', 'out', 'dev', 'local')
  1040. # print(Follow.filter_emoji("姐妹倆一唱一和,完美配合,終於把大慶降服了😅😅#萌娃搞笑日常"))
  1041. # Follow.repeat_video('follow', 'youtube', 4, "dev", "local")
  1042. # title = "'西部巡游220丨两人一车环游中国半年,需要花费多少钱? 2万公里吃住行费用总结'"
  1043. # title = "'Insanely Crowded Shanghai Yu Garden Lantern Festival Walk Tour 2023 人气爆棚的上海豫园元宵节漫步之行 4K'"
  1044. # print(title.strip().replace("\\", "").replace(" ", "").replace("\n", "").replace("/", "").replace("\r", "").replace("&NBSP", "").replace("&", ""))
  1045. pass