xigua_recommend.py 50 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871
  1. # -*- coding: utf-8 -*-
  2. # @Author: wangkun
  3. # @Time: 2023/4/7
  4. import base64
  5. import json
  6. import os
  7. import random
  8. import shutil
  9. import string
  10. import sys
  11. import time
  12. from datetime import date, timedelta
  13. from hashlib import md5
  14. import requests
  15. import urllib3
  16. from requests.adapters import HTTPAdapter
  17. from selenium import webdriver
  18. from selenium.webdriver import DesiredCapabilities
  19. from selenium.webdriver.chrome.service import Service
  20. sys.path.append(os.getcwd())
  21. from common.publish import Publish
  22. from common.common import Common
  23. from common.feishu import Feishu
  24. from common.scheduling_db import MysqlHelper
  25. class XiguaRecommend:
  26. platform = "西瓜视频"
  27. @classmethod
  28. def xigua_config(cls, log_type, crawler, text, env):
  29. select_sql = f"""select * from crawler_config where source="xigua" """
  30. contents = MysqlHelper.get_values(log_type, crawler, select_sql, env, action='')
  31. title_list = []
  32. filter_list = []
  33. for content in contents:
  34. config = content['config']
  35. config_dict = eval(config)
  36. for k, v in config_dict.items():
  37. if k == "title":
  38. title_list_config = v.split(",")
  39. for title in title_list_config:
  40. title_list.append(title)
  41. if k == "filter":
  42. filter_list_config = v.split(",")
  43. for filter_word in filter_list_config:
  44. filter_list.append(filter_word)
  45. if text == "title":
  46. return title_list
  47. elif text == "filter":
  48. return filter_list
  49. @classmethod
  50. def download_rule(cls, video_dict):
  51. publish_time_str_rule = (date.today() + timedelta(days=-30)).strftime("%Y-%m-%d %H:%M:%S")
  52. publish_time_stamp_rule = int(time.mktime(time.strptime(publish_time_str_rule, "%Y-%m-%d %H:%M:%S")))
  53. if int(video_dict['play_cnt']) >= 10000:
  54. if 60*30 >= int(video_dict['duration']) >= 60:
  55. if int(video_dict['publish_time_stamp']) >= publish_time_stamp_rule:
  56. return True
  57. else:
  58. return False
  59. else:
  60. return False
  61. else:
  62. return False
  63. @classmethod
  64. def random_signature(cls):
  65. src_digits = string.digits # string_数字
  66. src_uppercase = string.ascii_uppercase # string_大写字母
  67. src_lowercase = string.ascii_lowercase # string_小写字母
  68. digits_num = random.randint(1, 6)
  69. uppercase_num = random.randint(1, 26 - digits_num - 1)
  70. lowercase_num = 26 - (digits_num + uppercase_num)
  71. password = random.sample(src_digits, digits_num) + random.sample(src_uppercase, uppercase_num) + random.sample(
  72. src_lowercase, lowercase_num)
  73. random.shuffle(password)
  74. new_password = 'AAAAAAAAAA' + ''.join(password)[10:-4] + 'AAAB'
  75. new_password_start = new_password[0:18]
  76. new_password_end = new_password[-7:]
  77. if new_password[18] == '8':
  78. new_password = new_password_start + 'w' + new_password_end
  79. elif new_password[18] == '9':
  80. new_password = new_password_start + 'x' + new_password_end
  81. elif new_password[18] == '-':
  82. new_password = new_password_start + 'y' + new_password_end
  83. elif new_password[18] == '.':
  84. new_password = new_password_start + 'z' + new_password_end
  85. else:
  86. new_password = new_password_start + 'y' + new_password_end
  87. return new_password
  88. @classmethod
  89. def get_signature(cls, log_type, crawler, env):
  90. try:
  91. # 打印请求配置
  92. ca = DesiredCapabilities.CHROME
  93. ca["goog:loggingPrefs"] = {"performance": "ALL"}
  94. # 不打开浏览器运行
  95. chrome_options = webdriver.ChromeOptions()
  96. chrome_options.add_argument("headless")
  97. chrome_options.add_argument(
  98. f'user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36')
  99. chrome_options.add_argument("--no-sandbox")
  100. # driver初始化
  101. if env == "dev":
  102. driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options,
  103. service=Service('/Users/wangkun/Downloads/chromedriver/chromedriver_v111/chromedriver'))
  104. else:
  105. driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options)
  106. driver.implicitly_wait(10)
  107. driver.get('https://www.ixigua.com/')
  108. time.sleep(1)
  109. # 向上滑动 1000 个像素
  110. driver.execute_script('window.scrollBy(0, 2000)')
  111. # Common.logger(log_type, crawler).info('刷新页面')
  112. driver.refresh()
  113. logs = driver.get_log("performance")
  114. # Common.logger(log_type, crawler).info('已获取logs:{}\n', logs)
  115. driver.quit()
  116. for line in logs:
  117. msg = json.loads(line['message'])
  118. if 'params' not in msg['message']:
  119. pass
  120. elif 'documentURL' not in msg['message']['params']:
  121. pass
  122. elif 'www.ixigua.com' not in msg['message']['params']['documentURL']:
  123. pass
  124. elif 'url' not in msg['message']['params']['request']:
  125. pass
  126. elif '_signature' not in msg['message']['params']['request']['url']:
  127. pass
  128. else:
  129. url = msg['message']['params']['request']['url']
  130. signature = url.split('_signature=')[-1].split('&')[0]
  131. return signature
  132. except Exception as e:
  133. Common.logger(log_type, crawler).error(f'get_signature异常:{e}\n')
  134. # 获取视频详情
  135. @classmethod
  136. def get_video_url(cls, log_type, crawler, gid):
  137. try:
  138. url = 'https://www.ixigua.com/api/mixVideo/information?'
  139. headers = {
  140. "accept-encoding": "gzip, deflate",
  141. "accept-language": "zh-CN,zh-Hans;q=0.9",
  142. "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
  143. "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.5 Safari/605.1.15",
  144. "referer": "https://www.ixigua.com/7102614741050196520?logTag=0531c88ac04f38ab2c62",
  145. }
  146. params = {
  147. 'mixId': gid,
  148. 'msToken': 'IlG0wd0Pylyw9ghcYiB2YseUmTwrsrqqhXrbIcsSaTcLTJyVlbYJzk20zw3UO-CfrfC'
  149. 'NVVIOBNjIl7vfBoxnVUwO9ZyzAI3umSKsT5-pef_RRfQCJwmA',
  150. 'X-Bogus': 'DFSzswVupYTANCJOSBk0P53WxM-r',
  151. '_signature': '_02B4Z6wo0000119LvEwAAIDCuktNZ0y5wkdfS7jAALThuOR8D9yWNZ.EmWHKV0WSn6Px'
  152. 'fPsH9-BldyxVje0f49ryXgmn7Tzk-swEHNb15TiGqa6YF.cX0jW8Eds1TtJOIZyfc9s5emH7gdWN94',
  153. }
  154. cookies = {
  155. 'ixigua-a-s': '1',
  156. 'msToken': 'IlG0wd0Pylyw9ghcYiB2YseUmTwrsrqqhXrbIcsSaTcLTJyVlbYJzk20zw3UO-CfrfCNVVIOB'
  157. 'NjIl7vfBoxnVUwO9ZyzAI3umSKsT5-pef_RRfQCJwmA',
  158. 'ttwid': '1%7C_yXQeHWwLZgCsgHClOwTCdYSOt_MjdOkgnPIkpi-Sr8%7C1661241238%7Cf57d0c5ef3f1d7'
  159. '6e049fccdca1ac54887c34d1f8731c8e51a49780ff0ceab9f8',
  160. 'tt_scid': 'QZ4l8KXDG0YAEaMCSbADdcybdKbUfG4BC6S4OBv9lpRS5VyqYLX2bIR8CTeZeGHR9ee3',
  161. 'MONITOR_WEB_ID': '0a49204a-7af5-4e96-95f0-f4bafb7450ad',
  162. '__ac_nonce': '06304878000964fdad287',
  163. '__ac_signature': '_02B4Z6wo00f017Rcr3AAAIDCUVxeW1tOKEu0fKvAAI4cvoYzV-wBhq7B6D8k0no7lb'
  164. 'FlvYoinmtK6UXjRIYPXnahUlFTvmWVtb77jsMkKAXzAEsLE56m36RlvL7ky.M3Xn52r9t1IEb7IR3ke8',
  165. 'ttcid': 'e56fabf6e85d4adf9e4d91902496a0e882',
  166. '_tea_utm_cache_1300': 'undefined',
  167. 'support_avif': 'false',
  168. 'support_webp': 'false',
  169. 'xiguavideopcwebid': '7134967546256016900',
  170. 'xiguavideopcwebid.sig': 'xxRww5R1VEMJN_dQepHorEu_eAc',
  171. }
  172. urllib3.disable_warnings()
  173. s = requests.session()
  174. # max_retries=3 重试3次
  175. s.mount('http://', HTTPAdapter(max_retries=3))
  176. s.mount('https://', HTTPAdapter(max_retries=3))
  177. response = s.get(url=url, headers=headers, params=params, cookies=cookies, verify=False,
  178. proxies=Common.tunnel_proxies(), timeout=5)
  179. response.close()
  180. if 'data' not in response.json() or response.json()['data'] == '':
  181. Common.logger(log_type, crawler).warning('get_video_info: response: {}', response)
  182. else:
  183. video_info = response.json()['data']['gidInformation']['packerData']['video']
  184. video_url_dict = {}
  185. # video_url
  186. if 'videoResource' not in video_info:
  187. video_url_dict["video_url"] = ''
  188. video_url_dict["audio_url"] = ''
  189. video_url_dict["video_width"] = 0
  190. video_url_dict["video_height"] = 0
  191. elif 'dash_120fps' in video_info['videoResource']:
  192. if "video_list" in video_info['videoResource']['dash_120fps'] and 'video_4' in \
  193. video_info['videoResource']['dash_120fps']['video_list']:
  194. video_url = video_info['videoResource']['dash_120fps']['video_list']['video_4'][
  195. 'backup_url_1']
  196. audio_url = video_info['videoResource']['dash_120fps']['video_list']['video_4'][
  197. 'backup_url_1']
  198. if len(video_url) % 3 == 1:
  199. video_url += '=='
  200. elif len(video_url) % 3 == 2:
  201. video_url += '='
  202. elif len(audio_url) % 3 == 1:
  203. audio_url += '=='
  204. elif len(audio_url) % 3 == 2:
  205. audio_url += '='
  206. video_url = base64.b64decode(video_url).decode('utf8')
  207. audio_url = base64.b64decode(audio_url).decode('utf8')
  208. video_width = video_info['videoResource']['dash_120fps']['video_list']['video_4']['vwidth']
  209. video_height = video_info['videoResource']['dash_120fps']['video_list']['video_4'][
  210. 'vheight']
  211. video_url_dict["video_url"] = video_url
  212. video_url_dict["audio_url"] = audio_url
  213. video_url_dict["video_width"] = video_width
  214. video_url_dict["video_height"] = video_height
  215. elif "video_list" in video_info['videoResource']['dash_120fps'] and 'video_3' in \
  216. video_info['videoResource']['dash_120fps']['video_list']:
  217. video_url = video_info['videoResource']['dash_120fps']['video_list']['video_3'][
  218. 'backup_url_1']
  219. audio_url = video_info['videoResource']['dash_120fps']['video_list']['video_3'][
  220. 'backup_url_1']
  221. if len(video_url) % 3 == 1:
  222. video_url += '=='
  223. elif len(video_url) % 3 == 2:
  224. video_url += '='
  225. elif len(audio_url) % 3 == 1:
  226. audio_url += '=='
  227. elif len(audio_url) % 3 == 2:
  228. audio_url += '='
  229. video_url = base64.b64decode(video_url).decode('utf8')
  230. audio_url = base64.b64decode(audio_url).decode('utf8')
  231. video_width = video_info['videoResource']['dash_120fps']['video_list']['video_3']['vwidth']
  232. video_height = video_info['videoResource']['dash_120fps']['video_list']['video_3'][
  233. 'vheight']
  234. video_url_dict["video_url"] = video_url
  235. video_url_dict["audio_url"] = audio_url
  236. video_url_dict["video_width"] = video_width
  237. video_url_dict["video_height"] = video_height
  238. elif "video_list" in video_info['videoResource']['dash_120fps'] and 'video_2' in \
  239. video_info['videoResource']['dash_120fps']['video_list']:
  240. video_url = video_info['videoResource']['dash_120fps']['video_list']['video_2'][
  241. 'backup_url_1']
  242. audio_url = video_info['videoResource']['dash_120fps']['video_list']['video_2'][
  243. 'backup_url_1']
  244. if len(video_url) % 3 == 1:
  245. video_url += '=='
  246. elif len(video_url) % 3 == 2:
  247. video_url += '='
  248. elif len(audio_url) % 3 == 1:
  249. audio_url += '=='
  250. elif len(audio_url) % 3 == 2:
  251. audio_url += '='
  252. video_url = base64.b64decode(video_url).decode('utf8')
  253. audio_url = base64.b64decode(audio_url).decode('utf8')
  254. video_width = video_info['videoResource']['dash_120fps']['video_list']['video_2']['vwidth']
  255. video_height = video_info['videoResource']['dash_120fps']['video_list']['video_2'][
  256. 'vheight']
  257. video_url_dict["video_url"] = video_url
  258. video_url_dict["audio_url"] = audio_url
  259. video_url_dict["video_width"] = video_width
  260. video_url_dict["video_height"] = video_height
  261. elif "video_list" in video_info['videoResource']['dash_120fps'] and 'video_1' in \
  262. video_info['videoResource']['dash_120fps']['video_list']:
  263. video_url = video_info['videoResource']['dash_120fps']['video_list']['video_1'][
  264. 'backup_url_1']
  265. audio_url = video_info['videoResource']['dash_120fps']['video_list']['video_1'][
  266. 'backup_url_1']
  267. if len(video_url) % 3 == 1:
  268. video_url += '=='
  269. elif len(video_url) % 3 == 2:
  270. video_url += '='
  271. elif len(audio_url) % 3 == 1:
  272. audio_url += '=='
  273. elif len(audio_url) % 3 == 2:
  274. audio_url += '='
  275. video_url = base64.b64decode(video_url).decode('utf8')
  276. audio_url = base64.b64decode(audio_url).decode('utf8')
  277. video_width = video_info['videoResource']['dash_120fps']['video_list']['video_1']['vwidth']
  278. video_height = video_info['videoResource']['dash_120fps']['video_list']['video_1'][
  279. 'vheight']
  280. video_url_dict["video_url"] = video_url
  281. video_url_dict["audio_url"] = audio_url
  282. video_url_dict["video_width"] = video_width
  283. video_url_dict["video_height"] = video_height
  284. elif 'dynamic_video' in video_info['videoResource']['dash_120fps'] \
  285. and 'dynamic_video_list' in video_info['videoResource']['dash_120fps']['dynamic_video'] \
  286. and 'dynamic_audio_list' in video_info['videoResource']['dash_120fps']['dynamic_video'] \
  287. and len(
  288. video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_video_list']) != 0 \
  289. and len(
  290. video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_audio_list']) != 0:
  291. video_url = \
  292. video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_video_list'][-1][
  293. 'backup_url_1']
  294. audio_url = \
  295. video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_audio_list'][-1][
  296. 'backup_url_1']
  297. if len(video_url) % 3 == 1:
  298. video_url += '=='
  299. elif len(video_url) % 3 == 2:
  300. video_url += '='
  301. elif len(audio_url) % 3 == 1:
  302. audio_url += '=='
  303. elif len(audio_url) % 3 == 2:
  304. audio_url += '='
  305. video_url = base64.b64decode(video_url).decode('utf8')
  306. audio_url = base64.b64decode(audio_url).decode('utf8')
  307. video_width = \
  308. video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_video_list'][-1][
  309. 'vwidth']
  310. video_height = \
  311. video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_video_list'][-1][
  312. 'vheight']
  313. video_url_dict["video_url"] = video_url
  314. video_url_dict["audio_url"] = audio_url
  315. video_url_dict["video_width"] = video_width
  316. video_url_dict["video_height"] = video_height
  317. else:
  318. video_url_dict["video_url"] = ''
  319. video_url_dict["audio_url"] = ''
  320. video_url_dict["video_width"] = 0
  321. video_url_dict["video_height"] = 0
  322. elif 'dash' in video_info['videoResource']:
  323. if "video_list" in video_info['videoResource']['dash'] and 'video_4' in \
  324. video_info['videoResource']['dash']['video_list']:
  325. video_url = video_info['videoResource']['dash']['video_list']['video_4']['backup_url_1']
  326. audio_url = video_info['videoResource']['dash']['video_list']['video_4']['backup_url_1']
  327. if len(video_url) % 3 == 1:
  328. video_url += '=='
  329. elif len(video_url) % 3 == 2:
  330. video_url += '='
  331. elif len(audio_url) % 3 == 1:
  332. audio_url += '=='
  333. elif len(audio_url) % 3 == 2:
  334. audio_url += '='
  335. video_url = base64.b64decode(video_url).decode('utf8')
  336. audio_url = base64.b64decode(audio_url).decode('utf8')
  337. video_width = video_info['videoResource']['dash']['video_list']['video_4']['vwidth']
  338. video_height = video_info['videoResource']['dash']['video_list']['video_4']['vheight']
  339. video_url_dict["video_url"] = video_url
  340. video_url_dict["audio_url"] = audio_url
  341. video_url_dict["video_width"] = video_width
  342. video_url_dict["video_height"] = video_height
  343. elif "video_list" in video_info['videoResource']['dash'] and 'video_3' in \
  344. video_info['videoResource']['dash']['video_list']:
  345. video_url = video_info['videoResource']['dash']['video_list']['video_3']['backup_url_1']
  346. audio_url = video_info['videoResource']['dash']['video_list']['video_3']['backup_url_1']
  347. if len(video_url) % 3 == 1:
  348. video_url += '=='
  349. elif len(video_url) % 3 == 2:
  350. video_url += '='
  351. elif len(audio_url) % 3 == 1:
  352. audio_url += '=='
  353. elif len(audio_url) % 3 == 2:
  354. audio_url += '='
  355. video_url = base64.b64decode(video_url).decode('utf8')
  356. audio_url = base64.b64decode(audio_url).decode('utf8')
  357. video_width = video_info['videoResource']['dash']['video_list']['video_3']['vwidth']
  358. video_height = video_info['videoResource']['dash']['video_list']['video_3']['vheight']
  359. video_url_dict["video_url"] = video_url
  360. video_url_dict["audio_url"] = audio_url
  361. video_url_dict["video_width"] = video_width
  362. video_url_dict["video_height"] = video_height
  363. elif "video_list" in video_info['videoResource']['dash'] and 'video_2' in \
  364. video_info['videoResource']['dash']['video_list']:
  365. video_url = video_info['videoResource']['dash']['video_list']['video_2']['backup_url_1']
  366. audio_url = video_info['videoResource']['dash']['video_list']['video_2']['backup_url_1']
  367. if len(video_url) % 3 == 1:
  368. video_url += '=='
  369. elif len(video_url) % 3 == 2:
  370. video_url += '='
  371. elif len(audio_url) % 3 == 1:
  372. audio_url += '=='
  373. elif len(audio_url) % 3 == 2:
  374. audio_url += '='
  375. video_url = base64.b64decode(video_url).decode('utf8')
  376. audio_url = base64.b64decode(audio_url).decode('utf8')
  377. video_width = video_info['videoResource']['dash']['video_list']['video_2']['vwidth']
  378. video_height = video_info['videoResource']['dash']['video_list']['video_2']['vheight']
  379. video_url_dict["video_url"] = video_url
  380. video_url_dict["audio_url"] = audio_url
  381. video_url_dict["video_width"] = video_width
  382. video_url_dict["video_height"] = video_height
  383. elif "video_list" in video_info['videoResource']['dash'] and 'video_1' in \
  384. video_info['videoResource']['dash']['video_list']:
  385. video_url = video_info['videoResource']['dash']['video_list']['video_1']['backup_url_1']
  386. audio_url = video_info['videoResource']['dash']['video_list']['video_1']['backup_url_1']
  387. if len(video_url) % 3 == 1:
  388. video_url += '=='
  389. elif len(video_url) % 3 == 2:
  390. video_url += '='
  391. elif len(audio_url) % 3 == 1:
  392. audio_url += '=='
  393. elif len(audio_url) % 3 == 2:
  394. audio_url += '='
  395. video_url = base64.b64decode(video_url).decode('utf8')
  396. audio_url = base64.b64decode(audio_url).decode('utf8')
  397. video_width = video_info['videoResource']['dash']['video_list']['video_1']['vwidth']
  398. video_height = video_info['videoResource']['dash']['video_list']['video_1']['vheight']
  399. video_url_dict["video_url"] = video_url
  400. video_url_dict["audio_url"] = audio_url
  401. video_url_dict["video_width"] = video_width
  402. video_url_dict["video_height"] = video_height
  403. elif 'dynamic_video' in video_info['videoResource']['dash'] \
  404. and 'dynamic_video_list' in video_info['videoResource']['dash']['dynamic_video'] \
  405. and 'dynamic_audio_list' in video_info['videoResource']['dash']['dynamic_video'] \
  406. and len(video_info['videoResource']['dash']['dynamic_video']['dynamic_video_list']) != 0 \
  407. and len(
  408. video_info['videoResource']['dash']['dynamic_video']['dynamic_audio_list']) != 0:
  409. video_url = video_info['videoResource']['dash']['dynamic_video']['dynamic_video_list'][-1][
  410. 'backup_url_1']
  411. audio_url = video_info['videoResource']['dash']['dynamic_video']['dynamic_audio_list'][-1][
  412. 'backup_url_1']
  413. if len(video_url) % 3 == 1:
  414. video_url += '=='
  415. elif len(video_url) % 3 == 2:
  416. video_url += '='
  417. elif len(audio_url) % 3 == 1:
  418. audio_url += '=='
  419. elif len(audio_url) % 3 == 2:
  420. audio_url += '='
  421. video_url = base64.b64decode(video_url).decode('utf8')
  422. audio_url = base64.b64decode(audio_url).decode('utf8')
  423. video_width = \
  424. video_info['videoResource']['dash']['dynamic_video']['dynamic_video_list'][-1]['vwidth']
  425. video_height = \
  426. video_info['videoResource']['dash']['dynamic_video']['dynamic_video_list'][-1]['vheight']
  427. video_url_dict["video_url"] = video_url
  428. video_url_dict["audio_url"] = audio_url
  429. video_url_dict["video_width"] = video_width
  430. video_url_dict["video_height"] = video_height
  431. else:
  432. video_url_dict["video_url"] = ''
  433. video_url_dict["audio_url"] = ''
  434. video_url_dict["video_width"] = 0
  435. video_url_dict["video_height"] = 0
  436. elif 'normal' in video_info['videoResource']:
  437. if "video_list" in video_info['videoResource']['normal'] and 'video_4' in \
  438. video_info['videoResource']['normal']['video_list']:
  439. video_url = video_info['videoResource']['normal']['video_list']['video_4']['backup_url_1']
  440. audio_url = video_info['videoResource']['normal']['video_list']['video_4']['backup_url_1']
  441. if len(video_url) % 3 == 1:
  442. video_url += '=='
  443. elif len(video_url) % 3 == 2:
  444. video_url += '='
  445. elif len(audio_url) % 3 == 1:
  446. audio_url += '=='
  447. elif len(audio_url) % 3 == 2:
  448. audio_url += '='
  449. video_url = base64.b64decode(video_url).decode('utf8')
  450. audio_url = base64.b64decode(audio_url).decode('utf8')
  451. video_width = video_info['videoResource']['normal']['video_list']['video_4']['vwidth']
  452. video_height = video_info['videoResource']['normal']['video_list']['video_4']['vheight']
  453. video_url_dict["video_url"] = video_url
  454. video_url_dict["audio_url"] = audio_url
  455. video_url_dict["video_width"] = video_width
  456. video_url_dict["video_height"] = video_height
  457. elif "video_list" in video_info['videoResource']['normal'] and 'video_3' in \
  458. video_info['videoResource']['normal']['video_list']:
  459. video_url = video_info['videoResource']['normal']['video_list']['video_3']['backup_url_1']
  460. audio_url = video_info['videoResource']['normal']['video_list']['video_3']['backup_url_1']
  461. if len(video_url) % 3 == 1:
  462. video_url += '=='
  463. elif len(video_url) % 3 == 2:
  464. video_url += '='
  465. elif len(audio_url) % 3 == 1:
  466. audio_url += '=='
  467. elif len(audio_url) % 3 == 2:
  468. audio_url += '='
  469. video_url = base64.b64decode(video_url).decode('utf8')
  470. audio_url = base64.b64decode(audio_url).decode('utf8')
  471. video_width = video_info['videoResource']['normal']['video_list']['video_3']['vwidth']
  472. video_height = video_info['videoResource']['normal']['video_list']['video_3']['vheight']
  473. video_url_dict["video_url"] = video_url
  474. video_url_dict["audio_url"] = audio_url
  475. video_url_dict["video_width"] = video_width
  476. video_url_dict["video_height"] = video_height
  477. elif "video_list" in video_info['videoResource']['normal'] and 'video_2' in \
  478. video_info['videoResource']['normal']['video_list']:
  479. video_url = video_info['videoResource']['normal']['video_list']['video_2']['backup_url_1']
  480. audio_url = video_info['videoResource']['normal']['video_list']['video_2']['backup_url_1']
  481. if len(video_url) % 3 == 1:
  482. video_url += '=='
  483. elif len(video_url) % 3 == 2:
  484. video_url += '='
  485. elif len(audio_url) % 3 == 1:
  486. audio_url += '=='
  487. elif len(audio_url) % 3 == 2:
  488. audio_url += '='
  489. video_url = base64.b64decode(video_url).decode('utf8')
  490. audio_url = base64.b64decode(audio_url).decode('utf8')
  491. video_width = video_info['videoResource']['normal']['video_list']['video_2']['vwidth']
  492. video_height = video_info['videoResource']['normal']['video_list']['video_2']['vheight']
  493. video_url_dict["video_url"] = video_url
  494. video_url_dict["audio_url"] = audio_url
  495. video_url_dict["video_width"] = video_width
  496. video_url_dict["video_height"] = video_height
  497. elif "video_list" in video_info['videoResource']['normal'] and 'video_1' in \
  498. video_info['videoResource']['normal']['video_list']:
  499. video_url = video_info['videoResource']['normal']['video_list']['video_1']['backup_url_1']
  500. audio_url = video_info['videoResource']['normal']['video_list']['video_1']['backup_url_1']
  501. if len(video_url) % 3 == 1:
  502. video_url += '=='
  503. elif len(video_url) % 3 == 2:
  504. video_url += '='
  505. elif len(audio_url) % 3 == 1:
  506. audio_url += '=='
  507. elif len(audio_url) % 3 == 2:
  508. audio_url += '='
  509. video_url = base64.b64decode(video_url).decode('utf8')
  510. audio_url = base64.b64decode(audio_url).decode('utf8')
  511. video_width = video_info['videoResource']['normal']['video_list']['video_1']['vwidth']
  512. video_height = video_info['videoResource']['normal']['video_list']['video_1']['vheight']
  513. video_url_dict["video_url"] = video_url
  514. video_url_dict["audio_url"] = audio_url
  515. video_url_dict["video_width"] = video_width
  516. video_url_dict["video_height"] = video_height
  517. elif 'dynamic_video' in video_info['videoResource']['normal'] \
  518. and 'dynamic_video_list' in video_info['videoResource']['normal']['dynamic_video'] \
  519. and 'dynamic_audio_list' in video_info['videoResource']['normal']['dynamic_video'] \
  520. and len(
  521. video_info['videoResource']['normal']['dynamic_video']['dynamic_video_list']) != 0 \
  522. and len(
  523. video_info['videoResource']['normal']['dynamic_video']['dynamic_audio_list']) != 0:
  524. video_url = \
  525. video_info['videoResource']['normal']['dynamic_video']['dynamic_video_list'][-1][
  526. 'backup_url_1']
  527. audio_url = \
  528. video_info['videoResource']['normal']['dynamic_video']['dynamic_audio_list'][-1][
  529. 'backup_url_1']
  530. if len(video_url) % 3 == 1:
  531. video_url += '=='
  532. elif len(video_url) % 3 == 2:
  533. video_url += '='
  534. elif len(audio_url) % 3 == 1:
  535. audio_url += '=='
  536. elif len(audio_url) % 3 == 2:
  537. audio_url += '='
  538. video_url = base64.b64decode(video_url).decode('utf8')
  539. audio_url = base64.b64decode(audio_url).decode('utf8')
  540. video_width = \
  541. video_info['videoResource']['normal']['dynamic_video']['dynamic_video_list'][-1][
  542. 'vwidth']
  543. video_height = \
  544. video_info['videoResource']['normal']['dynamic_video']['dynamic_video_list'][-1][
  545. 'vheight']
  546. video_url_dict["video_url"] = video_url
  547. video_url_dict["audio_url"] = audio_url
  548. video_url_dict["video_width"] = video_width
  549. video_url_dict["video_height"] = video_height
  550. else:
  551. video_url_dict["video_url"] = ''
  552. video_url_dict["audio_url"] = ''
  553. video_url_dict["video_width"] = 0
  554. video_url_dict["video_height"] = 0
  555. else:
  556. video_url_dict["video_url"] = ''
  557. video_url_dict["audio_url"] = ''
  558. video_url_dict["video_width"] = 0
  559. video_url_dict["video_height"] = 0
  560. return video_url_dict
  561. except Exception as e:
  562. Common.logger(log_type, crawler).error(f'get_video_url:{e}\n')
  563. # 过滤词库
  564. @classmethod
  565. def filter_words(cls, log_type, crawler):
  566. try:
  567. while True:
  568. filter_words_sheet = Feishu.get_values_batch(log_type, crawler, 'KGB4Hc')
  569. if filter_words_sheet is None:
  570. Common.logger(log_type, crawler).warning(
  571. f"filter_words_sheet:{filter_words_sheet} 10秒钟后重试")
  572. continue
  573. filter_words_list = []
  574. for x in filter_words_sheet:
  575. for y in x:
  576. if y is None:
  577. pass
  578. else:
  579. filter_words_list.append(y)
  580. return filter_words_list
  581. except Exception as e:
  582. Common.logger(log_type, crawler).error(f'filter_words异常:{e}\n')
  583. @classmethod
  584. def repeat_video(cls, log_type, crawler, video_id, env):
  585. sql = f""" select * from crawler_video where platform="西瓜视频" and out_video_id="{video_id}"; """
  586. repeat_video = MysqlHelper.get_values(log_type, crawler, sql, env)
  587. return len(repeat_video)
  588. @classmethod
  589. def get_videoList(cls, log_type, crawler, oss_endpoint, env):
  590. queryCount = 1
  591. while True:
  592. signature = cls.get_signature(log_type, crawler, env)
  593. if signature is None:
  594. Common.logger(log_type, crawler).warning(f"signature:{signature}")
  595. continue
  596. url = "https://www.ixigua.com/api/feedv2/feedById?"
  597. params = {
  598. "channelId": "94349543909",
  599. "count": "9",
  600. "maxTime": str(int(time.time())),
  601. # "maxTime": "1683190690",
  602. "queryCount": str(queryCount),
  603. "_signature": signature,
  604. "request_from": "701",
  605. "offset": "0",
  606. "referrer:": "https://open.weixin.qq.com/",
  607. "aid": "1768",
  608. "msToken": "ehRUgm8-TX8-TCaeemY6U1BsvFmUmHG1v3EofQclihHwzd9VWxs2nW8-jj0ATkJK6fkZ_u25TEdE14n5Xb-4tF17MTRlMrhWVhe-PXxEYDaoApCytTB5P83OD4EbwQ==",
  609. "X-Bogus": "DFSzswVOx7bANt0TtCAcOFm4pIkR",
  610. }
  611. headers = {
  612. 'referer': 'https://www.ixigua.com/',
  613. 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36',
  614. 'authority': 'www.ixigua.com',
  615. 'accept': 'application/json, text/plain, */*',
  616. 'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
  617. 'cache-control': 'no-cache',
  618. 'cookie': 'csrf_session_id=9dd5d8287d4f075ae24ff163cd22e51f; support_webp=true; support_avif=true; ttcid=5d8f917a525e46759dc886296bf1111b69; MONITOR_WEB_ID=ad1c8360-d4c9-4fa2-a801-d9fd68dfc1b2; s_v_web_id=verify_lh8vaa6v_VI4RQ0ET_nVbq_4PXw_8mfN_7Xp6wdLOZi08; passport_csrf_token=0e7c6992cb6170c9db034c3696191fff; passport_csrf_token_default=0e7c6992cb6170c9db034c3696191fff; odin_tt=b102690fef38bf07c400e3c69cdc27627701802bdd816fa827e3721c33607c4d2c0cbef09fe99c7d370e4a9e9e11c263; sid_guard=8dec4ecbe52cbdcff99dafe622b586b4%7C1683189144%7C3024002%7CThu%2C+08-Jun-2023+08%3A32%3A26+GMT; uid_tt=1dccbeaf685e24afd018fec335f3151d; uid_tt_ss=1dccbeaf685e24afd018fec335f3151d; sid_tt=8dec4ecbe52cbdcff99dafe622b586b4; sessionid=8dec4ecbe52cbdcff99dafe622b586b4; sessionid_ss=8dec4ecbe52cbdcff99dafe622b586b4; sid_ucp_v1=1.0.0-KGVhZTIxYjFlNzRlZTNhZjk5MjNlNzk2NGRhOWJlYzZiNGI5NzBhMzYKFQiu3d-eqQIQmNvNogYYGCAMOAhACxoCaGwiIDhkZWM0ZWNiZTUyY2JkY2ZmOTlkYWZlNjIyYjU4NmI0; ssid_ucp_v1=1.0.0-KGVhZTIxYjFlNzRlZTNhZjk5MjNlNzk2NGRhOWJlYzZiNGI5NzBhMzYKFQiu3d-eqQIQmNvNogYYGCAMOAhACxoCaGwiIDhkZWM0ZWNiZTUyY2JkY2ZmOTlkYWZlNjIyYjU4NmI0; tt_scid=-DgOlrfJzUDs5Q5JmFKRg78l2WnsrbSnyFnrPl4AzxWShhd2zxBt269lRjlZVlnCb0d8; ttwid=1%7C4zaTJmlaHpEa8rAB-KjREdxT3sNBUJWrAzRJnNvqExQ%7C1683189687%7C82737ee4f85defde37e91ed3e387476e1d6bc4b45eb5488ad178fe28ff2a2fd6; msToken=ehRUgm8-TX8-TCaeemY6U1BsvFmUmHG1v3EofQclihHwzd9VWxs2nW8-jj0ATkJK6fkZ_u25TEdE14n5Xb-4tF17MTRlMrhWVhe-PXxEYDaoApCytTB5P83OD4EbwQ==; ixigua-a-s=3',
  619. 'pragma': 'no-cache',
  620. 'sec-ch-ua': '"Chromium";v="112", "Google Chrome";v="112", "Not:A-Brand";v="99"',
  621. 'sec-ch-ua-mobile': '?0',
  622. 'sec-ch-ua-platform': '"macOS"',
  623. 'sec-fetch-dest': 'empty',
  624. 'sec-fetch-mode': 'cors',
  625. 'sec-fetch-site': 'same-origin',
  626. 'tt-anti-token': '95Ny0vj4Q-90dd9b91193b34ce554cc2861439b9629d897723f4d33719b9747d7d18a2ff7c',
  627. 'x-secsdk-csrf-token': '000100000001ecb8f07e247a89e289b3ab55f3c967a8e88f88aa0addb1ddca9d3e36f35d7999175be79b8699c881'
  628. }
  629. urllib3.disable_warnings()
  630. s = requests.session()
  631. # max_retries=3 重试3次
  632. s.mount('http://', HTTPAdapter(max_retries=3))
  633. s.mount('https://', HTTPAdapter(max_retries=3))
  634. response = requests.get(url=url, headers=headers, params=params, proxies=Common.tunnel_proxies(), verify=False, timeout=5)
  635. response.close()
  636. queryCount += 1
  637. Common.logger(log_type, crawler).info(f"queryCount:{queryCount}")
  638. if response.status_code != 200:
  639. Common.logger(log_type, crawler).warning(f"get_videolist_response:{response.text}\n")
  640. return
  641. elif 'data' not in response.text:
  642. Common.logger(log_type, crawler).warning(f"get_videolist_response:{response.text}\n")
  643. return
  644. elif 'channelFeed' not in response.json()['data']:
  645. Common.logger(log_type, crawler).warning(f"get_videolist_response:{response.json()}\n")
  646. return
  647. elif 'Data' not in response.json()['data']['channelFeed']:
  648. Common.logger(log_type, crawler).warning(f"get_videolist_response:{response.json()}\n")
  649. return
  650. elif len(response.json()['data']['channelFeed']['Data']) == 0:
  651. Common.logger(log_type, crawler).warning(f"get_videolist_response:{response.json()}\n")
  652. return
  653. else:
  654. videoList = response.json()['data']['channelFeed']['Data']
  655. for i in range(len(videoList)):
  656. if 'data' not in videoList[i]:
  657. continue
  658. # video_title
  659. video_title = videoList[i]['data'].get('title', '').replace('"' ,'').replace("'", '')
  660. if video_title == '':
  661. video_title = random.choice(cls.xigua_config(log_type, crawler, "title", env))
  662. # video_id
  663. video_id = videoList[i]['data'].get('vid', '')
  664. # play_cnt
  665. play_cnt = int(videoList[i]['data'].get('playNum', 0))
  666. # comment_cnt
  667. comment_cnt = int(videoList[i]['data'].get('commentNum', 0))
  668. # gid
  669. gid = videoList[i]['data'].get('item_id', 0)
  670. # share_cnt / like_cnt
  671. share_cnt = 0
  672. like_cnt = 0
  673. # duration
  674. duration = int(videoList[i]['data'].get('duration', 0))
  675. # publish_time_stamp
  676. publish_time_stamp = int(videoList[i]['data'].get('publish_time', 0))
  677. # publish_time_str
  678. publish_time_str = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(publish_time_stamp))
  679. # cover_url
  680. cover_url = videoList[i]['data'].get('image_url', '')
  681. # user_name
  682. user_name = videoList[i]['data']['user_info'].get('name', '')
  683. # user_id
  684. user_id = videoList[i]['data']['user_info'].get('user_id', '')
  685. # avatar_url
  686. avatar_url = videoList[i]['data']['user_info'].get('avatar_url', '')
  687. video_dict = {
  688. 'video_title': video_title,
  689. 'video_id': video_id,
  690. 'gid': gid,
  691. 'play_cnt': play_cnt,
  692. 'comment_cnt': comment_cnt,
  693. 'like_cnt': like_cnt,
  694. 'share_cnt': share_cnt,
  695. 'duration': duration,
  696. 'publish_time_stamp': publish_time_stamp,
  697. 'publish_time_str': publish_time_str,
  698. 'user_name': user_name,
  699. 'user_id': user_id,
  700. 'avatar_url': avatar_url,
  701. 'cover_url': cover_url,
  702. 'session': signature
  703. }
  704. for k, v in video_dict.items():
  705. Common.logger(log_type, crawler).info(f"{k}:{v}")
  706. if gid == 0 or video_id == '' or cover_url == '':
  707. Common.logger(log_type, crawler).info('无效视频\n')
  708. elif cls.download_rule(video_dict) is False:
  709. Common.logger(log_type, crawler).info('不满足抓取规则\n')
  710. elif any(str(word) if str(word) in video_title else False for word in cls.xigua_config(log_type, crawler, "filter", env)) is True:
  711. Common.logger(log_type, crawler).info('已中过滤词\n')
  712. elif cls.repeat_video(log_type, crawler, video_id, env) != 0:
  713. Common.logger(log_type, crawler).info('视频已下载\n')
  714. else:
  715. video_url_dict = cls.get_video_url(log_type, crawler, gid)
  716. video_dict['video_url'] = video_url_dict["video_url"]
  717. video_dict["audio_url"] = video_url_dict["audio_url"]
  718. video_dict["video_width"] = video_url_dict["video_width"]
  719. video_dict["video_height"] = video_url_dict["video_height"]
  720. cls.download_publish(log_type, crawler, video_dict, oss_endpoint, env)
  721. @classmethod
  722. def download_publish(cls, log_type, crawler, video_dict, oss_endpoint, env):
  723. # 下载视频
  724. Common.download_method(log_type=log_type, crawler=crawler, text='xigua_video', title=video_dict['video_title'],
  725. url=video_dict['video_url'])
  726. # 下载音频
  727. Common.download_method(log_type=log_type, crawler=crawler, text='xigua_audio', title=video_dict['video_title'],
  728. url=video_dict['audio_url'])
  729. # 合成音视频
  730. Common.video_compose(log_type=log_type, crawler=crawler,
  731. video_dir=f"./{crawler}/videos/{video_dict['video_title']}")
  732. md_title = md5(video_dict['video_title'].encode('utf8')).hexdigest()
  733. if os.path.getsize(f"./{crawler}/videos/{md_title}/video.mp4") == 0:
  734. # 删除视频文件夹
  735. shutil.rmtree(f"./{crawler}/videos/{md_title}")
  736. Common.logger(log_type, crawler).info("视频size=0,删除成功\n")
  737. return
  738. # 下载封面
  739. Common.download_method(log_type=log_type, crawler=crawler, text='cover', title=video_dict['video_title'],
  740. url=video_dict['cover_url'])
  741. # 保存视频信息至txt
  742. Common.save_video_info(log_type=log_type, crawler=crawler, video_dict=video_dict)
  743. # 上传视频
  744. Common.logger(log_type, crawler).info("开始上传视频...")
  745. our_video_id = Publish.upload_and_publish(log_type=log_type,
  746. crawler=crawler,
  747. strategy="推荐榜爬虫策略",
  748. our_uid="recommend",
  749. env=env,
  750. oss_endpoint=oss_endpoint)
  751. if env == 'dev':
  752. our_video_link = f"https://testadmin.piaoquantv.com/cms/post-detail/{our_video_id}/info"
  753. else:
  754. our_video_link = f"https://admin.piaoquantv.com/cms/post-detail/{our_video_id}/info"
  755. Common.logger(log_type, crawler).info("视频上传完成")
  756. if our_video_id is None:
  757. # 删除视频文件夹
  758. shutil.rmtree(f"./{crawler}/videos/{video_dict['video_title']}")
  759. return
  760. # 视频写入飞书
  761. Feishu.insert_columns(log_type, 'xigua', "1iKGF1", "ROWS", 1, 2)
  762. upload_time = int(time.time())
  763. values = [[time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(upload_time)),
  764. "推荐榜爬虫策略",
  765. video_dict['video_title'],
  766. str(video_dict['video_id']),
  767. our_video_link,
  768. video_dict['gid'],
  769. video_dict['play_cnt'],
  770. video_dict['comment_cnt'],
  771. video_dict['like_cnt'],
  772. video_dict['share_cnt'],
  773. video_dict['duration'],
  774. str(video_dict['video_width']) + '*' + str(video_dict['video_height']),
  775. video_dict['publish_time_str'],
  776. video_dict['user_name'],
  777. video_dict['user_id'],
  778. video_dict['avatar_url'],
  779. video_dict['cover_url'],
  780. video_dict['audio_url'],
  781. video_dict['video_url']]]
  782. time.sleep(1)
  783. Feishu.update_values(log_type, 'xigua', "1iKGF1", "F2:Z2", values)
  784. Common.logger(log_type, crawler).info(f"视频已保存至云文档\n")
  785. rule_dict = {
  786. "play_cnt": {"min": 10000},
  787. "duration": {"min": 60, "max": 60*30},
  788. "publish_day": {"min": 30}
  789. }
  790. # 视频信息保存数据库
  791. insert_sql = f""" insert into crawler_video(video_id,
  792. user_id,
  793. out_user_id,
  794. platform,
  795. strategy,
  796. out_video_id,
  797. video_title,
  798. cover_url,
  799. video_url,
  800. duration,
  801. publish_time,
  802. play_cnt,
  803. crawler_rule,
  804. width,
  805. height)
  806. values({our_video_id},
  807. {int(50322238)},
  808. "{video_dict['user_id']}",
  809. "{cls.platform}",
  810. "推荐榜爬虫策略",
  811. "{video_dict['video_id']}",
  812. "{video_dict['video_title']}",
  813. "{video_dict['cover_url']}",
  814. "{video_dict['video_url']}",
  815. {int(video_dict['duration'])},
  816. "{video_dict['publish_time_str']}",
  817. {int(video_dict['play_cnt'])},
  818. '{json.dumps(rule_dict)}',
  819. {int(video_dict['video_width'])},
  820. {int(video_dict['video_height'])}) """
  821. Common.logger(log_type, crawler).info(f"insert_sql:{insert_sql}")
  822. MysqlHelper.update_values(log_type, crawler, insert_sql, env, action='')
  823. Common.logger(log_type, crawler).info('视频信息插入数据库成功!\n')
  824. if __name__ == "__main__":
  825. # XiguaRecommend.get_signature("recommend", "xigua", "dev")
  826. # XiguaRecommend.get_videolist("recommend", "xigua", "dev")
  827. # print(XiguaRecommend.get_video_url("recommend", "xigua", "7218171653242094139"))
  828. # print(XiguaRecommend.filter_words("recommend", "xigua"))
  829. print(XiguaRecommend.xigua_config("recommend", "xigua", "title", "dev"))
  830. pass