xigua_recommend.py 49 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850
  1. # -*- coding: utf-8 -*-
  2. # @Author: wangkun
  3. # @Time: 2023/4/7
  4. import base64
  5. import json
  6. import os
  7. import random
  8. import shutil
  9. import string
  10. import sys
  11. import time
  12. from datetime import date, timedelta
  13. from hashlib import md5
  14. import requests
  15. import urllib3
  16. from requests.adapters import HTTPAdapter
  17. from selenium import webdriver
  18. from selenium.webdriver import DesiredCapabilities
  19. from selenium.webdriver.chrome.service import Service
  20. sys.path.append(os.getcwd())
  21. from common.publish import Publish
  22. from common.common import Common
  23. from common.feishu import Feishu
  24. from common.scheduling_db import MysqlHelper
  25. class XiguaRecommend:
  26. platform = "西瓜视频"
  27. @classmethod
  28. def xigua_config(cls, log_type, crawler, text, env):
  29. select_sql = f"""select * from crawler_config where source="xigua" """
  30. contents = MysqlHelper.get_values(log_type, crawler, select_sql, env, action='')
  31. title_list = []
  32. filter_list = []
  33. for content in contents:
  34. config = content['config']
  35. config_dict = eval(config)
  36. for k, v in config_dict.items():
  37. if k == "title":
  38. title_list_config = v.split(",")
  39. for title in title_list_config:
  40. title_list.append(title)
  41. if k == "filter":
  42. filter_list_config = v.split(",")
  43. for filter_word in filter_list_config:
  44. filter_list.append(filter_word)
  45. if text == "title":
  46. return title_list
  47. elif text == "filter":
  48. return filter_list
  49. @classmethod
  50. def download_rule(cls, video_dict):
  51. publish_time_str_rule = (date.today() + timedelta(days=-30)).strftime("%Y-%m-%d %H:%M:%S")
  52. publish_time_stamp_rule = int(time.mktime(time.strptime(publish_time_str_rule, "%Y-%m-%d %H:%M:%S")))
  53. if int(video_dict['play_cnt']) >= 10000:
  54. if 60*30 >= int(video_dict['duration']) >= 60:
  55. if int(video_dict['publish_time_stamp']) >= publish_time_stamp_rule:
  56. return True
  57. else:
  58. return False
  59. else:
  60. return False
  61. else:
  62. return False
  63. @classmethod
  64. def random_signature(cls):
  65. src_digits = string.digits # string_数字
  66. src_uppercase = string.ascii_uppercase # string_大写字母
  67. src_lowercase = string.ascii_lowercase # string_小写字母
  68. digits_num = random.randint(1, 6)
  69. uppercase_num = random.randint(1, 26 - digits_num - 1)
  70. lowercase_num = 26 - (digits_num + uppercase_num)
  71. password = random.sample(src_digits, digits_num) + random.sample(src_uppercase, uppercase_num) + random.sample(
  72. src_lowercase, lowercase_num)
  73. random.shuffle(password)
  74. new_password = 'AAAAAAAAAA' + ''.join(password)[10:-4] + 'AAAB'
  75. new_password_start = new_password[0:18]
  76. new_password_end = new_password[-7:]
  77. if new_password[18] == '8':
  78. new_password = new_password_start + 'w' + new_password_end
  79. elif new_password[18] == '9':
  80. new_password = new_password_start + 'x' + new_password_end
  81. elif new_password[18] == '-':
  82. new_password = new_password_start + 'y' + new_password_end
  83. elif new_password[18] == '.':
  84. new_password = new_password_start + 'z' + new_password_end
  85. else:
  86. new_password = new_password_start + 'y' + new_password_end
  87. return new_password
  88. @classmethod
  89. def get_signature(cls, log_type, crawler, env):
  90. try:
  91. # 打印请求配置
  92. ca = DesiredCapabilities.CHROME
  93. ca["goog:loggingPrefs"] = {"performance": "ALL"}
  94. # 不打开浏览器运行
  95. chrome_options = webdriver.ChromeOptions()
  96. chrome_options.add_argument("headless")
  97. chrome_options.add_argument(
  98. f'user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36')
  99. chrome_options.add_argument("--no-sandbox")
  100. # driver初始化
  101. if env == "dev":
  102. driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options,
  103. service=Service('/Users/wangkun/Downloads/chromedriver/chromedriver_v111/chromedriver'))
  104. else:
  105. driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options)
  106. driver.implicitly_wait(10)
  107. driver.get('https://www.ixigua.com/')
  108. time.sleep(1)
  109. # 向上滑动 1000 个像素
  110. driver.execute_script('window.scrollBy(0, 2000)')
  111. # Common.logger(log_type, crawler).info('刷新页面')
  112. driver.refresh()
  113. logs = driver.get_log("performance")
  114. # Common.logger(log_type, crawler).info('已获取logs:{}\n', logs)
  115. driver.quit()
  116. for line in logs:
  117. msg = json.loads(line['message'])
  118. if 'params' not in msg['message']:
  119. pass
  120. elif 'documentURL' not in msg['message']['params']:
  121. pass
  122. elif 'www.ixigua.com' not in msg['message']['params']['documentURL']:
  123. pass
  124. elif 'url' not in msg['message']['params']['request']:
  125. pass
  126. elif '_signature' not in msg['message']['params']['request']['url']:
  127. pass
  128. else:
  129. url = msg['message']['params']['request']['url']
  130. signature = url.split('_signature=')[-1].split('&')[0]
  131. return signature
  132. except Exception as e:
  133. Common.logger(log_type, crawler).error(f'get_signature异常:{e}\n')
  134. # 获取视频详情
  135. @classmethod
  136. def get_video_url(cls, log_type, crawler, gid):
  137. try:
  138. url = 'https://www.ixigua.com/api/mixVideo/information?'
  139. headers = {
  140. "accept-encoding": "gzip, deflate",
  141. "accept-language": "zh-CN,zh-Hans;q=0.9",
  142. "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
  143. "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.5 Safari/605.1.15",
  144. "referer": "https://www.ixigua.com/7102614741050196520?logTag=0531c88ac04f38ab2c62",
  145. }
  146. params = {
  147. 'mixId': gid,
  148. 'msToken': 'IlG0wd0Pylyw9ghcYiB2YseUmTwrsrqqhXrbIcsSaTcLTJyVlbYJzk20zw3UO-CfrfC'
  149. 'NVVIOBNjIl7vfBoxnVUwO9ZyzAI3umSKsT5-pef_RRfQCJwmA',
  150. 'X-Bogus': 'DFSzswVupYTANCJOSBk0P53WxM-r',
  151. '_signature': '_02B4Z6wo0000119LvEwAAIDCuktNZ0y5wkdfS7jAALThuOR8D9yWNZ.EmWHKV0WSn6Px'
  152. 'fPsH9-BldyxVje0f49ryXgmn7Tzk-swEHNb15TiGqa6YF.cX0jW8Eds1TtJOIZyfc9s5emH7gdWN94',
  153. }
  154. cookies = {
  155. 'ixigua-a-s': '1',
  156. 'msToken': 'IlG0wd0Pylyw9ghcYiB2YseUmTwrsrqqhXrbIcsSaTcLTJyVlbYJzk20zw3UO-CfrfCNVVIOB'
  157. 'NjIl7vfBoxnVUwO9ZyzAI3umSKsT5-pef_RRfQCJwmA',
  158. 'ttwid': '1%7C_yXQeHWwLZgCsgHClOwTCdYSOt_MjdOkgnPIkpi-Sr8%7C1661241238%7Cf57d0c5ef3f1d7'
  159. '6e049fccdca1ac54887c34d1f8731c8e51a49780ff0ceab9f8',
  160. 'tt_scid': 'QZ4l8KXDG0YAEaMCSbADdcybdKbUfG4BC6S4OBv9lpRS5VyqYLX2bIR8CTeZeGHR9ee3',
  161. 'MONITOR_WEB_ID': '0a49204a-7af5-4e96-95f0-f4bafb7450ad',
  162. '__ac_nonce': '06304878000964fdad287',
  163. '__ac_signature': '_02B4Z6wo00f017Rcr3AAAIDCUVxeW1tOKEu0fKvAAI4cvoYzV-wBhq7B6D8k0no7lb'
  164. 'FlvYoinmtK6UXjRIYPXnahUlFTvmWVtb77jsMkKAXzAEsLE56m36RlvL7ky.M3Xn52r9t1IEb7IR3ke8',
  165. 'ttcid': 'e56fabf6e85d4adf9e4d91902496a0e882',
  166. '_tea_utm_cache_1300': 'undefined',
  167. 'support_avif': 'false',
  168. 'support_webp': 'false',
  169. 'xiguavideopcwebid': '7134967546256016900',
  170. 'xiguavideopcwebid.sig': 'xxRww5R1VEMJN_dQepHorEu_eAc',
  171. }
  172. urllib3.disable_warnings()
  173. s = requests.session()
  174. # max_retries=3 重试3次
  175. s.mount('http://', HTTPAdapter(max_retries=3))
  176. s.mount('https://', HTTPAdapter(max_retries=3))
  177. response = s.get(url=url, headers=headers, params=params, cookies=cookies, verify=False,
  178. proxies=Common.tunnel_proxies(), timeout=5)
  179. response.close()
  180. if 'data' not in response.json() or response.json()['data'] == '':
  181. Common.logger(log_type, crawler).warning('get_video_info: response: {}', response)
  182. else:
  183. video_info = response.json()['data']['gidInformation']['packerData']['video']
  184. video_url_dict = {}
  185. # video_url
  186. if 'videoResource' not in video_info:
  187. video_url_dict["video_url"] = ''
  188. video_url_dict["audio_url"] = ''
  189. video_url_dict["video_width"] = 0
  190. video_url_dict["video_height"] = 0
  191. elif 'dash_120fps' in video_info['videoResource']:
  192. if "video_list" in video_info['videoResource']['dash_120fps'] and 'video_4' in \
  193. video_info['videoResource']['dash_120fps']['video_list']:
  194. video_url = video_info['videoResource']['dash_120fps']['video_list']['video_4'][
  195. 'backup_url_1']
  196. audio_url = video_info['videoResource']['dash_120fps']['video_list']['video_4'][
  197. 'backup_url_1']
  198. if len(video_url) % 3 == 1:
  199. video_url += '=='
  200. elif len(video_url) % 3 == 2:
  201. video_url += '='
  202. elif len(audio_url) % 3 == 1:
  203. audio_url += '=='
  204. elif len(audio_url) % 3 == 2:
  205. audio_url += '='
  206. video_url = base64.b64decode(video_url).decode('utf8')
  207. audio_url = base64.b64decode(audio_url).decode('utf8')
  208. video_width = video_info['videoResource']['dash_120fps']['video_list']['video_4']['vwidth']
  209. video_height = video_info['videoResource']['dash_120fps']['video_list']['video_4'][
  210. 'vheight']
  211. video_url_dict["video_url"] = video_url
  212. video_url_dict["audio_url"] = audio_url
  213. video_url_dict["video_width"] = video_width
  214. video_url_dict["video_height"] = video_height
  215. elif "video_list" in video_info['videoResource']['dash_120fps'] and 'video_3' in \
  216. video_info['videoResource']['dash_120fps']['video_list']:
  217. video_url = video_info['videoResource']['dash_120fps']['video_list']['video_3'][
  218. 'backup_url_1']
  219. audio_url = video_info['videoResource']['dash_120fps']['video_list']['video_3'][
  220. 'backup_url_1']
  221. if len(video_url) % 3 == 1:
  222. video_url += '=='
  223. elif len(video_url) % 3 == 2:
  224. video_url += '='
  225. elif len(audio_url) % 3 == 1:
  226. audio_url += '=='
  227. elif len(audio_url) % 3 == 2:
  228. audio_url += '='
  229. video_url = base64.b64decode(video_url).decode('utf8')
  230. audio_url = base64.b64decode(audio_url).decode('utf8')
  231. video_width = video_info['videoResource']['dash_120fps']['video_list']['video_3']['vwidth']
  232. video_height = video_info['videoResource']['dash_120fps']['video_list']['video_3'][
  233. 'vheight']
  234. video_url_dict["video_url"] = video_url
  235. video_url_dict["audio_url"] = audio_url
  236. video_url_dict["video_width"] = video_width
  237. video_url_dict["video_height"] = video_height
  238. elif "video_list" in video_info['videoResource']['dash_120fps'] and 'video_2' in \
  239. video_info['videoResource']['dash_120fps']['video_list']:
  240. video_url = video_info['videoResource']['dash_120fps']['video_list']['video_2'][
  241. 'backup_url_1']
  242. audio_url = video_info['videoResource']['dash_120fps']['video_list']['video_2'][
  243. 'backup_url_1']
  244. if len(video_url) % 3 == 1:
  245. video_url += '=='
  246. elif len(video_url) % 3 == 2:
  247. video_url += '='
  248. elif len(audio_url) % 3 == 1:
  249. audio_url += '=='
  250. elif len(audio_url) % 3 == 2:
  251. audio_url += '='
  252. video_url = base64.b64decode(video_url).decode('utf8')
  253. audio_url = base64.b64decode(audio_url).decode('utf8')
  254. video_width = video_info['videoResource']['dash_120fps']['video_list']['video_2']['vwidth']
  255. video_height = video_info['videoResource']['dash_120fps']['video_list']['video_2'][
  256. 'vheight']
  257. video_url_dict["video_url"] = video_url
  258. video_url_dict["audio_url"] = audio_url
  259. video_url_dict["video_width"] = video_width
  260. video_url_dict["video_height"] = video_height
  261. elif "video_list" in video_info['videoResource']['dash_120fps'] and 'video_1' in \
  262. video_info['videoResource']['dash_120fps']['video_list']:
  263. video_url = video_info['videoResource']['dash_120fps']['video_list']['video_1'][
  264. 'backup_url_1']
  265. audio_url = video_info['videoResource']['dash_120fps']['video_list']['video_1'][
  266. 'backup_url_1']
  267. if len(video_url) % 3 == 1:
  268. video_url += '=='
  269. elif len(video_url) % 3 == 2:
  270. video_url += '='
  271. elif len(audio_url) % 3 == 1:
  272. audio_url += '=='
  273. elif len(audio_url) % 3 == 2:
  274. audio_url += '='
  275. video_url = base64.b64decode(video_url).decode('utf8')
  276. audio_url = base64.b64decode(audio_url).decode('utf8')
  277. video_width = video_info['videoResource']['dash_120fps']['video_list']['video_1']['vwidth']
  278. video_height = video_info['videoResource']['dash_120fps']['video_list']['video_1'][
  279. 'vheight']
  280. video_url_dict["video_url"] = video_url
  281. video_url_dict["audio_url"] = audio_url
  282. video_url_dict["video_width"] = video_width
  283. video_url_dict["video_height"] = video_height
  284. elif 'dynamic_video' in video_info['videoResource']['dash_120fps'] \
  285. and 'dynamic_video_list' in video_info['videoResource']['dash_120fps']['dynamic_video'] \
  286. and 'dynamic_audio_list' in video_info['videoResource']['dash_120fps']['dynamic_video'] \
  287. and len(
  288. video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_video_list']) != 0 \
  289. and len(
  290. video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_audio_list']) != 0:
  291. video_url = \
  292. video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_video_list'][-1][
  293. 'backup_url_1']
  294. audio_url = \
  295. video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_audio_list'][-1][
  296. 'backup_url_1']
  297. if len(video_url) % 3 == 1:
  298. video_url += '=='
  299. elif len(video_url) % 3 == 2:
  300. video_url += '='
  301. elif len(audio_url) % 3 == 1:
  302. audio_url += '=='
  303. elif len(audio_url) % 3 == 2:
  304. audio_url += '='
  305. video_url = base64.b64decode(video_url).decode('utf8')
  306. audio_url = base64.b64decode(audio_url).decode('utf8')
  307. video_width = \
  308. video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_video_list'][-1][
  309. 'vwidth']
  310. video_height = \
  311. video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_video_list'][-1][
  312. 'vheight']
  313. video_url_dict["video_url"] = video_url
  314. video_url_dict["audio_url"] = audio_url
  315. video_url_dict["video_width"] = video_width
  316. video_url_dict["video_height"] = video_height
  317. else:
  318. video_url_dict["video_url"] = ''
  319. video_url_dict["audio_url"] = ''
  320. video_url_dict["video_width"] = 0
  321. video_url_dict["video_height"] = 0
  322. elif 'dash' in video_info['videoResource']:
  323. if "video_list" in video_info['videoResource']['dash'] and 'video_4' in \
  324. video_info['videoResource']['dash']['video_list']:
  325. video_url = video_info['videoResource']['dash']['video_list']['video_4']['backup_url_1']
  326. audio_url = video_info['videoResource']['dash']['video_list']['video_4']['backup_url_1']
  327. if len(video_url) % 3 == 1:
  328. video_url += '=='
  329. elif len(video_url) % 3 == 2:
  330. video_url += '='
  331. elif len(audio_url) % 3 == 1:
  332. audio_url += '=='
  333. elif len(audio_url) % 3 == 2:
  334. audio_url += '='
  335. video_url = base64.b64decode(video_url).decode('utf8')
  336. audio_url = base64.b64decode(audio_url).decode('utf8')
  337. video_width = video_info['videoResource']['dash']['video_list']['video_4']['vwidth']
  338. video_height = video_info['videoResource']['dash']['video_list']['video_4']['vheight']
  339. video_url_dict["video_url"] = video_url
  340. video_url_dict["audio_url"] = audio_url
  341. video_url_dict["video_width"] = video_width
  342. video_url_dict["video_height"] = video_height
  343. elif "video_list" in video_info['videoResource']['dash'] and 'video_3' in \
  344. video_info['videoResource']['dash']['video_list']:
  345. video_url = video_info['videoResource']['dash']['video_list']['video_3']['backup_url_1']
  346. audio_url = video_info['videoResource']['dash']['video_list']['video_3']['backup_url_1']
  347. if len(video_url) % 3 == 1:
  348. video_url += '=='
  349. elif len(video_url) % 3 == 2:
  350. video_url += '='
  351. elif len(audio_url) % 3 == 1:
  352. audio_url += '=='
  353. elif len(audio_url) % 3 == 2:
  354. audio_url += '='
  355. video_url = base64.b64decode(video_url).decode('utf8')
  356. audio_url = base64.b64decode(audio_url).decode('utf8')
  357. video_width = video_info['videoResource']['dash']['video_list']['video_3']['vwidth']
  358. video_height = video_info['videoResource']['dash']['video_list']['video_3']['vheight']
  359. video_url_dict["video_url"] = video_url
  360. video_url_dict["audio_url"] = audio_url
  361. video_url_dict["video_width"] = video_width
  362. video_url_dict["video_height"] = video_height
  363. elif "video_list" in video_info['videoResource']['dash'] and 'video_2' in \
  364. video_info['videoResource']['dash']['video_list']:
  365. video_url = video_info['videoResource']['dash']['video_list']['video_2']['backup_url_1']
  366. audio_url = video_info['videoResource']['dash']['video_list']['video_2']['backup_url_1']
  367. if len(video_url) % 3 == 1:
  368. video_url += '=='
  369. elif len(video_url) % 3 == 2:
  370. video_url += '='
  371. elif len(audio_url) % 3 == 1:
  372. audio_url += '=='
  373. elif len(audio_url) % 3 == 2:
  374. audio_url += '='
  375. video_url = base64.b64decode(video_url).decode('utf8')
  376. audio_url = base64.b64decode(audio_url).decode('utf8')
  377. video_width = video_info['videoResource']['dash']['video_list']['video_2']['vwidth']
  378. video_height = video_info['videoResource']['dash']['video_list']['video_2']['vheight']
  379. video_url_dict["video_url"] = video_url
  380. video_url_dict["audio_url"] = audio_url
  381. video_url_dict["video_width"] = video_width
  382. video_url_dict["video_height"] = video_height
  383. elif "video_list" in video_info['videoResource']['dash'] and 'video_1' in \
  384. video_info['videoResource']['dash']['video_list']:
  385. video_url = video_info['videoResource']['dash']['video_list']['video_1']['backup_url_1']
  386. audio_url = video_info['videoResource']['dash']['video_list']['video_1']['backup_url_1']
  387. if len(video_url) % 3 == 1:
  388. video_url += '=='
  389. elif len(video_url) % 3 == 2:
  390. video_url += '='
  391. elif len(audio_url) % 3 == 1:
  392. audio_url += '=='
  393. elif len(audio_url) % 3 == 2:
  394. audio_url += '='
  395. video_url = base64.b64decode(video_url).decode('utf8')
  396. audio_url = base64.b64decode(audio_url).decode('utf8')
  397. video_width = video_info['videoResource']['dash']['video_list']['video_1']['vwidth']
  398. video_height = video_info['videoResource']['dash']['video_list']['video_1']['vheight']
  399. video_url_dict["video_url"] = video_url
  400. video_url_dict["audio_url"] = audio_url
  401. video_url_dict["video_width"] = video_width
  402. video_url_dict["video_height"] = video_height
  403. elif 'dynamic_video' in video_info['videoResource']['dash'] \
  404. and 'dynamic_video_list' in video_info['videoResource']['dash']['dynamic_video'] \
  405. and 'dynamic_audio_list' in video_info['videoResource']['dash']['dynamic_video'] \
  406. and len(video_info['videoResource']['dash']['dynamic_video']['dynamic_video_list']) != 0 \
  407. and len(
  408. video_info['videoResource']['dash']['dynamic_video']['dynamic_audio_list']) != 0:
  409. video_url = video_info['videoResource']['dash']['dynamic_video']['dynamic_video_list'][-1][
  410. 'backup_url_1']
  411. audio_url = video_info['videoResource']['dash']['dynamic_video']['dynamic_audio_list'][-1][
  412. 'backup_url_1']
  413. if len(video_url) % 3 == 1:
  414. video_url += '=='
  415. elif len(video_url) % 3 == 2:
  416. video_url += '='
  417. elif len(audio_url) % 3 == 1:
  418. audio_url += '=='
  419. elif len(audio_url) % 3 == 2:
  420. audio_url += '='
  421. video_url = base64.b64decode(video_url).decode('utf8')
  422. audio_url = base64.b64decode(audio_url).decode('utf8')
  423. video_width = \
  424. video_info['videoResource']['dash']['dynamic_video']['dynamic_video_list'][-1]['vwidth']
  425. video_height = \
  426. video_info['videoResource']['dash']['dynamic_video']['dynamic_video_list'][-1]['vheight']
  427. video_url_dict["video_url"] = video_url
  428. video_url_dict["audio_url"] = audio_url
  429. video_url_dict["video_width"] = video_width
  430. video_url_dict["video_height"] = video_height
  431. else:
  432. video_url_dict["video_url"] = ''
  433. video_url_dict["audio_url"] = ''
  434. video_url_dict["video_width"] = 0
  435. video_url_dict["video_height"] = 0
  436. elif 'normal' in video_info['videoResource']:
  437. if "video_list" in video_info['videoResource']['normal'] and 'video_4' in \
  438. video_info['videoResource']['normal']['video_list']:
  439. video_url = video_info['videoResource']['normal']['video_list']['video_4']['backup_url_1']
  440. audio_url = video_info['videoResource']['normal']['video_list']['video_4']['backup_url_1']
  441. if len(video_url) % 3 == 1:
  442. video_url += '=='
  443. elif len(video_url) % 3 == 2:
  444. video_url += '='
  445. elif len(audio_url) % 3 == 1:
  446. audio_url += '=='
  447. elif len(audio_url) % 3 == 2:
  448. audio_url += '='
  449. video_url = base64.b64decode(video_url).decode('utf8')
  450. audio_url = base64.b64decode(audio_url).decode('utf8')
  451. video_width = video_info['videoResource']['normal']['video_list']['video_4']['vwidth']
  452. video_height = video_info['videoResource']['normal']['video_list']['video_4']['vheight']
  453. video_url_dict["video_url"] = video_url
  454. video_url_dict["audio_url"] = audio_url
  455. video_url_dict["video_width"] = video_width
  456. video_url_dict["video_height"] = video_height
  457. elif "video_list" in video_info['videoResource']['normal'] and 'video_3' in \
  458. video_info['videoResource']['normal']['video_list']:
  459. video_url = video_info['videoResource']['normal']['video_list']['video_3']['backup_url_1']
  460. audio_url = video_info['videoResource']['normal']['video_list']['video_3']['backup_url_1']
  461. if len(video_url) % 3 == 1:
  462. video_url += '=='
  463. elif len(video_url) % 3 == 2:
  464. video_url += '='
  465. elif len(audio_url) % 3 == 1:
  466. audio_url += '=='
  467. elif len(audio_url) % 3 == 2:
  468. audio_url += '='
  469. video_url = base64.b64decode(video_url).decode('utf8')
  470. audio_url = base64.b64decode(audio_url).decode('utf8')
  471. video_width = video_info['videoResource']['normal']['video_list']['video_3']['vwidth']
  472. video_height = video_info['videoResource']['normal']['video_list']['video_3']['vheight']
  473. video_url_dict["video_url"] = video_url
  474. video_url_dict["audio_url"] = audio_url
  475. video_url_dict["video_width"] = video_width
  476. video_url_dict["video_height"] = video_height
  477. elif "video_list" in video_info['videoResource']['normal'] and 'video_2' in \
  478. video_info['videoResource']['normal']['video_list']:
  479. video_url = video_info['videoResource']['normal']['video_list']['video_2']['backup_url_1']
  480. audio_url = video_info['videoResource']['normal']['video_list']['video_2']['backup_url_1']
  481. if len(video_url) % 3 == 1:
  482. video_url += '=='
  483. elif len(video_url) % 3 == 2:
  484. video_url += '='
  485. elif len(audio_url) % 3 == 1:
  486. audio_url += '=='
  487. elif len(audio_url) % 3 == 2:
  488. audio_url += '='
  489. video_url = base64.b64decode(video_url).decode('utf8')
  490. audio_url = base64.b64decode(audio_url).decode('utf8')
  491. video_width = video_info['videoResource']['normal']['video_list']['video_2']['vwidth']
  492. video_height = video_info['videoResource']['normal']['video_list']['video_2']['vheight']
  493. video_url_dict["video_url"] = video_url
  494. video_url_dict["audio_url"] = audio_url
  495. video_url_dict["video_width"] = video_width
  496. video_url_dict["video_height"] = video_height
  497. elif "video_list" in video_info['videoResource']['normal'] and 'video_1' in \
  498. video_info['videoResource']['normal']['video_list']:
  499. video_url = video_info['videoResource']['normal']['video_list']['video_1']['backup_url_1']
  500. audio_url = video_info['videoResource']['normal']['video_list']['video_1']['backup_url_1']
  501. if len(video_url) % 3 == 1:
  502. video_url += '=='
  503. elif len(video_url) % 3 == 2:
  504. video_url += '='
  505. elif len(audio_url) % 3 == 1:
  506. audio_url += '=='
  507. elif len(audio_url) % 3 == 2:
  508. audio_url += '='
  509. video_url = base64.b64decode(video_url).decode('utf8')
  510. audio_url = base64.b64decode(audio_url).decode('utf8')
  511. video_width = video_info['videoResource']['normal']['video_list']['video_1']['vwidth']
  512. video_height = video_info['videoResource']['normal']['video_list']['video_1']['vheight']
  513. video_url_dict["video_url"] = video_url
  514. video_url_dict["audio_url"] = audio_url
  515. video_url_dict["video_width"] = video_width
  516. video_url_dict["video_height"] = video_height
  517. elif 'dynamic_video' in video_info['videoResource']['normal'] \
  518. and 'dynamic_video_list' in video_info['videoResource']['normal']['dynamic_video'] \
  519. and 'dynamic_audio_list' in video_info['videoResource']['normal']['dynamic_video'] \
  520. and len(
  521. video_info['videoResource']['normal']['dynamic_video']['dynamic_video_list']) != 0 \
  522. and len(
  523. video_info['videoResource']['normal']['dynamic_video']['dynamic_audio_list']) != 0:
  524. video_url = \
  525. video_info['videoResource']['normal']['dynamic_video']['dynamic_video_list'][-1][
  526. 'backup_url_1']
  527. audio_url = \
  528. video_info['videoResource']['normal']['dynamic_video']['dynamic_audio_list'][-1][
  529. 'backup_url_1']
  530. if len(video_url) % 3 == 1:
  531. video_url += '=='
  532. elif len(video_url) % 3 == 2:
  533. video_url += '='
  534. elif len(audio_url) % 3 == 1:
  535. audio_url += '=='
  536. elif len(audio_url) % 3 == 2:
  537. audio_url += '='
  538. video_url = base64.b64decode(video_url).decode('utf8')
  539. audio_url = base64.b64decode(audio_url).decode('utf8')
  540. video_width = \
  541. video_info['videoResource']['normal']['dynamic_video']['dynamic_video_list'][-1][
  542. 'vwidth']
  543. video_height = \
  544. video_info['videoResource']['normal']['dynamic_video']['dynamic_video_list'][-1][
  545. 'vheight']
  546. video_url_dict["video_url"] = video_url
  547. video_url_dict["audio_url"] = audio_url
  548. video_url_dict["video_width"] = video_width
  549. video_url_dict["video_height"] = video_height
  550. else:
  551. video_url_dict["video_url"] = ''
  552. video_url_dict["audio_url"] = ''
  553. video_url_dict["video_width"] = 0
  554. video_url_dict["video_height"] = 0
  555. else:
  556. video_url_dict["video_url"] = ''
  557. video_url_dict["audio_url"] = ''
  558. video_url_dict["video_width"] = 0
  559. video_url_dict["video_height"] = 0
  560. return video_url_dict
  561. except Exception as e:
  562. Common.logger(log_type, crawler).error(f'get_video_url:{e}\n')
  563. @classmethod
  564. def repeat_video(cls, log_type, crawler, video_id, env):
  565. sql = f""" select * from crawler_video where platform="西瓜视频" and out_video_id="{video_id}"; """
  566. repeat_video = MysqlHelper.get_values(log_type, crawler, sql, env)
  567. return len(repeat_video)
  568. @classmethod
  569. def get_videoList(cls, log_type, crawler, oss_endpoint, env):
  570. queryCount = 1
  571. while True:
  572. signature = cls.get_signature(log_type, crawler, env)
  573. if signature is None:
  574. Common.logger(log_type, crawler).warning(f"signature:{signature}")
  575. continue
  576. url = "https://www.ixigua.com/api/feedv2/feedById?"
  577. params = {
  578. "channelId": "94349543909",
  579. "count": "9",
  580. "maxTime": str(int(time.time())),
  581. # "maxTime": "1683190690",
  582. "queryCount": str(queryCount),
  583. "_signature": signature,
  584. "request_from": "701",
  585. "offset": "0",
  586. "referrer:": "https://open.weixin.qq.com/",
  587. "aid": "1768",
  588. "msToken": "XDpSA6_ZPP-gAkkBV-_WRQvNpG20uUUGPwf3E-S-txhznjBcXNbK2sbOuSpF3U7Jki6R9HwLDPeW4Gj7n6PURPTKrKLEs8J-ieFrwXDvMp2DX94ZoMua",
  589. # "X-Bogus": "DFSzswVOx7bANt0TtCAcOFm4pIkR",
  590. }
  591. headers = {
  592. 'referer': 'https://www.ixigua.com/',
  593. 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36',
  594. 'authority': 'www.ixigua.com',
  595. 'accept': 'application/json, text/plain, */*',
  596. 'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
  597. 'cache-control': 'no-cache',
  598. # 'cookie': 'ttcid=5d8f917a525e46759dc886296bf1111b69; MONITOR_WEB_ID=ad1c8360-d4c9-4fa2-a801-d9fd68dfc1b2; s_v_web_id=verify_lh8vaa6v_VI4RQ0ET_nVbq_4PXw_8mfN_7Xp6wdLOZi08; passport_csrf_token=0e7c6992cb6170c9db034c3696191fff; passport_csrf_token_default=0e7c6992cb6170c9db034c3696191fff; odin_tt=b102690fef38bf07c400e3c69cdc27627701802bdd816fa827e3721c33607c4d2c0cbef09fe99c7d370e4a9e9e11c263; sid_guard=8dec4ecbe52cbdcff99dafe622b586b4%7C1683189144%7C3024002%7CThu%2C+08-Jun-2023+08%3A32%3A26+GMT; uid_tt=1dccbeaf685e24afd018fec335f3151d; uid_tt_ss=1dccbeaf685e24afd018fec335f3151d; sid_tt=8dec4ecbe52cbdcff99dafe622b586b4; sessionid=8dec4ecbe52cbdcff99dafe622b586b4; sessionid_ss=8dec4ecbe52cbdcff99dafe622b586b4; sid_ucp_v1=1.0.0-KGVhZTIxYjFlNzRlZTNhZjk5MjNlNzk2NGRhOWJlYzZiNGI5NzBhMzYKFQiu3d-eqQIQmNvNogYYGCAMOAhACxoCaGwiIDhkZWM0ZWNiZTUyY2JkY2ZmOTlkYWZlNjIyYjU4NmI0; ssid_ucp_v1=1.0.0-KGVhZTIxYjFlNzRlZTNhZjk5MjNlNzk2NGRhOWJlYzZiNGI5NzBhMzYKFQiu3d-eqQIQmNvNogYYGCAMOAhACxoCaGwiIDhkZWM0ZWNiZTUyY2JkY2ZmOTlkYWZlNjIyYjU4NmI0; support_webp=true; support_avif=true; csrf_session_id=9dd5d8287d4f075ae24ff163cd22e51f; msToken=XDpSA6_ZPP-gAkkBV-_WRQvNpG20uUUGPwf3E-S-txhznjBcXNbK2sbOuSpF3U7Jki6R9HwLDPeW4Gj7n6PURPTKrKLEs8J-ieFrwXDvMp2DX94ZoMua; ixigua-a-s=1; tt_scid=UTduWO4ij7cX6YKx23sDuV4zjvFkGFtFk5ZBhEnd1lJ1EZBykStzU7tbWQOSzGdE0fc6; ttwid=1%7C4zaTJmlaHpEa8rAB-KjREdxT3sNBUJWrAzRJnNvqExQ%7C1683198318%7Cffc2eef612caab19a0db93b4cec27e21a6230f9b82ab4bf5b1c6193d082baab1',
  599. 'pragma': 'no-cache',
  600. 'sec-ch-ua': '"Chromium";v="112", "Google Chrome";v="112", "Not:A-Brand";v="99"',
  601. 'sec-ch-ua-mobile': '?0',
  602. 'sec-ch-ua-platform': '"macOS"',
  603. 'sec-fetch-dest': 'empty',
  604. 'sec-fetch-mode': 'cors',
  605. 'sec-fetch-site': 'same-origin',
  606. # 'tt-anti-token': '95Ny0vj4Q-90dd9b91193b34ce554cc2861439b9629d897723f4d33719b9747d7d18a2ff7c',
  607. # 'x-secsdk-csrf-token': '000100000001ecb8f07e247a89e289b3ab55f3c967a8e88f88aa0addb1ddca9d3e36f35d7999175be79b8699c881'
  608. }
  609. urllib3.disable_warnings()
  610. s = requests.session()
  611. # max_retries=3 重试3次
  612. s.mount('http://', HTTPAdapter(max_retries=3))
  613. s.mount('https://', HTTPAdapter(max_retries=3))
  614. response = requests.get(url=url, headers=headers, params=params, proxies=Common.tunnel_proxies(), verify=False, timeout=5)
  615. response.close()
  616. queryCount += 1
  617. Common.logger(log_type, crawler).info(f"queryCount:{queryCount}")
  618. if response.status_code != 200:
  619. Common.logger(log_type, crawler).warning(f"get_videolist_response:{response.text}\n")
  620. return
  621. elif 'data' not in response.text:
  622. Common.logger(log_type, crawler).warning(f"get_videolist_response:{response.text}\n")
  623. return
  624. elif 'channelFeed' not in response.json()['data']:
  625. Common.logger(log_type, crawler).warning(f"get_videolist_response:{response.json()}\n")
  626. return
  627. elif 'Data' not in response.json()['data']['channelFeed']:
  628. Common.logger(log_type, crawler).warning(f"get_videolist_response:{response.json()}\n")
  629. return
  630. elif len(response.json()['data']['channelFeed']['Data']) == 0:
  631. Common.logger(log_type, crawler).warning(f"get_videolist_response:{response.json()}\n")
  632. return
  633. else:
  634. videoList = response.json()['data']['channelFeed']['Data']
  635. for i in range(len(videoList)):
  636. if 'data' not in videoList[i]:
  637. continue
  638. # video_title
  639. video_title = videoList[i]['data'].get('title', '').replace('"' ,'').replace("'", '')
  640. if video_title == '':
  641. video_title = random.choice(cls.xigua_config(log_type, crawler, "title", env))
  642. # video_id
  643. video_id = videoList[i]['data'].get('vid', '')
  644. # play_cnt
  645. play_cnt = int(videoList[i]['data'].get('playNum', 0))
  646. # comment_cnt
  647. comment_cnt = int(videoList[i]['data'].get('commentNum', 0))
  648. # gid
  649. gid = videoList[i]['data'].get('item_id', 0)
  650. # share_cnt / like_cnt
  651. share_cnt = 0
  652. like_cnt = 0
  653. # duration
  654. duration = int(videoList[i]['data'].get('duration', 0))
  655. # publish_time_stamp
  656. publish_time_stamp = int(videoList[i]['data'].get('publish_time', 0))
  657. # publish_time_str
  658. publish_time_str = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(publish_time_stamp))
  659. # cover_url
  660. cover_url = videoList[i]['data'].get('image_url', '')
  661. # user_name
  662. user_name = videoList[i]['data']['user_info'].get('name', '')
  663. # user_id
  664. user_id = videoList[i]['data']['user_info'].get('user_id', '')
  665. # avatar_url
  666. avatar_url = videoList[i]['data']['user_info'].get('avatar_url', '')
  667. video_dict = {
  668. 'video_title': video_title,
  669. 'video_id': video_id,
  670. 'gid': gid,
  671. 'play_cnt': play_cnt,
  672. 'comment_cnt': comment_cnt,
  673. 'like_cnt': like_cnt,
  674. 'share_cnt': share_cnt,
  675. 'duration': duration,
  676. 'publish_time_stamp': publish_time_stamp,
  677. 'publish_time_str': publish_time_str,
  678. 'user_name': user_name,
  679. 'user_id': user_id,
  680. 'avatar_url': avatar_url,
  681. 'cover_url': cover_url,
  682. 'session': signature
  683. }
  684. for k, v in video_dict.items():
  685. Common.logger(log_type, crawler).info(f"{k}:{v}")
  686. if gid == 0 or video_id == '' or cover_url == '':
  687. Common.logger(log_type, crawler).info('无效视频\n')
  688. elif cls.download_rule(video_dict) is False:
  689. Common.logger(log_type, crawler).info('不满足抓取规则\n')
  690. elif any(str(word) if str(word) in video_title else False for word in cls.xigua_config(log_type, crawler, "filter", env)) is True:
  691. Common.logger(log_type, crawler).info('已中过滤词\n')
  692. elif cls.repeat_video(log_type, crawler, video_id, env) != 0:
  693. Common.logger(log_type, crawler).info('视频已下载\n')
  694. else:
  695. video_url_dict = cls.get_video_url(log_type, crawler, gid)
  696. video_dict['video_url'] = video_url_dict["video_url"]
  697. video_dict["audio_url"] = video_url_dict["audio_url"]
  698. video_dict["video_width"] = video_url_dict["video_width"]
  699. video_dict["video_height"] = video_url_dict["video_height"]
  700. cls.download_publish(log_type, crawler, video_dict, oss_endpoint, env)
  701. @classmethod
  702. def download_publish(cls, log_type, crawler, video_dict, oss_endpoint, env):
  703. # 下载视频
  704. Common.download_method(log_type=log_type, crawler=crawler, text='xigua_video', title=video_dict['video_title'],
  705. url=video_dict['video_url'])
  706. # 下载音频
  707. Common.download_method(log_type=log_type, crawler=crawler, text='xigua_audio', title=video_dict['video_title'],
  708. url=video_dict['audio_url'])
  709. # 合成音视频
  710. Common.video_compose(log_type=log_type, crawler=crawler,
  711. video_dir=f"./{crawler}/videos/{video_dict['video_title']}")
  712. md_title = md5(video_dict['video_title'].encode('utf8')).hexdigest()
  713. if os.path.getsize(f"./{crawler}/videos/{md_title}/video.mp4") == 0:
  714. # 删除视频文件夹
  715. shutil.rmtree(f"./{crawler}/videos/{md_title}")
  716. Common.logger(log_type, crawler).info("视频size=0,删除成功\n")
  717. return
  718. # 下载封面
  719. Common.download_method(log_type=log_type, crawler=crawler, text='cover', title=video_dict['video_title'],
  720. url=video_dict['cover_url'])
  721. # 保存视频信息至txt
  722. Common.save_video_info(log_type=log_type, crawler=crawler, video_dict=video_dict)
  723. # 上传视频
  724. Common.logger(log_type, crawler).info("开始上传视频...")
  725. our_video_id = Publish.upload_and_publish(log_type=log_type,
  726. crawler=crawler,
  727. strategy="推荐榜爬虫策略",
  728. our_uid="recommend",
  729. env=env,
  730. oss_endpoint=oss_endpoint)
  731. if env == 'dev':
  732. our_video_link = f"https://testadmin.piaoquantv.com/cms/post-detail/{our_video_id}/info"
  733. else:
  734. our_video_link = f"https://admin.piaoquantv.com/cms/post-detail/{our_video_id}/info"
  735. Common.logger(log_type, crawler).info("视频上传完成")
  736. if our_video_id is None:
  737. # 删除视频文件夹
  738. shutil.rmtree(f"./{crawler}/videos/{video_dict['video_title']}")
  739. return
  740. # 视频写入飞书
  741. Feishu.insert_columns(log_type, 'xigua', "1iKGF1", "ROWS", 1, 2)
  742. upload_time = int(time.time())
  743. values = [[time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(upload_time)),
  744. "推荐榜爬虫策略",
  745. video_dict['video_title'],
  746. str(video_dict['video_id']),
  747. our_video_link,
  748. video_dict['gid'],
  749. video_dict['play_cnt'],
  750. video_dict['comment_cnt'],
  751. video_dict['like_cnt'],
  752. video_dict['share_cnt'],
  753. video_dict['duration'],
  754. str(video_dict['video_width']) + '*' + str(video_dict['video_height']),
  755. video_dict['publish_time_str'],
  756. video_dict['user_name'],
  757. video_dict['user_id'],
  758. video_dict['avatar_url'],
  759. video_dict['cover_url'],
  760. video_dict['audio_url'],
  761. video_dict['video_url']]]
  762. time.sleep(1)
  763. Feishu.update_values(log_type, 'xigua', "1iKGF1", "F2:Z2", values)
  764. Common.logger(log_type, crawler).info(f"视频已保存至云文档\n")
  765. rule_dict = {
  766. "play_cnt": {"min": 10000},
  767. "duration": {"min": 60, "max": 60*30},
  768. "publish_day": {"min": 30}
  769. }
  770. # 视频信息保存数据库
  771. insert_sql = f""" insert into crawler_video(video_id,
  772. user_id,
  773. out_user_id,
  774. platform,
  775. strategy,
  776. out_video_id,
  777. video_title,
  778. cover_url,
  779. video_url,
  780. duration,
  781. publish_time,
  782. play_cnt,
  783. crawler_rule,
  784. width,
  785. height)
  786. values({our_video_id},
  787. {int(50322238)},
  788. "{video_dict['user_id']}",
  789. "{cls.platform}",
  790. "推荐榜爬虫策略",
  791. "{video_dict['video_id']}",
  792. "{video_dict['video_title']}",
  793. "{video_dict['cover_url']}",
  794. "{video_dict['video_url']}",
  795. {int(video_dict['duration'])},
  796. "{video_dict['publish_time_str']}",
  797. {int(video_dict['play_cnt'])},
  798. '{json.dumps(rule_dict)}',
  799. {int(video_dict['video_width'])},
  800. {int(video_dict['video_height'])}) """
  801. Common.logger(log_type, crawler).info(f"insert_sql:{insert_sql}")
  802. MysqlHelper.update_values(log_type, crawler, insert_sql, env, action='')
  803. Common.logger(log_type, crawler).info('视频信息插入数据库成功!\n')
  804. if __name__ == "__main__":
  805. # XiguaRecommend.get_signature("recommend", "xigua", "dev")
  806. # XiguaRecommend.get_videolist("recommend", "xigua", "dev")
  807. # print(XiguaRecommend.get_video_url("recommend", "xigua", "7218171653242094139"))
  808. # print(XiguaRecommend.filter_words("recommend", "xigua"))
  809. print(XiguaRecommend.xigua_config("recommend", "xigua", "title", "dev"))
  810. pass