xigua_search.py 50 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877
  1. # -*- coding: utf-8 -*-
  2. # @Author: wangkun
  3. # @Time: 2023/2/17
  4. import base64
  5. import json
  6. import os
  7. import random
  8. import re
  9. import shutil
  10. import string
  11. import sys
  12. import time
  13. import requests
  14. import urllib3
  15. from urllib.parse import quote
  16. from requests.adapters import HTTPAdapter
  17. sys.path.append(os.getcwd())
  18. from common.db import MysqlHelper
  19. from common.users import Users
  20. from common.common import Common
  21. from common.feishu import Feishu
  22. from common.publish import Publish
  23. class Search:
  24. platform = "西瓜视频"
  25. tag = "西瓜视频爬虫,搜索爬虫策略"
  26. @classmethod
  27. def get_rule(cls, log_type, crawler):
  28. try:
  29. while True:
  30. rule_sheet = Feishu.get_values_batch(log_type, crawler, "shxOl7")
  31. if rule_sheet is None:
  32. Common.logger(log_type, crawler).warning("rule_sheet is None! 10秒后重新获取")
  33. time.sleep(10)
  34. continue
  35. rule_dict = {
  36. "play_cnt": int(rule_sheet[1][2]),
  37. "min_duration": int(rule_sheet[2][2]),
  38. "max_duration": int(rule_sheet[3][2]),
  39. "publish_time": int(rule_sheet[4][2]),
  40. }
  41. return rule_dict
  42. except Exception as e:
  43. Common.logger(log_type, crawler).error(f"get_rule:{e}\n")
  44. # 下载规则
  45. @classmethod
  46. def download_rule(cls, video_info_dict, rule_dict):
  47. if video_info_dict['play_cnt'] >= rule_dict['play_cnt']:
  48. if video_info_dict['comment_cnt'] >= rule_dict['comment_cnt']:
  49. if video_info_dict['like_cnt'] >= rule_dict['like_cnt']:
  50. if video_info_dict['duration'] >= rule_dict['duration']:
  51. if video_info_dict['video_width'] >= rule_dict['video_width'] \
  52. or video_info_dict['video_height'] >= rule_dict['video_height']:
  53. return True
  54. else:
  55. return False
  56. else:
  57. return False
  58. else:
  59. return False
  60. else:
  61. return False
  62. else:
  63. return False
  64. # 过滤词库
  65. @classmethod
  66. def filter_words(cls, log_type, crawler):
  67. try:
  68. while True:
  69. filter_words_sheet = Feishu.get_values_batch(log_type, crawler, 'KGB4Hc')
  70. if filter_words_sheet is None:
  71. Common.logger(log_type, crawler).warning(f"filter_words_sheet:{filter_words_sheet} 10秒钟后重试")
  72. continue
  73. filter_words_list = []
  74. for x in filter_words_sheet:
  75. for y in x:
  76. if y is None:
  77. pass
  78. else:
  79. filter_words_list.append(y)
  80. return filter_words_list
  81. except Exception as e:
  82. Common.logger(log_type, crawler).error(f'filter_words异常:{e}\n')
  83. # 获取用户信息(字典格式). 注意:部分 user_id 字符类型是 int / str
  84. @classmethod
  85. def get_user_list(cls, log_type, crawler, sheetid, env, machine):
  86. try:
  87. while True:
  88. user_sheet = Feishu.get_values_batch(log_type, crawler, sheetid)
  89. if user_sheet is None:
  90. Common.logger(log_type, crawler).warning(f"user_sheet:{user_sheet} 10秒钟后重试")
  91. continue
  92. our_user_list = []
  93. for i in range(1, len(user_sheet)):
  94. our_uid = user_sheet[i][6]
  95. search_word = user_sheet[i][4]
  96. storage = user_sheet[i][5]
  97. tag = user_sheet[i][8]
  98. Common.logger(log_type, crawler).info(f"正在更新 {search_word} 关键词信息\n")
  99. if our_uid is None:
  100. # 用来创建our_id的信息
  101. user_dict = {
  102. 'nickName': search_word,
  103. 'avatarUrl': '',
  104. 'tagName': f'{storage},{tag}',
  105. }
  106. our_user_link = f'https://admin.piaoquantv.com/ums/user/{our_uid}/post'
  107. our_uid = Users.create_uid(log_type, crawler, user_dict, env)
  108. Feishu.update_values(log_type, crawler, sheetid, f'G{i + 1}:H{i + 1}',
  109. [[our_uid, our_user_link]])
  110. Common.logger(log_type, crawler).info(f'站内用户信息写入飞书成功!\n')
  111. our_user_dict = {
  112. 'out_uid': '',
  113. 'search_word': search_word,
  114. 'our_uid': our_uid,
  115. 'our_user_link': f'https://admin.piaoquantv.com/ums/user/{our_uid}/post',
  116. }
  117. our_user_list.append(our_user_dict)
  118. return our_user_list
  119. except Exception as e:
  120. Common.logger(log_type, crawler).error(f'get_user_id_from_feishu异常:{e}\n')
  121. @classmethod
  122. def random_signature(cls):
  123. src_digits = string.digits # string_数字
  124. src_uppercase = string.ascii_uppercase # string_大写字母
  125. src_lowercase = string.ascii_lowercase # string_小写字母
  126. digits_num = random.randint(1, 6)
  127. uppercase_num = random.randint(1, 26 - digits_num - 1)
  128. lowercase_num = 26 - (digits_num + uppercase_num)
  129. password = random.sample(src_digits, digits_num) + random.sample(src_uppercase, uppercase_num) + random.sample(
  130. src_lowercase, lowercase_num)
  131. random.shuffle(password)
  132. new_password = 'AAAAAAAAAA' + ''.join(password)[10:-4] + 'AAAB'
  133. new_password_start = new_password[0:18]
  134. new_password_end = new_password[-7:]
  135. if new_password[18] == '8':
  136. new_password = new_password_start + 'w' + new_password_end
  137. elif new_password[18] == '9':
  138. new_password = new_password_start + 'x' + new_password_end
  139. elif new_password[18] == '-':
  140. new_password = new_password_start + 'y' + new_password_end
  141. elif new_password[18] == '.':
  142. new_password = new_password_start + 'z' + new_password_end
  143. else:
  144. new_password = new_password_start + 'y' + new_password_end
  145. return new_password
  146. # 获取视频详情
  147. @classmethod
  148. def get_video_url(cls, log_type, crawler, gid):
  149. try:
  150. url = 'https://www.ixigua.com/api/mixVideo/information?'
  151. headers = {
  152. "accept-encoding": "gzip, deflate",
  153. "accept-language": "zh-CN,zh-Hans;q=0.9",
  154. "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
  155. "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.5 Safari/605.1.15",
  156. "referer": "https://www.ixigua.com/7102614741050196520?logTag=0531c88ac04f38ab2c62",
  157. }
  158. params = {
  159. 'mixId': gid,
  160. 'msToken': 'IlG0wd0Pylyw9ghcYiB2YseUmTwrsrqqhXrbIcsSaTcLTJyVlbYJzk20zw3UO-CfrfC'
  161. 'NVVIOBNjIl7vfBoxnVUwO9ZyzAI3umSKsT5-pef_RRfQCJwmA',
  162. 'X-Bogus': 'DFSzswVupYTANCJOSBk0P53WxM-r',
  163. '_signature': '_02B4Z6wo0000119LvEwAAIDCuktNZ0y5wkdfS7jAALThuOR8D9yWNZ.EmWHKV0WSn6Px'
  164. 'fPsH9-BldyxVje0f49ryXgmn7Tzk-swEHNb15TiGqa6YF.cX0jW8Eds1TtJOIZyfc9s5emH7gdWN94',
  165. }
  166. cookies = {
  167. 'ixigua-a-s': '1',
  168. 'msToken': 'IlG0wd0Pylyw9ghcYiB2YseUmTwrsrqqhXrbIcsSaTcLTJyVlbYJzk20zw3UO-CfrfCNVVIOB'
  169. 'NjIl7vfBoxnVUwO9ZyzAI3umSKsT5-pef_RRfQCJwmA',
  170. 'ttwid': '1%7C_yXQeHWwLZgCsgHClOwTCdYSOt_MjdOkgnPIkpi-Sr8%7C1661241238%7Cf57d0c5ef3f1d7'
  171. '6e049fccdca1ac54887c34d1f8731c8e51a49780ff0ceab9f8',
  172. 'tt_scid': 'QZ4l8KXDG0YAEaMCSbADdcybdKbUfG4BC6S4OBv9lpRS5VyqYLX2bIR8CTeZeGHR9ee3',
  173. 'MONITOR_WEB_ID': '0a49204a-7af5-4e96-95f0-f4bafb7450ad',
  174. '__ac_nonce': '06304878000964fdad287',
  175. '__ac_signature': '_02B4Z6wo00f017Rcr3AAAIDCUVxeW1tOKEu0fKvAAI4cvoYzV-wBhq7B6D8k0no7lb'
  176. 'FlvYoinmtK6UXjRIYPXnahUlFTvmWVtb77jsMkKAXzAEsLE56m36RlvL7ky.M3Xn52r9t1IEb7IR3ke8',
  177. 'ttcid': 'e56fabf6e85d4adf9e4d91902496a0e882',
  178. '_tea_utm_cache_1300': 'undefined',
  179. 'support_avif': 'false',
  180. 'support_webp': 'false',
  181. 'xiguavideopcwebid': '7134967546256016900',
  182. 'xiguavideopcwebid.sig': 'xxRww5R1VEMJN_dQepHorEu_eAc',
  183. }
  184. urllib3.disable_warnings()
  185. s = requests.session()
  186. # max_retries=3 重试3次
  187. s.mount('http://', HTTPAdapter(max_retries=3))
  188. s.mount('https://', HTTPAdapter(max_retries=3))
  189. response = s.get(url=url, headers=headers, params=params, cookies=cookies, verify=False,
  190. proxies=Common.tunnel_proxies(), timeout=5)
  191. response.close()
  192. if 'data' not in response.json() or response.json()['data'] == '':
  193. Common.logger(log_type, crawler).warning('get_video_info: response: {}', response)
  194. else:
  195. video_info = response.json()['data']['gidInformation']['packerData']['video']
  196. video_url_dict = {}
  197. # video_url
  198. if 'videoResource' not in video_info:
  199. video_url_dict["video_url"] = ''
  200. video_url_dict["audio_url"] = ''
  201. video_url_dict["video_width"] = 0
  202. video_url_dict["video_height"] = 0
  203. elif 'dash_120fps' in video_info['videoResource']:
  204. if "video_list" in video_info['videoResource']['dash_120fps'] and 'video_4' in \
  205. video_info['videoResource']['dash_120fps']['video_list']:
  206. video_url = video_info['videoResource']['dash_120fps']['video_list']['video_4']['backup_url_1']
  207. audio_url = video_info['videoResource']['dash_120fps']['video_list']['video_4']['backup_url_1']
  208. if len(video_url) % 3 == 1:
  209. video_url += '=='
  210. elif len(video_url) % 3 == 2:
  211. video_url += '='
  212. elif len(audio_url) % 3 == 1:
  213. audio_url += '=='
  214. elif len(audio_url) % 3 == 2:
  215. audio_url += '='
  216. video_url = base64.b64decode(video_url).decode('utf8')
  217. audio_url = base64.b64decode(audio_url).decode('utf8')
  218. video_width = video_info['videoResource']['dash_120fps']['video_list']['video_4']['vwidth']
  219. video_height = video_info['videoResource']['dash_120fps']['video_list']['video_4']['vheight']
  220. video_url_dict["video_url"] = video_url
  221. video_url_dict["audio_url"] = audio_url
  222. video_url_dict["video_width"] = video_width
  223. video_url_dict["video_height"] = video_height
  224. elif "video_list" in video_info['videoResource']['dash_120fps'] and 'video_3' in \
  225. video_info['videoResource']['dash_120fps']['video_list']:
  226. video_url = video_info['videoResource']['dash_120fps']['video_list']['video_3']['backup_url_1']
  227. audio_url = video_info['videoResource']['dash_120fps']['video_list']['video_3']['backup_url_1']
  228. if len(video_url) % 3 == 1:
  229. video_url += '=='
  230. elif len(video_url) % 3 == 2:
  231. video_url += '='
  232. elif len(audio_url) % 3 == 1:
  233. audio_url += '=='
  234. elif len(audio_url) % 3 == 2:
  235. audio_url += '='
  236. video_url = base64.b64decode(video_url).decode('utf8')
  237. audio_url = base64.b64decode(audio_url).decode('utf8')
  238. video_width = video_info['videoResource']['dash_120fps']['video_list']['video_3']['vwidth']
  239. video_height = video_info['videoResource']['dash_120fps']['video_list']['video_3']['vheight']
  240. video_url_dict["video_url"] = video_url
  241. video_url_dict["audio_url"] = audio_url
  242. video_url_dict["video_width"] = video_width
  243. video_url_dict["video_height"] = video_height
  244. elif "video_list" in video_info['videoResource']['dash_120fps'] and 'video_2' in \
  245. video_info['videoResource']['dash_120fps']['video_list']:
  246. video_url = video_info['videoResource']['dash_120fps']['video_list']['video_2']['backup_url_1']
  247. audio_url = video_info['videoResource']['dash_120fps']['video_list']['video_2']['backup_url_1']
  248. if len(video_url) % 3 == 1:
  249. video_url += '=='
  250. elif len(video_url) % 3 == 2:
  251. video_url += '='
  252. elif len(audio_url) % 3 == 1:
  253. audio_url += '=='
  254. elif len(audio_url) % 3 == 2:
  255. audio_url += '='
  256. video_url = base64.b64decode(video_url).decode('utf8')
  257. audio_url = base64.b64decode(audio_url).decode('utf8')
  258. video_width = video_info['videoResource']['dash_120fps']['video_list']['video_2']['vwidth']
  259. video_height = video_info['videoResource']['dash_120fps']['video_list']['video_2']['vheight']
  260. video_url_dict["video_url"] = video_url
  261. video_url_dict["audio_url"] = audio_url
  262. video_url_dict["video_width"] = video_width
  263. video_url_dict["video_height"] = video_height
  264. elif "video_list" in video_info['videoResource']['dash_120fps'] and 'video_1' in \
  265. video_info['videoResource']['dash_120fps']['video_list']:
  266. video_url = video_info['videoResource']['dash_120fps']['video_list']['video_1']['backup_url_1']
  267. audio_url = video_info['videoResource']['dash_120fps']['video_list']['video_1']['backup_url_1']
  268. if len(video_url) % 3 == 1:
  269. video_url += '=='
  270. elif len(video_url) % 3 == 2:
  271. video_url += '='
  272. elif len(audio_url) % 3 == 1:
  273. audio_url += '=='
  274. elif len(audio_url) % 3 == 2:
  275. audio_url += '='
  276. video_url = base64.b64decode(video_url).decode('utf8')
  277. audio_url = base64.b64decode(audio_url).decode('utf8')
  278. video_width = video_info['videoResource']['dash_120fps']['video_list']['video_1']['vwidth']
  279. video_height = video_info['videoResource']['dash_120fps']['video_list']['video_1']['vheight']
  280. video_url_dict["video_url"] = video_url
  281. video_url_dict["audio_url"] = audio_url
  282. video_url_dict["video_width"] = video_width
  283. video_url_dict["video_height"] = video_height
  284. elif 'dynamic_video' in video_info['videoResource']['dash_120fps'] \
  285. and 'dynamic_video_list' in video_info['videoResource']['dash_120fps']['dynamic_video'] \
  286. and 'dynamic_audio_list' in video_info['videoResource']['dash_120fps']['dynamic_video'] \
  287. and len(
  288. video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_video_list']) != 0 \
  289. and len(
  290. video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_audio_list']) != 0:
  291. video_url = \
  292. video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_video_list'][-1][
  293. 'backup_url_1']
  294. audio_url = \
  295. video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_audio_list'][-1][
  296. 'backup_url_1']
  297. if len(video_url) % 3 == 1:
  298. video_url += '=='
  299. elif len(video_url) % 3 == 2:
  300. video_url += '='
  301. elif len(audio_url) % 3 == 1:
  302. audio_url += '=='
  303. elif len(audio_url) % 3 == 2:
  304. audio_url += '='
  305. video_url = base64.b64decode(video_url).decode('utf8')
  306. audio_url = base64.b64decode(audio_url).decode('utf8')
  307. video_width = \
  308. video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_video_list'][-1][
  309. 'vwidth']
  310. video_height = \
  311. video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_video_list'][-1][
  312. 'vheight']
  313. video_url_dict["video_url"] = video_url
  314. video_url_dict["audio_url"] = audio_url
  315. video_url_dict["video_width"] = video_width
  316. video_url_dict["video_height"] = video_height
  317. else:
  318. video_url_dict["video_url"] = ''
  319. video_url_dict["audio_url"] = ''
  320. video_url_dict["video_width"] = 0
  321. video_url_dict["video_height"] = 0
  322. elif 'dash' in video_info['videoResource']:
  323. if "video_list" in video_info['videoResource']['dash'] and 'video_4' in \
  324. video_info['videoResource']['dash']['video_list']:
  325. video_url = video_info['videoResource']['dash']['video_list']['video_4']['backup_url_1']
  326. audio_url = video_info['videoResource']['dash']['video_list']['video_4']['backup_url_1']
  327. if len(video_url) % 3 == 1:
  328. video_url += '=='
  329. elif len(video_url) % 3 == 2:
  330. video_url += '='
  331. elif len(audio_url) % 3 == 1:
  332. audio_url += '=='
  333. elif len(audio_url) % 3 == 2:
  334. audio_url += '='
  335. video_url = base64.b64decode(video_url).decode('utf8')
  336. audio_url = base64.b64decode(audio_url).decode('utf8')
  337. video_width = video_info['videoResource']['dash']['video_list']['video_4']['vwidth']
  338. video_height = video_info['videoResource']['dash']['video_list']['video_4']['vheight']
  339. video_url_dict["video_url"] = video_url
  340. video_url_dict["audio_url"] = audio_url
  341. video_url_dict["video_width"] = video_width
  342. video_url_dict["video_height"] = video_height
  343. elif "video_list" in video_info['videoResource']['dash'] and 'video_3' in \
  344. video_info['videoResource']['dash']['video_list']:
  345. video_url = video_info['videoResource']['dash']['video_list']['video_3']['backup_url_1']
  346. audio_url = video_info['videoResource']['dash']['video_list']['video_3']['backup_url_1']
  347. if len(video_url) % 3 == 1:
  348. video_url += '=='
  349. elif len(video_url) % 3 == 2:
  350. video_url += '='
  351. elif len(audio_url) % 3 == 1:
  352. audio_url += '=='
  353. elif len(audio_url) % 3 == 2:
  354. audio_url += '='
  355. video_url = base64.b64decode(video_url).decode('utf8')
  356. audio_url = base64.b64decode(audio_url).decode('utf8')
  357. video_width = video_info['videoResource']['dash']['video_list']['video_3']['vwidth']
  358. video_height = video_info['videoResource']['dash']['video_list']['video_3']['vheight']
  359. video_url_dict["video_url"] = video_url
  360. video_url_dict["audio_url"] = audio_url
  361. video_url_dict["video_width"] = video_width
  362. video_url_dict["video_height"] = video_height
  363. elif "video_list" in video_info['videoResource']['dash'] and 'video_2' in \
  364. video_info['videoResource']['dash']['video_list']:
  365. video_url = video_info['videoResource']['dash']['video_list']['video_2']['backup_url_1']
  366. audio_url = video_info['videoResource']['dash']['video_list']['video_2']['backup_url_1']
  367. if len(video_url) % 3 == 1:
  368. video_url += '=='
  369. elif len(video_url) % 3 == 2:
  370. video_url += '='
  371. elif len(audio_url) % 3 == 1:
  372. audio_url += '=='
  373. elif len(audio_url) % 3 == 2:
  374. audio_url += '='
  375. video_url = base64.b64decode(video_url).decode('utf8')
  376. audio_url = base64.b64decode(audio_url).decode('utf8')
  377. video_width = video_info['videoResource']['dash']['video_list']['video_2']['vwidth']
  378. video_height = video_info['videoResource']['dash']['video_list']['video_2']['vheight']
  379. video_url_dict["video_url"] = video_url
  380. video_url_dict["audio_url"] = audio_url
  381. video_url_dict["video_width"] = video_width
  382. video_url_dict["video_height"] = video_height
  383. elif "video_list" in video_info['videoResource']['dash'] and 'video_1' in \
  384. video_info['videoResource']['dash']['video_list']:
  385. video_url = video_info['videoResource']['dash']['video_list']['video_1']['backup_url_1']
  386. audio_url = video_info['videoResource']['dash']['video_list']['video_1']['backup_url_1']
  387. if len(video_url) % 3 == 1:
  388. video_url += '=='
  389. elif len(video_url) % 3 == 2:
  390. video_url += '='
  391. elif len(audio_url) % 3 == 1:
  392. audio_url += '=='
  393. elif len(audio_url) % 3 == 2:
  394. audio_url += '='
  395. video_url = base64.b64decode(video_url).decode('utf8')
  396. audio_url = base64.b64decode(audio_url).decode('utf8')
  397. video_width = video_info['videoResource']['dash']['video_list']['video_1']['vwidth']
  398. video_height = video_info['videoResource']['dash']['video_list']['video_1']['vheight']
  399. video_url_dict["video_url"] = video_url
  400. video_url_dict["audio_url"] = audio_url
  401. video_url_dict["video_width"] = video_width
  402. video_url_dict["video_height"] = video_height
  403. elif 'dynamic_video' in video_info['videoResource']['dash'] \
  404. and 'dynamic_video_list' in video_info['videoResource']['dash']['dynamic_video'] \
  405. and 'dynamic_audio_list' in video_info['videoResource']['dash']['dynamic_video'] \
  406. and len(video_info['videoResource']['dash']['dynamic_video']['dynamic_video_list']) != 0 \
  407. and len(video_info['videoResource']['dash']['dynamic_video']['dynamic_audio_list']) != 0:
  408. video_url = video_info['videoResource']['dash']['dynamic_video']['dynamic_video_list'][-1][
  409. 'backup_url_1']
  410. audio_url = video_info['videoResource']['dash']['dynamic_video']['dynamic_audio_list'][-1][
  411. 'backup_url_1']
  412. if len(video_url) % 3 == 1:
  413. video_url += '=='
  414. elif len(video_url) % 3 == 2:
  415. video_url += '='
  416. elif len(audio_url) % 3 == 1:
  417. audio_url += '=='
  418. elif len(audio_url) % 3 == 2:
  419. audio_url += '='
  420. video_url = base64.b64decode(video_url).decode('utf8')
  421. audio_url = base64.b64decode(audio_url).decode('utf8')
  422. video_width = video_info['videoResource']['dash']['dynamic_video']['dynamic_video_list'][-1][
  423. 'vwidth']
  424. video_height = video_info['videoResource']['dash']['dynamic_video']['dynamic_video_list'][-1][
  425. 'vheight']
  426. video_url_dict["video_url"] = video_url
  427. video_url_dict["audio_url"] = audio_url
  428. video_url_dict["video_width"] = video_width
  429. video_url_dict["video_height"] = video_height
  430. else:
  431. video_url_dict["video_url"] = ''
  432. video_url_dict["audio_url"] = ''
  433. video_url_dict["video_width"] = 0
  434. video_url_dict["video_height"] = 0
  435. elif 'normal' in video_info['videoResource']:
  436. if "video_list" in video_info['videoResource']['normal'] and 'video_4' in \
  437. video_info['videoResource']['normal']['video_list']:
  438. video_url = video_info['videoResource']['normal']['video_list']['video_4']['backup_url_1']
  439. audio_url = video_info['videoResource']['normal']['video_list']['video_4']['backup_url_1']
  440. if len(video_url) % 3 == 1:
  441. video_url += '=='
  442. elif len(video_url) % 3 == 2:
  443. video_url += '='
  444. elif len(audio_url) % 3 == 1:
  445. audio_url += '=='
  446. elif len(audio_url) % 3 == 2:
  447. audio_url += '='
  448. video_url = base64.b64decode(video_url).decode('utf8')
  449. audio_url = base64.b64decode(audio_url).decode('utf8')
  450. video_width = video_info['videoResource']['normal']['video_list']['video_4']['vwidth']
  451. video_height = video_info['videoResource']['normal']['video_list']['video_4']['vheight']
  452. video_url_dict["video_url"] = video_url
  453. video_url_dict["audio_url"] = audio_url
  454. video_url_dict["video_width"] = video_width
  455. video_url_dict["video_height"] = video_height
  456. elif "video_list" in video_info['videoResource']['normal'] and 'video_3' in \
  457. video_info['videoResource']['normal']['video_list']:
  458. video_url = video_info['videoResource']['normal']['video_list']['video_3']['backup_url_1']
  459. audio_url = video_info['videoResource']['normal']['video_list']['video_3']['backup_url_1']
  460. if len(video_url) % 3 == 1:
  461. video_url += '=='
  462. elif len(video_url) % 3 == 2:
  463. video_url += '='
  464. elif len(audio_url) % 3 == 1:
  465. audio_url += '=='
  466. elif len(audio_url) % 3 == 2:
  467. audio_url += '='
  468. video_url = base64.b64decode(video_url).decode('utf8')
  469. audio_url = base64.b64decode(audio_url).decode('utf8')
  470. video_width = video_info['videoResource']['normal']['video_list']['video_3']['vwidth']
  471. video_height = video_info['videoResource']['normal']['video_list']['video_3']['vheight']
  472. video_url_dict["video_url"] = video_url
  473. video_url_dict["audio_url"] = audio_url
  474. video_url_dict["video_width"] = video_width
  475. video_url_dict["video_height"] = video_height
  476. elif "video_list" in video_info['videoResource']['normal'] and 'video_2' in \
  477. video_info['videoResource']['normal']['video_list']:
  478. video_url = video_info['videoResource']['normal']['video_list']['video_2']['backup_url_1']
  479. audio_url = video_info['videoResource']['normal']['video_list']['video_2']['backup_url_1']
  480. if len(video_url) % 3 == 1:
  481. video_url += '=='
  482. elif len(video_url) % 3 == 2:
  483. video_url += '='
  484. elif len(audio_url) % 3 == 1:
  485. audio_url += '=='
  486. elif len(audio_url) % 3 == 2:
  487. audio_url += '='
  488. video_url = base64.b64decode(video_url).decode('utf8')
  489. audio_url = base64.b64decode(audio_url).decode('utf8')
  490. video_width = video_info['videoResource']['normal']['video_list']['video_2']['vwidth']
  491. video_height = video_info['videoResource']['normal']['video_list']['video_2']['vheight']
  492. video_url_dict["video_url"] = video_url
  493. video_url_dict["audio_url"] = audio_url
  494. video_url_dict["video_width"] = video_width
  495. video_url_dict["video_height"] = video_height
  496. elif "video_list" in video_info['videoResource']['normal'] and 'video_1' in \
  497. video_info['videoResource']['normal']['video_list']:
  498. video_url = video_info['videoResource']['normal']['video_list']['video_1']['backup_url_1']
  499. audio_url = video_info['videoResource']['normal']['video_list']['video_1']['backup_url_1']
  500. if len(video_url) % 3 == 1:
  501. video_url += '=='
  502. elif len(video_url) % 3 == 2:
  503. video_url += '='
  504. elif len(audio_url) % 3 == 1:
  505. audio_url += '=='
  506. elif len(audio_url) % 3 == 2:
  507. audio_url += '='
  508. video_url = base64.b64decode(video_url).decode('utf8')
  509. audio_url = base64.b64decode(audio_url).decode('utf8')
  510. video_width = video_info['videoResource']['normal']['video_list']['video_1']['vwidth']
  511. video_height = video_info['videoResource']['normal']['video_list']['video_1']['vheight']
  512. video_url_dict["video_url"] = video_url
  513. video_url_dict["audio_url"] = audio_url
  514. video_url_dict["video_width"] = video_width
  515. video_url_dict["video_height"] = video_height
  516. elif 'dynamic_video' in video_info['videoResource']['normal'] \
  517. and 'dynamic_video_list' in video_info['videoResource']['normal']['dynamic_video'] \
  518. and 'dynamic_audio_list' in video_info['videoResource']['normal']['dynamic_video'] \
  519. and len(video_info['videoResource']['normal']['dynamic_video']['dynamic_video_list']) != 0 \
  520. and len(video_info['videoResource']['normal']['dynamic_video']['dynamic_audio_list']) != 0:
  521. video_url = video_info['videoResource']['normal']['dynamic_video']['dynamic_video_list'][-1][
  522. 'backup_url_1']
  523. audio_url = video_info['videoResource']['normal']['dynamic_video']['dynamic_audio_list'][-1][
  524. 'backup_url_1']
  525. if len(video_url) % 3 == 1:
  526. video_url += '=='
  527. elif len(video_url) % 3 == 2:
  528. video_url += '='
  529. elif len(audio_url) % 3 == 1:
  530. audio_url += '=='
  531. elif len(audio_url) % 3 == 2:
  532. audio_url += '='
  533. video_url = base64.b64decode(video_url).decode('utf8')
  534. audio_url = base64.b64decode(audio_url).decode('utf8')
  535. video_width = video_info['videoResource']['normal']['dynamic_video']['dynamic_video_list'][-1][
  536. 'vwidth']
  537. video_height = video_info['videoResource']['normal']['dynamic_video']['dynamic_video_list'][-1][
  538. 'vheight']
  539. video_url_dict["video_url"] = video_url
  540. video_url_dict["audio_url"] = audio_url
  541. video_url_dict["video_width"] = video_width
  542. video_url_dict["video_height"] = video_height
  543. else:
  544. video_url_dict["video_url"] = ''
  545. video_url_dict["audio_url"] = ''
  546. video_url_dict["video_width"] = 0
  547. video_url_dict["video_height"] = 0
  548. else:
  549. video_url_dict["video_url"] = ''
  550. video_url_dict["audio_url"] = ''
  551. video_url_dict["video_width"] = 0
  552. video_url_dict["video_height"] = 0
  553. return video_url_dict
  554. except Exception as e:
  555. Common.logger(log_type, crawler).error(f'get_video_url:{e}\n')
  556. @classmethod
  557. def get_video_info(cls, item_id):
  558. d_headers = {
  559. 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36'
  560. }
  561. d_url = "http://a6.pstatp.com/article/full/11/1/{video_id}/{video_id}/1/0/?iid=3636030325&device_id=5787057242" \
  562. "&ac=wifi&channel=wandoujia&aid=13&app_name=news_article&version_code=532&version_name=5.3.2&device_platform" \
  563. "=android&ab_client=a1%2Cc2%2Ce1%2Cf2%2Cg2%2Cb3%2Cf4&abflag=3&ssmix=a&device_type=SM705" \
  564. "&device_brand=smartisan&os_api=19&os_version=4.4.2&uuid=864593021012562&openudid=e23a5ff037ef2d1a" \
  565. "&manifest_version_code=532&resolution=1080*1920&dpi=480&update_version_code=5320".format(
  566. video_id=item_id)
  567. res = requests.get(url=d_url, headers=d_headers, proxies=Common.tunnel_proxies())
  568. data = json.loads(res.text)['data']
  569. item_counter = data['h5_extra']['itemCell']['itemCounter']
  570. user_info = data['user_info']
  571. detail_info = data['video_detail_info']
  572. video_dict = {'video_title': data['title'],
  573. 'video_id': detail_info['video_id'],
  574. 'gid': data['group_id'],
  575. 'play_cnt': item_counter['videoWatchCount'],
  576. 'comment_cnt': item_counter['commentCount'],
  577. 'like_cnt': item_counter['diggCount'],
  578. 'share_cnt': item_counter['shareCount'],
  579. 'duration': data['video_duration'],
  580. 'publish_time_stamp': data['publish_time'],
  581. 'publish_time_str': time.strftime("%Y-%m-%d %H:%M:%S",
  582. time.localtime(data['publish_time'])),
  583. 'user_name': user_info['name'],
  584. 'user_id': user_info['user_id'],
  585. 'avatar_url': user_info['avatar_url'],
  586. 'cover_url': data['large_image']['url'].replace('\u0026', '&'),
  587. }
  588. return video_dict
  589. @classmethod
  590. def is_ruled(cls, log_type, crawler, video_dict, rule_dict):
  591. old_time = int(time.time()) - (3600 * 24 * rule_dict['publish_time'])
  592. if video_dict['publish_time_stamp'] <= old_time:
  593. return False
  594. elif video_dict['play_cnt'] <= rule_dict['play_cnt']:
  595. return False
  596. elif video_dict['duration'] < rule_dict['min_duration'] or video_dict['duration'] > rule_dict['max_duration']:
  597. return False
  598. else:
  599. return True
  600. @classmethod
  601. def get_videolist(cls, log_type, crawler, strategy, our_uid, search_word, oss_endpoint, env, machine):
  602. total_count = 0
  603. offset = 0
  604. while True:
  605. signature = cls.random_signature()
  606. url = "https://www.ixigua.com/api/searchv2/complex/{}/{}?_signature={}".format(
  607. quote(search_word), offset, signature)
  608. headers = {
  609. 'accept': 'application/json, text/plain, */*',
  610. 'referer': 'https://www.ixigua.com/search/{}/?logTag=594535e3690f17a88cdb&tab_name=search'.format(
  611. quote(search_word)),
  612. 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36',
  613. }
  614. try:
  615. res = requests.request("GET", url, headers=headers, proxies=Common.tunnel_proxies())
  616. search_list = res.json()['data']['data']
  617. except Exception as e:
  618. search_list = []
  619. if not search_list:
  620. return
  621. for video_info in search_list:
  622. v_type = video_info['type']
  623. item_id = video_info['data']['group_id']
  624. rule_dict = cls.get_rule(log_type, crawler)
  625. video_dict = {}
  626. if v_type == 'video':
  627. try:
  628. video_dict = cls.get_video_info(item_id)
  629. video_url_dict = cls.get_video_url(log_type, crawler, video_dict['gid'])
  630. video_dict['video_width'] = video_url_dict["video_width"]
  631. video_dict['video_height'] = video_url_dict["video_height"]
  632. video_dict['audio_url'] = video_url_dict["audio_url"]
  633. video_dict['video_url'] = video_url_dict["video_url"]
  634. video_dict['session'] = signature
  635. except Exception as e:
  636. Common.logger(log_type, crawler).error(f'视频:{item_id},没有获取到视频详情,原因:{e}')
  637. continue
  638. elif v_type == 'pseries':
  639. p_url = "https://www.ixigua.com/api/videov2/pseries_more_v2?pSeriesId={}&rank=0&tailCount=30&aid=1768&msToken=wHEafKFLx0k3hihOPbhXYNsfMBxWiq2AB0K5R-34kEFixyq3ATi_DuXbL4Q47J9C2uK2zgWItMa1g2yc4FyDxM4dMijmSdwF4c4T8sSmOkoOI0wGzeEcPw==&X-Bogus=DFSzswVOzdUANG3ItaVHYr7TlqCv&_signature=_02B4Z6wo00001vB6l3QAAIDBZKzMeTihTmbwepPAANgh1Ai3JgFFo4e6anoezmBEpHfEMEYlWISGhXI-QKfev4N-2bwgXsHOuNGLnOsGqMbANIjFPh7Yj6OakQWrkbACenlv0P-arswtB6Zn45".format(
  640. item_id)
  641. p_headers = {
  642. 'referer': 'https://www.ixigua.com/{}?series_flow=1&logTag=cfec9d927da968feff89'.format(
  643. item_id),
  644. 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36',
  645. }
  646. try:
  647. p_res = requests.request("GET", p_url, headers=p_headers, proxies=Common.tunnel_proxies()).json()
  648. except Exception as e:
  649. Common.logger(log_type, crawler).error(f'合集:{item_id},没有获取到合集详情,原因:{e}')
  650. continue
  651. for video in p_res['data']:
  652. item_id = video['item_id']
  653. try:
  654. video_dict = cls.get_video_info(item_id)
  655. if cls.is_ruled(log_type, crawler, video_dict, rule_dict):
  656. video_url_dict = cls.get_video_url(log_type, crawler, video_dict['gid'])
  657. video_dict['video_width'] = video_url_dict["video_width"]
  658. video_dict['video_height'] = video_url_dict["video_height"]
  659. video_dict['audio_url'] = video_url_dict["audio_url"]
  660. video_dict['video_url'] = video_url_dict["video_url"]
  661. video_dict['session'] = signature
  662. break
  663. else:
  664. continue
  665. except Exception as e:
  666. Common.logger(log_type, crawler).error(f'视频:{item_id},没有获取到视频详情,原因:{e}')
  667. continue
  668. if not cls.is_ruled(log_type, crawler, video_dict, rule_dict):
  669. continue
  670. for k, v in video_dict.items():
  671. Common.logger(log_type, crawler).info(f"{k}:{v}")
  672. cls.download_publish(
  673. search_word=search_word,
  674. log_type=log_type,
  675. crawler=crawler,
  676. video_dict=video_dict,
  677. rule_dict=rule_dict,
  678. strategy=strategy,
  679. our_uid=our_uid,
  680. oss_endpoint=oss_endpoint,
  681. env=env,
  682. machine=machine
  683. )
  684. total_count += 1
  685. if total_count >= 30:
  686. return
  687. offset += 10
  688. @classmethod
  689. def repeat_video(cls, log_type, crawler, video_id, env, machine):
  690. sql = f""" select * from crawler_video where platform="{cls.platform}" and out_video_id="{video_id}"; """
  691. repeat_video = MysqlHelper.get_values(log_type, crawler, sql, env, machine)
  692. return len(repeat_video)
  693. # 下载 / 上传
  694. @classmethod
  695. def download_publish(cls, log_type, crawler, search_word, strategy, video_dict, rule_dict, our_uid, oss_endpoint,
  696. env, machine):
  697. try:
  698. Common.download_method(log_type=log_type, crawler=crawler, text='xigua_video',
  699. title=video_dict['video_title'], url=video_dict['video_url'])
  700. # 下载音频
  701. Common.download_method(log_type=log_type, crawler=crawler, text='xigua_audio',
  702. title=video_dict['video_title'], url=video_dict['audio_url'])
  703. # 合成音视频
  704. Common.video_compose(log_type=log_type, crawler=crawler,
  705. video_dir=f"./{crawler}/videos/{video_dict['video_title']}")
  706. ffmpeg_dict = Common.ffmpeg(log_type, crawler,
  707. f"./{crawler}/videos/{video_dict['video_title']}/video.mp4")
  708. if ffmpeg_dict is None or ffmpeg_dict['size'] == 0:
  709. Common.logger(log_type, crawler).warning(f"下载的视频无效,已删除\n")
  710. # 删除视频文件夹
  711. shutil.rmtree(f"./{crawler}/videos/{video_dict['video_title']}")
  712. return
  713. # 下载封面
  714. Common.download_method(log_type=log_type, crawler=crawler, text='cover',
  715. title=video_dict['video_title'], url=video_dict['cover_url'])
  716. # 保存视频信息至txt
  717. Common.save_video_info(log_type=log_type, crawler=crawler, video_dict=video_dict)
  718. # 上传视频
  719. Common.logger(log_type, crawler).info("开始上传视频...")
  720. our_video_id = Publish.upload_and_publish(log_type=log_type,
  721. crawler=crawler,
  722. strategy=strategy,
  723. our_uid=our_uid,
  724. env=env,
  725. oss_endpoint=oss_endpoint)
  726. if env == 'dev':
  727. our_video_link = f"https://testadmin.piaoquantv.com/cms/post-detail/{our_video_id}/info"
  728. else:
  729. our_video_link = f"https://admin.piaoquantv.com/cms/post-detail/{our_video_id}/info"
  730. Common.logger(log_type, crawler).info("视频上传完成")
  731. if our_video_id is None:
  732. # 删除视频文件夹
  733. shutil.rmtree(f"./{crawler}/videos/{video_dict['video_title']}")
  734. return
  735. # 视频写入飞书
  736. Feishu.insert_columns(log_type, 'xigua', "BUNvGC", "ROWS", 1, 2)
  737. upload_time = int(time.time())
  738. values = [[
  739. search_word,
  740. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(upload_time)),
  741. "关键词搜索",
  742. video_dict['video_title'],
  743. str(video_dict['video_id']),
  744. our_video_link,
  745. video_dict['gid'],
  746. video_dict['play_cnt'],
  747. video_dict['comment_cnt'],
  748. video_dict['like_cnt'],
  749. video_dict['share_cnt'],
  750. video_dict['duration'],
  751. str(video_dict['video_width']) + '*' + str(video_dict['video_height']),
  752. video_dict['publish_time_str'],
  753. video_dict['user_name'],
  754. video_dict['user_id'],
  755. video_dict['avatar_url'],
  756. video_dict['cover_url'],
  757. video_dict['video_url'],
  758. video_dict['audio_url']]]
  759. time.sleep(1)
  760. Feishu.update_values(log_type, 'xigua', "BUNvGC", "E2:Z2", values)
  761. Common.logger(log_type, crawler).info(f"视频已保存至云文档\n")
  762. # 视频信息保存数据库
  763. insert_sql = f""" insert into crawler_video(video_id,
  764. user_id,
  765. out_user_id,
  766. platform,
  767. strategy,
  768. out_video_id,
  769. video_title,
  770. cover_url,
  771. video_url,
  772. duration,
  773. publish_time,
  774. play_cnt,
  775. crawler_rule,
  776. width,
  777. height)
  778. values({our_video_id},
  779. {our_uid},
  780. "{video_dict['user_id']}",
  781. "{cls.platform}",
  782. "定向爬虫策略",
  783. "{video_dict['video_id']}",
  784. "{video_dict['video_title']}",
  785. "{video_dict['cover_url']}",
  786. "{video_dict['video_url']}",
  787. {int(video_dict['duration'])},
  788. "{video_dict['publish_time_str']}",
  789. {int(video_dict['play_cnt'])},
  790. '{json.dumps(rule_dict)}',
  791. {int(video_dict['video_width'])},
  792. {int(video_dict['video_height'])}) """
  793. Common.logger(log_type, crawler).info(f"insert_sql:{insert_sql}")
  794. MysqlHelper.update_values(log_type, crawler, insert_sql, env, machine)
  795. Common.logger(log_type, crawler).info('视频信息插入数据库成功!\n')
  796. except Exception as e:
  797. Common.logger(log_type, crawler).error(f'download_publish异常:{e}\n')
  798. @classmethod
  799. def get_search_videos(cls, log_type, crawler, strategy, oss_endpoint, env, machine):
  800. try:
  801. user_list = cls.get_user_list(log_type=log_type, crawler=crawler, sheetid="SSPNPW", env=env,
  802. machine=machine)
  803. for user in user_list:
  804. search_word = user["search_word"]
  805. our_uid = user["our_uid"]
  806. Common.logger(log_type, crawler).info(f"开始抓取 {search_word} 用户主页视频\n")
  807. cls.get_videolist(log_type=log_type,
  808. crawler=crawler,
  809. strategy=strategy,
  810. our_uid=our_uid,
  811. search_word=search_word,
  812. oss_endpoint=oss_endpoint,
  813. env=env,
  814. machine=machine)
  815. except Exception as e:
  816. Common.logger(log_type, crawler).error(f"get_search_videos:{e}\n")
  817. if __name__ == '__main__':
  818. # print(Follow.get_signature("follow", "xigua", "95420624045", "local"))
  819. # Search.get_search_videos('search', 'xigua', 'xigua_search', 'out', 'dev', 'local')
  820. # Follow.get_videolist(log_type="follow",
  821. # crawler="xigua",
  822. # strategy="定向爬虫策略",
  823. # our_uid="6267141",
  824. # out_uid="95420624045",
  825. # oss_endpoint="out",
  826. # env="dev",
  827. # machine="local")
  828. # print(Follow.random_signature())
  829. # rule = Follow.get_rule("follow", "xigua")
  830. # print(type(rule))
  831. # print(type(json.dumps(rule)))
  832. # print(json.dumps(rule))
  833. pass