xigua_search.py 51 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902
  1. # -*- coding: utf-8 -*-
  2. # @Author: wangkun
  3. # @Time: 2023/2/17
  4. import base64
  5. import json
  6. import os
  7. import random
  8. import re
  9. import shutil
  10. import string
  11. import sys
  12. import time
  13. import requests
  14. import urllib3
  15. from urllib.parse import quote
  16. from requests.adapters import HTTPAdapter
  17. sys.path.append(os.getcwd())
  18. from common.db import MysqlHelper
  19. from common.users import Users
  20. from common.common import Common
  21. from common.feishu import Feishu
  22. from common.publish import Publish
  23. from common.userAgent import get_random_user_agent, random_user_agent
  24. class Search:
  25. platform = "西瓜视频"
  26. tag = "西瓜视频爬虫,搜索爬虫策略"
  27. @classmethod
  28. def get_rule(cls, log_type, crawler):
  29. try:
  30. while True:
  31. rule_sheet = Feishu.get_values_batch(log_type, crawler, "shxOl7")
  32. if rule_sheet is None:
  33. Common.logger(log_type, crawler).warning("rule_sheet is None! 10秒后重新获取")
  34. time.sleep(10)
  35. continue
  36. rule_dict = {
  37. "play_cnt": int(rule_sheet[1][2]),
  38. "min_duration": int(rule_sheet[2][2]),
  39. "max_duration": int(rule_sheet[3][2]),
  40. "publish_time": int(rule_sheet[4][2]),
  41. }
  42. return rule_dict
  43. except Exception as e:
  44. Common.logger(log_type, crawler).error(f"get_rule:{e}\n")
  45. # 下载规则
  46. @classmethod
  47. def download_rule(cls, video_info_dict, rule_dict):
  48. if video_info_dict['play_cnt'] >= rule_dict['play_cnt']:
  49. if video_info_dict['comment_cnt'] >= rule_dict['comment_cnt']:
  50. if video_info_dict['like_cnt'] >= rule_dict['like_cnt']:
  51. if video_info_dict['duration'] >= rule_dict['duration']:
  52. if video_info_dict['video_width'] >= rule_dict['video_width'] \
  53. or video_info_dict['video_height'] >= rule_dict['video_height']:
  54. return True
  55. else:
  56. return False
  57. else:
  58. return False
  59. else:
  60. return False
  61. else:
  62. return False
  63. else:
  64. return False
  65. # 过滤词库
  66. @classmethod
  67. def filter_words(cls, log_type, crawler):
  68. try:
  69. while True:
  70. filter_words_sheet = Feishu.get_values_batch(log_type, crawler, 'KGB4Hc')
  71. if filter_words_sheet is None:
  72. Common.logger(log_type, crawler).warning(f"filter_words_sheet:{filter_words_sheet} 10秒钟后重试")
  73. continue
  74. filter_words_list = []
  75. for x in filter_words_sheet:
  76. for y in x:
  77. if y is None:
  78. pass
  79. else:
  80. filter_words_list.append(y)
  81. return filter_words_list
  82. except Exception as e:
  83. Common.logger(log_type, crawler).error(f'filter_words异常:{e}\n')
  84. # 获取用户信息(字典格式). 注意:部分 user_id 字符类型是 int / str
  85. @classmethod
  86. def get_user_list(cls, log_type, crawler, sheetid, env, machine):
  87. try:
  88. while True:
  89. user_sheet = Feishu.get_values_batch(log_type, crawler, sheetid)
  90. if user_sheet is None:
  91. Common.logger(log_type, crawler).warning(f"user_sheet:{user_sheet} 10秒钟后重试")
  92. continue
  93. our_user_list = []
  94. for i in range(1, len(user_sheet)):
  95. our_uid = user_sheet[i][6]
  96. search_word = user_sheet[i][4]
  97. storage = user_sheet[i][5]
  98. tag1 = user_sheet[i][8]
  99. tag2 = user_sheet[i][9]
  100. tag3 = user_sheet[i][10]
  101. Common.logger(log_type, crawler).info(f"正在更新 {search_word} 关键词信息\n")
  102. if our_uid is None:
  103. default_user = Users.get_default_user()
  104. # 用来创建our_id的信息
  105. user_dict = {
  106. 'nickName': default_user['nickName'],
  107. 'avatarUrl': default_user['avatarUrl'],
  108. 'tagName': f'{tag1},{tag2},{tag3}',
  109. }
  110. Common.logger(log_type, crawler).info(f'新创建的站内UID:{our_uid}')
  111. our_uid = Users.create_uid(log_type, crawler, user_dict, env)
  112. if env == 'prod':
  113. our_user_link = f'https://admin.piaoquantv.com/ums/user/{our_uid}/post'
  114. else:
  115. our_user_link = f'https://testadmin.piaoquantv.com/ums/user/{our_uid}/post'
  116. Feishu.update_values(log_type, crawler, sheetid, f'G{i + 1}:H{i + 1}',
  117. [[our_uid, our_user_link]])
  118. Common.logger(log_type, crawler).info(f'站内用户信息写入飞书成功!\n')
  119. our_user_dict = {
  120. 'out_uid': '',
  121. 'search_word': search_word,
  122. 'our_uid': our_uid,
  123. 'our_user_link': f'https://admin.piaoquantv.com/ums/user/{our_uid}/post',
  124. }
  125. our_user_list.append(our_user_dict)
  126. return our_user_list
  127. except Exception as e:
  128. Common.logger(log_type, crawler).error(f'get_user_id_from_feishu异常:{e}\n')
  129. @classmethod
  130. def random_signature(cls):
  131. src_digits = string.digits # string_数字
  132. src_uppercase = string.ascii_uppercase # string_大写字母
  133. src_lowercase = string.ascii_lowercase # string_小写字母
  134. digits_num = random.randint(1, 6)
  135. uppercase_num = random.randint(1, 26 - digits_num - 1)
  136. lowercase_num = 26 - (digits_num + uppercase_num)
  137. password = random.sample(src_digits, digits_num) + random.sample(src_uppercase, uppercase_num) + random.sample(
  138. src_lowercase, lowercase_num)
  139. random.shuffle(password)
  140. new_password = 'AAAAAAAAAA' + ''.join(password)[10:-4] + 'AAAB'
  141. new_password_start = new_password[0:18]
  142. new_password_end = new_password[-7:]
  143. if new_password[18] == '8':
  144. new_password = new_password_start + 'w' + new_password_end
  145. elif new_password[18] == '9':
  146. new_password = new_password_start + 'x' + new_password_end
  147. elif new_password[18] == '-':
  148. new_password = new_password_start + 'y' + new_password_end
  149. elif new_password[18] == '.':
  150. new_password = new_password_start + 'z' + new_password_end
  151. else:
  152. new_password = new_password_start + 'y' + new_password_end
  153. return new_password
  154. # 获取视频详情
  155. @classmethod
  156. def get_video_url(cls, log_type, crawler, gid):
  157. try:
  158. url = 'https://www.ixigua.com/api/mixVideo/information?'
  159. headers = {
  160. "accept-encoding": "gzip, deflate",
  161. "accept-language": "zh-CN,zh-Hans;q=0.9",
  162. "user-agent": get_random_user_agent('pc'),
  163. "referer": "https://www.ixigua.com/7102614741050196520?logTag=0531c88ac04f38ab2c62",
  164. }
  165. params = {
  166. 'mixId': gid,
  167. 'msToken': 'IlG0wd0Pylyw9ghcYiB2YseUmTwrsrqqhXrbIcsSaTcLTJyVlbYJzk20zw3UO-CfrfC'
  168. 'NVVIOBNjIl7vfBoxnVUwO9ZyzAI3umSKsT5-pef_RRfQCJwmA',
  169. 'X-Bogus': 'DFSzswVupYTANCJOSBk0P53WxM-r',
  170. '_signature': '_02B4Z6wo0000119LvEwAAIDCuktNZ0y5wkdfS7jAALThuOR8D9yWNZ.EmWHKV0WSn6Px'
  171. 'fPsH9-BldyxVje0f49ryXgmn7Tzk-swEHNb15TiGqa6YF.cX0jW8Eds1TtJOIZyfc9s5emH7gdWN94',
  172. }
  173. cookies = {
  174. 'ixigua-a-s': '1',
  175. 'msToken': 'IlG0wd0Pylyw9ghcYiB2YseUmTwrsrqqhXrbIcsSaTcLTJyVlbYJzk20zw3UO-CfrfCNVVIOB'
  176. 'NjIl7vfBoxnVUwO9ZyzAI3umSKsT5-pef_RRfQCJwmA',
  177. 'ttwid': '1%7C_yXQeHWwLZgCsgHClOwTCdYSOt_MjdOkgnPIkpi-Sr8%7C1661241238%7Cf57d0c5ef3f1d7'
  178. '6e049fccdca1ac54887c34d1f8731c8e51a49780ff0ceab9f8',
  179. 'tt_scid': 'QZ4l8KXDG0YAEaMCSbADdcybdKbUfG4BC6S4OBv9lpRS5VyqYLX2bIR8CTeZeGHR9ee3',
  180. 'MONITOR_WEB_ID': '0a49204a-7af5-4e96-95f0-f4bafb7450ad',
  181. '__ac_nonce': '06304878000964fdad287',
  182. '__ac_signature': '_02B4Z6wo00f017Rcr3AAAIDCUVxeW1tOKEu0fKvAAI4cvoYzV-wBhq7B6D8k0no7lb'
  183. 'FlvYoinmtK6UXjRIYPXnahUlFTvmWVtb77jsMkKAXzAEsLE56m36RlvL7ky.M3Xn52r9t1IEb7IR3ke8',
  184. 'ttcid': 'e56fabf6e85d4adf9e4d91902496a0e882',
  185. '_tea_utm_cache_1300': 'undefined',
  186. 'support_avif': 'false',
  187. 'support_webp': 'false',
  188. 'xiguavideopcwebid': '7134967546256016900',
  189. 'xiguavideopcwebid.sig': 'xxRww5R1VEMJN_dQepHorEu_eAc',
  190. }
  191. urllib3.disable_warnings()
  192. s = requests.session()
  193. # max_retries=3 重试3次
  194. s.mount('http://', HTTPAdapter(max_retries=3))
  195. s.mount('https://', HTTPAdapter(max_retries=3))
  196. response = s.get(url=url, headers=headers, params=params, cookies=cookies, verify=False,
  197. proxies=Common.tunnel_proxies(), timeout=5)
  198. response.close()
  199. if 'data' not in response.json() or response.json()['data'] == '':
  200. Common.logger(log_type, crawler).warning('get_video_info: response: {}', response)
  201. else:
  202. video_info = response.json()['data']['gidInformation']['packerData']['video']
  203. video_url_dict = {}
  204. # video_url
  205. if 'videoResource' not in video_info:
  206. video_url_dict["video_url"] = ''
  207. video_url_dict["audio_url"] = ''
  208. video_url_dict["video_width"] = 0
  209. video_url_dict["video_height"] = 0
  210. elif 'dash_120fps' in video_info['videoResource']:
  211. if "video_list" in video_info['videoResource']['dash_120fps'] and 'video_4' in \
  212. video_info['videoResource']['dash_120fps']['video_list']:
  213. video_url = video_info['videoResource']['dash_120fps']['video_list']['video_4']['backup_url_1']
  214. audio_url = video_info['videoResource']['dash_120fps']['video_list']['video_4']['backup_url_1']
  215. if len(video_url) % 3 == 1:
  216. video_url += '=='
  217. elif len(video_url) % 3 == 2:
  218. video_url += '='
  219. elif len(audio_url) % 3 == 1:
  220. audio_url += '=='
  221. elif len(audio_url) % 3 == 2:
  222. audio_url += '='
  223. video_url = base64.b64decode(video_url).decode('utf8')
  224. audio_url = base64.b64decode(audio_url).decode('utf8')
  225. video_width = video_info['videoResource']['dash_120fps']['video_list']['video_4']['vwidth']
  226. video_height = video_info['videoResource']['dash_120fps']['video_list']['video_4']['vheight']
  227. video_url_dict["video_url"] = video_url
  228. video_url_dict["audio_url"] = audio_url
  229. video_url_dict["video_width"] = video_width
  230. video_url_dict["video_height"] = video_height
  231. elif "video_list" in video_info['videoResource']['dash_120fps'] and 'video_3' in \
  232. video_info['videoResource']['dash_120fps']['video_list']:
  233. video_url = video_info['videoResource']['dash_120fps']['video_list']['video_3']['backup_url_1']
  234. audio_url = video_info['videoResource']['dash_120fps']['video_list']['video_3']['backup_url_1']
  235. if len(video_url) % 3 == 1:
  236. video_url += '=='
  237. elif len(video_url) % 3 == 2:
  238. video_url += '='
  239. elif len(audio_url) % 3 == 1:
  240. audio_url += '=='
  241. elif len(audio_url) % 3 == 2:
  242. audio_url += '='
  243. video_url = base64.b64decode(video_url).decode('utf8')
  244. audio_url = base64.b64decode(audio_url).decode('utf8')
  245. video_width = video_info['videoResource']['dash_120fps']['video_list']['video_3']['vwidth']
  246. video_height = video_info['videoResource']['dash_120fps']['video_list']['video_3']['vheight']
  247. video_url_dict["video_url"] = video_url
  248. video_url_dict["audio_url"] = audio_url
  249. video_url_dict["video_width"] = video_width
  250. video_url_dict["video_height"] = video_height
  251. elif "video_list" in video_info['videoResource']['dash_120fps'] and 'video_2' in \
  252. video_info['videoResource']['dash_120fps']['video_list']:
  253. video_url = video_info['videoResource']['dash_120fps']['video_list']['video_2']['backup_url_1']
  254. audio_url = video_info['videoResource']['dash_120fps']['video_list']['video_2']['backup_url_1']
  255. if len(video_url) % 3 == 1:
  256. video_url += '=='
  257. elif len(video_url) % 3 == 2:
  258. video_url += '='
  259. elif len(audio_url) % 3 == 1:
  260. audio_url += '=='
  261. elif len(audio_url) % 3 == 2:
  262. audio_url += '='
  263. video_url = base64.b64decode(video_url).decode('utf8')
  264. audio_url = base64.b64decode(audio_url).decode('utf8')
  265. video_width = video_info['videoResource']['dash_120fps']['video_list']['video_2']['vwidth']
  266. video_height = video_info['videoResource']['dash_120fps']['video_list']['video_2']['vheight']
  267. video_url_dict["video_url"] = video_url
  268. video_url_dict["audio_url"] = audio_url
  269. video_url_dict["video_width"] = video_width
  270. video_url_dict["video_height"] = video_height
  271. elif "video_list" in video_info['videoResource']['dash_120fps'] and 'video_1' in \
  272. video_info['videoResource']['dash_120fps']['video_list']:
  273. video_url = video_info['videoResource']['dash_120fps']['video_list']['video_1']['backup_url_1']
  274. audio_url = video_info['videoResource']['dash_120fps']['video_list']['video_1']['backup_url_1']
  275. if len(video_url) % 3 == 1:
  276. video_url += '=='
  277. elif len(video_url) % 3 == 2:
  278. video_url += '='
  279. elif len(audio_url) % 3 == 1:
  280. audio_url += '=='
  281. elif len(audio_url) % 3 == 2:
  282. audio_url += '='
  283. video_url = base64.b64decode(video_url).decode('utf8')
  284. audio_url = base64.b64decode(audio_url).decode('utf8')
  285. video_width = video_info['videoResource']['dash_120fps']['video_list']['video_1']['vwidth']
  286. video_height = video_info['videoResource']['dash_120fps']['video_list']['video_1']['vheight']
  287. video_url_dict["video_url"] = video_url
  288. video_url_dict["audio_url"] = audio_url
  289. video_url_dict["video_width"] = video_width
  290. video_url_dict["video_height"] = video_height
  291. elif 'dynamic_video' in video_info['videoResource']['dash_120fps'] \
  292. and 'dynamic_video_list' in video_info['videoResource']['dash_120fps']['dynamic_video'] \
  293. and 'dynamic_audio_list' in video_info['videoResource']['dash_120fps']['dynamic_video'] \
  294. and len(
  295. video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_video_list']) != 0 \
  296. and len(
  297. video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_audio_list']) != 0:
  298. video_url = \
  299. video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_video_list'][-1][
  300. 'backup_url_1']
  301. audio_url = \
  302. video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_audio_list'][-1][
  303. 'backup_url_1']
  304. if len(video_url) % 3 == 1:
  305. video_url += '=='
  306. elif len(video_url) % 3 == 2:
  307. video_url += '='
  308. elif len(audio_url) % 3 == 1:
  309. audio_url += '=='
  310. elif len(audio_url) % 3 == 2:
  311. audio_url += '='
  312. video_url = base64.b64decode(video_url).decode('utf8')
  313. audio_url = base64.b64decode(audio_url).decode('utf8')
  314. video_width = \
  315. video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_video_list'][-1][
  316. 'vwidth']
  317. video_height = \
  318. video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_video_list'][-1][
  319. 'vheight']
  320. video_url_dict["video_url"] = video_url
  321. video_url_dict["audio_url"] = audio_url
  322. video_url_dict["video_width"] = video_width
  323. video_url_dict["video_height"] = video_height
  324. else:
  325. video_url_dict["video_url"] = ''
  326. video_url_dict["audio_url"] = ''
  327. video_url_dict["video_width"] = 0
  328. video_url_dict["video_height"] = 0
  329. elif 'dash' in video_info['videoResource']:
  330. if "video_list" in video_info['videoResource']['dash'] and 'video_4' in \
  331. video_info['videoResource']['dash']['video_list']:
  332. video_url = video_info['videoResource']['dash']['video_list']['video_4']['backup_url_1']
  333. audio_url = video_info['videoResource']['dash']['video_list']['video_4']['backup_url_1']
  334. if len(video_url) % 3 == 1:
  335. video_url += '=='
  336. elif len(video_url) % 3 == 2:
  337. video_url += '='
  338. elif len(audio_url) % 3 == 1:
  339. audio_url += '=='
  340. elif len(audio_url) % 3 == 2:
  341. audio_url += '='
  342. video_url = base64.b64decode(video_url).decode('utf8')
  343. audio_url = base64.b64decode(audio_url).decode('utf8')
  344. video_width = video_info['videoResource']['dash']['video_list']['video_4']['vwidth']
  345. video_height = video_info['videoResource']['dash']['video_list']['video_4']['vheight']
  346. video_url_dict["video_url"] = video_url
  347. video_url_dict["audio_url"] = audio_url
  348. video_url_dict["video_width"] = video_width
  349. video_url_dict["video_height"] = video_height
  350. elif "video_list" in video_info['videoResource']['dash'] and 'video_3' in \
  351. video_info['videoResource']['dash']['video_list']:
  352. video_url = video_info['videoResource']['dash']['video_list']['video_3']['backup_url_1']
  353. audio_url = video_info['videoResource']['dash']['video_list']['video_3']['backup_url_1']
  354. if len(video_url) % 3 == 1:
  355. video_url += '=='
  356. elif len(video_url) % 3 == 2:
  357. video_url += '='
  358. elif len(audio_url) % 3 == 1:
  359. audio_url += '=='
  360. elif len(audio_url) % 3 == 2:
  361. audio_url += '='
  362. video_url = base64.b64decode(video_url).decode('utf8')
  363. audio_url = base64.b64decode(audio_url).decode('utf8')
  364. video_width = video_info['videoResource']['dash']['video_list']['video_3']['vwidth']
  365. video_height = video_info['videoResource']['dash']['video_list']['video_3']['vheight']
  366. video_url_dict["video_url"] = video_url
  367. video_url_dict["audio_url"] = audio_url
  368. video_url_dict["video_width"] = video_width
  369. video_url_dict["video_height"] = video_height
  370. elif "video_list" in video_info['videoResource']['dash'] and 'video_2' in \
  371. video_info['videoResource']['dash']['video_list']:
  372. video_url = video_info['videoResource']['dash']['video_list']['video_2']['backup_url_1']
  373. audio_url = video_info['videoResource']['dash']['video_list']['video_2']['backup_url_1']
  374. if len(video_url) % 3 == 1:
  375. video_url += '=='
  376. elif len(video_url) % 3 == 2:
  377. video_url += '='
  378. elif len(audio_url) % 3 == 1:
  379. audio_url += '=='
  380. elif len(audio_url) % 3 == 2:
  381. audio_url += '='
  382. video_url = base64.b64decode(video_url).decode('utf8')
  383. audio_url = base64.b64decode(audio_url).decode('utf8')
  384. video_width = video_info['videoResource']['dash']['video_list']['video_2']['vwidth']
  385. video_height = video_info['videoResource']['dash']['video_list']['video_2']['vheight']
  386. video_url_dict["video_url"] = video_url
  387. video_url_dict["audio_url"] = audio_url
  388. video_url_dict["video_width"] = video_width
  389. video_url_dict["video_height"] = video_height
  390. elif "video_list" in video_info['videoResource']['dash'] and 'video_1' in \
  391. video_info['videoResource']['dash']['video_list']:
  392. video_url = video_info['videoResource']['dash']['video_list']['video_1']['backup_url_1']
  393. audio_url = video_info['videoResource']['dash']['video_list']['video_1']['backup_url_1']
  394. if len(video_url) % 3 == 1:
  395. video_url += '=='
  396. elif len(video_url) % 3 == 2:
  397. video_url += '='
  398. elif len(audio_url) % 3 == 1:
  399. audio_url += '=='
  400. elif len(audio_url) % 3 == 2:
  401. audio_url += '='
  402. video_url = base64.b64decode(video_url).decode('utf8')
  403. audio_url = base64.b64decode(audio_url).decode('utf8')
  404. video_width = video_info['videoResource']['dash']['video_list']['video_1']['vwidth']
  405. video_height = video_info['videoResource']['dash']['video_list']['video_1']['vheight']
  406. video_url_dict["video_url"] = video_url
  407. video_url_dict["audio_url"] = audio_url
  408. video_url_dict["video_width"] = video_width
  409. video_url_dict["video_height"] = video_height
  410. elif 'dynamic_video' in video_info['videoResource']['dash'] \
  411. and 'dynamic_video_list' in video_info['videoResource']['dash']['dynamic_video'] \
  412. and 'dynamic_audio_list' in video_info['videoResource']['dash']['dynamic_video'] \
  413. and len(video_info['videoResource']['dash']['dynamic_video']['dynamic_video_list']) != 0 \
  414. and len(video_info['videoResource']['dash']['dynamic_video']['dynamic_audio_list']) != 0:
  415. video_url = video_info['videoResource']['dash']['dynamic_video']['dynamic_video_list'][-1][
  416. 'backup_url_1']
  417. audio_url = video_info['videoResource']['dash']['dynamic_video']['dynamic_audio_list'][-1][
  418. 'backup_url_1']
  419. if len(video_url) % 3 == 1:
  420. video_url += '=='
  421. elif len(video_url) % 3 == 2:
  422. video_url += '='
  423. elif len(audio_url) % 3 == 1:
  424. audio_url += '=='
  425. elif len(audio_url) % 3 == 2:
  426. audio_url += '='
  427. video_url = base64.b64decode(video_url).decode('utf8')
  428. audio_url = base64.b64decode(audio_url).decode('utf8')
  429. video_width = video_info['videoResource']['dash']['dynamic_video']['dynamic_video_list'][-1][
  430. 'vwidth']
  431. video_height = video_info['videoResource']['dash']['dynamic_video']['dynamic_video_list'][-1][
  432. 'vheight']
  433. video_url_dict["video_url"] = video_url
  434. video_url_dict["audio_url"] = audio_url
  435. video_url_dict["video_width"] = video_width
  436. video_url_dict["video_height"] = video_height
  437. else:
  438. video_url_dict["video_url"] = ''
  439. video_url_dict["audio_url"] = ''
  440. video_url_dict["video_width"] = 0
  441. video_url_dict["video_height"] = 0
  442. elif 'normal' in video_info['videoResource']:
  443. if "video_list" in video_info['videoResource']['normal'] and 'video_4' in \
  444. video_info['videoResource']['normal']['video_list']:
  445. video_url = video_info['videoResource']['normal']['video_list']['video_4']['backup_url_1']
  446. audio_url = video_info['videoResource']['normal']['video_list']['video_4']['backup_url_1']
  447. if len(video_url) % 3 == 1:
  448. video_url += '=='
  449. elif len(video_url) % 3 == 2:
  450. video_url += '='
  451. elif len(audio_url) % 3 == 1:
  452. audio_url += '=='
  453. elif len(audio_url) % 3 == 2:
  454. audio_url += '='
  455. video_url = base64.b64decode(video_url).decode('utf8')
  456. audio_url = base64.b64decode(audio_url).decode('utf8')
  457. video_width = video_info['videoResource']['normal']['video_list']['video_4']['vwidth']
  458. video_height = video_info['videoResource']['normal']['video_list']['video_4']['vheight']
  459. video_url_dict["video_url"] = video_url
  460. video_url_dict["audio_url"] = audio_url
  461. video_url_dict["video_width"] = video_width
  462. video_url_dict["video_height"] = video_height
  463. elif "video_list" in video_info['videoResource']['normal'] and 'video_3' in \
  464. video_info['videoResource']['normal']['video_list']:
  465. video_url = video_info['videoResource']['normal']['video_list']['video_3']['backup_url_1']
  466. audio_url = video_info['videoResource']['normal']['video_list']['video_3']['backup_url_1']
  467. if len(video_url) % 3 == 1:
  468. video_url += '=='
  469. elif len(video_url) % 3 == 2:
  470. video_url += '='
  471. elif len(audio_url) % 3 == 1:
  472. audio_url += '=='
  473. elif len(audio_url) % 3 == 2:
  474. audio_url += '='
  475. video_url = base64.b64decode(video_url).decode('utf8')
  476. audio_url = base64.b64decode(audio_url).decode('utf8')
  477. video_width = video_info['videoResource']['normal']['video_list']['video_3']['vwidth']
  478. video_height = video_info['videoResource']['normal']['video_list']['video_3']['vheight']
  479. video_url_dict["video_url"] = video_url
  480. video_url_dict["audio_url"] = audio_url
  481. video_url_dict["video_width"] = video_width
  482. video_url_dict["video_height"] = video_height
  483. elif "video_list" in video_info['videoResource']['normal'] and 'video_2' in \
  484. video_info['videoResource']['normal']['video_list']:
  485. video_url = video_info['videoResource']['normal']['video_list']['video_2']['backup_url_1']
  486. audio_url = video_info['videoResource']['normal']['video_list']['video_2']['backup_url_1']
  487. if len(video_url) % 3 == 1:
  488. video_url += '=='
  489. elif len(video_url) % 3 == 2:
  490. video_url += '='
  491. elif len(audio_url) % 3 == 1:
  492. audio_url += '=='
  493. elif len(audio_url) % 3 == 2:
  494. audio_url += '='
  495. video_url = base64.b64decode(video_url).decode('utf8')
  496. audio_url = base64.b64decode(audio_url).decode('utf8')
  497. video_width = video_info['videoResource']['normal']['video_list']['video_2']['vwidth']
  498. video_height = video_info['videoResource']['normal']['video_list']['video_2']['vheight']
  499. video_url_dict["video_url"] = video_url
  500. video_url_dict["audio_url"] = audio_url
  501. video_url_dict["video_width"] = video_width
  502. video_url_dict["video_height"] = video_height
  503. elif "video_list" in video_info['videoResource']['normal'] and 'video_1' in \
  504. video_info['videoResource']['normal']['video_list']:
  505. video_url = video_info['videoResource']['normal']['video_list']['video_1']['backup_url_1']
  506. audio_url = video_info['videoResource']['normal']['video_list']['video_1']['backup_url_1']
  507. if len(video_url) % 3 == 1:
  508. video_url += '=='
  509. elif len(video_url) % 3 == 2:
  510. video_url += '='
  511. elif len(audio_url) % 3 == 1:
  512. audio_url += '=='
  513. elif len(audio_url) % 3 == 2:
  514. audio_url += '='
  515. video_url = base64.b64decode(video_url).decode('utf8')
  516. audio_url = base64.b64decode(audio_url).decode('utf8')
  517. video_width = video_info['videoResource']['normal']['video_list']['video_1']['vwidth']
  518. video_height = video_info['videoResource']['normal']['video_list']['video_1']['vheight']
  519. video_url_dict["video_url"] = video_url
  520. video_url_dict["audio_url"] = audio_url
  521. video_url_dict["video_width"] = video_width
  522. video_url_dict["video_height"] = video_height
  523. elif 'dynamic_video' in video_info['videoResource']['normal'] \
  524. and 'dynamic_video_list' in video_info['videoResource']['normal']['dynamic_video'] \
  525. and 'dynamic_audio_list' in video_info['videoResource']['normal']['dynamic_video'] \
  526. and len(video_info['videoResource']['normal']['dynamic_video']['dynamic_video_list']) != 0 \
  527. and len(video_info['videoResource']['normal']['dynamic_video']['dynamic_audio_list']) != 0:
  528. video_url = video_info['videoResource']['normal']['dynamic_video']['dynamic_video_list'][-1][
  529. 'backup_url_1']
  530. audio_url = video_info['videoResource']['normal']['dynamic_video']['dynamic_audio_list'][-1][
  531. 'backup_url_1']
  532. if len(video_url) % 3 == 1:
  533. video_url += '=='
  534. elif len(video_url) % 3 == 2:
  535. video_url += '='
  536. elif len(audio_url) % 3 == 1:
  537. audio_url += '=='
  538. elif len(audio_url) % 3 == 2:
  539. audio_url += '='
  540. video_url = base64.b64decode(video_url).decode('utf8')
  541. audio_url = base64.b64decode(audio_url).decode('utf8')
  542. video_width = video_info['videoResource']['normal']['dynamic_video']['dynamic_video_list'][-1][
  543. 'vwidth']
  544. video_height = video_info['videoResource']['normal']['dynamic_video']['dynamic_video_list'][-1][
  545. 'vheight']
  546. video_url_dict["video_url"] = video_url
  547. video_url_dict["audio_url"] = audio_url
  548. video_url_dict["video_width"] = video_width
  549. video_url_dict["video_height"] = video_height
  550. else:
  551. video_url_dict["video_url"] = ''
  552. video_url_dict["audio_url"] = ''
  553. video_url_dict["video_width"] = 0
  554. video_url_dict["video_height"] = 0
  555. else:
  556. video_url_dict["video_url"] = ''
  557. video_url_dict["audio_url"] = ''
  558. video_url_dict["video_width"] = 0
  559. video_url_dict["video_height"] = 0
  560. return video_url_dict
  561. except Exception as e:
  562. Common.logger(log_type, crawler).error(f'get_video_url:{e}\n')
  563. @classmethod
  564. def get_video_info(cls, log_type, crawler, item_id):
  565. try:
  566. d_url = "http://a6.pstatp.com/article/full/11/1/{video_id}/{video_id}/1/0/?iid=3636030325&device_id=5787057242" \
  567. "&ac=wifi&channel=wandoujia&aid=13&app_name=news_article&version_code=532&version_name=5.3.2&device_platform" \
  568. "=android&ab_client=a1%2Cc2%2Ce1%2Cf2%2Cg2%2Cb3%2Cf4&abflag=3&ssmix=a&device_type=SM705" \
  569. "&device_brand=smartisan&os_api=19&os_version=4.4.2&uuid=864593021012562&openudid=e23a5ff037ef2d1a" \
  570. "&manifest_version_code=532&resolution=1080*1920&dpi=480&update_version_code=5320".format(
  571. video_id=item_id)
  572. res = requests.get(url=d_url, headers=random_user_agent('pc'), proxies=Common.tunnel_proxies())
  573. data = json.loads(res.text)['data']
  574. item_counter = data['h5_extra']['itemCell']['itemCounter']
  575. user_info = data['user_info']
  576. detail_info = data['video_detail_info']
  577. video_dict = {'video_title': data['title'],
  578. 'video_id': detail_info['video_id'],
  579. 'gid': data['group_id'],
  580. 'play_cnt': item_counter['videoWatchCount'],
  581. 'comment_cnt': item_counter['commentCount'],
  582. 'like_cnt': item_counter['diggCount'],
  583. 'share_cnt': item_counter['shareCount'],
  584. 'duration': data['video_duration'],
  585. 'publish_time_stamp': data['publish_time'],
  586. 'publish_time_str': time.strftime("%Y-%m-%d %H:%M:%S",
  587. time.localtime(data['publish_time'])),
  588. 'user_name': user_info['name'],
  589. 'user_id': user_info['user_id'],
  590. 'avatar_url': user_info['avatar_url'],
  591. 'cover_url': data['large_image']['url'].replace('\u0026', '&'),
  592. }
  593. return video_dict
  594. except Exception as e:
  595. Common.logger(log_type, crawler).error(f'视频:{item_id},没有获取到视频详情,原因:{e}')
  596. return {}
  597. @classmethod
  598. def is_ruled(cls, log_type, crawler, video_dict, rule_dict):
  599. old_time = int(time.time()) - (3600 * 24 * rule_dict['publish_time'])
  600. if video_dict['publish_time_stamp'] <= old_time:
  601. return False
  602. elif video_dict['play_cnt'] <= rule_dict['play_cnt']:
  603. return False
  604. elif video_dict['duration'] < rule_dict['min_duration'] or video_dict['duration'] > rule_dict['max_duration']:
  605. return False
  606. else:
  607. return True
  608. @classmethod
  609. def get_videolist(cls, log_type, crawler, strategy, our_uid, search_word, oss_endpoint, env, machine):
  610. total_count = 0
  611. offset = 0
  612. while True:
  613. signature = cls.random_signature()
  614. url = "https://www.ixigua.com/api/searchv2/complex/{}/{}".format(
  615. quote(search_word), offset, signature)
  616. headers = {
  617. 'referer': 'https://www.ixigua.com/search/{}/?logTag=594535e3690f17a88cdb&tab_name=search'.format(
  618. quote(search_word)),
  619. 'cookie': 'ttwid=1|x_4RDmVTqp6BQ5Xy5AnuCZCQdDyDxv-fnMVWzj19VU0|1678693556|883092b75667cbcc48cbcc4b3b44d060aa205ef26c7640dc672d0cc50ddf0be9',
  620. 'user-agent': get_random_user_agent('pc'),
  621. }
  622. try:
  623. res = requests.request("GET", url, headers=headers, proxies=Common.tunnel_proxies())
  624. search_list = res.json()['data']['data']
  625. except Exception as e:
  626. Common.logger(log_type, crawler).error(f'关键词:{search_word},没有获取到视频列表:offset{offset}')
  627. search_list = []
  628. if not search_list:
  629. return
  630. for video_info in search_list:
  631. v_type = video_info['type']
  632. item_id = video_info['data']['group_id']
  633. rule_dict = cls.get_rule(log_type, crawler)
  634. video_dict = {}
  635. if v_type == 'video':
  636. try:
  637. old_time = int(time.time()) - (3600 * 24 * rule_dict['publish_time'])
  638. if video_info['data']['publish_time'] <= old_time:
  639. continue
  640. elif video_info['data']['video_watch_count'] <= rule_dict['play_cnt']:
  641. continue
  642. elif video_info['data']['video_time'] < rule_dict['min_duration'] or video_info['data']['video_time'] > rule_dict[
  643. 'max_duration']:
  644. continue
  645. video_dict = cls.get_video_info(log_type, crawler, item_id)
  646. if not video_dict:
  647. continue
  648. video_url_dict = cls.get_video_url(log_type, crawler, video_dict['gid'])
  649. video_dict['video_width'] = video_url_dict["video_width"]
  650. video_dict['video_height'] = video_url_dict["video_height"]
  651. video_dict['audio_url'] = video_url_dict["audio_url"]
  652. video_dict['video_url'] = video_url_dict["video_url"]
  653. video_dict['session'] = signature
  654. except Exception as e:
  655. Common.logger(log_type, crawler).error(f'视频:{item_id},没有获取到视频详情,原因:{e}')
  656. continue
  657. elif v_type == 'pseries':
  658. p_url = "https://www.ixigua.com/api/videov2/pseries_more_v2?pSeriesId={}&rank=0&tailCount=30&aid=1768&msToken=wHEafKFLx0k3hihOPbhXYNsfMBxWiq2AB0K5R-34kEFixyq3ATi_DuXbL4Q47J9C2uK2zgWItMa1g2yc4FyDxM4dMijmSdwF4c4T8sSmOkoOI0wGzeEcPw==&X-Bogus=DFSzswVOzdUANG3ItaVHYr7TlqCv&_signature=_02B4Z6wo00001vB6l3QAAIDBZKzMeTihTmbwepPAANgh1Ai3JgFFo4e6anoezmBEpHfEMEYlWISGhXI-QKfev4N-2bwgXsHOuNGLnOsGqMbANIjFPh7Yj6OakQWrkbACenlv0P-arswtB6Zn45".format(
  659. item_id)
  660. p_headers = {
  661. 'referer': 'https://www.ixigua.com/{}?series_flow=1&logTag=cfec9d927da968feff89'.format(
  662. item_id),
  663. 'user-agent': get_random_user_agent('pc'),
  664. }
  665. try:
  666. p_res = requests.request("GET", p_url, headers=p_headers,
  667. proxies=Common.tunnel_proxies()).json()
  668. except Exception as e:
  669. Common.logger(log_type, crawler).error(f'合集:{item_id},没有获取到合集详情,原因:{e}')
  670. continue
  671. for video in p_res['data']:
  672. item_id = video['item_id']
  673. try:
  674. video_dict = cls.get_video_info(log_type, crawler, item_id)
  675. if not video_dict:
  676. continue
  677. if not cls.is_ruled(log_type, crawler, video_dict, rule_dict):
  678. Common.logger(log_type, crawler).info(f'gid:{item_id},不符合抓取规则\n')
  679. continue
  680. else:
  681. video_url_dict = cls.get_video_url(log_type, crawler, video_dict['gid'])
  682. video_dict['video_width'] = video_url_dict["video_width"]
  683. video_dict['video_height'] = video_url_dict["video_height"]
  684. video_dict['audio_url'] = video_url_dict["audio_url"]
  685. video_dict['video_url'] = video_url_dict["video_url"]
  686. video_dict['session'] = signature
  687. break
  688. except Exception as e:
  689. Common.logger(log_type, crawler).error(f'视频:{item_id},没有获取到视频详情,原因:{e}')
  690. continue
  691. if cls.repeat_video(log_type, crawler, video_dict['video_id'], env, machine) != 0:
  692. Common.logger(log_type, crawler).info(f'gid:{video_dict["video_id"]},视频已下载,无需重复下载\n')
  693. continue
  694. for k, v in video_dict.items():
  695. Common.logger(log_type, crawler).info(f"{k}:{v}")
  696. try:
  697. cls.download_publish(
  698. search_word=search_word,
  699. log_type=log_type,
  700. crawler=crawler,
  701. video_dict=video_dict,
  702. rule_dict=rule_dict,
  703. strategy=strategy,
  704. our_uid=our_uid,
  705. oss_endpoint=oss_endpoint,
  706. env=env,
  707. machine=machine
  708. )
  709. except Exception as e:
  710. Common.logger(log_type, crawler).error(f'视频:{item_id},download_publish异常:{e}\n')
  711. continue
  712. total_count += 1
  713. # print(f'search_word:{search_word},title:{video_dict["video_title"]},gid:{video_dict["gid"]},offset:{offset}, total:{total_count}')
  714. if total_count >= 30:
  715. return
  716. offset += 10
  717. @classmethod
  718. def repeat_video(cls, log_type, crawler, video_id, env, machine):
  719. sql = f""" select * from crawler_video where platform="{cls.platform}" and out_video_id="{video_id}"; """
  720. repeat_video = MysqlHelper.get_values(log_type, crawler, sql, env, machine)
  721. return len(repeat_video)
  722. # 下载 / 上传
  723. @classmethod
  724. def download_publish(cls, log_type, crawler, search_word, strategy, video_dict, rule_dict, our_uid, oss_endpoint,
  725. env, machine):
  726. Common.download_method(log_type=log_type, crawler=crawler, text='xigua_video',
  727. title=video_dict['video_title'], url=video_dict['video_url'])
  728. # 下载音频
  729. Common.download_method(log_type=log_type, crawler=crawler, text='xigua_audio',
  730. title=video_dict['video_title'], url=video_dict['audio_url'])
  731. # 合成音视频
  732. Common.video_compose(log_type=log_type, crawler=crawler,
  733. video_dir=f"./{crawler}/videos/{video_dict['video_title']}")
  734. ffmpeg_dict = Common.ffmpeg(log_type, crawler,
  735. f"./{crawler}/videos/{video_dict['video_title']}/video.mp4")
  736. if ffmpeg_dict is None or ffmpeg_dict['size'] == 0:
  737. Common.logger(log_type, crawler).warning(f"下载的视频无效,已删除\n")
  738. # 删除视频文件夹
  739. shutil.rmtree(f"./{crawler}/videos/{video_dict['video_title']}")
  740. return
  741. # 下载封面
  742. Common.download_method(log_type=log_type, crawler=crawler, text='cover',
  743. title=video_dict['video_title'], url=video_dict['cover_url'])
  744. # 保存视频信息至txt
  745. Common.save_video_info(log_type=log_type, crawler=crawler, video_dict=video_dict)
  746. # 上传视频
  747. Common.logger(log_type, crawler).info("开始上传视频...")
  748. our_video_id = Publish.upload_and_publish(log_type=log_type,
  749. crawler=crawler,
  750. strategy=strategy,
  751. our_uid=our_uid,
  752. env=env,
  753. oss_endpoint=oss_endpoint)
  754. if env == 'dev':
  755. our_video_link = f"https://testadmin.piaoquantv.com/cms/post-detail/{our_video_id}/info"
  756. else:
  757. our_video_link = f"https://admin.piaoquantv.com/cms/post-detail/{our_video_id}/info"
  758. Common.logger(log_type, crawler).info("视频上传完成")
  759. if our_video_id is None:
  760. # 删除视频文件夹
  761. shutil.rmtree(f"./{crawler}/videos/{video_dict['video_title']}")
  762. return
  763. # 视频写入飞书
  764. Feishu.insert_columns(log_type, 'xigua', "BUNvGC", "ROWS", 1, 2)
  765. upload_time = int(time.time())
  766. values = [[
  767. search_word,
  768. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(upload_time)),
  769. "关键词搜索",
  770. video_dict['video_title'],
  771. str(video_dict['video_id']),
  772. our_video_link,
  773. video_dict['gid'],
  774. video_dict['play_cnt'],
  775. video_dict['comment_cnt'],
  776. video_dict['like_cnt'],
  777. video_dict['share_cnt'],
  778. video_dict['duration'],
  779. str(video_dict['video_width']) + '*' + str(video_dict['video_height']),
  780. video_dict['publish_time_str'],
  781. video_dict['user_name'],
  782. video_dict['user_id'],
  783. video_dict['avatar_url'],
  784. video_dict['cover_url'],
  785. video_dict['video_url'],
  786. video_dict['audio_url']]]
  787. time.sleep(1)
  788. Feishu.update_values(log_type, 'xigua', "BUNvGC", "E2:Z2", values)
  789. Common.logger(log_type, crawler).info(f"视频已保存至云文档\n")
  790. # 视频信息保存数据库
  791. insert_sql = f""" insert into crawler_video(video_id,
  792. user_id,
  793. out_user_id,
  794. platform,
  795. strategy,
  796. out_video_id,
  797. video_title,
  798. cover_url,
  799. video_url,
  800. duration,
  801. publish_time,
  802. play_cnt,
  803. crawler_rule,
  804. width,
  805. height)
  806. values({our_video_id},
  807. {our_uid},
  808. "{video_dict['user_id']}",
  809. "{cls.platform}",
  810. "定向爬虫策略",
  811. "{video_dict['video_id']}",
  812. "{video_dict['video_title']}",
  813. "{video_dict['cover_url']}",
  814. "{video_dict['video_url']}",
  815. {int(video_dict['duration'])},
  816. "{video_dict['publish_time_str']}",
  817. {int(video_dict['play_cnt'])},
  818. '{json.dumps(rule_dict)}',
  819. {int(video_dict['video_width'])},
  820. {int(video_dict['video_height'])}) """
  821. Common.logger(log_type, crawler).info(f"insert_sql:{insert_sql}")
  822. MysqlHelper.update_values(log_type, crawler, insert_sql, env, machine)
  823. Common.logger(log_type, crawler).info('视频信息插入数据库成功!\n')
  824. @classmethod
  825. def get_search_videos(cls, log_type, crawler, strategy, oss_endpoint, env, machine):
  826. try:
  827. user_list = cls.get_user_list(log_type=log_type, crawler=crawler, sheetid="SSPNPW", env=env,
  828. machine=machine)
  829. for user in user_list:
  830. search_word = user["search_word"]
  831. our_uid = user["our_uid"]
  832. Common.logger(log_type, crawler).info(f"开始抓取 {search_word} 用户主页视频\n")
  833. cls.get_videolist(log_type=log_type,
  834. crawler=crawler,
  835. strategy=strategy,
  836. our_uid=our_uid,
  837. search_word=search_word,
  838. oss_endpoint=oss_endpoint,
  839. env=env,
  840. machine=machine)
  841. except Exception as e:
  842. Common.logger(log_type, crawler).error(f"get_search_videos:{e}\n")
  843. if __name__ == '__main__':
  844. # print(Follow.get_signature("follow", "xigua", "95420624045", "local"))
  845. # Search.get_search_videos('search', 'xigua', 'xigua_search', 'inner', 'prod', 'aliyun')
  846. # Follow.get_videolist(log_type="follow",
  847. # crawler="xigua",
  848. # strategy="定向爬虫策略",
  849. # our_uid="6267141",
  850. # out_uid="95420624045",
  851. # oss_endpoint="out",
  852. # env="dev",
  853. # machine="local")
  854. # print(Follow.random_signature())
  855. # rule = Follow.get_rule("follow", "xigua")
  856. # print(type(rule))
  857. # print(type(json.dumps(rule)))
  858. # print(json.dumps(rule))
  859. pass