xugua_search.py 50 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875
  1. # -*- coding: utf-8 -*-
  2. # @Author: wangkun
  3. # @Time: 2023/2/17
  4. import base64
  5. import json
  6. import os
  7. import random
  8. import re
  9. import shutil
  10. import string
  11. import sys
  12. import time
  13. import requests
  14. import urllib3
  15. from urllib.parse import quote
  16. from requests.adapters import HTTPAdapter
  17. sys.path.append(os.getcwd())
  18. from common.db import MysqlHelper
  19. from common.users import Users
  20. from common.common import Common
  21. from common.feishu import Feishu
  22. from common.publish import Publish
  23. class Follow:
  24. platform = "西瓜视频"
  25. tag = "西瓜视频爬虫,搜索爬虫策略"
  26. @classmethod
  27. def get_rule(cls, log_type, crawler):
  28. try:
  29. while True:
  30. rule_sheet = Feishu.get_values_batch(log_type, crawler, "shxOl7")
  31. if rule_sheet is None:
  32. Common.logger(log_type, crawler).warning("rule_sheet is None! 10秒后重新获取")
  33. time.sleep(10)
  34. continue
  35. rule_dict = {
  36. "play_cnt": int(rule_sheet[1][2]),
  37. "min_duration": int(rule_sheet[2][2]),
  38. "max_duration": int(rule_sheet[3][2]),
  39. "publish_time": int(rule_sheet[4][2]),
  40. }
  41. return rule_dict
  42. except Exception as e:
  43. Common.logger(log_type, crawler).error(f"get_rule:{e}\n")
  44. # 下载规则
  45. @classmethod
  46. def download_rule(cls, video_info_dict, rule_dict):
  47. if video_info_dict['play_cnt'] >= rule_dict['play_cnt']:
  48. if video_info_dict['comment_cnt'] >= rule_dict['comment_cnt']:
  49. if video_info_dict['like_cnt'] >= rule_dict['like_cnt']:
  50. if video_info_dict['duration'] >= rule_dict['duration']:
  51. if video_info_dict['video_width'] >= rule_dict['video_width'] \
  52. or video_info_dict['video_height'] >= rule_dict['video_height']:
  53. return True
  54. else:
  55. return False
  56. else:
  57. return False
  58. else:
  59. return False
  60. else:
  61. return False
  62. else:
  63. return False
  64. # 过滤词库
  65. @classmethod
  66. def filter_words(cls, log_type, crawler):
  67. try:
  68. while True:
  69. filter_words_sheet = Feishu.get_values_batch(log_type, crawler, 'KGB4Hc')
  70. if filter_words_sheet is None:
  71. Common.logger(log_type, crawler).warning(f"filter_words_sheet:{filter_words_sheet} 10秒钟后重试")
  72. continue
  73. filter_words_list = []
  74. for x in filter_words_sheet:
  75. for y in x:
  76. if y is None:
  77. pass
  78. else:
  79. filter_words_list.append(y)
  80. return filter_words_list
  81. except Exception as e:
  82. Common.logger(log_type, crawler).error(f'filter_words异常:{e}\n')
  83. # 获取用户信息(字典格式). 注意:部分 user_id 字符类型是 int / str
  84. @classmethod
  85. def get_user_list(cls, log_type, crawler, sheetid, env, machine):
  86. try:
  87. while True:
  88. user_sheet = Feishu.get_values_batch(log_type, crawler, sheetid)
  89. if user_sheet is None:
  90. Common.logger(log_type, crawler).warning(f"user_sheet:{user_sheet} 10秒钟后重试")
  91. continue
  92. our_user_list = []
  93. for i in range(1, len(user_sheet)):
  94. our_uid = user_sheet[i][6]
  95. search_word = user_sheet[i][4]
  96. Common.logger(log_type, crawler).info(f"正在更新 {search_word} 关键词信息\n")
  97. if our_uid is None:
  98. # 用来创建our_id的信息
  99. user_dict = {
  100. 'nickName': search_word,
  101. 'avatarUrl': '',
  102. 'tagName': '',
  103. }
  104. our_user_link = f'https://admin.piaoquantv.com/ums/user/{our_uid}/post'
  105. our_uid = Users.create_uid(log_type, crawler, user_dict, env)
  106. Feishu.update_values(log_type, crawler, sheetid, f'G{i + 1}:H{i + 1}',
  107. [[our_uid, our_user_link]])
  108. Common.logger(log_type, crawler).info(f'站内用户信息写入飞书成功!\n')
  109. our_user_dict = {
  110. 'out_uid': '',
  111. 'search_word': search_word,
  112. 'our_uid': our_uid,
  113. 'our_user_link': f'https://admin.piaoquantv.com/ums/user/{our_uid}/post',
  114. }
  115. our_user_list.append(our_user_dict)
  116. return our_user_list
  117. except Exception as e:
  118. Common.logger(log_type, crawler).error(f'get_user_id_from_feishu异常:{e}\n')
  119. @classmethod
  120. def random_signature(cls):
  121. src_digits = string.digits # string_数字
  122. src_uppercase = string.ascii_uppercase # string_大写字母
  123. src_lowercase = string.ascii_lowercase # string_小写字母
  124. digits_num = random.randint(1, 6)
  125. uppercase_num = random.randint(1, 26 - digits_num - 1)
  126. lowercase_num = 26 - (digits_num + uppercase_num)
  127. password = random.sample(src_digits, digits_num) + random.sample(src_uppercase, uppercase_num) + random.sample(
  128. src_lowercase, lowercase_num)
  129. random.shuffle(password)
  130. new_password = 'AAAAAAAAAA' + ''.join(password)[10:-4] + 'AAAB'
  131. new_password_start = new_password[0:18]
  132. new_password_end = new_password[-7:]
  133. if new_password[18] == '8':
  134. new_password = new_password_start + 'w' + new_password_end
  135. elif new_password[18] == '9':
  136. new_password = new_password_start + 'x' + new_password_end
  137. elif new_password[18] == '-':
  138. new_password = new_password_start + 'y' + new_password_end
  139. elif new_password[18] == '.':
  140. new_password = new_password_start + 'z' + new_password_end
  141. else:
  142. new_password = new_password_start + 'y' + new_password_end
  143. return new_password
  144. # 获取视频详情
  145. @classmethod
  146. def get_video_url(cls, log_type, crawler, gid):
  147. try:
  148. url = 'https://www.ixigua.com/api/mixVideo/information?'
  149. headers = {
  150. "accept-encoding": "gzip, deflate",
  151. "accept-language": "zh-CN,zh-Hans;q=0.9",
  152. "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
  153. "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.5 Safari/605.1.15",
  154. "referer": "https://www.ixigua.com/7102614741050196520?logTag=0531c88ac04f38ab2c62",
  155. }
  156. params = {
  157. 'mixId': gid,
  158. 'msToken': 'IlG0wd0Pylyw9ghcYiB2YseUmTwrsrqqhXrbIcsSaTcLTJyVlbYJzk20zw3UO-CfrfC'
  159. 'NVVIOBNjIl7vfBoxnVUwO9ZyzAI3umSKsT5-pef_RRfQCJwmA',
  160. 'X-Bogus': 'DFSzswVupYTANCJOSBk0P53WxM-r',
  161. '_signature': '_02B4Z6wo0000119LvEwAAIDCuktNZ0y5wkdfS7jAALThuOR8D9yWNZ.EmWHKV0WSn6Px'
  162. 'fPsH9-BldyxVje0f49ryXgmn7Tzk-swEHNb15TiGqa6YF.cX0jW8Eds1TtJOIZyfc9s5emH7gdWN94',
  163. }
  164. cookies = {
  165. 'ixigua-a-s': '1',
  166. 'msToken': 'IlG0wd0Pylyw9ghcYiB2YseUmTwrsrqqhXrbIcsSaTcLTJyVlbYJzk20zw3UO-CfrfCNVVIOB'
  167. 'NjIl7vfBoxnVUwO9ZyzAI3umSKsT5-pef_RRfQCJwmA',
  168. 'ttwid': '1%7C_yXQeHWwLZgCsgHClOwTCdYSOt_MjdOkgnPIkpi-Sr8%7C1661241238%7Cf57d0c5ef3f1d7'
  169. '6e049fccdca1ac54887c34d1f8731c8e51a49780ff0ceab9f8',
  170. 'tt_scid': 'QZ4l8KXDG0YAEaMCSbADdcybdKbUfG4BC6S4OBv9lpRS5VyqYLX2bIR8CTeZeGHR9ee3',
  171. 'MONITOR_WEB_ID': '0a49204a-7af5-4e96-95f0-f4bafb7450ad',
  172. '__ac_nonce': '06304878000964fdad287',
  173. '__ac_signature': '_02B4Z6wo00f017Rcr3AAAIDCUVxeW1tOKEu0fKvAAI4cvoYzV-wBhq7B6D8k0no7lb'
  174. 'FlvYoinmtK6UXjRIYPXnahUlFTvmWVtb77jsMkKAXzAEsLE56m36RlvL7ky.M3Xn52r9t1IEb7IR3ke8',
  175. 'ttcid': 'e56fabf6e85d4adf9e4d91902496a0e882',
  176. '_tea_utm_cache_1300': 'undefined',
  177. 'support_avif': 'false',
  178. 'support_webp': 'false',
  179. 'xiguavideopcwebid': '7134967546256016900',
  180. 'xiguavideopcwebid.sig': 'xxRww5R1VEMJN_dQepHorEu_eAc',
  181. }
  182. urllib3.disable_warnings()
  183. s = requests.session()
  184. # max_retries=3 重试3次
  185. s.mount('http://', HTTPAdapter(max_retries=3))
  186. s.mount('https://', HTTPAdapter(max_retries=3))
  187. response = s.get(url=url, headers=headers, params=params, cookies=cookies, verify=False,
  188. proxies=Common.tunnel_proxies(), timeout=5)
  189. response.close()
  190. if 'data' not in response.json() or response.json()['data'] == '':
  191. Common.logger(log_type, crawler).warning('get_video_info: response: {}', response)
  192. else:
  193. video_info = response.json()['data']['gidInformation']['packerData']['video']
  194. video_url_dict = {}
  195. # video_url
  196. if 'videoResource' not in video_info:
  197. video_url_dict["video_url"] = ''
  198. video_url_dict["audio_url"] = ''
  199. video_url_dict["video_width"] = 0
  200. video_url_dict["video_height"] = 0
  201. elif 'dash_120fps' in video_info['videoResource']:
  202. if "video_list" in video_info['videoResource']['dash_120fps'] and 'video_4' in \
  203. video_info['videoResource']['dash_120fps']['video_list']:
  204. video_url = video_info['videoResource']['dash_120fps']['video_list']['video_4']['backup_url_1']
  205. audio_url = video_info['videoResource']['dash_120fps']['video_list']['video_4']['backup_url_1']
  206. if len(video_url) % 3 == 1:
  207. video_url += '=='
  208. elif len(video_url) % 3 == 2:
  209. video_url += '='
  210. elif len(audio_url) % 3 == 1:
  211. audio_url += '=='
  212. elif len(audio_url) % 3 == 2:
  213. audio_url += '='
  214. video_url = base64.b64decode(video_url).decode('utf8')
  215. audio_url = base64.b64decode(audio_url).decode('utf8')
  216. video_width = video_info['videoResource']['dash_120fps']['video_list']['video_4']['vwidth']
  217. video_height = video_info['videoResource']['dash_120fps']['video_list']['video_4']['vheight']
  218. video_url_dict["video_url"] = video_url
  219. video_url_dict["audio_url"] = audio_url
  220. video_url_dict["video_width"] = video_width
  221. video_url_dict["video_height"] = video_height
  222. elif "video_list" in video_info['videoResource']['dash_120fps'] and 'video_3' in \
  223. video_info['videoResource']['dash_120fps']['video_list']:
  224. video_url = video_info['videoResource']['dash_120fps']['video_list']['video_3']['backup_url_1']
  225. audio_url = video_info['videoResource']['dash_120fps']['video_list']['video_3']['backup_url_1']
  226. if len(video_url) % 3 == 1:
  227. video_url += '=='
  228. elif len(video_url) % 3 == 2:
  229. video_url += '='
  230. elif len(audio_url) % 3 == 1:
  231. audio_url += '=='
  232. elif len(audio_url) % 3 == 2:
  233. audio_url += '='
  234. video_url = base64.b64decode(video_url).decode('utf8')
  235. audio_url = base64.b64decode(audio_url).decode('utf8')
  236. video_width = video_info['videoResource']['dash_120fps']['video_list']['video_3']['vwidth']
  237. video_height = video_info['videoResource']['dash_120fps']['video_list']['video_3']['vheight']
  238. video_url_dict["video_url"] = video_url
  239. video_url_dict["audio_url"] = audio_url
  240. video_url_dict["video_width"] = video_width
  241. video_url_dict["video_height"] = video_height
  242. elif "video_list" in video_info['videoResource']['dash_120fps'] and 'video_2' in \
  243. video_info['videoResource']['dash_120fps']['video_list']:
  244. video_url = video_info['videoResource']['dash_120fps']['video_list']['video_2']['backup_url_1']
  245. audio_url = video_info['videoResource']['dash_120fps']['video_list']['video_2']['backup_url_1']
  246. if len(video_url) % 3 == 1:
  247. video_url += '=='
  248. elif len(video_url) % 3 == 2:
  249. video_url += '='
  250. elif len(audio_url) % 3 == 1:
  251. audio_url += '=='
  252. elif len(audio_url) % 3 == 2:
  253. audio_url += '='
  254. video_url = base64.b64decode(video_url).decode('utf8')
  255. audio_url = base64.b64decode(audio_url).decode('utf8')
  256. video_width = video_info['videoResource']['dash_120fps']['video_list']['video_2']['vwidth']
  257. video_height = video_info['videoResource']['dash_120fps']['video_list']['video_2']['vheight']
  258. video_url_dict["video_url"] = video_url
  259. video_url_dict["audio_url"] = audio_url
  260. video_url_dict["video_width"] = video_width
  261. video_url_dict["video_height"] = video_height
  262. elif "video_list" in video_info['videoResource']['dash_120fps'] and 'video_1' in \
  263. video_info['videoResource']['dash_120fps']['video_list']:
  264. video_url = video_info['videoResource']['dash_120fps']['video_list']['video_1']['backup_url_1']
  265. audio_url = video_info['videoResource']['dash_120fps']['video_list']['video_1']['backup_url_1']
  266. if len(video_url) % 3 == 1:
  267. video_url += '=='
  268. elif len(video_url) % 3 == 2:
  269. video_url += '='
  270. elif len(audio_url) % 3 == 1:
  271. audio_url += '=='
  272. elif len(audio_url) % 3 == 2:
  273. audio_url += '='
  274. video_url = base64.b64decode(video_url).decode('utf8')
  275. audio_url = base64.b64decode(audio_url).decode('utf8')
  276. video_width = video_info['videoResource']['dash_120fps']['video_list']['video_1']['vwidth']
  277. video_height = video_info['videoResource']['dash_120fps']['video_list']['video_1']['vheight']
  278. video_url_dict["video_url"] = video_url
  279. video_url_dict["audio_url"] = audio_url
  280. video_url_dict["video_width"] = video_width
  281. video_url_dict["video_height"] = video_height
  282. elif 'dynamic_video' in video_info['videoResource']['dash_120fps'] \
  283. and 'dynamic_video_list' in video_info['videoResource']['dash_120fps']['dynamic_video'] \
  284. and 'dynamic_audio_list' in video_info['videoResource']['dash_120fps']['dynamic_video'] \
  285. and len(
  286. video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_video_list']) != 0 \
  287. and len(
  288. video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_audio_list']) != 0:
  289. video_url = \
  290. video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_video_list'][-1][
  291. 'backup_url_1']
  292. audio_url = \
  293. video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_audio_list'][-1][
  294. 'backup_url_1']
  295. if len(video_url) % 3 == 1:
  296. video_url += '=='
  297. elif len(video_url) % 3 == 2:
  298. video_url += '='
  299. elif len(audio_url) % 3 == 1:
  300. audio_url += '=='
  301. elif len(audio_url) % 3 == 2:
  302. audio_url += '='
  303. video_url = base64.b64decode(video_url).decode('utf8')
  304. audio_url = base64.b64decode(audio_url).decode('utf8')
  305. video_width = \
  306. video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_video_list'][-1][
  307. 'vwidth']
  308. video_height = \
  309. video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_video_list'][-1][
  310. 'vheight']
  311. video_url_dict["video_url"] = video_url
  312. video_url_dict["audio_url"] = audio_url
  313. video_url_dict["video_width"] = video_width
  314. video_url_dict["video_height"] = video_height
  315. else:
  316. video_url_dict["video_url"] = ''
  317. video_url_dict["audio_url"] = ''
  318. video_url_dict["video_width"] = 0
  319. video_url_dict["video_height"] = 0
  320. elif 'dash' in video_info['videoResource']:
  321. if "video_list" in video_info['videoResource']['dash'] and 'video_4' in \
  322. video_info['videoResource']['dash']['video_list']:
  323. video_url = video_info['videoResource']['dash']['video_list']['video_4']['backup_url_1']
  324. audio_url = video_info['videoResource']['dash']['video_list']['video_4']['backup_url_1']
  325. if len(video_url) % 3 == 1:
  326. video_url += '=='
  327. elif len(video_url) % 3 == 2:
  328. video_url += '='
  329. elif len(audio_url) % 3 == 1:
  330. audio_url += '=='
  331. elif len(audio_url) % 3 == 2:
  332. audio_url += '='
  333. video_url = base64.b64decode(video_url).decode('utf8')
  334. audio_url = base64.b64decode(audio_url).decode('utf8')
  335. video_width = video_info['videoResource']['dash']['video_list']['video_4']['vwidth']
  336. video_height = video_info['videoResource']['dash']['video_list']['video_4']['vheight']
  337. video_url_dict["video_url"] = video_url
  338. video_url_dict["audio_url"] = audio_url
  339. video_url_dict["video_width"] = video_width
  340. video_url_dict["video_height"] = video_height
  341. elif "video_list" in video_info['videoResource']['dash'] and 'video_3' in \
  342. video_info['videoResource']['dash']['video_list']:
  343. video_url = video_info['videoResource']['dash']['video_list']['video_3']['backup_url_1']
  344. audio_url = video_info['videoResource']['dash']['video_list']['video_3']['backup_url_1']
  345. if len(video_url) % 3 == 1:
  346. video_url += '=='
  347. elif len(video_url) % 3 == 2:
  348. video_url += '='
  349. elif len(audio_url) % 3 == 1:
  350. audio_url += '=='
  351. elif len(audio_url) % 3 == 2:
  352. audio_url += '='
  353. video_url = base64.b64decode(video_url).decode('utf8')
  354. audio_url = base64.b64decode(audio_url).decode('utf8')
  355. video_width = video_info['videoResource']['dash']['video_list']['video_3']['vwidth']
  356. video_height = video_info['videoResource']['dash']['video_list']['video_3']['vheight']
  357. video_url_dict["video_url"] = video_url
  358. video_url_dict["audio_url"] = audio_url
  359. video_url_dict["video_width"] = video_width
  360. video_url_dict["video_height"] = video_height
  361. elif "video_list" in video_info['videoResource']['dash'] and 'video_2' in \
  362. video_info['videoResource']['dash']['video_list']:
  363. video_url = video_info['videoResource']['dash']['video_list']['video_2']['backup_url_1']
  364. audio_url = video_info['videoResource']['dash']['video_list']['video_2']['backup_url_1']
  365. if len(video_url) % 3 == 1:
  366. video_url += '=='
  367. elif len(video_url) % 3 == 2:
  368. video_url += '='
  369. elif len(audio_url) % 3 == 1:
  370. audio_url += '=='
  371. elif len(audio_url) % 3 == 2:
  372. audio_url += '='
  373. video_url = base64.b64decode(video_url).decode('utf8')
  374. audio_url = base64.b64decode(audio_url).decode('utf8')
  375. video_width = video_info['videoResource']['dash']['video_list']['video_2']['vwidth']
  376. video_height = video_info['videoResource']['dash']['video_list']['video_2']['vheight']
  377. video_url_dict["video_url"] = video_url
  378. video_url_dict["audio_url"] = audio_url
  379. video_url_dict["video_width"] = video_width
  380. video_url_dict["video_height"] = video_height
  381. elif "video_list" in video_info['videoResource']['dash'] and 'video_1' in \
  382. video_info['videoResource']['dash']['video_list']:
  383. video_url = video_info['videoResource']['dash']['video_list']['video_1']['backup_url_1']
  384. audio_url = video_info['videoResource']['dash']['video_list']['video_1']['backup_url_1']
  385. if len(video_url) % 3 == 1:
  386. video_url += '=='
  387. elif len(video_url) % 3 == 2:
  388. video_url += '='
  389. elif len(audio_url) % 3 == 1:
  390. audio_url += '=='
  391. elif len(audio_url) % 3 == 2:
  392. audio_url += '='
  393. video_url = base64.b64decode(video_url).decode('utf8')
  394. audio_url = base64.b64decode(audio_url).decode('utf8')
  395. video_width = video_info['videoResource']['dash']['video_list']['video_1']['vwidth']
  396. video_height = video_info['videoResource']['dash']['video_list']['video_1']['vheight']
  397. video_url_dict["video_url"] = video_url
  398. video_url_dict["audio_url"] = audio_url
  399. video_url_dict["video_width"] = video_width
  400. video_url_dict["video_height"] = video_height
  401. elif 'dynamic_video' in video_info['videoResource']['dash'] \
  402. and 'dynamic_video_list' in video_info['videoResource']['dash']['dynamic_video'] \
  403. and 'dynamic_audio_list' in video_info['videoResource']['dash']['dynamic_video'] \
  404. and len(video_info['videoResource']['dash']['dynamic_video']['dynamic_video_list']) != 0 \
  405. and len(video_info['videoResource']['dash']['dynamic_video']['dynamic_audio_list']) != 0:
  406. video_url = video_info['videoResource']['dash']['dynamic_video']['dynamic_video_list'][-1][
  407. 'backup_url_1']
  408. audio_url = video_info['videoResource']['dash']['dynamic_video']['dynamic_audio_list'][-1][
  409. 'backup_url_1']
  410. if len(video_url) % 3 == 1:
  411. video_url += '=='
  412. elif len(video_url) % 3 == 2:
  413. video_url += '='
  414. elif len(audio_url) % 3 == 1:
  415. audio_url += '=='
  416. elif len(audio_url) % 3 == 2:
  417. audio_url += '='
  418. video_url = base64.b64decode(video_url).decode('utf8')
  419. audio_url = base64.b64decode(audio_url).decode('utf8')
  420. video_width = video_info['videoResource']['dash']['dynamic_video']['dynamic_video_list'][-1][
  421. 'vwidth']
  422. video_height = video_info['videoResource']['dash']['dynamic_video']['dynamic_video_list'][-1][
  423. 'vheight']
  424. video_url_dict["video_url"] = video_url
  425. video_url_dict["audio_url"] = audio_url
  426. video_url_dict["video_width"] = video_width
  427. video_url_dict["video_height"] = video_height
  428. else:
  429. video_url_dict["video_url"] = ''
  430. video_url_dict["audio_url"] = ''
  431. video_url_dict["video_width"] = 0
  432. video_url_dict["video_height"] = 0
  433. elif 'normal' in video_info['videoResource']:
  434. if "video_list" in video_info['videoResource']['normal'] and 'video_4' in \
  435. video_info['videoResource']['normal']['video_list']:
  436. video_url = video_info['videoResource']['normal']['video_list']['video_4']['backup_url_1']
  437. audio_url = video_info['videoResource']['normal']['video_list']['video_4']['backup_url_1']
  438. if len(video_url) % 3 == 1:
  439. video_url += '=='
  440. elif len(video_url) % 3 == 2:
  441. video_url += '='
  442. elif len(audio_url) % 3 == 1:
  443. audio_url += '=='
  444. elif len(audio_url) % 3 == 2:
  445. audio_url += '='
  446. video_url = base64.b64decode(video_url).decode('utf8')
  447. audio_url = base64.b64decode(audio_url).decode('utf8')
  448. video_width = video_info['videoResource']['normal']['video_list']['video_4']['vwidth']
  449. video_height = video_info['videoResource']['normal']['video_list']['video_4']['vheight']
  450. video_url_dict["video_url"] = video_url
  451. video_url_dict["audio_url"] = audio_url
  452. video_url_dict["video_width"] = video_width
  453. video_url_dict["video_height"] = video_height
  454. elif "video_list" in video_info['videoResource']['normal'] and 'video_3' in \
  455. video_info['videoResource']['normal']['video_list']:
  456. video_url = video_info['videoResource']['normal']['video_list']['video_3']['backup_url_1']
  457. audio_url = video_info['videoResource']['normal']['video_list']['video_3']['backup_url_1']
  458. if len(video_url) % 3 == 1:
  459. video_url += '=='
  460. elif len(video_url) % 3 == 2:
  461. video_url += '='
  462. elif len(audio_url) % 3 == 1:
  463. audio_url += '=='
  464. elif len(audio_url) % 3 == 2:
  465. audio_url += '='
  466. video_url = base64.b64decode(video_url).decode('utf8')
  467. audio_url = base64.b64decode(audio_url).decode('utf8')
  468. video_width = video_info['videoResource']['normal']['video_list']['video_3']['vwidth']
  469. video_height = video_info['videoResource']['normal']['video_list']['video_3']['vheight']
  470. video_url_dict["video_url"] = video_url
  471. video_url_dict["audio_url"] = audio_url
  472. video_url_dict["video_width"] = video_width
  473. video_url_dict["video_height"] = video_height
  474. elif "video_list" in video_info['videoResource']['normal'] and 'video_2' in \
  475. video_info['videoResource']['normal']['video_list']:
  476. video_url = video_info['videoResource']['normal']['video_list']['video_2']['backup_url_1']
  477. audio_url = video_info['videoResource']['normal']['video_list']['video_2']['backup_url_1']
  478. if len(video_url) % 3 == 1:
  479. video_url += '=='
  480. elif len(video_url) % 3 == 2:
  481. video_url += '='
  482. elif len(audio_url) % 3 == 1:
  483. audio_url += '=='
  484. elif len(audio_url) % 3 == 2:
  485. audio_url += '='
  486. video_url = base64.b64decode(video_url).decode('utf8')
  487. audio_url = base64.b64decode(audio_url).decode('utf8')
  488. video_width = video_info['videoResource']['normal']['video_list']['video_2']['vwidth']
  489. video_height = video_info['videoResource']['normal']['video_list']['video_2']['vheight']
  490. video_url_dict["video_url"] = video_url
  491. video_url_dict["audio_url"] = audio_url
  492. video_url_dict["video_width"] = video_width
  493. video_url_dict["video_height"] = video_height
  494. elif "video_list" in video_info['videoResource']['normal'] and 'video_1' in \
  495. video_info['videoResource']['normal']['video_list']:
  496. video_url = video_info['videoResource']['normal']['video_list']['video_1']['backup_url_1']
  497. audio_url = video_info['videoResource']['normal']['video_list']['video_1']['backup_url_1']
  498. if len(video_url) % 3 == 1:
  499. video_url += '=='
  500. elif len(video_url) % 3 == 2:
  501. video_url += '='
  502. elif len(audio_url) % 3 == 1:
  503. audio_url += '=='
  504. elif len(audio_url) % 3 == 2:
  505. audio_url += '='
  506. video_url = base64.b64decode(video_url).decode('utf8')
  507. audio_url = base64.b64decode(audio_url).decode('utf8')
  508. video_width = video_info['videoResource']['normal']['video_list']['video_1']['vwidth']
  509. video_height = video_info['videoResource']['normal']['video_list']['video_1']['vheight']
  510. video_url_dict["video_url"] = video_url
  511. video_url_dict["audio_url"] = audio_url
  512. video_url_dict["video_width"] = video_width
  513. video_url_dict["video_height"] = video_height
  514. elif 'dynamic_video' in video_info['videoResource']['normal'] \
  515. and 'dynamic_video_list' in video_info['videoResource']['normal']['dynamic_video'] \
  516. and 'dynamic_audio_list' in video_info['videoResource']['normal']['dynamic_video'] \
  517. and len(video_info['videoResource']['normal']['dynamic_video']['dynamic_video_list']) != 0 \
  518. and len(video_info['videoResource']['normal']['dynamic_video']['dynamic_audio_list']) != 0:
  519. video_url = video_info['videoResource']['normal']['dynamic_video']['dynamic_video_list'][-1][
  520. 'backup_url_1']
  521. audio_url = video_info['videoResource']['normal']['dynamic_video']['dynamic_audio_list'][-1][
  522. 'backup_url_1']
  523. if len(video_url) % 3 == 1:
  524. video_url += '=='
  525. elif len(video_url) % 3 == 2:
  526. video_url += '='
  527. elif len(audio_url) % 3 == 1:
  528. audio_url += '=='
  529. elif len(audio_url) % 3 == 2:
  530. audio_url += '='
  531. video_url = base64.b64decode(video_url).decode('utf8')
  532. audio_url = base64.b64decode(audio_url).decode('utf8')
  533. video_width = video_info['videoResource']['normal']['dynamic_video']['dynamic_video_list'][-1][
  534. 'vwidth']
  535. video_height = video_info['videoResource']['normal']['dynamic_video']['dynamic_video_list'][-1][
  536. 'vheight']
  537. video_url_dict["video_url"] = video_url
  538. video_url_dict["audio_url"] = audio_url
  539. video_url_dict["video_width"] = video_width
  540. video_url_dict["video_height"] = video_height
  541. else:
  542. video_url_dict["video_url"] = ''
  543. video_url_dict["audio_url"] = ''
  544. video_url_dict["video_width"] = 0
  545. video_url_dict["video_height"] = 0
  546. else:
  547. video_url_dict["video_url"] = ''
  548. video_url_dict["audio_url"] = ''
  549. video_url_dict["video_width"] = 0
  550. video_url_dict["video_height"] = 0
  551. return video_url_dict
  552. except Exception as e:
  553. Common.logger(log_type, crawler).error(f'get_video_url:{e}\n')
  554. @classmethod
  555. def get_video_info(cls, item_id):
  556. d_headers = {
  557. 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36'
  558. }
  559. d_url = "http://a6.pstatp.com/article/full/11/1/{video_id}/{video_id}/1/0/?iid=3636030325&device_id=5787057242" \
  560. "&ac=wifi&channel=wandoujia&aid=13&app_name=news_article&version_code=532&version_name=5.3.2&device_platform" \
  561. "=android&ab_client=a1%2Cc2%2Ce1%2Cf2%2Cg2%2Cb3%2Cf4&abflag=3&ssmix=a&device_type=SM705" \
  562. "&device_brand=smartisan&os_api=19&os_version=4.4.2&uuid=864593021012562&openudid=e23a5ff037ef2d1a" \
  563. "&manifest_version_code=532&resolution=1080*1920&dpi=480&update_version_code=5320".format(
  564. video_id=item_id)
  565. res = requests.get(url=d_url, headers=d_headers, proxies=Common.tunnel_proxies())
  566. data = json.loads(res.text)['data']
  567. item_counter = data['h5_extra']['itemCell']['itemCounter']
  568. user_info = data['user_info']
  569. detail_info = data['video_detail_info']
  570. video_dict = {'video_title': data['title'],
  571. 'video_id': detail_info['video_id'],
  572. 'gid': data['group_id'],
  573. 'play_cnt': item_counter['videoWatchCount'],
  574. 'comment_cnt': item_counter['commentCount'],
  575. 'like_cnt': item_counter['diggCount'],
  576. 'share_cnt': item_counter['shareCount'],
  577. 'duration': data['video_duration'],
  578. 'publish_time_stamp': data['publish_time'],
  579. 'publish_time_str': time.strftime("%Y-%m-%d %H:%M:%S",
  580. time.localtime(data['publish_time'])),
  581. 'user_name': user_info['name'],
  582. 'user_id': user_info['user_id'],
  583. 'avatar_url': user_info['avatar_url'],
  584. 'cover_url': data['large_image']['url'].replace('\u0026', '&'),
  585. }
  586. return video_dict
  587. @classmethod
  588. def is_ruled(cls, log_type, crawler, video_dict, rule_dict):
  589. old_time = int(time.time()) - (3600 * 24 * rule_dict['publish_time'])
  590. if video_dict['publish_time_stamp'] <= old_time:
  591. return False
  592. elif video_dict['play_cnt'] <= rule_dict['play_cnt']:
  593. return False
  594. elif video_dict['duration'] < rule_dict['min_duration'] or video_dict['duration'] > rule_dict['max_duration']:
  595. return False
  596. else:
  597. return True
  598. @classmethod
  599. def get_videolist(cls, log_type, crawler, strategy, our_uid, search_word, oss_endpoint, env, machine):
  600. total_count = 0
  601. offset = 0
  602. while True:
  603. signature = cls.random_signature()
  604. url = "https://www.ixigua.com/api/searchv2/complex/{}/{}?_signature={}".format(
  605. quote(search_word), offset, signature)
  606. headers = {
  607. 'accept': 'application/json, text/plain, */*',
  608. 'referer': 'https://www.ixigua.com/search/{}/?logTag=594535e3690f17a88cdb&tab_name=search'.format(
  609. quote(search_word)),
  610. 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36',
  611. }
  612. try:
  613. res = requests.request("GET", url, headers=headers, proxies=Common.tunnel_proxies())
  614. search_list = res.json()['data']['data']
  615. except Exception as e:
  616. search_list = []
  617. if not search_list:
  618. return
  619. for video_info in search_list:
  620. v_type = video_info['type']
  621. item_id = video_info['data']['group_id']
  622. rule_dict = cls.get_rule(log_type, crawler)
  623. video_dict = {}
  624. if v_type == 'video':
  625. try:
  626. video_dict = cls.get_video_info(item_id)
  627. video_url_dict = cls.get_video_url(log_type, crawler, video_dict['gid'])
  628. video_dict['video_width'] = video_url_dict["video_width"]
  629. video_dict['video_height'] = video_url_dict["video_height"]
  630. video_dict['audio_url'] = video_url_dict["audio_url"]
  631. video_dict['video_url'] = video_url_dict["video_url"]
  632. video_dict['session'] = signature
  633. except Exception as e:
  634. Common.logger(log_type, crawler).error(f'视频:{item_id},没有获取到视频详情,原因:{e}')
  635. continue
  636. elif v_type == 'pseries':
  637. p_url = "https://www.ixigua.com/api/videov2/pseries_more_v2?pSeriesId={}&rank=0&tailCount=30&aid=1768&msToken=wHEafKFLx0k3hihOPbhXYNsfMBxWiq2AB0K5R-34kEFixyq3ATi_DuXbL4Q47J9C2uK2zgWItMa1g2yc4FyDxM4dMijmSdwF4c4T8sSmOkoOI0wGzeEcPw==&X-Bogus=DFSzswVOzdUANG3ItaVHYr7TlqCv&_signature=_02B4Z6wo00001vB6l3QAAIDBZKzMeTihTmbwepPAANgh1Ai3JgFFo4e6anoezmBEpHfEMEYlWISGhXI-QKfev4N-2bwgXsHOuNGLnOsGqMbANIjFPh7Yj6OakQWrkbACenlv0P-arswtB6Zn45".format(
  638. item_id)
  639. p_headers = {
  640. 'referer': 'https://www.ixigua.com/{}?series_flow=1&logTag=cfec9d927da968feff89'.format(
  641. item_id),
  642. 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36',
  643. }
  644. try:
  645. p_res = requests.request("GET", p_url, headers=p_headers, proxies=Common.tunnel_proxies()).json()
  646. except Exception as e:
  647. Common.logger(log_type, crawler).error(f'合集:{item_id},没有获取到合集详情,原因:{e}')
  648. continue
  649. for video in p_res['data']:
  650. item_id = video['item_id']
  651. try:
  652. video_dict = cls.get_video_info(item_id)
  653. if cls.is_ruled(log_type, crawler, video_dict, rule_dict):
  654. video_url_dict = cls.get_video_url(log_type, crawler, video_dict['gid'])
  655. video_dict['video_width'] = video_url_dict["video_width"]
  656. video_dict['video_height'] = video_url_dict["video_height"]
  657. video_dict['audio_url'] = video_url_dict["audio_url"]
  658. video_dict['video_url'] = video_url_dict["video_url"]
  659. video_dict['session'] = signature
  660. break
  661. else:
  662. continue
  663. except Exception as e:
  664. Common.logger(log_type, crawler).error(f'视频:{item_id},没有获取到视频详情,原因:{e}')
  665. continue
  666. if not cls.is_ruled(log_type, crawler, video_dict, rule_dict):
  667. continue
  668. for k, v in video_dict.items():
  669. Common.logger(log_type, crawler).info(f"{k}:{v}")
  670. cls.download_publish(
  671. search_word=search_word,
  672. log_type=log_type,
  673. crawler=crawler,
  674. video_dict=video_dict,
  675. rule_dict=rule_dict,
  676. strategy=strategy,
  677. our_uid=our_uid,
  678. oss_endpoint=oss_endpoint,
  679. env=env,
  680. machine=machine
  681. )
  682. total_count += 1
  683. if total_count >= 30:
  684. return
  685. offset += 10
  686. @classmethod
  687. def repeat_video(cls, log_type, crawler, video_id, env, machine):
  688. sql = f""" select * from crawler_video where platform="{cls.platform}" and out_video_id="{video_id}"; """
  689. repeat_video = MysqlHelper.get_values(log_type, crawler, sql, env, machine)
  690. return len(repeat_video)
  691. # 下载 / 上传
  692. @classmethod
  693. def download_publish(cls, log_type, crawler, search_word, strategy, video_dict, rule_dict, our_uid, oss_endpoint,
  694. env, machine):
  695. try:
  696. Common.download_method(log_type=log_type, crawler=crawler, text='xigua_video',
  697. title=video_dict['video_title'], url=video_dict['video_url'])
  698. # 下载音频
  699. Common.download_method(log_type=log_type, crawler=crawler, text='xigua_audio',
  700. title=video_dict['video_title'], url=video_dict['audio_url'])
  701. # 合成音视频
  702. Common.video_compose(log_type=log_type, crawler=crawler,
  703. video_dir=f"./{crawler}/videos/{video_dict['video_title']}")
  704. ffmpeg_dict = Common.ffmpeg(log_type, crawler,
  705. f"./{crawler}/videos/{video_dict['video_title']}/video.mp4")
  706. if ffmpeg_dict is None or ffmpeg_dict['size'] == 0:
  707. Common.logger(log_type, crawler).warning(f"下载的视频无效,已删除\n")
  708. # 删除视频文件夹
  709. shutil.rmtree(f"./{crawler}/videos/{video_dict['video_title']}")
  710. return
  711. # 下载封面
  712. Common.download_method(log_type=log_type, crawler=crawler, text='cover',
  713. title=video_dict['video_title'], url=video_dict['cover_url'])
  714. # 保存视频信息至txt
  715. Common.save_video_info(log_type=log_type, crawler=crawler, video_dict=video_dict)
  716. # 上传视频
  717. Common.logger(log_type, crawler).info("开始上传视频...")
  718. our_video_id = Publish.upload_and_publish(log_type=log_type,
  719. crawler=crawler,
  720. strategy=strategy,
  721. our_uid=our_uid,
  722. env=env,
  723. oss_endpoint=oss_endpoint)
  724. if env == 'dev':
  725. our_video_link = f"https://testadmin.piaoquantv.com/cms/post-detail/{our_video_id}/info"
  726. else:
  727. our_video_link = f"https://admin.piaoquantv.com/cms/post-detail/{our_video_id}/info"
  728. Common.logger(log_type, crawler).info("视频上传完成")
  729. if our_video_id is None:
  730. # 删除视频文件夹
  731. shutil.rmtree(f"./{crawler}/videos/{video_dict['video_title']}")
  732. return
  733. # 视频写入飞书
  734. Feishu.insert_columns(log_type, 'xigua', "BUNvGC", "ROWS", 1, 2)
  735. upload_time = int(time.time())
  736. values = [[
  737. search_word,
  738. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(upload_time)),
  739. "关键词搜索",
  740. video_dict['video_title'],
  741. str(video_dict['video_id']),
  742. our_video_link,
  743. video_dict['gid'],
  744. video_dict['play_cnt'],
  745. video_dict['comment_cnt'],
  746. video_dict['like_cnt'],
  747. video_dict['share_cnt'],
  748. video_dict['duration'],
  749. str(video_dict['video_width']) + '*' + str(video_dict['video_height']),
  750. video_dict['publish_time_str'],
  751. video_dict['user_name'],
  752. video_dict['user_id'],
  753. video_dict['avatar_url'],
  754. video_dict['cover_url'],
  755. video_dict['video_url'],
  756. video_dict['audio_url']]]
  757. time.sleep(1)
  758. Feishu.update_values(log_type, 'xigua', "BUNvGC", "E2:Z2", values)
  759. Common.logger(log_type, crawler).info(f"视频已保存至云文档\n")
  760. # 视频信息保存数据库
  761. insert_sql = f""" insert into crawler_video(video_id,
  762. user_id,
  763. out_user_id,
  764. platform,
  765. strategy,
  766. out_video_id,
  767. video_title,
  768. cover_url,
  769. video_url,
  770. duration,
  771. publish_time,
  772. play_cnt,
  773. crawler_rule,
  774. width,
  775. height)
  776. values({our_video_id},
  777. {our_uid},
  778. "{video_dict['user_id']}",
  779. "{cls.platform}",
  780. "定向爬虫策略",
  781. "{video_dict['video_id']}",
  782. "{video_dict['video_title']}",
  783. "{video_dict['cover_url']}",
  784. "{video_dict['video_url']}",
  785. {int(video_dict['duration'])},
  786. "{video_dict['publish_time_str']}",
  787. {int(video_dict['play_cnt'])},
  788. '{json.dumps(rule_dict)}',
  789. {int(video_dict['video_width'])},
  790. {int(video_dict['video_height'])}) """
  791. Common.logger(log_type, crawler).info(f"insert_sql:{insert_sql}")
  792. MysqlHelper.update_values(log_type, crawler, insert_sql, env, machine)
  793. Common.logger(log_type, crawler).info('视频信息插入数据库成功!\n')
  794. except Exception as e:
  795. Common.logger(log_type, crawler).error(f'download_publish异常:{e}\n')
  796. @classmethod
  797. def get_search_videos(cls, log_type, crawler, strategy, oss_endpoint, env, machine):
  798. try:
  799. user_list = cls.get_user_list(log_type=log_type, crawler=crawler, sheetid="SSPNPW", env=env,
  800. machine=machine)
  801. for user in user_list:
  802. search_word = user["search_word"]
  803. our_uid = user["our_uid"]
  804. Common.logger(log_type, crawler).info(f"开始抓取 {search_word} 用户主页视频\n")
  805. cls.get_videolist(log_type=log_type,
  806. crawler=crawler,
  807. strategy=strategy,
  808. our_uid=our_uid,
  809. search_word=search_word,
  810. oss_endpoint=oss_endpoint,
  811. env=env,
  812. machine=machine)
  813. except Exception as e:
  814. Common.logger(log_type, crawler).error(f"get_follow_videos:{e}\n")
  815. if __name__ == '__main__':
  816. # print(Follow.get_signature("follow", "xigua", "95420624045", "local"))
  817. Follow.get_search_videos('search', 'xigua', 'xigua_search', 'out', 'dev', 'local')
  818. # Follow.get_videolist(log_type="follow",
  819. # crawler="xigua",
  820. # strategy="定向爬虫策略",
  821. # our_uid="6267141",
  822. # out_uid="95420624045",
  823. # oss_endpoint="out",
  824. # env="dev",
  825. # machine="local")
  826. # print(Follow.random_signature())
  827. # rule = Follow.get_rule("follow", "xigua")
  828. # print(type(rule))
  829. # print(type(json.dumps(rule)))
  830. # print(json.dumps(rule))
  831. pass