xigua_follow_scheduling.py 51 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912
  1. # -*- coding: utf-8 -*-
  2. # @Author: wangkun
  3. # @Time: 2023/2/17
  4. import base64
  5. import json
  6. import os
  7. import random
  8. import shutil
  9. import string
  10. import sys
  11. import time
  12. from datetime import date, timedelta
  13. import requests
  14. import urllib3
  15. from requests.adapters import HTTPAdapter
  16. sys.path.append(os.getcwd())
  17. from common.scheduling_db import MysqlHelper
  18. from common.common import Common
  19. from common.feishu import Feishu
  20. from common.publish import Publish
  21. class SchedulingFollow:
  22. # 个人主页视频翻页参数
  23. offset = 0
  24. platform = "西瓜视频"
  25. @classmethod
  26. def get_users(cls, log_type, crawler, task, env):
  27. link_list = task['spider_link']
  28. user_list = []
  29. for link in link_list:
  30. out_uid = int(link.split("https://www.ixigua.com/home/")[-1].replace("/", "").strip())
  31. sql = f""" select * from crawler_author_map where spider_link="{link}" """
  32. our_user_info = MysqlHelper.get_values(log_type=log_type, crawler=crawler, sql=sql, env=env)
  33. if len(our_user_info) == 0:
  34. our_uid = 0
  35. Common.logger(log_type, crawler).info(f"没有站内虚拟账号: {link}\n")
  36. else:
  37. # print(type(our_user_info[0]))
  38. # print(our_user_info[0])
  39. our_uid = our_user_info[0]["media_id"]
  40. user_dict = {
  41. "out_uid": out_uid,
  42. "our_uid": our_uid
  43. }
  44. user_list.append(user_dict)
  45. Common.logger(log_type, crawler).info(f"user_list:{user_list}")
  46. return user_list
  47. # 下载规则
  48. @classmethod
  49. def download_rule_scheduling(cls, video_info_dict, task):
  50. try:
  51. play_cnt_min = int(task['play_cnt']['min'])
  52. except:
  53. play_cnt_min = 0
  54. try:
  55. video_like_min = int(task['video_like']['min'])
  56. except:
  57. video_like_min = 0
  58. try:
  59. share_cnt_min = int(task['share_cnt']['min'])
  60. except:
  61. share_cnt_min = 0
  62. try:
  63. video_width_min = int(task['video_width']['min'])
  64. except:
  65. video_width_min = 0
  66. try:
  67. video_height_min = task['video_height']['min']
  68. except:
  69. video_height_min = 0
  70. try:
  71. duration_min = int(task['duration_min'])
  72. except:
  73. duration_min = 0
  74. try:
  75. duration_max = int(task['duration_max'])
  76. except:
  77. duration_max = 1000000000
  78. if int(video_info_dict['play_cnt']) >= play_cnt_min:
  79. if int(video_info_dict['like_cnt']) >= video_like_min:
  80. if int(video_info_dict['share_cnt']) >= share_cnt_min:
  81. if duration_max >= int(video_info_dict['duration']) >= duration_min:
  82. if int(video_info_dict['video_width']) >= video_width_min:
  83. if int(video_info_dict['video_height']) >= video_height_min:
  84. return True
  85. else:
  86. return False
  87. else:
  88. return False
  89. else:
  90. return False
  91. else:
  92. return False
  93. else:
  94. return False
  95. else:
  96. return False
  97. # 过滤词库
  98. @classmethod
  99. def filter_words(cls, log_type, crawler):
  100. try:
  101. while True:
  102. filter_words_sheet = Feishu.get_values_batch(log_type, crawler, 'KGB4Hc')
  103. if filter_words_sheet is None:
  104. Common.logger(log_type, crawler).warning(f"filter_words_sheet:{filter_words_sheet} 10秒钟后重试")
  105. continue
  106. filter_words_list = []
  107. for x in filter_words_sheet:
  108. for y in x:
  109. if y is None:
  110. pass
  111. else:
  112. filter_words_list.append(y)
  113. return filter_words_list
  114. except Exception as e:
  115. Common.logger(log_type, crawler).error(f'filter_words异常:{e}\n')
  116. @classmethod
  117. def random_signature(cls):
  118. src_digits = string.digits # string_数字
  119. src_uppercase = string.ascii_uppercase # string_大写字母
  120. src_lowercase = string.ascii_lowercase # string_小写字母
  121. digits_num = random.randint(1, 6)
  122. uppercase_num = random.randint(1, 26 - digits_num - 1)
  123. lowercase_num = 26 - (digits_num + uppercase_num)
  124. password = random.sample(src_digits, digits_num) + random.sample(src_uppercase, uppercase_num) + random.sample(
  125. src_lowercase, lowercase_num)
  126. random.shuffle(password)
  127. new_password = 'AAAAAAAAAA' + ''.join(password)[10:-4] + 'AAAB'
  128. new_password_start = new_password[0:18]
  129. new_password_end = new_password[-7:]
  130. if new_password[18] == '8':
  131. new_password = new_password_start + 'w' + new_password_end
  132. elif new_password[18] == '9':
  133. new_password = new_password_start + 'x' + new_password_end
  134. elif new_password[18] == '-':
  135. new_password = new_password_start + 'y' + new_password_end
  136. elif new_password[18] == '.':
  137. new_password = new_password_start + 'z' + new_password_end
  138. else:
  139. new_password = new_password_start + 'y' + new_password_end
  140. return new_password
  141. # 获取视频详情
  142. @classmethod
  143. def get_video_url(cls, log_type, crawler, gid):
  144. try:
  145. url = 'https://www.ixigua.com/api/mixVideo/information?'
  146. headers = {
  147. "accept-encoding": "gzip, deflate",
  148. "accept-language": "zh-CN,zh-Hans;q=0.9",
  149. "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
  150. "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.5 Safari/605.1.15",
  151. "referer": "https://www.ixigua.com/7102614741050196520?logTag=0531c88ac04f38ab2c62",
  152. }
  153. params = {
  154. 'mixId': gid,
  155. 'msToken': 'IlG0wd0Pylyw9ghcYiB2YseUmTwrsrqqhXrbIcsSaTcLTJyVlbYJzk20zw3UO-CfrfC'
  156. 'NVVIOBNjIl7vfBoxnVUwO9ZyzAI3umSKsT5-pef_RRfQCJwmA',
  157. 'X-Bogus': 'DFSzswVupYTANCJOSBk0P53WxM-r',
  158. '_signature': '_02B4Z6wo0000119LvEwAAIDCuktNZ0y5wkdfS7jAALThuOR8D9yWNZ.EmWHKV0WSn6Px'
  159. 'fPsH9-BldyxVje0f49ryXgmn7Tzk-swEHNb15TiGqa6YF.cX0jW8Eds1TtJOIZyfc9s5emH7gdWN94',
  160. }
  161. cookies = {
  162. 'ixigua-a-s': '1',
  163. 'msToken': 'IlG0wd0Pylyw9ghcYiB2YseUmTwrsrqqhXrbIcsSaTcLTJyVlbYJzk20zw3UO-CfrfCNVVIOB'
  164. 'NjIl7vfBoxnVUwO9ZyzAI3umSKsT5-pef_RRfQCJwmA',
  165. 'ttwid': '1%7C_yXQeHWwLZgCsgHClOwTCdYSOt_MjdOkgnPIkpi-Sr8%7C1661241238%7Cf57d0c5ef3f1d7'
  166. '6e049fccdca1ac54887c34d1f8731c8e51a49780ff0ceab9f8',
  167. 'tt_scid': 'QZ4l8KXDG0YAEaMCSbADdcybdKbUfG4BC6S4OBv9lpRS5VyqYLX2bIR8CTeZeGHR9ee3',
  168. 'MONITOR_WEB_ID': '0a49204a-7af5-4e96-95f0-f4bafb7450ad',
  169. '__ac_nonce': '06304878000964fdad287',
  170. '__ac_signature': '_02B4Z6wo00f017Rcr3AAAIDCUVxeW1tOKEu0fKvAAI4cvoYzV-wBhq7B6D8k0no7lb'
  171. 'FlvYoinmtK6UXjRIYPXnahUlFTvmWVtb77jsMkKAXzAEsLE56m36RlvL7ky.M3Xn52r9t1IEb7IR3ke8',
  172. 'ttcid': 'e56fabf6e85d4adf9e4d91902496a0e882',
  173. '_tea_utm_cache_1300': 'undefined',
  174. 'support_avif': 'false',
  175. 'support_webp': 'false',
  176. 'xiguavideopcwebid': '7134967546256016900',
  177. 'xiguavideopcwebid.sig': 'xxRww5R1VEMJN_dQepHorEu_eAc',
  178. }
  179. urllib3.disable_warnings()
  180. s = requests.session()
  181. # max_retries=3 重试3次
  182. s.mount('http://', HTTPAdapter(max_retries=3))
  183. s.mount('https://', HTTPAdapter(max_retries=3))
  184. response = s.get(url=url, headers=headers, params=params, cookies=cookies, verify=False, proxies=Common.tunnel_proxies(), timeout=5)
  185. response.close()
  186. if 'data' not in response.json() or response.json()['data'] == '':
  187. Common.logger(log_type, crawler).warning('get_video_info: response: {}', response)
  188. else:
  189. video_info = response.json()['data']['gidInformation']['packerData']['video']
  190. video_url_dict = {}
  191. # video_url
  192. if 'videoResource' not in video_info:
  193. video_url_dict["video_url"] = ''
  194. video_url_dict["audio_url"] = ''
  195. video_url_dict["video_width"] = 0
  196. video_url_dict["video_height"] = 0
  197. elif 'dash_120fps' in video_info['videoResource']:
  198. if "video_list" in video_info['videoResource']['dash_120fps'] and 'video_4' in video_info['videoResource']['dash_120fps']['video_list']:
  199. video_url = video_info['videoResource']['dash_120fps']['video_list']['video_4']['backup_url_1']
  200. audio_url = video_info['videoResource']['dash_120fps']['video_list']['video_4']['backup_url_1']
  201. if len(video_url) % 3 == 1:
  202. video_url += '=='
  203. elif len(video_url) % 3 == 2:
  204. video_url += '='
  205. elif len(audio_url) % 3 == 1:
  206. audio_url += '=='
  207. elif len(audio_url) % 3 == 2:
  208. audio_url += '='
  209. video_url = base64.b64decode(video_url).decode('utf8')
  210. audio_url = base64.b64decode(audio_url).decode('utf8')
  211. video_width = video_info['videoResource']['dash_120fps']['video_list']['video_4']['vwidth']
  212. video_height = video_info['videoResource']['dash_120fps']['video_list']['video_4']['vheight']
  213. video_url_dict["video_url"] = video_url
  214. video_url_dict["audio_url"] = audio_url
  215. video_url_dict["video_width"] = video_width
  216. video_url_dict["video_height"] = video_height
  217. elif "video_list" in video_info['videoResource']['dash_120fps'] and 'video_3' in video_info['videoResource']['dash_120fps']['video_list']:
  218. video_url = video_info['videoResource']['dash_120fps']['video_list']['video_3']['backup_url_1']
  219. audio_url = video_info['videoResource']['dash_120fps']['video_list']['video_3']['backup_url_1']
  220. if len(video_url) % 3 == 1:
  221. video_url += '=='
  222. elif len(video_url) % 3 == 2:
  223. video_url += '='
  224. elif len(audio_url) % 3 == 1:
  225. audio_url += '=='
  226. elif len(audio_url) % 3 == 2:
  227. audio_url += '='
  228. video_url = base64.b64decode(video_url).decode('utf8')
  229. audio_url = base64.b64decode(audio_url).decode('utf8')
  230. video_width = video_info['videoResource']['dash_120fps']['video_list']['video_3']['vwidth']
  231. video_height = video_info['videoResource']['dash_120fps']['video_list']['video_3']['vheight']
  232. video_url_dict["video_url"] = video_url
  233. video_url_dict["audio_url"] = audio_url
  234. video_url_dict["video_width"] = video_width
  235. video_url_dict["video_height"] = video_height
  236. elif "video_list" in video_info['videoResource']['dash_120fps'] and 'video_2' in video_info['videoResource']['dash_120fps']['video_list']:
  237. video_url = video_info['videoResource']['dash_120fps']['video_list']['video_2']['backup_url_1']
  238. audio_url = video_info['videoResource']['dash_120fps']['video_list']['video_2']['backup_url_1']
  239. if len(video_url) % 3 == 1:
  240. video_url += '=='
  241. elif len(video_url) % 3 == 2:
  242. video_url += '='
  243. elif len(audio_url) % 3 == 1:
  244. audio_url += '=='
  245. elif len(audio_url) % 3 == 2:
  246. audio_url += '='
  247. video_url = base64.b64decode(video_url).decode('utf8')
  248. audio_url = base64.b64decode(audio_url).decode('utf8')
  249. video_width = video_info['videoResource']['dash_120fps']['video_list']['video_2']['vwidth']
  250. video_height = video_info['videoResource']['dash_120fps']['video_list']['video_2']['vheight']
  251. video_url_dict["video_url"] = video_url
  252. video_url_dict["audio_url"] = audio_url
  253. video_url_dict["video_width"] = video_width
  254. video_url_dict["video_height"] = video_height
  255. elif "video_list" in video_info['videoResource']['dash_120fps'] and 'video_1' in video_info['videoResource']['dash_120fps']['video_list']:
  256. video_url = video_info['videoResource']['dash_120fps']['video_list']['video_1']['backup_url_1']
  257. audio_url = video_info['videoResource']['dash_120fps']['video_list']['video_1']['backup_url_1']
  258. if len(video_url) % 3 == 1:
  259. video_url += '=='
  260. elif len(video_url) % 3 == 2:
  261. video_url += '='
  262. elif len(audio_url) % 3 == 1:
  263. audio_url += '=='
  264. elif len(audio_url) % 3 == 2:
  265. audio_url += '='
  266. video_url = base64.b64decode(video_url).decode('utf8')
  267. audio_url = base64.b64decode(audio_url).decode('utf8')
  268. video_width = video_info['videoResource']['dash_120fps']['video_list']['video_1']['vwidth']
  269. video_height = video_info['videoResource']['dash_120fps']['video_list']['video_1']['vheight']
  270. video_url_dict["video_url"] = video_url
  271. video_url_dict["audio_url"] = audio_url
  272. video_url_dict["video_width"] = video_width
  273. video_url_dict["video_height"] = video_height
  274. elif 'dynamic_video' in video_info['videoResource']['dash_120fps'] \
  275. and 'dynamic_video_list' in video_info['videoResource']['dash_120fps']['dynamic_video'] \
  276. and 'dynamic_audio_list' in video_info['videoResource']['dash_120fps']['dynamic_video'] \
  277. and len(video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_video_list']) != 0 \
  278. and len(video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_audio_list']) != 0:
  279. video_url = video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_video_list'][-1]['backup_url_1']
  280. audio_url = video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_audio_list'][-1]['backup_url_1']
  281. if len(video_url) % 3 == 1:
  282. video_url += '=='
  283. elif len(video_url) % 3 == 2:
  284. video_url += '='
  285. elif len(audio_url) % 3 == 1:
  286. audio_url += '=='
  287. elif len(audio_url) % 3 == 2:
  288. audio_url += '='
  289. video_url = base64.b64decode(video_url).decode('utf8')
  290. audio_url = base64.b64decode(audio_url).decode('utf8')
  291. video_width = video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_video_list'][-1]['vwidth']
  292. video_height = video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_video_list'][-1]['vheight']
  293. video_url_dict["video_url"] = video_url
  294. video_url_dict["audio_url"] = audio_url
  295. video_url_dict["video_width"] = video_width
  296. video_url_dict["video_height"] = video_height
  297. else:
  298. video_url_dict["video_url"] = ''
  299. video_url_dict["audio_url"] = ''
  300. video_url_dict["video_width"] = 0
  301. video_url_dict["video_height"] = 0
  302. elif 'dash' in video_info['videoResource']:
  303. if "video_list" in video_info['videoResource']['dash'] and 'video_4' in video_info['videoResource']['dash']['video_list']:
  304. video_url = video_info['videoResource']['dash']['video_list']['video_4']['backup_url_1']
  305. audio_url = video_info['videoResource']['dash']['video_list']['video_4']['backup_url_1']
  306. if len(video_url) % 3 == 1:
  307. video_url += '=='
  308. elif len(video_url) % 3 == 2:
  309. video_url += '='
  310. elif len(audio_url) % 3 == 1:
  311. audio_url += '=='
  312. elif len(audio_url) % 3 == 2:
  313. audio_url += '='
  314. video_url = base64.b64decode(video_url).decode('utf8')
  315. audio_url = base64.b64decode(audio_url).decode('utf8')
  316. video_width = video_info['videoResource']['dash']['video_list']['video_4']['vwidth']
  317. video_height = video_info['videoResource']['dash']['video_list']['video_4']['vheight']
  318. video_url_dict["video_url"] = video_url
  319. video_url_dict["audio_url"] = audio_url
  320. video_url_dict["video_width"] = video_width
  321. video_url_dict["video_height"] = video_height
  322. elif "video_list" in video_info['videoResource']['dash'] and 'video_3' in video_info['videoResource']['dash']['video_list']:
  323. video_url = video_info['videoResource']['dash']['video_list']['video_3']['backup_url_1']
  324. audio_url = video_info['videoResource']['dash']['video_list']['video_3']['backup_url_1']
  325. if len(video_url) % 3 == 1:
  326. video_url += '=='
  327. elif len(video_url) % 3 == 2:
  328. video_url += '='
  329. elif len(audio_url) % 3 == 1:
  330. audio_url += '=='
  331. elif len(audio_url) % 3 == 2:
  332. audio_url += '='
  333. video_url = base64.b64decode(video_url).decode('utf8')
  334. audio_url = base64.b64decode(audio_url).decode('utf8')
  335. video_width = video_info['videoResource']['dash']['video_list']['video_3']['vwidth']
  336. video_height = video_info['videoResource']['dash']['video_list']['video_3']['vheight']
  337. video_url_dict["video_url"] = video_url
  338. video_url_dict["audio_url"] = audio_url
  339. video_url_dict["video_width"] = video_width
  340. video_url_dict["video_height"] = video_height
  341. elif "video_list" in video_info['videoResource']['dash'] and 'video_2' in video_info['videoResource']['dash']['video_list']:
  342. video_url = video_info['videoResource']['dash']['video_list']['video_2']['backup_url_1']
  343. audio_url = video_info['videoResource']['dash']['video_list']['video_2']['backup_url_1']
  344. if len(video_url) % 3 == 1:
  345. video_url += '=='
  346. elif len(video_url) % 3 == 2:
  347. video_url += '='
  348. elif len(audio_url) % 3 == 1:
  349. audio_url += '=='
  350. elif len(audio_url) % 3 == 2:
  351. audio_url += '='
  352. video_url = base64.b64decode(video_url).decode('utf8')
  353. audio_url = base64.b64decode(audio_url).decode('utf8')
  354. video_width = video_info['videoResource']['dash']['video_list']['video_2']['vwidth']
  355. video_height = video_info['videoResource']['dash']['video_list']['video_2']['vheight']
  356. video_url_dict["video_url"] = video_url
  357. video_url_dict["audio_url"] = audio_url
  358. video_url_dict["video_width"] = video_width
  359. video_url_dict["video_height"] = video_height
  360. elif "video_list" in video_info['videoResource']['dash'] and 'video_1' in video_info['videoResource']['dash']['video_list']:
  361. video_url = video_info['videoResource']['dash']['video_list']['video_1']['backup_url_1']
  362. audio_url = video_info['videoResource']['dash']['video_list']['video_1']['backup_url_1']
  363. if len(video_url) % 3 == 1:
  364. video_url += '=='
  365. elif len(video_url) % 3 == 2:
  366. video_url += '='
  367. elif len(audio_url) % 3 == 1:
  368. audio_url += '=='
  369. elif len(audio_url) % 3 == 2:
  370. audio_url += '='
  371. video_url = base64.b64decode(video_url).decode('utf8')
  372. audio_url = base64.b64decode(audio_url).decode('utf8')
  373. video_width = video_info['videoResource']['dash']['video_list']['video_1']['vwidth']
  374. video_height = video_info['videoResource']['dash']['video_list']['video_1']['vheight']
  375. video_url_dict["video_url"] = video_url
  376. video_url_dict["audio_url"] = audio_url
  377. video_url_dict["video_width"] = video_width
  378. video_url_dict["video_height"] = video_height
  379. elif 'dynamic_video' in video_info['videoResource']['dash'] \
  380. and 'dynamic_video_list' in video_info['videoResource']['dash']['dynamic_video'] \
  381. and 'dynamic_audio_list' in video_info['videoResource']['dash']['dynamic_video'] \
  382. and len(video_info['videoResource']['dash']['dynamic_video']['dynamic_video_list']) != 0 \
  383. and len(video_info['videoResource']['dash']['dynamic_video']['dynamic_audio_list']) != 0:
  384. video_url = video_info['videoResource']['dash']['dynamic_video']['dynamic_video_list'][-1]['backup_url_1']
  385. audio_url = video_info['videoResource']['dash']['dynamic_video']['dynamic_audio_list'][-1]['backup_url_1']
  386. if len(video_url) % 3 == 1:
  387. video_url += '=='
  388. elif len(video_url) % 3 == 2:
  389. video_url += '='
  390. elif len(audio_url) % 3 == 1:
  391. audio_url += '=='
  392. elif len(audio_url) % 3 == 2:
  393. audio_url += '='
  394. video_url = base64.b64decode(video_url).decode('utf8')
  395. audio_url = base64.b64decode(audio_url).decode('utf8')
  396. video_width = video_info['videoResource']['dash']['dynamic_video']['dynamic_video_list'][-1]['vwidth']
  397. video_height = video_info['videoResource']['dash']['dynamic_video']['dynamic_video_list'][-1]['vheight']
  398. video_url_dict["video_url"] = video_url
  399. video_url_dict["audio_url"] = audio_url
  400. video_url_dict["video_width"] = video_width
  401. video_url_dict["video_height"] = video_height
  402. else:
  403. video_url_dict["video_url"] = ''
  404. video_url_dict["audio_url"] = ''
  405. video_url_dict["video_width"] = 0
  406. video_url_dict["video_height"] = 0
  407. elif 'normal' in video_info['videoResource']:
  408. if "video_list" in video_info['videoResource']['normal'] and 'video_4' in \
  409. video_info['videoResource']['normal']['video_list']:
  410. video_url = video_info['videoResource']['normal']['video_list']['video_4']['backup_url_1']
  411. audio_url = video_info['videoResource']['normal']['video_list']['video_4']['backup_url_1']
  412. if len(video_url) % 3 == 1:
  413. video_url += '=='
  414. elif len(video_url) % 3 == 2:
  415. video_url += '='
  416. elif len(audio_url) % 3 == 1:
  417. audio_url += '=='
  418. elif len(audio_url) % 3 == 2:
  419. audio_url += '='
  420. video_url = base64.b64decode(video_url).decode('utf8')
  421. audio_url = base64.b64decode(audio_url).decode('utf8')
  422. video_width = video_info['videoResource']['normal']['video_list']['video_4']['vwidth']
  423. video_height = video_info['videoResource']['normal']['video_list']['video_4']['vheight']
  424. video_url_dict["video_url"] = video_url
  425. video_url_dict["audio_url"] = audio_url
  426. video_url_dict["video_width"] = video_width
  427. video_url_dict["video_height"] = video_height
  428. elif "video_list" in video_info['videoResource']['normal'] and 'video_3' in \
  429. video_info['videoResource']['normal']['video_list']:
  430. video_url = video_info['videoResource']['normal']['video_list']['video_3']['backup_url_1']
  431. audio_url = video_info['videoResource']['normal']['video_list']['video_3']['backup_url_1']
  432. if len(video_url) % 3 == 1:
  433. video_url += '=='
  434. elif len(video_url) % 3 == 2:
  435. video_url += '='
  436. elif len(audio_url) % 3 == 1:
  437. audio_url += '=='
  438. elif len(audio_url) % 3 == 2:
  439. audio_url += '='
  440. video_url = base64.b64decode(video_url).decode('utf8')
  441. audio_url = base64.b64decode(audio_url).decode('utf8')
  442. video_width = video_info['videoResource']['normal']['video_list']['video_3']['vwidth']
  443. video_height = video_info['videoResource']['normal']['video_list']['video_3']['vheight']
  444. video_url_dict["video_url"] = video_url
  445. video_url_dict["audio_url"] = audio_url
  446. video_url_dict["video_width"] = video_width
  447. video_url_dict["video_height"] = video_height
  448. elif "video_list" in video_info['videoResource']['normal'] and 'video_2' in \
  449. video_info['videoResource']['normal']['video_list']:
  450. video_url = video_info['videoResource']['normal']['video_list']['video_2']['backup_url_1']
  451. audio_url = video_info['videoResource']['normal']['video_list']['video_2']['backup_url_1']
  452. if len(video_url) % 3 == 1:
  453. video_url += '=='
  454. elif len(video_url) % 3 == 2:
  455. video_url += '='
  456. elif len(audio_url) % 3 == 1:
  457. audio_url += '=='
  458. elif len(audio_url) % 3 == 2:
  459. audio_url += '='
  460. video_url = base64.b64decode(video_url).decode('utf8')
  461. audio_url = base64.b64decode(audio_url).decode('utf8')
  462. video_width = video_info['videoResource']['normal']['video_list']['video_2']['vwidth']
  463. video_height = video_info['videoResource']['normal']['video_list']['video_2']['vheight']
  464. video_url_dict["video_url"] = video_url
  465. video_url_dict["audio_url"] = audio_url
  466. video_url_dict["video_width"] = video_width
  467. video_url_dict["video_height"] = video_height
  468. elif "video_list" in video_info['videoResource']['normal'] and 'video_1' in \
  469. video_info['videoResource']['normal']['video_list']:
  470. video_url = video_info['videoResource']['normal']['video_list']['video_1']['backup_url_1']
  471. audio_url = video_info['videoResource']['normal']['video_list']['video_1']['backup_url_1']
  472. if len(video_url) % 3 == 1:
  473. video_url += '=='
  474. elif len(video_url) % 3 == 2:
  475. video_url += '='
  476. elif len(audio_url) % 3 == 1:
  477. audio_url += '=='
  478. elif len(audio_url) % 3 == 2:
  479. audio_url += '='
  480. video_url = base64.b64decode(video_url).decode('utf8')
  481. audio_url = base64.b64decode(audio_url).decode('utf8')
  482. video_width = video_info['videoResource']['normal']['video_list']['video_1']['vwidth']
  483. video_height = video_info['videoResource']['normal']['video_list']['video_1']['vheight']
  484. video_url_dict["video_url"] = video_url
  485. video_url_dict["audio_url"] = audio_url
  486. video_url_dict["video_width"] = video_width
  487. video_url_dict["video_height"] = video_height
  488. elif 'dynamic_video' in video_info['videoResource']['normal'] \
  489. and 'dynamic_video_list' in video_info['videoResource']['normal']['dynamic_video'] \
  490. and 'dynamic_audio_list' in video_info['videoResource']['normal']['dynamic_video'] \
  491. and len(video_info['videoResource']['normal']['dynamic_video']['dynamic_video_list']) != 0 \
  492. and len(video_info['videoResource']['normal']['dynamic_video']['dynamic_audio_list']) != 0:
  493. video_url = video_info['videoResource']['normal']['dynamic_video']['dynamic_video_list'][-1][
  494. 'backup_url_1']
  495. audio_url = video_info['videoResource']['normal']['dynamic_video']['dynamic_audio_list'][-1][
  496. 'backup_url_1']
  497. if len(video_url) % 3 == 1:
  498. video_url += '=='
  499. elif len(video_url) % 3 == 2:
  500. video_url += '='
  501. elif len(audio_url) % 3 == 1:
  502. audio_url += '=='
  503. elif len(audio_url) % 3 == 2:
  504. audio_url += '='
  505. video_url = base64.b64decode(video_url).decode('utf8')
  506. audio_url = base64.b64decode(audio_url).decode('utf8')
  507. video_width = video_info['videoResource']['normal']['dynamic_video']['dynamic_video_list'][-1][
  508. 'vwidth']
  509. video_height = video_info['videoResource']['normal']['dynamic_video']['dynamic_video_list'][-1][
  510. 'vheight']
  511. video_url_dict["video_url"] = video_url
  512. video_url_dict["audio_url"] = audio_url
  513. video_url_dict["video_width"] = video_width
  514. video_url_dict["video_height"] = video_height
  515. else:
  516. video_url_dict["video_url"] = ''
  517. video_url_dict["audio_url"] = ''
  518. video_url_dict["video_width"] = 0
  519. video_url_dict["video_height"] = 0
  520. else:
  521. video_url_dict["video_url"] = ''
  522. video_url_dict["audio_url"] = ''
  523. video_url_dict["video_width"] = 0
  524. video_url_dict["video_height"] = 0
  525. return video_url_dict
  526. except Exception as e:
  527. Common.logger(log_type, crawler).error(f'get_video_url:{e}\n')
  528. @classmethod
  529. def get_videolist(cls, log_type, crawler, task, our_uid, out_uid, oss_endpoint, env):
  530. try:
  531. signature = cls.random_signature()
  532. while True:
  533. url = "https://www.ixigua.com/api/videov2/author/new_video_list?"
  534. params = {
  535. 'to_user_id': str(out_uid),
  536. 'offset': str(cls.offset),
  537. 'limit': '30',
  538. 'maxBehotTime': '0',
  539. 'order': 'new',
  540. 'isHome': '0',
  541. '_signature': signature,
  542. }
  543. headers = {
  544. 'referer': f'https://www.ixigua.com/home/{out_uid}/video/?preActiveKey=hotsoon&list_entrance=userdetail',
  545. 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.41',
  546. }
  547. urllib3.disable_warnings()
  548. s = requests.session()
  549. # max_retries=3 重试3次
  550. s.mount('http://', HTTPAdapter(max_retries=3))
  551. s.mount('https://', HTTPAdapter(max_retries=3))
  552. response = s.get(url=url, headers=headers, params=params, proxies=Common.tunnel_proxies(), verify=False, timeout=5)
  553. response.close()
  554. cls.offset += 30
  555. if response.status_code != 200:
  556. Common.logger(log_type, crawler).warning(f"get_videolist_response:{response.text}\n")
  557. cls.offset = 0
  558. return
  559. elif 'data' not in response.text:
  560. Common.logger(log_type, crawler).warning(f"get_videolist_response:{response.text}\n")
  561. cls.offset = 0
  562. return
  563. elif 'videoList' not in response.json()["data"]:
  564. Common.logger(log_type, crawler).warning(f"get_videolist_response:{response.json()}\n")
  565. cls.offset = 0
  566. return
  567. else:
  568. videoList = response.json()['data']['videoList']
  569. for i in range(len(videoList)):
  570. # video_title
  571. if 'title' not in videoList[i]:
  572. video_title = 0
  573. else:
  574. video_title = videoList[i]['title'].strip().replace('手游', '') \
  575. .replace('/', '').replace('\/', '').replace('\n', '')
  576. # video_id
  577. if 'video_id' not in videoList[i]:
  578. video_id = 0
  579. else:
  580. video_id = videoList[i]['video_id']
  581. # gid
  582. if 'gid' not in videoList[i]:
  583. gid = 0
  584. else:
  585. gid = videoList[i]['gid']
  586. # play_cnt
  587. if 'video_detail_info' not in videoList[i]:
  588. play_cnt = 0
  589. elif 'video_watch_count' not in videoList[i]['video_detail_info']:
  590. play_cnt = 0
  591. else:
  592. play_cnt = videoList[i]['video_detail_info']['video_watch_count']
  593. # comment_cnt
  594. if 'comment_count' not in videoList[i]:
  595. comment_cnt = 0
  596. else:
  597. comment_cnt = videoList[i]['comment_count']
  598. # like_cnt
  599. if 'digg_count' not in videoList[i]:
  600. like_cnt = 0
  601. else:
  602. like_cnt = videoList[i]['digg_count']
  603. # share_cnt
  604. share_cnt = 0
  605. # video_duration
  606. if 'video_duration' not in videoList[i]:
  607. video_duration = 0
  608. else:
  609. video_duration = int(videoList[i]['video_duration'])
  610. # send_time
  611. if 'publish_time' not in videoList[i]:
  612. publish_time = 0
  613. else:
  614. publish_time = videoList[i]['publish_time']
  615. publish_time_str = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(publish_time))
  616. # is_top
  617. if 'is_top' not in videoList[i]:
  618. is_top = 0
  619. else:
  620. is_top = videoList[i]['is_top']
  621. # user_name
  622. if 'user_info' not in videoList[i]:
  623. user_name = 0
  624. elif 'name' not in videoList[i]['user_info']:
  625. user_name = 0
  626. else:
  627. user_name = videoList[i]['user_info']['name']
  628. # user_id
  629. if 'user_info' not in videoList[i]:
  630. user_id = 0
  631. elif 'user_id' not in videoList[i]['user_info']:
  632. user_id = 0
  633. else:
  634. user_id = videoList[i]['user_info']['user_id']
  635. # avatar_url
  636. if 'user_info' not in videoList[i]:
  637. avatar_url = 0
  638. elif 'avatar_url' not in videoList[i]['user_info']:
  639. avatar_url = 0
  640. else:
  641. avatar_url = videoList[i]['user_info']['avatar_url']
  642. # cover_url
  643. if 'video_detail_info' not in videoList[i]:
  644. cover_url = 0
  645. elif 'detail_video_large_image' not in videoList[i]['video_detail_info']:
  646. cover_url = 0
  647. elif 'url' in videoList[i]['video_detail_info']['detail_video_large_image']:
  648. cover_url = videoList[i]['video_detail_info']['detail_video_large_image']['url']
  649. else:
  650. cover_url = videoList[i]['video_detail_info']['detail_video_large_image']['url_list'][0]['url']
  651. min_publish_time = int(task["min_publish_time"])
  652. min_publish_day = int(task["min_publish_day"])
  653. min_publish_day = (date.today() + timedelta(days=-min_publish_day)).strftime("%Y-%m-%d")
  654. min_publish_day = int(time.mktime(time.strptime(min_publish_day, "%Y-%m-%d")))
  655. if min_publish_time > 0 and min_publish_day > 0:
  656. publish_time_rule = min_publish_time
  657. elif min_publish_time > 0:
  658. publish_time_rule = min_publish_time
  659. else:
  660. publish_time_rule = min_publish_day
  661. if gid == 0 or video_id == 0 or cover_url == 0:
  662. Common.logger(log_type, crawler).info('无效视频\n')
  663. elif is_top is True and int(publish_time) < publish_time_rule:
  664. Common.logger(log_type, crawler).info(f'置顶视频,且发布时间超过抓取时间\n')
  665. elif int(publish_time) < publish_time_rule:
  666. Common.logger(log_type, crawler).info(f'发布时间超过抓取时间\n')
  667. cls.offset = 0
  668. return
  669. else:
  670. video_url_dict = cls.get_video_url(log_type, crawler, gid)
  671. video_url = video_url_dict["video_url"]
  672. audio_url = video_url_dict["audio_url"]
  673. video_width = video_url_dict["video_width"]
  674. video_height = video_url_dict["video_height"]
  675. video_dict = {'video_title': video_title,
  676. 'video_id': video_id,
  677. 'gid': gid,
  678. 'play_cnt': play_cnt,
  679. 'comment_cnt': comment_cnt,
  680. 'like_cnt': like_cnt,
  681. 'share_cnt': share_cnt,
  682. 'video_width': video_width,
  683. 'video_height': video_height,
  684. 'duration': video_duration,
  685. 'publish_time_stamp': publish_time,
  686. 'publish_time_str': publish_time_str,
  687. 'is_top': is_top,
  688. 'user_name': user_name,
  689. 'user_id': user_id,
  690. 'avatar_url': avatar_url,
  691. 'cover_url': cover_url,
  692. 'audio_url': audio_url,
  693. 'video_url': video_url,
  694. 'session': signature}
  695. for k, v in video_dict.items():
  696. Common.logger(log_type, crawler).info(f"{k}:{v}")
  697. cls.download_publish(log_type=log_type,
  698. crawler=crawler,
  699. video_dict=video_dict,
  700. task=task,
  701. strategy=task["task_name"],
  702. our_uid=our_uid,
  703. oss_endpoint=oss_endpoint,
  704. env=env)
  705. except Exception as e:
  706. Common.logger(log_type, crawler).error(f"get_videolist:{e}\n")
  707. @classmethod
  708. def repeat_video(cls, log_type, crawler, video_id, env):
  709. sql = f""" select * from crawler_video where platform="{cls.platform}" and out_video_id="{video_id}"; """
  710. repeat_video = MysqlHelper.get_values(log_type, crawler, sql, env)
  711. return len(repeat_video)
  712. # 下载 / 上传
  713. @classmethod
  714. def download_publish(cls, log_type, crawler, strategy, video_dict, task, our_uid, oss_endpoint, env):
  715. try:
  716. if cls.download_rule_scheduling(video_dict, task) is False:
  717. Common.logger(log_type, crawler).info('不满足抓取规则\n')
  718. elif any(word if word in video_dict['video_title'] else False for word in cls.filter_words(log_type, crawler)) is True:
  719. Common.logger(log_type, crawler).info('标题已中过滤词:{}\n', video_dict['video_title'])
  720. elif cls.repeat_video(log_type, crawler, video_dict['video_id'], env) != 0:
  721. Common.logger(log_type, crawler).info('视频已下载\n')
  722. else:
  723. # 下载视频
  724. Common.download_method(log_type=log_type, crawler=crawler, text='xigua_video', title=video_dict['video_title'], url=video_dict['video_url'])
  725. # 下载音频
  726. Common.download_method(log_type=log_type, crawler=crawler, text='xigua_audio', title=video_dict['video_title'], url=video_dict['audio_url'])
  727. # 合成音视频
  728. Common.video_compose(log_type=log_type, crawler=crawler, video_dir=f"./{crawler}/videos/{video_dict['video_title']}")
  729. ffmpeg_dict = Common.ffmpeg(log_type, crawler, f"./{crawler}/videos/{video_dict['video_title']}/video.mp4")
  730. if ffmpeg_dict is None or ffmpeg_dict['size'] == 0:
  731. Common.logger(log_type, crawler).warning(f"下载的视频无效,已删除\n")
  732. # 删除视频文件夹
  733. shutil.rmtree(f"./{crawler}/videos/{video_dict['video_title']}")
  734. return
  735. # 下载封面
  736. Common.download_method(log_type=log_type, crawler=crawler, text='cover', title=video_dict['video_title'], url=video_dict['cover_url'])
  737. # 保存视频信息至txt
  738. Common.save_video_info(log_type=log_type, crawler=crawler, video_dict=video_dict)
  739. # 上传视频
  740. Common.logger(log_type, crawler).info("开始上传视频...")
  741. our_video_id = Publish.upload_and_publish(log_type=log_type,
  742. crawler=crawler,
  743. strategy=strategy,
  744. our_uid=our_uid,
  745. env=env,
  746. oss_endpoint=oss_endpoint)
  747. if env == 'dev':
  748. our_video_link = f"https://testadmin.piaoquantv.com/cms/post-detail/{our_video_id}/info"
  749. else:
  750. our_video_link = f"https://admin.piaoquantv.com/cms/post-detail/{our_video_id}/info"
  751. Common.logger(log_type, crawler).info("视频上传完成")
  752. if our_video_id is None:
  753. # 删除视频文件夹
  754. shutil.rmtree(f"./{crawler}/videos/{video_dict['video_title']}")
  755. return
  756. # 视频写入飞书
  757. Feishu.insert_columns(log_type, 'xigua', "e075e9", "ROWS", 1, 2)
  758. upload_time = int(time.time())
  759. values = [[time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(upload_time)),
  760. "定向榜",
  761. video_dict['video_title'],
  762. str(video_dict['video_id']),
  763. our_video_link,
  764. video_dict['gid'],
  765. video_dict['play_cnt'],
  766. video_dict['comment_cnt'],
  767. video_dict['like_cnt'],
  768. video_dict['share_cnt'],
  769. video_dict['duration'],
  770. str(video_dict['video_width']) + '*' + str(video_dict['video_height']),
  771. video_dict['publish_time_str'],
  772. video_dict['user_name'],
  773. video_dict['user_id'],
  774. video_dict['avatar_url'],
  775. video_dict['cover_url'],
  776. video_dict['video_url'],
  777. video_dict['audio_url']]]
  778. time.sleep(1)
  779. Feishu.update_values(log_type, 'xigua', "e075e9", "F2:Z2", values)
  780. Common.logger(log_type, crawler).info(f"视频已保存至云文档\n")
  781. rule_dict = {
  782. "play_cnt": task["play_cnt"],
  783. "video_width": task["video_width"],
  784. "video_height": task["video_height"],
  785. "video_like": task["video_like"],
  786. "share_cnt": task["share_cnt"],
  787. "duration": {"min": task["duration_min"], "max": task["duration_max"]}
  788. }
  789. # 视频信息保存数据库
  790. insert_sql = f""" insert into crawler_video(video_id,
  791. user_id,
  792. out_user_id,
  793. platform,
  794. strategy,
  795. out_video_id,
  796. video_title,
  797. cover_url,
  798. video_url,
  799. duration,
  800. publish_time,
  801. play_cnt,
  802. crawler_rule,
  803. width,
  804. height)
  805. values({our_video_id},
  806. {our_uid},
  807. "{video_dict['user_id']}",
  808. "{cls.platform}",
  809. "定向爬虫策略",
  810. "{video_dict['video_id']}",
  811. "{video_dict['video_title']}",
  812. "{video_dict['cover_url']}",
  813. "{video_dict['video_url']}",
  814. {int(video_dict['duration'])},
  815. "{video_dict['publish_time_str']}",
  816. {int(video_dict['play_cnt'])},
  817. '{json.dumps(rule_dict)}',
  818. {int(video_dict['video_width'])},
  819. {int(video_dict['video_height'])}) """
  820. Common.logger(log_type, crawler).info(f"insert_sql:{insert_sql}")
  821. MysqlHelper.update_values(log_type, crawler, insert_sql, env)
  822. Common.logger(log_type, crawler).info('视频信息插入数据库成功!\n')
  823. except Exception as e:
  824. Common.logger(log_type, crawler).error(f'download_publish异常:{e}\n')
  825. @classmethod
  826. def get_follow_videos(cls, log_type, crawler, task, oss_endpoint, env):
  827. try:
  828. user_list = cls.get_users(log_type=log_type,
  829. crawler=crawler,
  830. task=task,
  831. env=env)
  832. for user in user_list:
  833. out_uid = user["out_uid"]
  834. our_uid = int(user["our_uid"])
  835. if our_uid == 0:
  836. pass
  837. else:
  838. Common.logger(log_type, crawler).info(f"开始抓取 {out_uid} 用户主页视频\n")
  839. cls.get_videolist(log_type=log_type,
  840. crawler=crawler,
  841. task=task,
  842. our_uid=our_uid,
  843. out_uid=out_uid,
  844. oss_endpoint=oss_endpoint,
  845. env=env)
  846. cls.offset = 0
  847. time.sleep(1)
  848. except Exception as e:
  849. Common.logger(log_type, crawler).error(f"get_follow_videos:{e}\n")
  850. if __name__ == '__main__':
  851. # SchedulingFollow.get_users(log_type="follow",
  852. # crawler="xigua",
  853. # spider_rule="['https://www.ixigua.com/home/95420624045', 'https://www.ixigua.com/home/6431477489']",
  854. # env="dev",
  855. # machine="local")
  856. print(SchedulingFollow.repeat_video("follow", "xigua", "v0201ag10000ce3jcjbc77u8jsplpgrg", "dev"))
  857. pass