xigua_follow_scheduling.py 52 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920
  1. # -*- coding: utf-8 -*-
  2. # @Author: wangkun
  3. # @Time: 2023/2/17
  4. import base64
  5. import json
  6. import os
  7. import random
  8. import shutil
  9. import string
  10. import sys
  11. import time
  12. from datetime import date, timedelta
  13. from hashlib import md5
  14. import requests
  15. import urllib3
  16. from requests.adapters import HTTPAdapter
  17. sys.path.append(os.getcwd())
  18. from common.scheduling_db import MysqlHelper
  19. from common.common import Common
  20. from common.feishu import Feishu
  21. from common.publish import Publish
  22. class SchedulingFollow:
  23. # 个人主页视频翻页参数
  24. offset = 0
  25. platform = "西瓜视频"
  26. @classmethod
  27. def get_users(cls, log_type, crawler, task, env):
  28. link_list = task['spider_link']
  29. user_list = []
  30. for link in link_list:
  31. out_uid = int(link.split("https://www.ixigua.com/home/")[-1].replace("/", "").strip())
  32. sql = f""" select * from crawler_author_map where spider_link="{link}" """
  33. our_user_info = MysqlHelper.get_values(log_type=log_type, crawler=crawler, sql=sql, env=env)
  34. if len(our_user_info) == 0:
  35. our_uid = 0
  36. Common.logger(log_type, crawler).info(f"没有站内虚拟账号: {link}\n")
  37. else:
  38. # print(type(our_user_info[0]))
  39. # print(our_user_info[0])
  40. our_uid = our_user_info[0]["media_id"]
  41. user_dict = {
  42. "out_uid": out_uid,
  43. "our_uid": our_uid
  44. }
  45. user_list.append(user_dict)
  46. Common.logger(log_type, crawler).info(f"user_list:{user_list}")
  47. return user_list
  48. # 下载规则
  49. @classmethod
  50. def download_rule_scheduling(cls, video_info_dict, task):
  51. try:
  52. play_cnt_min = int(task['play_cnt']['min'])
  53. except:
  54. play_cnt_min = 0
  55. try:
  56. video_like_min = int(task['video_like']['min'])
  57. except:
  58. video_like_min = 0
  59. try:
  60. share_cnt_min = int(task['share_cnt']['min'])
  61. except:
  62. share_cnt_min = 0
  63. try:
  64. video_width_min = int(task['video_width']['min'])
  65. except:
  66. video_width_min = 0
  67. try:
  68. video_height_min = task['video_height']['min']
  69. except:
  70. video_height_min = 0
  71. try:
  72. duration_min = int(task['duration_min'])
  73. except:
  74. duration_min = 0
  75. try:
  76. duration_max = int(task['duration_max'])
  77. except:
  78. duration_max = 1000000000
  79. if int(video_info_dict['play_cnt']) >= play_cnt_min:
  80. if int(video_info_dict['like_cnt']) >= video_like_min:
  81. if int(video_info_dict['share_cnt']) >= share_cnt_min:
  82. if duration_max >= int(video_info_dict['duration']) >= duration_min:
  83. if int(video_info_dict['video_width']) >= video_width_min:
  84. if int(video_info_dict['video_height']) >= video_height_min:
  85. return True
  86. else:
  87. return False
  88. else:
  89. return False
  90. else:
  91. return False
  92. else:
  93. return False
  94. else:
  95. return False
  96. else:
  97. return False
  98. # 过滤词库
  99. @classmethod
  100. def filter_words(cls, log_type, crawler):
  101. try:
  102. while True:
  103. filter_words_sheet = Feishu.get_values_batch(log_type, crawler, 'KGB4Hc')
  104. if filter_words_sheet is None:
  105. Common.logger(log_type, crawler).warning(f"filter_words_sheet:{filter_words_sheet} 10秒钟后重试")
  106. continue
  107. filter_words_list = []
  108. for x in filter_words_sheet:
  109. for y in x:
  110. if y is None:
  111. pass
  112. else:
  113. filter_words_list.append(y)
  114. return filter_words_list
  115. except Exception as e:
  116. Common.logger(log_type, crawler).error(f'filter_words异常:{e}\n')
  117. @classmethod
  118. def random_signature(cls):
  119. src_digits = string.digits # string_数字
  120. src_uppercase = string.ascii_uppercase # string_大写字母
  121. src_lowercase = string.ascii_lowercase # string_小写字母
  122. digits_num = random.randint(1, 6)
  123. uppercase_num = random.randint(1, 26 - digits_num - 1)
  124. lowercase_num = 26 - (digits_num + uppercase_num)
  125. password = random.sample(src_digits, digits_num) + random.sample(src_uppercase, uppercase_num) + random.sample(
  126. src_lowercase, lowercase_num)
  127. random.shuffle(password)
  128. new_password = 'AAAAAAAAAA' + ''.join(password)[10:-4] + 'AAAB'
  129. new_password_start = new_password[0:18]
  130. new_password_end = new_password[-7:]
  131. if new_password[18] == '8':
  132. new_password = new_password_start + 'w' + new_password_end
  133. elif new_password[18] == '9':
  134. new_password = new_password_start + 'x' + new_password_end
  135. elif new_password[18] == '-':
  136. new_password = new_password_start + 'y' + new_password_end
  137. elif new_password[18] == '.':
  138. new_password = new_password_start + 'z' + new_password_end
  139. else:
  140. new_password = new_password_start + 'y' + new_password_end
  141. return new_password
  142. # 获取视频详情
  143. @classmethod
  144. def get_video_url(cls, log_type, crawler, gid):
  145. try:
  146. url = 'https://www.ixigua.com/api/mixVideo/information?'
  147. headers = {
  148. "accept-encoding": "gzip, deflate",
  149. "accept-language": "zh-CN,zh-Hans;q=0.9",
  150. "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
  151. "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.5 Safari/605.1.15",
  152. "referer": "https://www.ixigua.com/7102614741050196520?logTag=0531c88ac04f38ab2c62",
  153. }
  154. params = {
  155. 'mixId': gid,
  156. 'msToken': 'IlG0wd0Pylyw9ghcYiB2YseUmTwrsrqqhXrbIcsSaTcLTJyVlbYJzk20zw3UO-CfrfC'
  157. 'NVVIOBNjIl7vfBoxnVUwO9ZyzAI3umSKsT5-pef_RRfQCJwmA',
  158. 'X-Bogus': 'DFSzswVupYTANCJOSBk0P53WxM-r',
  159. '_signature': '_02B4Z6wo0000119LvEwAAIDCuktNZ0y5wkdfS7jAALThuOR8D9yWNZ.EmWHKV0WSn6Px'
  160. 'fPsH9-BldyxVje0f49ryXgmn7Tzk-swEHNb15TiGqa6YF.cX0jW8Eds1TtJOIZyfc9s5emH7gdWN94',
  161. }
  162. cookies = {
  163. 'ixigua-a-s': '1',
  164. 'msToken': 'IlG0wd0Pylyw9ghcYiB2YseUmTwrsrqqhXrbIcsSaTcLTJyVlbYJzk20zw3UO-CfrfCNVVIOB'
  165. 'NjIl7vfBoxnVUwO9ZyzAI3umSKsT5-pef_RRfQCJwmA',
  166. 'ttwid': '1%7C_yXQeHWwLZgCsgHClOwTCdYSOt_MjdOkgnPIkpi-Sr8%7C1661241238%7Cf57d0c5ef3f1d7'
  167. '6e049fccdca1ac54887c34d1f8731c8e51a49780ff0ceab9f8',
  168. 'tt_scid': 'QZ4l8KXDG0YAEaMCSbADdcybdKbUfG4BC6S4OBv9lpRS5VyqYLX2bIR8CTeZeGHR9ee3',
  169. 'MONITOR_WEB_ID': '0a49204a-7af5-4e96-95f0-f4bafb7450ad',
  170. '__ac_nonce': '06304878000964fdad287',
  171. '__ac_signature': '_02B4Z6wo00f017Rcr3AAAIDCUVxeW1tOKEu0fKvAAI4cvoYzV-wBhq7B6D8k0no7lb'
  172. 'FlvYoinmtK6UXjRIYPXnahUlFTvmWVtb77jsMkKAXzAEsLE56m36RlvL7ky.M3Xn52r9t1IEb7IR3ke8',
  173. 'ttcid': 'e56fabf6e85d4adf9e4d91902496a0e882',
  174. '_tea_utm_cache_1300': 'undefined',
  175. 'support_avif': 'false',
  176. 'support_webp': 'false',
  177. 'xiguavideopcwebid': '7134967546256016900',
  178. 'xiguavideopcwebid.sig': 'xxRww5R1VEMJN_dQepHorEu_eAc',
  179. }
  180. urllib3.disable_warnings()
  181. s = requests.session()
  182. # max_retries=3 重试3次
  183. s.mount('http://', HTTPAdapter(max_retries=3))
  184. s.mount('https://', HTTPAdapter(max_retries=3))
  185. response = s.get(url=url, headers=headers, params=params, cookies=cookies, verify=False, proxies=Common.tunnel_proxies(), timeout=5)
  186. response.close()
  187. if 'data' not in response.json() or response.json()['data'] == '':
  188. Common.logger(log_type, crawler).warning('get_video_info: response: {}', response)
  189. else:
  190. video_info = response.json()['data']['gidInformation']['packerData']['video']
  191. video_url_dict = {}
  192. # video_url
  193. if 'videoResource' not in video_info:
  194. video_url_dict["video_url"] = ''
  195. video_url_dict["audio_url"] = ''
  196. video_url_dict["video_width"] = 0
  197. video_url_dict["video_height"] = 0
  198. elif 'dash_120fps' in video_info['videoResource']:
  199. if "video_list" in video_info['videoResource']['dash_120fps'] and 'video_4' in video_info['videoResource']['dash_120fps']['video_list']:
  200. video_url = video_info['videoResource']['dash_120fps']['video_list']['video_4']['backup_url_1']
  201. audio_url = video_info['videoResource']['dash_120fps']['video_list']['video_4']['backup_url_1']
  202. if len(video_url) % 3 == 1:
  203. video_url += '=='
  204. elif len(video_url) % 3 == 2:
  205. video_url += '='
  206. elif len(audio_url) % 3 == 1:
  207. audio_url += '=='
  208. elif len(audio_url) % 3 == 2:
  209. audio_url += '='
  210. video_url = base64.b64decode(video_url).decode('utf8')
  211. audio_url = base64.b64decode(audio_url).decode('utf8')
  212. video_width = video_info['videoResource']['dash_120fps']['video_list']['video_4']['vwidth']
  213. video_height = video_info['videoResource']['dash_120fps']['video_list']['video_4']['vheight']
  214. video_url_dict["video_url"] = video_url
  215. video_url_dict["audio_url"] = audio_url
  216. video_url_dict["video_width"] = video_width
  217. video_url_dict["video_height"] = video_height
  218. elif "video_list" in video_info['videoResource']['dash_120fps'] and 'video_3' in video_info['videoResource']['dash_120fps']['video_list']:
  219. video_url = video_info['videoResource']['dash_120fps']['video_list']['video_3']['backup_url_1']
  220. audio_url = video_info['videoResource']['dash_120fps']['video_list']['video_3']['backup_url_1']
  221. if len(video_url) % 3 == 1:
  222. video_url += '=='
  223. elif len(video_url) % 3 == 2:
  224. video_url += '='
  225. elif len(audio_url) % 3 == 1:
  226. audio_url += '=='
  227. elif len(audio_url) % 3 == 2:
  228. audio_url += '='
  229. video_url = base64.b64decode(video_url).decode('utf8')
  230. audio_url = base64.b64decode(audio_url).decode('utf8')
  231. video_width = video_info['videoResource']['dash_120fps']['video_list']['video_3']['vwidth']
  232. video_height = video_info['videoResource']['dash_120fps']['video_list']['video_3']['vheight']
  233. video_url_dict["video_url"] = video_url
  234. video_url_dict["audio_url"] = audio_url
  235. video_url_dict["video_width"] = video_width
  236. video_url_dict["video_height"] = video_height
  237. elif "video_list" in video_info['videoResource']['dash_120fps'] and 'video_2' in video_info['videoResource']['dash_120fps']['video_list']:
  238. video_url = video_info['videoResource']['dash_120fps']['video_list']['video_2']['backup_url_1']
  239. audio_url = video_info['videoResource']['dash_120fps']['video_list']['video_2']['backup_url_1']
  240. if len(video_url) % 3 == 1:
  241. video_url += '=='
  242. elif len(video_url) % 3 == 2:
  243. video_url += '='
  244. elif len(audio_url) % 3 == 1:
  245. audio_url += '=='
  246. elif len(audio_url) % 3 == 2:
  247. audio_url += '='
  248. video_url = base64.b64decode(video_url).decode('utf8')
  249. audio_url = base64.b64decode(audio_url).decode('utf8')
  250. video_width = video_info['videoResource']['dash_120fps']['video_list']['video_2']['vwidth']
  251. video_height = video_info['videoResource']['dash_120fps']['video_list']['video_2']['vheight']
  252. video_url_dict["video_url"] = video_url
  253. video_url_dict["audio_url"] = audio_url
  254. video_url_dict["video_width"] = video_width
  255. video_url_dict["video_height"] = video_height
  256. elif "video_list" in video_info['videoResource']['dash_120fps'] and 'video_1' in video_info['videoResource']['dash_120fps']['video_list']:
  257. video_url = video_info['videoResource']['dash_120fps']['video_list']['video_1']['backup_url_1']
  258. audio_url = video_info['videoResource']['dash_120fps']['video_list']['video_1']['backup_url_1']
  259. if len(video_url) % 3 == 1:
  260. video_url += '=='
  261. elif len(video_url) % 3 == 2:
  262. video_url += '='
  263. elif len(audio_url) % 3 == 1:
  264. audio_url += '=='
  265. elif len(audio_url) % 3 == 2:
  266. audio_url += '='
  267. video_url = base64.b64decode(video_url).decode('utf8')
  268. audio_url = base64.b64decode(audio_url).decode('utf8')
  269. video_width = video_info['videoResource']['dash_120fps']['video_list']['video_1']['vwidth']
  270. video_height = video_info['videoResource']['dash_120fps']['video_list']['video_1']['vheight']
  271. video_url_dict["video_url"] = video_url
  272. video_url_dict["audio_url"] = audio_url
  273. video_url_dict["video_width"] = video_width
  274. video_url_dict["video_height"] = video_height
  275. elif 'dynamic_video' in video_info['videoResource']['dash_120fps'] \
  276. and 'dynamic_video_list' in video_info['videoResource']['dash_120fps']['dynamic_video'] \
  277. and 'dynamic_audio_list' in video_info['videoResource']['dash_120fps']['dynamic_video'] \
  278. and len(video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_video_list']) != 0 \
  279. and len(video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_audio_list']) != 0:
  280. video_url = video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_video_list'][-1]['backup_url_1']
  281. audio_url = video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_audio_list'][-1]['backup_url_1']
  282. if len(video_url) % 3 == 1:
  283. video_url += '=='
  284. elif len(video_url) % 3 == 2:
  285. video_url += '='
  286. elif len(audio_url) % 3 == 1:
  287. audio_url += '=='
  288. elif len(audio_url) % 3 == 2:
  289. audio_url += '='
  290. video_url = base64.b64decode(video_url).decode('utf8')
  291. audio_url = base64.b64decode(audio_url).decode('utf8')
  292. video_width = video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_video_list'][-1]['vwidth']
  293. video_height = video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_video_list'][-1]['vheight']
  294. video_url_dict["video_url"] = video_url
  295. video_url_dict["audio_url"] = audio_url
  296. video_url_dict["video_width"] = video_width
  297. video_url_dict["video_height"] = video_height
  298. else:
  299. video_url_dict["video_url"] = ''
  300. video_url_dict["audio_url"] = ''
  301. video_url_dict["video_width"] = 0
  302. video_url_dict["video_height"] = 0
  303. elif 'dash' in video_info['videoResource']:
  304. if "video_list" in video_info['videoResource']['dash'] and 'video_4' in video_info['videoResource']['dash']['video_list']:
  305. video_url = video_info['videoResource']['dash']['video_list']['video_4']['backup_url_1']
  306. audio_url = video_info['videoResource']['dash']['video_list']['video_4']['backup_url_1']
  307. if len(video_url) % 3 == 1:
  308. video_url += '=='
  309. elif len(video_url) % 3 == 2:
  310. video_url += '='
  311. elif len(audio_url) % 3 == 1:
  312. audio_url += '=='
  313. elif len(audio_url) % 3 == 2:
  314. audio_url += '='
  315. video_url = base64.b64decode(video_url).decode('utf8')
  316. audio_url = base64.b64decode(audio_url).decode('utf8')
  317. video_width = video_info['videoResource']['dash']['video_list']['video_4']['vwidth']
  318. video_height = video_info['videoResource']['dash']['video_list']['video_4']['vheight']
  319. video_url_dict["video_url"] = video_url
  320. video_url_dict["audio_url"] = audio_url
  321. video_url_dict["video_width"] = video_width
  322. video_url_dict["video_height"] = video_height
  323. elif "video_list" in video_info['videoResource']['dash'] and 'video_3' in video_info['videoResource']['dash']['video_list']:
  324. video_url = video_info['videoResource']['dash']['video_list']['video_3']['backup_url_1']
  325. audio_url = video_info['videoResource']['dash']['video_list']['video_3']['backup_url_1']
  326. if len(video_url) % 3 == 1:
  327. video_url += '=='
  328. elif len(video_url) % 3 == 2:
  329. video_url += '='
  330. elif len(audio_url) % 3 == 1:
  331. audio_url += '=='
  332. elif len(audio_url) % 3 == 2:
  333. audio_url += '='
  334. video_url = base64.b64decode(video_url).decode('utf8')
  335. audio_url = base64.b64decode(audio_url).decode('utf8')
  336. video_width = video_info['videoResource']['dash']['video_list']['video_3']['vwidth']
  337. video_height = video_info['videoResource']['dash']['video_list']['video_3']['vheight']
  338. video_url_dict["video_url"] = video_url
  339. video_url_dict["audio_url"] = audio_url
  340. video_url_dict["video_width"] = video_width
  341. video_url_dict["video_height"] = video_height
  342. elif "video_list" in video_info['videoResource']['dash'] and 'video_2' in video_info['videoResource']['dash']['video_list']:
  343. video_url = video_info['videoResource']['dash']['video_list']['video_2']['backup_url_1']
  344. audio_url = video_info['videoResource']['dash']['video_list']['video_2']['backup_url_1']
  345. if len(video_url) % 3 == 1:
  346. video_url += '=='
  347. elif len(video_url) % 3 == 2:
  348. video_url += '='
  349. elif len(audio_url) % 3 == 1:
  350. audio_url += '=='
  351. elif len(audio_url) % 3 == 2:
  352. audio_url += '='
  353. video_url = base64.b64decode(video_url).decode('utf8')
  354. audio_url = base64.b64decode(audio_url).decode('utf8')
  355. video_width = video_info['videoResource']['dash']['video_list']['video_2']['vwidth']
  356. video_height = video_info['videoResource']['dash']['video_list']['video_2']['vheight']
  357. video_url_dict["video_url"] = video_url
  358. video_url_dict["audio_url"] = audio_url
  359. video_url_dict["video_width"] = video_width
  360. video_url_dict["video_height"] = video_height
  361. elif "video_list" in video_info['videoResource']['dash'] and 'video_1' in video_info['videoResource']['dash']['video_list']:
  362. video_url = video_info['videoResource']['dash']['video_list']['video_1']['backup_url_1']
  363. audio_url = video_info['videoResource']['dash']['video_list']['video_1']['backup_url_1']
  364. if len(video_url) % 3 == 1:
  365. video_url += '=='
  366. elif len(video_url) % 3 == 2:
  367. video_url += '='
  368. elif len(audio_url) % 3 == 1:
  369. audio_url += '=='
  370. elif len(audio_url) % 3 == 2:
  371. audio_url += '='
  372. video_url = base64.b64decode(video_url).decode('utf8')
  373. audio_url = base64.b64decode(audio_url).decode('utf8')
  374. video_width = video_info['videoResource']['dash']['video_list']['video_1']['vwidth']
  375. video_height = video_info['videoResource']['dash']['video_list']['video_1']['vheight']
  376. video_url_dict["video_url"] = video_url
  377. video_url_dict["audio_url"] = audio_url
  378. video_url_dict["video_width"] = video_width
  379. video_url_dict["video_height"] = video_height
  380. elif 'dynamic_video' in video_info['videoResource']['dash'] \
  381. and 'dynamic_video_list' in video_info['videoResource']['dash']['dynamic_video'] \
  382. and 'dynamic_audio_list' in video_info['videoResource']['dash']['dynamic_video'] \
  383. and len(video_info['videoResource']['dash']['dynamic_video']['dynamic_video_list']) != 0 \
  384. and len(video_info['videoResource']['dash']['dynamic_video']['dynamic_audio_list']) != 0:
  385. video_url = video_info['videoResource']['dash']['dynamic_video']['dynamic_video_list'][-1]['backup_url_1']
  386. audio_url = video_info['videoResource']['dash']['dynamic_video']['dynamic_audio_list'][-1]['backup_url_1']
  387. if len(video_url) % 3 == 1:
  388. video_url += '=='
  389. elif len(video_url) % 3 == 2:
  390. video_url += '='
  391. elif len(audio_url) % 3 == 1:
  392. audio_url += '=='
  393. elif len(audio_url) % 3 == 2:
  394. audio_url += '='
  395. video_url = base64.b64decode(video_url).decode('utf8')
  396. audio_url = base64.b64decode(audio_url).decode('utf8')
  397. video_width = video_info['videoResource']['dash']['dynamic_video']['dynamic_video_list'][-1]['vwidth']
  398. video_height = video_info['videoResource']['dash']['dynamic_video']['dynamic_video_list'][-1]['vheight']
  399. video_url_dict["video_url"] = video_url
  400. video_url_dict["audio_url"] = audio_url
  401. video_url_dict["video_width"] = video_width
  402. video_url_dict["video_height"] = video_height
  403. else:
  404. video_url_dict["video_url"] = ''
  405. video_url_dict["audio_url"] = ''
  406. video_url_dict["video_width"] = 0
  407. video_url_dict["video_height"] = 0
  408. elif 'normal' in video_info['videoResource']:
  409. if "video_list" in video_info['videoResource']['normal'] and 'video_4' in \
  410. video_info['videoResource']['normal']['video_list']:
  411. video_url = video_info['videoResource']['normal']['video_list']['video_4']['backup_url_1']
  412. audio_url = video_info['videoResource']['normal']['video_list']['video_4']['backup_url_1']
  413. if len(video_url) % 3 == 1:
  414. video_url += '=='
  415. elif len(video_url) % 3 == 2:
  416. video_url += '='
  417. elif len(audio_url) % 3 == 1:
  418. audio_url += '=='
  419. elif len(audio_url) % 3 == 2:
  420. audio_url += '='
  421. video_url = base64.b64decode(video_url).decode('utf8')
  422. audio_url = base64.b64decode(audio_url).decode('utf8')
  423. video_width = video_info['videoResource']['normal']['video_list']['video_4']['vwidth']
  424. video_height = video_info['videoResource']['normal']['video_list']['video_4']['vheight']
  425. video_url_dict["video_url"] = video_url
  426. video_url_dict["audio_url"] = audio_url
  427. video_url_dict["video_width"] = video_width
  428. video_url_dict["video_height"] = video_height
  429. elif "video_list" in video_info['videoResource']['normal'] and 'video_3' in \
  430. video_info['videoResource']['normal']['video_list']:
  431. video_url = video_info['videoResource']['normal']['video_list']['video_3']['backup_url_1']
  432. audio_url = video_info['videoResource']['normal']['video_list']['video_3']['backup_url_1']
  433. if len(video_url) % 3 == 1:
  434. video_url += '=='
  435. elif len(video_url) % 3 == 2:
  436. video_url += '='
  437. elif len(audio_url) % 3 == 1:
  438. audio_url += '=='
  439. elif len(audio_url) % 3 == 2:
  440. audio_url += '='
  441. video_url = base64.b64decode(video_url).decode('utf8')
  442. audio_url = base64.b64decode(audio_url).decode('utf8')
  443. video_width = video_info['videoResource']['normal']['video_list']['video_3']['vwidth']
  444. video_height = video_info['videoResource']['normal']['video_list']['video_3']['vheight']
  445. video_url_dict["video_url"] = video_url
  446. video_url_dict["audio_url"] = audio_url
  447. video_url_dict["video_width"] = video_width
  448. video_url_dict["video_height"] = video_height
  449. elif "video_list" in video_info['videoResource']['normal'] and 'video_2' in \
  450. video_info['videoResource']['normal']['video_list']:
  451. video_url = video_info['videoResource']['normal']['video_list']['video_2']['backup_url_1']
  452. audio_url = video_info['videoResource']['normal']['video_list']['video_2']['backup_url_1']
  453. if len(video_url) % 3 == 1:
  454. video_url += '=='
  455. elif len(video_url) % 3 == 2:
  456. video_url += '='
  457. elif len(audio_url) % 3 == 1:
  458. audio_url += '=='
  459. elif len(audio_url) % 3 == 2:
  460. audio_url += '='
  461. video_url = base64.b64decode(video_url).decode('utf8')
  462. audio_url = base64.b64decode(audio_url).decode('utf8')
  463. video_width = video_info['videoResource']['normal']['video_list']['video_2']['vwidth']
  464. video_height = video_info['videoResource']['normal']['video_list']['video_2']['vheight']
  465. video_url_dict["video_url"] = video_url
  466. video_url_dict["audio_url"] = audio_url
  467. video_url_dict["video_width"] = video_width
  468. video_url_dict["video_height"] = video_height
  469. elif "video_list" in video_info['videoResource']['normal'] and 'video_1' in \
  470. video_info['videoResource']['normal']['video_list']:
  471. video_url = video_info['videoResource']['normal']['video_list']['video_1']['backup_url_1']
  472. audio_url = video_info['videoResource']['normal']['video_list']['video_1']['backup_url_1']
  473. if len(video_url) % 3 == 1:
  474. video_url += '=='
  475. elif len(video_url) % 3 == 2:
  476. video_url += '='
  477. elif len(audio_url) % 3 == 1:
  478. audio_url += '=='
  479. elif len(audio_url) % 3 == 2:
  480. audio_url += '='
  481. video_url = base64.b64decode(video_url).decode('utf8')
  482. audio_url = base64.b64decode(audio_url).decode('utf8')
  483. video_width = video_info['videoResource']['normal']['video_list']['video_1']['vwidth']
  484. video_height = video_info['videoResource']['normal']['video_list']['video_1']['vheight']
  485. video_url_dict["video_url"] = video_url
  486. video_url_dict["audio_url"] = audio_url
  487. video_url_dict["video_width"] = video_width
  488. video_url_dict["video_height"] = video_height
  489. elif 'dynamic_video' in video_info['videoResource']['normal'] \
  490. and 'dynamic_video_list' in video_info['videoResource']['normal']['dynamic_video'] \
  491. and 'dynamic_audio_list' in video_info['videoResource']['normal']['dynamic_video'] \
  492. and len(video_info['videoResource']['normal']['dynamic_video']['dynamic_video_list']) != 0 \
  493. and len(video_info['videoResource']['normal']['dynamic_video']['dynamic_audio_list']) != 0:
  494. video_url = video_info['videoResource']['normal']['dynamic_video']['dynamic_video_list'][-1][
  495. 'backup_url_1']
  496. audio_url = video_info['videoResource']['normal']['dynamic_video']['dynamic_audio_list'][-1][
  497. 'backup_url_1']
  498. if len(video_url) % 3 == 1:
  499. video_url += '=='
  500. elif len(video_url) % 3 == 2:
  501. video_url += '='
  502. elif len(audio_url) % 3 == 1:
  503. audio_url += '=='
  504. elif len(audio_url) % 3 == 2:
  505. audio_url += '='
  506. video_url = base64.b64decode(video_url).decode('utf8')
  507. audio_url = base64.b64decode(audio_url).decode('utf8')
  508. video_width = video_info['videoResource']['normal']['dynamic_video']['dynamic_video_list'][-1][
  509. 'vwidth']
  510. video_height = video_info['videoResource']['normal']['dynamic_video']['dynamic_video_list'][-1][
  511. 'vheight']
  512. video_url_dict["video_url"] = video_url
  513. video_url_dict["audio_url"] = audio_url
  514. video_url_dict["video_width"] = video_width
  515. video_url_dict["video_height"] = video_height
  516. else:
  517. video_url_dict["video_url"] = ''
  518. video_url_dict["audio_url"] = ''
  519. video_url_dict["video_width"] = 0
  520. video_url_dict["video_height"] = 0
  521. else:
  522. video_url_dict["video_url"] = ''
  523. video_url_dict["audio_url"] = ''
  524. video_url_dict["video_width"] = 0
  525. video_url_dict["video_height"] = 0
  526. return video_url_dict
  527. except Exception as e:
  528. Common.logger(log_type, crawler).error(f'get_video_url:{e}\n')
  529. @classmethod
  530. def get_videolist(cls, log_type, crawler, task, our_uid, out_uid, oss_endpoint, env):
  531. try:
  532. signature = cls.random_signature()
  533. while True:
  534. url = "https://www.ixigua.com/api/videov2/author/new_video_list?"
  535. params = {
  536. 'to_user_id': str(out_uid),
  537. 'offset': str(cls.offset),
  538. 'limit': '30',
  539. 'maxBehotTime': '0',
  540. 'order': 'new',
  541. 'isHome': '0',
  542. '_signature': signature,
  543. }
  544. headers = {
  545. 'referer': f'https://www.ixigua.com/home/{out_uid}/video/?preActiveKey=hotsoon&list_entrance=userdetail',
  546. 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.41',
  547. }
  548. urllib3.disable_warnings()
  549. s = requests.session()
  550. # max_retries=3 重试3次
  551. s.mount('http://', HTTPAdapter(max_retries=3))
  552. s.mount('https://', HTTPAdapter(max_retries=3))
  553. response = s.get(url=url, headers=headers, params=params, proxies=Common.tunnel_proxies(), verify=False, timeout=5)
  554. response.close()
  555. cls.offset += 30
  556. if response.status_code != 200:
  557. Common.logger(log_type, crawler).warning(f"get_videolist_response:{response.text}\n")
  558. cls.offset = 0
  559. return
  560. elif 'data' not in response.text:
  561. Common.logger(log_type, crawler).warning(f"get_videolist_response:{response.text}\n")
  562. cls.offset = 0
  563. return
  564. elif 'videoList' not in response.json()["data"]:
  565. Common.logger(log_type, crawler).warning(f"get_videolist_response:{response.json()}\n")
  566. cls.offset = 0
  567. return
  568. else:
  569. videoList = response.json()['data']['videoList']
  570. for i in range(len(videoList)):
  571. # video_title
  572. if 'title' not in videoList[i]:
  573. video_title = 0
  574. else:
  575. video_title = videoList[i]['title'].strip().replace('手游', '') \
  576. .replace('/', '').replace('\/', '').replace('\n', '')
  577. # video_id
  578. if 'video_id' not in videoList[i]:
  579. video_id = 0
  580. else:
  581. video_id = videoList[i]['video_id']
  582. # gid
  583. if 'gid' not in videoList[i]:
  584. gid = 0
  585. else:
  586. gid = videoList[i]['gid']
  587. # play_cnt
  588. if 'video_detail_info' not in videoList[i]:
  589. play_cnt = 0
  590. elif 'video_watch_count' not in videoList[i]['video_detail_info']:
  591. play_cnt = 0
  592. else:
  593. play_cnt = videoList[i]['video_detail_info']['video_watch_count']
  594. # comment_cnt
  595. if 'comment_count' not in videoList[i]:
  596. comment_cnt = 0
  597. else:
  598. comment_cnt = videoList[i]['comment_count']
  599. # like_cnt
  600. if 'digg_count' not in videoList[i]:
  601. like_cnt = 0
  602. else:
  603. like_cnt = videoList[i]['digg_count']
  604. # share_cnt
  605. share_cnt = 0
  606. # video_duration
  607. if 'video_duration' not in videoList[i]:
  608. video_duration = 0
  609. else:
  610. video_duration = int(videoList[i]['video_duration'])
  611. # send_time
  612. if 'publish_time' not in videoList[i]:
  613. publish_time = 0
  614. else:
  615. publish_time = videoList[i]['publish_time']
  616. publish_time_str = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(publish_time))
  617. # is_top
  618. if 'is_top' not in videoList[i]:
  619. is_top = 0
  620. else:
  621. is_top = videoList[i]['is_top']
  622. # user_name
  623. if 'user_info' not in videoList[i]:
  624. user_name = 0
  625. elif 'name' not in videoList[i]['user_info']:
  626. user_name = 0
  627. else:
  628. user_name = videoList[i]['user_info']['name']
  629. # user_id
  630. if 'user_info' not in videoList[i]:
  631. user_id = 0
  632. elif 'user_id' not in videoList[i]['user_info']:
  633. user_id = 0
  634. else:
  635. user_id = videoList[i]['user_info']['user_id']
  636. # avatar_url
  637. if 'user_info' not in videoList[i]:
  638. avatar_url = 0
  639. elif 'avatar_url' not in videoList[i]['user_info']:
  640. avatar_url = 0
  641. else:
  642. avatar_url = videoList[i]['user_info']['avatar_url']
  643. # cover_url
  644. if 'video_detail_info' not in videoList[i]:
  645. cover_url = 0
  646. elif 'detail_video_large_image' not in videoList[i]['video_detail_info']:
  647. cover_url = 0
  648. elif 'url' in videoList[i]['video_detail_info']['detail_video_large_image']:
  649. cover_url = videoList[i]['video_detail_info']['detail_video_large_image']['url']
  650. else:
  651. cover_url = videoList[i]['video_detail_info']['detail_video_large_image']['url_list'][0]['url']
  652. min_publish_time = int(task["min_publish_time"])
  653. min_publish_day = int(task["min_publish_day"])
  654. min_publish_day = (date.today() + timedelta(days=-min_publish_day)).strftime("%Y-%m-%d")
  655. min_publish_day = int(time.mktime(time.strptime(min_publish_day, "%Y-%m-%d")))
  656. if min_publish_time > 0 and min_publish_day > 0:
  657. publish_time_rule = min_publish_time
  658. elif min_publish_time > 0:
  659. publish_time_rule = min_publish_time
  660. else:
  661. publish_time_rule = min_publish_day
  662. if gid == 0 or video_id == 0 or cover_url == 0:
  663. Common.logger(log_type, crawler).info('无效视频\n')
  664. elif is_top is True and int(publish_time) < publish_time_rule:
  665. Common.logger(log_type, crawler).info(f'置顶视频,且发布时间超过抓取时间\n')
  666. elif int(publish_time) < publish_time_rule:
  667. Common.logger(log_type, crawler).info(f'发布时间超过抓取时间\n')
  668. cls.offset = 0
  669. return
  670. else:
  671. video_url_dict = cls.get_video_url(log_type, crawler, gid)
  672. video_url = video_url_dict["video_url"]
  673. audio_url = video_url_dict["audio_url"]
  674. video_width = video_url_dict["video_width"]
  675. video_height = video_url_dict["video_height"]
  676. video_dict = {'video_title': video_title,
  677. 'video_id': video_id,
  678. 'gid': gid,
  679. 'play_cnt': play_cnt,
  680. 'comment_cnt': comment_cnt,
  681. 'like_cnt': like_cnt,
  682. 'share_cnt': share_cnt,
  683. 'video_width': video_width,
  684. 'video_height': video_height,
  685. 'duration': video_duration,
  686. 'publish_time_stamp': publish_time,
  687. 'publish_time_str': publish_time_str,
  688. 'is_top': is_top,
  689. 'user_name': user_name,
  690. 'user_id': user_id,
  691. 'avatar_url': avatar_url,
  692. 'cover_url': cover_url,
  693. 'audio_url': audio_url,
  694. 'video_url': video_url,
  695. 'session': signature}
  696. for k, v in video_dict.items():
  697. Common.logger(log_type, crawler).info(f"{k}:{v}")
  698. cls.download_publish(log_type=log_type,
  699. crawler=crawler,
  700. video_dict=video_dict,
  701. task=task,
  702. strategy=task["task_name"],
  703. our_uid=our_uid,
  704. oss_endpoint=oss_endpoint,
  705. env=env)
  706. except Exception as e:
  707. Common.logger(log_type, crawler).error(f"get_videolist:{e}\n")
  708. @classmethod
  709. def repeat_video(cls, log_type, crawler, video_id, env):
  710. sql = f""" select * from crawler_video where platform="{cls.platform}" and out_video_id="{video_id}"; """
  711. repeat_video = MysqlHelper.get_values(log_type, crawler, sql, env)
  712. return len(repeat_video)
  713. # 下载 / 上传
  714. @classmethod
  715. def download_publish(cls, log_type, crawler, strategy, video_dict, task, our_uid, oss_endpoint, env):
  716. try:
  717. if cls.download_rule_scheduling(video_dict, task) is False:
  718. Common.logger(log_type, crawler).info('不满足抓取规则\n')
  719. elif any(word if word in video_dict['video_title'] else False for word in cls.filter_words(log_type, crawler)) is True:
  720. Common.logger(log_type, crawler).info('标题已中过滤词:{}\n', video_dict['video_title'])
  721. elif cls.repeat_video(log_type, crawler, video_dict['video_id'], env) != 0:
  722. Common.logger(log_type, crawler).info('视频已下载\n')
  723. else:
  724. # 下载视频
  725. Common.download_method(log_type=log_type, crawler=crawler, text='xigua_video', title=video_dict['video_title'], url=video_dict['video_url'])
  726. # 下载音频
  727. Common.download_method(log_type=log_type, crawler=crawler, text='xigua_audio', title=video_dict['video_title'], url=video_dict['audio_url'])
  728. # 合成音视频
  729. Common.video_compose(log_type=log_type, crawler=crawler, video_dir=f"./{crawler}/videos/{video_dict['video_title']}")
  730. md_title = md5(video_dict['video_title'].encode('utf8')).hexdigest()
  731. if os.path.getsize(f"./{crawler}/videos/{md_title}/video.mp4") == 0:
  732. # 删除视频文件夹
  733. shutil.rmtree(f"./{crawler}/videos/{md_title}")
  734. Common.logger(log_type, crawler).info("视频size=0,删除成功\n")
  735. return
  736. # ffmpeg_dict = Common.ffmpeg(log_type, crawler, f"./{crawler}/videos/{video_dict['video_title']}/video.mp4")
  737. # if ffmpeg_dict is None or ffmpeg_dict['size'] == 0:
  738. # Common.logger(log_type, crawler).warning(f"下载的视频无效,已删除\n")
  739. # # 删除视频文件夹
  740. # shutil.rmtree(f"./{crawler}/videos/{video_dict['video_title']}")
  741. # return
  742. # 下载封面
  743. Common.download_method(log_type=log_type, crawler=crawler, text='cover', title=video_dict['video_title'], url=video_dict['cover_url'])
  744. # 保存视频信息至txt
  745. Common.save_video_info(log_type=log_type, crawler=crawler, video_dict=video_dict)
  746. # 上传视频
  747. Common.logger(log_type, crawler).info("开始上传视频...")
  748. our_video_id = Publish.upload_and_publish(log_type=log_type,
  749. crawler=crawler,
  750. strategy=strategy,
  751. our_uid=our_uid,
  752. env=env,
  753. oss_endpoint=oss_endpoint)
  754. if env == 'dev':
  755. our_video_link = f"https://testadmin.piaoquantv.com/cms/post-detail/{our_video_id}/info"
  756. else:
  757. our_video_link = f"https://admin.piaoquantv.com/cms/post-detail/{our_video_id}/info"
  758. Common.logger(log_type, crawler).info("视频上传完成")
  759. if our_video_id is None:
  760. # 删除视频文件夹
  761. shutil.rmtree(f"./{crawler}/videos/{video_dict['video_title']}")
  762. return
  763. # 视频写入飞书
  764. Feishu.insert_columns(log_type, 'xigua', "e075e9", "ROWS", 1, 2)
  765. upload_time = int(time.time())
  766. values = [[time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(upload_time)),
  767. "定向榜",
  768. video_dict['video_title'],
  769. str(video_dict['video_id']),
  770. our_video_link,
  771. video_dict['gid'],
  772. video_dict['play_cnt'],
  773. video_dict['comment_cnt'],
  774. video_dict['like_cnt'],
  775. video_dict['share_cnt'],
  776. video_dict['duration'],
  777. str(video_dict['video_width']) + '*' + str(video_dict['video_height']),
  778. video_dict['publish_time_str'],
  779. video_dict['user_name'],
  780. video_dict['user_id'],
  781. video_dict['avatar_url'],
  782. video_dict['cover_url'],
  783. video_dict['video_url'],
  784. video_dict['audio_url']]]
  785. time.sleep(1)
  786. Feishu.update_values(log_type, 'xigua', "e075e9", "F2:Z2", values)
  787. Common.logger(log_type, crawler).info(f"视频已保存至云文档\n")
  788. rule_dict = {
  789. "play_cnt": task["play_cnt"],
  790. "video_width": task["video_width"],
  791. "video_height": task["video_height"],
  792. "video_like": task["video_like"],
  793. "share_cnt": task["share_cnt"],
  794. "duration": {"min": task["duration_min"], "max": task["duration_max"]}
  795. }
  796. # 视频信息保存数据库
  797. insert_sql = f""" insert into crawler_video(video_id,
  798. user_id,
  799. out_user_id,
  800. platform,
  801. strategy,
  802. out_video_id,
  803. video_title,
  804. cover_url,
  805. video_url,
  806. duration,
  807. publish_time,
  808. play_cnt,
  809. crawler_rule,
  810. width,
  811. height)
  812. values({our_video_id},
  813. {our_uid},
  814. "{video_dict['user_id']}",
  815. "{cls.platform}",
  816. "定向爬虫策略",
  817. "{video_dict['video_id']}",
  818. "{video_dict['video_title']}",
  819. "{video_dict['cover_url']}",
  820. "{video_dict['video_url']}",
  821. {int(video_dict['duration'])},
  822. "{video_dict['publish_time_str']}",
  823. {int(video_dict['play_cnt'])},
  824. '{json.dumps(rule_dict)}',
  825. {int(video_dict['video_width'])},
  826. {int(video_dict['video_height'])}) """
  827. Common.logger(log_type, crawler).info(f"insert_sql:{insert_sql}")
  828. MysqlHelper.update_values(log_type, crawler, insert_sql, env)
  829. Common.logger(log_type, crawler).info('视频信息插入数据库成功!\n')
  830. except Exception as e:
  831. Common.logger(log_type, crawler).error(f'download_publish异常:{e}\n')
  832. @classmethod
  833. def get_follow_videos(cls, log_type, crawler, task, oss_endpoint, env):
  834. try:
  835. user_list = cls.get_users(log_type=log_type,
  836. crawler=crawler,
  837. task=task,
  838. env=env)
  839. for user in user_list:
  840. out_uid = user["out_uid"]
  841. our_uid = int(user["our_uid"])
  842. if our_uid == 0:
  843. pass
  844. else:
  845. Common.logger(log_type, crawler).info(f"开始抓取 {out_uid} 用户主页视频\n")
  846. cls.get_videolist(log_type=log_type,
  847. crawler=crawler,
  848. task=task,
  849. our_uid=our_uid,
  850. out_uid=out_uid,
  851. oss_endpoint=oss_endpoint,
  852. env=env)
  853. cls.offset = 0
  854. time.sleep(1)
  855. except Exception as e:
  856. Common.logger(log_type, crawler).error(f"get_follow_videos:{e}\n")
  857. if __name__ == '__main__':
  858. # SchedulingFollow.get_users(log_type="follow",
  859. # crawler="xigua",
  860. # spider_rule="['https://www.ixigua.com/home/95420624045', 'https://www.ixigua.com/home/6431477489']",
  861. # env="dev",
  862. # machine="local")
  863. print(SchedulingFollow.repeat_video("follow", "xigua", "v0201ag10000ce3jcjbc77u8jsplpgrg", "dev"))
  864. pass