xigua_follow_scheduling.py 51 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895
  1. # -*- coding: utf-8 -*-
  2. # @Author: wangkun
  3. # @Time: 2023/2/17
  4. import base64
  5. import json
  6. import os
  7. import random
  8. import shutil
  9. import string
  10. import sys
  11. import time
  12. from datetime import date, timedelta
  13. from hashlib import md5
  14. import requests
  15. import urllib3
  16. from requests.adapters import HTTPAdapter
  17. sys.path.append(os.getcwd())
  18. from common.scheduling_db import MysqlHelper
  19. from common.common import Common
  20. from common.feishu import Feishu
  21. from common.publish import Publish
  22. from common.public import get_user_from_mysql, get_config_from_mysql, download_rule
  23. class ScheduleXiguaFollow:
  24. # 个人主页视频翻页参数
  25. offset = 0
  26. platform = "西瓜视频"
  27. @classmethod
  28. def download_rule(cls, video_info_dict, rule_dict):
  29. if video_info_dict['play_cnt'] >= rule_dict['play_cnt']['min']:
  30. if video_info_dict['comment_cnt'] >= rule_dict['comment_cnt']['min']:
  31. if video_info_dict['like_cnt'] >= rule_dict['like_cnt']['min']:
  32. if video_info_dict['duration'] >= rule_dict['duration']['min']:
  33. if video_info_dict['video_width'] >= rule_dict['width']['min'] \
  34. or video_info_dict['video_height'] >= rule_dict['height']['min']:
  35. return True
  36. else:
  37. return False
  38. else:
  39. return False
  40. else:
  41. return False
  42. else:
  43. return False
  44. else:
  45. return False
  46. @classmethod
  47. def get_users(cls, log_type, crawler, task, env):
  48. link_list = task['spider_link']
  49. user_list = []
  50. for link in link_list:
  51. out_uid = int(link.split("https://www.ixigua.com/home/")[-1].replace("/", "").strip())
  52. sql = f""" select * from crawler_author_map where spider_link="{link}" """
  53. our_user_info = MysqlHelper.get_values(log_type=log_type, crawler=crawler, sql=sql, env=env)
  54. if len(our_user_info) == 0:
  55. our_uid = 0
  56. Common.logger(log_type, crawler).info(f"没有站内虚拟账号: {link}\n")
  57. else:
  58. # print(type(our_user_info[0]))
  59. # print(our_user_info[0])
  60. our_uid = our_user_info[0]["media_id"]
  61. user_dict = {
  62. "out_uid": out_uid,
  63. "our_uid": our_uid
  64. }
  65. user_list.append(user_dict)
  66. Common.logger(log_type, crawler).info(f"user_list:{user_list}")
  67. return user_list
  68. # 过滤词库
  69. @classmethod
  70. def filter_words(cls, log_type, crawler):
  71. try:
  72. while True:
  73. filter_words_sheet = Feishu.get_values_batch(log_type, crawler, 'KGB4Hc')
  74. if filter_words_sheet is None:
  75. Common.logger(log_type, crawler).warning(f"filter_words_sheet:{filter_words_sheet} 10秒钟后重试")
  76. continue
  77. filter_words_list = []
  78. for x in filter_words_sheet:
  79. for y in x:
  80. if y is None:
  81. pass
  82. else:
  83. filter_words_list.append(y)
  84. return filter_words_list
  85. except Exception as e:
  86. Common.logger(log_type, crawler).error(f'filter_words异常:{e}\n')
  87. @classmethod
  88. def random_signature(cls):
  89. src_digits = string.digits # string_数字
  90. src_uppercase = string.ascii_uppercase # string_大写字母
  91. src_lowercase = string.ascii_lowercase # string_小写字母
  92. digits_num = random.randint(1, 6)
  93. uppercase_num = random.randint(1, 26 - digits_num - 1)
  94. lowercase_num = 26 - (digits_num + uppercase_num)
  95. password = random.sample(src_digits, digits_num) + random.sample(src_uppercase, uppercase_num) + random.sample(
  96. src_lowercase, lowercase_num)
  97. random.shuffle(password)
  98. new_password = 'AAAAAAAAAA' + ''.join(password)[10:-4] + 'AAAB'
  99. new_password_start = new_password[0:18]
  100. new_password_end = new_password[-7:]
  101. if new_password[18] == '8':
  102. new_password = new_password_start + 'w' + new_password_end
  103. elif new_password[18] == '9':
  104. new_password = new_password_start + 'x' + new_password_end
  105. elif new_password[18] == '-':
  106. new_password = new_password_start + 'y' + new_password_end
  107. elif new_password[18] == '.':
  108. new_password = new_password_start + 'z' + new_password_end
  109. else:
  110. new_password = new_password_start + 'y' + new_password_end
  111. return new_password
  112. # 获取视频详情
  113. @classmethod
  114. def get_video_url(cls, log_type, crawler, gid):
  115. try:
  116. url = 'https://www.ixigua.com/api/mixVideo/information?'
  117. headers = {
  118. "accept-encoding": "gzip, deflate",
  119. "accept-language": "zh-CN,zh-Hans;q=0.9",
  120. "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
  121. "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.5 Safari/605.1.15",
  122. "referer": "https://www.ixigua.com/7102614741050196520?logTag=0531c88ac04f38ab2c62",
  123. }
  124. params = {
  125. 'mixId': gid,
  126. 'msToken': 'IlG0wd0Pylyw9ghcYiB2YseUmTwrsrqqhXrbIcsSaTcLTJyVlbYJzk20zw3UO-CfrfC'
  127. 'NVVIOBNjIl7vfBoxnVUwO9ZyzAI3umSKsT5-pef_RRfQCJwmA',
  128. 'X-Bogus': 'DFSzswVupYTANCJOSBk0P53WxM-r',
  129. '_signature': '_02B4Z6wo0000119LvEwAAIDCuktNZ0y5wkdfS7jAALThuOR8D9yWNZ.EmWHKV0WSn6Px'
  130. 'fPsH9-BldyxVje0f49ryXgmn7Tzk-swEHNb15TiGqa6YF.cX0jW8Eds1TtJOIZyfc9s5emH7gdWN94',
  131. }
  132. cookies = {
  133. 'ixigua-a-s': '1',
  134. 'msToken': 'IlG0wd0Pylyw9ghcYiB2YseUmTwrsrqqhXrbIcsSaTcLTJyVlbYJzk20zw3UO-CfrfCNVVIOB'
  135. 'NjIl7vfBoxnVUwO9ZyzAI3umSKsT5-pef_RRfQCJwmA',
  136. 'ttwid': '1%7C_yXQeHWwLZgCsgHClOwTCdYSOt_MjdOkgnPIkpi-Sr8%7C1661241238%7Cf57d0c5ef3f1d7'
  137. '6e049fccdca1ac54887c34d1f8731c8e51a49780ff0ceab9f8',
  138. 'tt_scid': 'QZ4l8KXDG0YAEaMCSbADdcybdKbUfG4BC6S4OBv9lpRS5VyqYLX2bIR8CTeZeGHR9ee3',
  139. 'MONITOR_WEB_ID': '0a49204a-7af5-4e96-95f0-f4bafb7450ad',
  140. '__ac_nonce': '06304878000964fdad287',
  141. '__ac_signature': '_02B4Z6wo00f017Rcr3AAAIDCUVxeW1tOKEu0fKvAAI4cvoYzV-wBhq7B6D8k0no7lb'
  142. 'FlvYoinmtK6UXjRIYPXnahUlFTvmWVtb77jsMkKAXzAEsLE56m36RlvL7ky.M3Xn52r9t1IEb7IR3ke8',
  143. 'ttcid': 'e56fabf6e85d4adf9e4d91902496a0e882',
  144. '_tea_utm_cache_1300': 'undefined',
  145. 'support_avif': 'false',
  146. 'support_webp': 'false',
  147. 'xiguavideopcwebid': '7134967546256016900',
  148. 'xiguavideopcwebid.sig': 'xxRww5R1VEMJN_dQepHorEu_eAc',
  149. }
  150. urllib3.disable_warnings()
  151. s = requests.session()
  152. # max_retries=3 重试3次
  153. s.mount('http://', HTTPAdapter(max_retries=3))
  154. s.mount('https://', HTTPAdapter(max_retries=3))
  155. response = s.get(url=url, headers=headers, params=params, cookies=cookies, verify=False,
  156. proxies=Common.tunnel_proxies(), timeout=5)
  157. response.close()
  158. if 'data' not in response.json() or response.json()['data'] == '':
  159. Common.logger(log_type, crawler).warning('get_video_info: response: {}', response)
  160. else:
  161. video_info = response.json()['data']['gidInformation']['packerData']['video']
  162. video_url_dict = {}
  163. # video_url
  164. if 'videoResource' not in video_info:
  165. video_url_dict["video_url"] = ''
  166. video_url_dict["audio_url"] = ''
  167. video_url_dict["video_width"] = 0
  168. video_url_dict["video_height"] = 0
  169. elif 'dash_120fps' in video_info['videoResource']:
  170. if "video_list" in video_info['videoResource']['dash_120fps'] and 'video_4' in \
  171. video_info['videoResource']['dash_120fps']['video_list']:
  172. video_url = video_info['videoResource']['dash_120fps']['video_list']['video_4']['backup_url_1']
  173. audio_url = video_info['videoResource']['dash_120fps']['video_list']['video_4']['backup_url_1']
  174. if len(video_url) % 3 == 1:
  175. video_url += '=='
  176. elif len(video_url) % 3 == 2:
  177. video_url += '='
  178. elif len(audio_url) % 3 == 1:
  179. audio_url += '=='
  180. elif len(audio_url) % 3 == 2:
  181. audio_url += '='
  182. video_url = base64.b64decode(video_url).decode('utf8')
  183. audio_url = base64.b64decode(audio_url).decode('utf8')
  184. video_width = video_info['videoResource']['dash_120fps']['video_list']['video_4']['vwidth']
  185. video_height = video_info['videoResource']['dash_120fps']['video_list']['video_4']['vheight']
  186. video_url_dict["video_url"] = video_url
  187. video_url_dict["audio_url"] = audio_url
  188. video_url_dict["video_width"] = video_width
  189. video_url_dict["video_height"] = video_height
  190. elif "video_list" in video_info['videoResource']['dash_120fps'] and 'video_3' in \
  191. video_info['videoResource']['dash_120fps']['video_list']:
  192. video_url = video_info['videoResource']['dash_120fps']['video_list']['video_3']['backup_url_1']
  193. audio_url = video_info['videoResource']['dash_120fps']['video_list']['video_3']['backup_url_1']
  194. if len(video_url) % 3 == 1:
  195. video_url += '=='
  196. elif len(video_url) % 3 == 2:
  197. video_url += '='
  198. elif len(audio_url) % 3 == 1:
  199. audio_url += '=='
  200. elif len(audio_url) % 3 == 2:
  201. audio_url += '='
  202. video_url = base64.b64decode(video_url).decode('utf8')
  203. audio_url = base64.b64decode(audio_url).decode('utf8')
  204. video_width = video_info['videoResource']['dash_120fps']['video_list']['video_3']['vwidth']
  205. video_height = video_info['videoResource']['dash_120fps']['video_list']['video_3']['vheight']
  206. video_url_dict["video_url"] = video_url
  207. video_url_dict["audio_url"] = audio_url
  208. video_url_dict["video_width"] = video_width
  209. video_url_dict["video_height"] = video_height
  210. elif "video_list" in video_info['videoResource']['dash_120fps'] and 'video_2' in \
  211. video_info['videoResource']['dash_120fps']['video_list']:
  212. video_url = video_info['videoResource']['dash_120fps']['video_list']['video_2']['backup_url_1']
  213. audio_url = video_info['videoResource']['dash_120fps']['video_list']['video_2']['backup_url_1']
  214. if len(video_url) % 3 == 1:
  215. video_url += '=='
  216. elif len(video_url) % 3 == 2:
  217. video_url += '='
  218. elif len(audio_url) % 3 == 1:
  219. audio_url += '=='
  220. elif len(audio_url) % 3 == 2:
  221. audio_url += '='
  222. video_url = base64.b64decode(video_url).decode('utf8')
  223. audio_url = base64.b64decode(audio_url).decode('utf8')
  224. video_width = video_info['videoResource']['dash_120fps']['video_list']['video_2']['vwidth']
  225. video_height = video_info['videoResource']['dash_120fps']['video_list']['video_2']['vheight']
  226. video_url_dict["video_url"] = video_url
  227. video_url_dict["audio_url"] = audio_url
  228. video_url_dict["video_width"] = video_width
  229. video_url_dict["video_height"] = video_height
  230. elif "video_list" in video_info['videoResource']['dash_120fps'] and 'video_1' in \
  231. video_info['videoResource']['dash_120fps']['video_list']:
  232. video_url = video_info['videoResource']['dash_120fps']['video_list']['video_1']['backup_url_1']
  233. audio_url = video_info['videoResource']['dash_120fps']['video_list']['video_1']['backup_url_1']
  234. if len(video_url) % 3 == 1:
  235. video_url += '=='
  236. elif len(video_url) % 3 == 2:
  237. video_url += '='
  238. elif len(audio_url) % 3 == 1:
  239. audio_url += '=='
  240. elif len(audio_url) % 3 == 2:
  241. audio_url += '='
  242. video_url = base64.b64decode(video_url).decode('utf8')
  243. audio_url = base64.b64decode(audio_url).decode('utf8')
  244. video_width = video_info['videoResource']['dash_120fps']['video_list']['video_1']['vwidth']
  245. video_height = video_info['videoResource']['dash_120fps']['video_list']['video_1']['vheight']
  246. video_url_dict["video_url"] = video_url
  247. video_url_dict["audio_url"] = audio_url
  248. video_url_dict["video_width"] = video_width
  249. video_url_dict["video_height"] = video_height
  250. elif 'dynamic_video' in video_info['videoResource']['dash_120fps'] \
  251. and 'dynamic_video_list' in video_info['videoResource']['dash_120fps']['dynamic_video'] \
  252. and 'dynamic_audio_list' in video_info['videoResource']['dash_120fps']['dynamic_video'] \
  253. and len(
  254. video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_video_list']) != 0 \
  255. and len(
  256. video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_audio_list']) != 0:
  257. video_url = \
  258. video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_video_list'][-1][
  259. 'backup_url_1']
  260. audio_url = \
  261. video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_audio_list'][-1][
  262. 'backup_url_1']
  263. if len(video_url) % 3 == 1:
  264. video_url += '=='
  265. elif len(video_url) % 3 == 2:
  266. video_url += '='
  267. elif len(audio_url) % 3 == 1:
  268. audio_url += '=='
  269. elif len(audio_url) % 3 == 2:
  270. audio_url += '='
  271. video_url = base64.b64decode(video_url).decode('utf8')
  272. audio_url = base64.b64decode(audio_url).decode('utf8')
  273. video_width = \
  274. video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_video_list'][-1][
  275. 'vwidth']
  276. video_height = \
  277. video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_video_list'][-1][
  278. 'vheight']
  279. video_url_dict["video_url"] = video_url
  280. video_url_dict["audio_url"] = audio_url
  281. video_url_dict["video_width"] = video_width
  282. video_url_dict["video_height"] = video_height
  283. else:
  284. video_url_dict["video_url"] = ''
  285. video_url_dict["audio_url"] = ''
  286. video_url_dict["video_width"] = 0
  287. video_url_dict["video_height"] = 0
  288. elif 'dash' in video_info['videoResource']:
  289. if "video_list" in video_info['videoResource']['dash'] and 'video_4' in \
  290. video_info['videoResource']['dash']['video_list']:
  291. video_url = video_info['videoResource']['dash']['video_list']['video_4']['backup_url_1']
  292. audio_url = video_info['videoResource']['dash']['video_list']['video_4']['backup_url_1']
  293. if len(video_url) % 3 == 1:
  294. video_url += '=='
  295. elif len(video_url) % 3 == 2:
  296. video_url += '='
  297. elif len(audio_url) % 3 == 1:
  298. audio_url += '=='
  299. elif len(audio_url) % 3 == 2:
  300. audio_url += '='
  301. video_url = base64.b64decode(video_url).decode('utf8')
  302. audio_url = base64.b64decode(audio_url).decode('utf8')
  303. video_width = video_info['videoResource']['dash']['video_list']['video_4']['vwidth']
  304. video_height = video_info['videoResource']['dash']['video_list']['video_4']['vheight']
  305. video_url_dict["video_url"] = video_url
  306. video_url_dict["audio_url"] = audio_url
  307. video_url_dict["video_width"] = video_width
  308. video_url_dict["video_height"] = video_height
  309. elif "video_list" in video_info['videoResource']['dash'] and 'video_3' in \
  310. video_info['videoResource']['dash']['video_list']:
  311. video_url = video_info['videoResource']['dash']['video_list']['video_3']['backup_url_1']
  312. audio_url = video_info['videoResource']['dash']['video_list']['video_3']['backup_url_1']
  313. if len(video_url) % 3 == 1:
  314. video_url += '=='
  315. elif len(video_url) % 3 == 2:
  316. video_url += '='
  317. elif len(audio_url) % 3 == 1:
  318. audio_url += '=='
  319. elif len(audio_url) % 3 == 2:
  320. audio_url += '='
  321. video_url = base64.b64decode(video_url).decode('utf8')
  322. audio_url = base64.b64decode(audio_url).decode('utf8')
  323. video_width = video_info['videoResource']['dash']['video_list']['video_3']['vwidth']
  324. video_height = video_info['videoResource']['dash']['video_list']['video_3']['vheight']
  325. video_url_dict["video_url"] = video_url
  326. video_url_dict["audio_url"] = audio_url
  327. video_url_dict["video_width"] = video_width
  328. video_url_dict["video_height"] = video_height
  329. elif "video_list" in video_info['videoResource']['dash'] and 'video_2' in \
  330. video_info['videoResource']['dash']['video_list']:
  331. video_url = video_info['videoResource']['dash']['video_list']['video_2']['backup_url_1']
  332. audio_url = video_info['videoResource']['dash']['video_list']['video_2']['backup_url_1']
  333. if len(video_url) % 3 == 1:
  334. video_url += '=='
  335. elif len(video_url) % 3 == 2:
  336. video_url += '='
  337. elif len(audio_url) % 3 == 1:
  338. audio_url += '=='
  339. elif len(audio_url) % 3 == 2:
  340. audio_url += '='
  341. video_url = base64.b64decode(video_url).decode('utf8')
  342. audio_url = base64.b64decode(audio_url).decode('utf8')
  343. video_width = video_info['videoResource']['dash']['video_list']['video_2']['vwidth']
  344. video_height = video_info['videoResource']['dash']['video_list']['video_2']['vheight']
  345. video_url_dict["video_url"] = video_url
  346. video_url_dict["audio_url"] = audio_url
  347. video_url_dict["video_width"] = video_width
  348. video_url_dict["video_height"] = video_height
  349. elif "video_list" in video_info['videoResource']['dash'] and 'video_1' in \
  350. video_info['videoResource']['dash']['video_list']:
  351. video_url = video_info['videoResource']['dash']['video_list']['video_1']['backup_url_1']
  352. audio_url = video_info['videoResource']['dash']['video_list']['video_1']['backup_url_1']
  353. if len(video_url) % 3 == 1:
  354. video_url += '=='
  355. elif len(video_url) % 3 == 2:
  356. video_url += '='
  357. elif len(audio_url) % 3 == 1:
  358. audio_url += '=='
  359. elif len(audio_url) % 3 == 2:
  360. audio_url += '='
  361. video_url = base64.b64decode(video_url).decode('utf8')
  362. audio_url = base64.b64decode(audio_url).decode('utf8')
  363. video_width = video_info['videoResource']['dash']['video_list']['video_1']['vwidth']
  364. video_height = video_info['videoResource']['dash']['video_list']['video_1']['vheight']
  365. video_url_dict["video_url"] = video_url
  366. video_url_dict["audio_url"] = audio_url
  367. video_url_dict["video_width"] = video_width
  368. video_url_dict["video_height"] = video_height
  369. elif 'dynamic_video' in video_info['videoResource']['dash'] \
  370. and 'dynamic_video_list' in video_info['videoResource']['dash']['dynamic_video'] \
  371. and 'dynamic_audio_list' in video_info['videoResource']['dash']['dynamic_video'] \
  372. and len(video_info['videoResource']['dash']['dynamic_video']['dynamic_video_list']) != 0 \
  373. and len(video_info['videoResource']['dash']['dynamic_video']['dynamic_audio_list']) != 0:
  374. video_url = video_info['videoResource']['dash']['dynamic_video']['dynamic_video_list'][-1][
  375. 'backup_url_1']
  376. audio_url = video_info['videoResource']['dash']['dynamic_video']['dynamic_audio_list'][-1][
  377. 'backup_url_1']
  378. if len(video_url) % 3 == 1:
  379. video_url += '=='
  380. elif len(video_url) % 3 == 2:
  381. video_url += '='
  382. elif len(audio_url) % 3 == 1:
  383. audio_url += '=='
  384. elif len(audio_url) % 3 == 2:
  385. audio_url += '='
  386. video_url = base64.b64decode(video_url).decode('utf8')
  387. audio_url = base64.b64decode(audio_url).decode('utf8')
  388. video_width = video_info['videoResource']['dash']['dynamic_video']['dynamic_video_list'][-1][
  389. 'vwidth']
  390. video_height = video_info['videoResource']['dash']['dynamic_video']['dynamic_video_list'][-1][
  391. 'vheight']
  392. video_url_dict["video_url"] = video_url
  393. video_url_dict["audio_url"] = audio_url
  394. video_url_dict["video_width"] = video_width
  395. video_url_dict["video_height"] = video_height
  396. else:
  397. video_url_dict["video_url"] = ''
  398. video_url_dict["audio_url"] = ''
  399. video_url_dict["video_width"] = 0
  400. video_url_dict["video_height"] = 0
  401. elif 'normal' in video_info['videoResource']:
  402. if "video_list" in video_info['videoResource']['normal'] and 'video_4' in \
  403. video_info['videoResource']['normal']['video_list']:
  404. video_url = video_info['videoResource']['normal']['video_list']['video_4']['backup_url_1']
  405. audio_url = video_info['videoResource']['normal']['video_list']['video_4']['backup_url_1']
  406. if len(video_url) % 3 == 1:
  407. video_url += '=='
  408. elif len(video_url) % 3 == 2:
  409. video_url += '='
  410. elif len(audio_url) % 3 == 1:
  411. audio_url += '=='
  412. elif len(audio_url) % 3 == 2:
  413. audio_url += '='
  414. video_url = base64.b64decode(video_url).decode('utf8')
  415. audio_url = base64.b64decode(audio_url).decode('utf8')
  416. video_width = video_info['videoResource']['normal']['video_list']['video_4']['vwidth']
  417. video_height = video_info['videoResource']['normal']['video_list']['video_4']['vheight']
  418. video_url_dict["video_url"] = video_url
  419. video_url_dict["audio_url"] = audio_url
  420. video_url_dict["video_width"] = video_width
  421. video_url_dict["video_height"] = video_height
  422. elif "video_list" in video_info['videoResource']['normal'] and 'video_3' in \
  423. video_info['videoResource']['normal']['video_list']:
  424. video_url = video_info['videoResource']['normal']['video_list']['video_3']['backup_url_1']
  425. audio_url = video_info['videoResource']['normal']['video_list']['video_3']['backup_url_1']
  426. if len(video_url) % 3 == 1:
  427. video_url += '=='
  428. elif len(video_url) % 3 == 2:
  429. video_url += '='
  430. elif len(audio_url) % 3 == 1:
  431. audio_url += '=='
  432. elif len(audio_url) % 3 == 2:
  433. audio_url += '='
  434. video_url = base64.b64decode(video_url).decode('utf8')
  435. audio_url = base64.b64decode(audio_url).decode('utf8')
  436. video_width = video_info['videoResource']['normal']['video_list']['video_3']['vwidth']
  437. video_height = video_info['videoResource']['normal']['video_list']['video_3']['vheight']
  438. video_url_dict["video_url"] = video_url
  439. video_url_dict["audio_url"] = audio_url
  440. video_url_dict["video_width"] = video_width
  441. video_url_dict["video_height"] = video_height
  442. elif "video_list" in video_info['videoResource']['normal'] and 'video_2' in \
  443. video_info['videoResource']['normal']['video_list']:
  444. video_url = video_info['videoResource']['normal']['video_list']['video_2']['backup_url_1']
  445. audio_url = video_info['videoResource']['normal']['video_list']['video_2']['backup_url_1']
  446. if len(video_url) % 3 == 1:
  447. video_url += '=='
  448. elif len(video_url) % 3 == 2:
  449. video_url += '='
  450. elif len(audio_url) % 3 == 1:
  451. audio_url += '=='
  452. elif len(audio_url) % 3 == 2:
  453. audio_url += '='
  454. video_url = base64.b64decode(video_url).decode('utf8')
  455. audio_url = base64.b64decode(audio_url).decode('utf8')
  456. video_width = video_info['videoResource']['normal']['video_list']['video_2']['vwidth']
  457. video_height = video_info['videoResource']['normal']['video_list']['video_2']['vheight']
  458. video_url_dict["video_url"] = video_url
  459. video_url_dict["audio_url"] = audio_url
  460. video_url_dict["video_width"] = video_width
  461. video_url_dict["video_height"] = video_height
  462. elif "video_list" in video_info['videoResource']['normal'] and 'video_1' in \
  463. video_info['videoResource']['normal']['video_list']:
  464. video_url = video_info['videoResource']['normal']['video_list']['video_1']['backup_url_1']
  465. audio_url = video_info['videoResource']['normal']['video_list']['video_1']['backup_url_1']
  466. if len(video_url) % 3 == 1:
  467. video_url += '=='
  468. elif len(video_url) % 3 == 2:
  469. video_url += '='
  470. elif len(audio_url) % 3 == 1:
  471. audio_url += '=='
  472. elif len(audio_url) % 3 == 2:
  473. audio_url += '='
  474. video_url = base64.b64decode(video_url).decode('utf8')
  475. audio_url = base64.b64decode(audio_url).decode('utf8')
  476. video_width = video_info['videoResource']['normal']['video_list']['video_1']['vwidth']
  477. video_height = video_info['videoResource']['normal']['video_list']['video_1']['vheight']
  478. video_url_dict["video_url"] = video_url
  479. video_url_dict["audio_url"] = audio_url
  480. video_url_dict["video_width"] = video_width
  481. video_url_dict["video_height"] = video_height
  482. elif 'dynamic_video' in video_info['videoResource']['normal'] \
  483. and 'dynamic_video_list' in video_info['videoResource']['normal']['dynamic_video'] \
  484. and 'dynamic_audio_list' in video_info['videoResource']['normal']['dynamic_video'] \
  485. and len(video_info['videoResource']['normal']['dynamic_video']['dynamic_video_list']) != 0 \
  486. and len(video_info['videoResource']['normal']['dynamic_video']['dynamic_audio_list']) != 0:
  487. video_url = video_info['videoResource']['normal']['dynamic_video']['dynamic_video_list'][-1][
  488. 'backup_url_1']
  489. audio_url = video_info['videoResource']['normal']['dynamic_video']['dynamic_audio_list'][-1][
  490. 'backup_url_1']
  491. if len(video_url) % 3 == 1:
  492. video_url += '=='
  493. elif len(video_url) % 3 == 2:
  494. video_url += '='
  495. elif len(audio_url) % 3 == 1:
  496. audio_url += '=='
  497. elif len(audio_url) % 3 == 2:
  498. audio_url += '='
  499. video_url = base64.b64decode(video_url).decode('utf8')
  500. audio_url = base64.b64decode(audio_url).decode('utf8')
  501. video_width = video_info['videoResource']['normal']['dynamic_video']['dynamic_video_list'][-1][
  502. 'vwidth']
  503. video_height = video_info['videoResource']['normal']['dynamic_video']['dynamic_video_list'][-1][
  504. 'vheight']
  505. video_url_dict["video_url"] = video_url
  506. video_url_dict["audio_url"] = audio_url
  507. video_url_dict["video_width"] = video_width
  508. video_url_dict["video_height"] = video_height
  509. else:
  510. video_url_dict["video_url"] = ''
  511. video_url_dict["audio_url"] = ''
  512. video_url_dict["video_width"] = 0
  513. video_url_dict["video_height"] = 0
  514. else:
  515. video_url_dict["video_url"] = ''
  516. video_url_dict["audio_url"] = ''
  517. video_url_dict["video_width"] = 0
  518. video_url_dict["video_height"] = 0
  519. return video_url_dict
  520. except Exception as e:
  521. Common.logger(log_type, crawler).error(f'get_video_url:{e}\n')
  522. @classmethod
  523. def get_videolist(cls, log_type, crawler, strategy, task, our_uid, out_uid, oss_endpoint, env):
  524. try:
  525. signature = cls.random_signature()
  526. while True:
  527. url = "https://www.ixigua.com/api/videov2/author/new_video_list?"
  528. params = {
  529. 'to_user_id': str(out_uid),
  530. 'offset': str(cls.offset),
  531. 'limit': '30',
  532. 'maxBehotTime': '0',
  533. 'order': 'new',
  534. 'isHome': '0',
  535. # 'msToken': 'G0eRzNkw189a8TLaXjc6nTHVMQwh9XcxVAqTbGKi7iPJdQcLwS3-XRrJ3MZ7QBfqErpxp3EX1WtvWOIcZ3NIgr41hgcd-v64so_RRj3YCRw1UsKW8mIssNLlIMspsg==',
  536. # 'X-Bogus': 'DFSzswVuEkUANjW9ShFTgR/F6qHt',
  537. '_signature': signature,
  538. }
  539. headers = {
  540. 'referer': f'https://www.ixigua.com/home/{out_uid}/video/?preActiveKey=hotsoon&list_entrance=userdetail',
  541. 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.41',
  542. }
  543. urllib3.disable_warnings()
  544. s = requests.session()
  545. # max_retries=3 重试3次
  546. s.mount('http://', HTTPAdapter(max_retries=3))
  547. s.mount('https://', HTTPAdapter(max_retries=3))
  548. response = s.get(url=url, headers=headers, params=params, proxies=Common.tunnel_proxies(), verify=False,
  549. timeout=5)
  550. response.close()
  551. cls.offset += 30
  552. if response.status_code != 200:
  553. Common.logger(log_type, crawler).warning(f"get_videolist_response:{response.text}\n")
  554. cls.offset = 0
  555. return
  556. elif 'data' not in response.text:
  557. Common.logger(log_type, crawler).warning(f"get_videolist_response:{response.text}\n")
  558. cls.offset = 0
  559. return
  560. elif not response.json()["data"]['videoList']:
  561. Common.logger(log_type, crawler).warning(f"get_videolist_response:{response.json()}\n")
  562. cls.offset = 0
  563. return
  564. else:
  565. videoList = response.json()['data']['videoList']
  566. for i in range(len(videoList)):
  567. # video_title
  568. if 'title' not in videoList[i]:
  569. video_title = 0
  570. else:
  571. video_title = videoList[i]['title'].strip().replace('手游', '') \
  572. .replace('/', '').replace('\/', '').replace('\n', '').replace('"', '').replace("'", '')
  573. # video_id
  574. if 'video_id' not in videoList[i]:
  575. video_id = 0
  576. else:
  577. video_id = videoList[i]['video_id']
  578. # gid
  579. if 'gid' not in videoList[i]:
  580. gid = 0
  581. else:
  582. gid = videoList[i]['gid']
  583. # play_cnt
  584. if 'video_detail_info' not in videoList[i]:
  585. play_cnt = 0
  586. elif 'video_watch_count' not in videoList[i]['video_detail_info']:
  587. play_cnt = 0
  588. else:
  589. play_cnt = videoList[i]['video_detail_info']['video_watch_count']
  590. # comment_cnt
  591. if 'comment_count' not in videoList[i]:
  592. comment_cnt = 0
  593. else:
  594. comment_cnt = videoList[i]['comment_count']
  595. # like_cnt
  596. if 'digg_count' not in videoList[i]:
  597. like_cnt = 0
  598. else:
  599. like_cnt = videoList[i]['digg_count']
  600. # share_cnt
  601. share_cnt = 0
  602. # video_duration
  603. if 'video_duration' not in videoList[i]:
  604. video_duration = 0
  605. else:
  606. video_duration = int(videoList[i]['video_duration'])
  607. # send_time
  608. if 'publish_time' not in videoList[i]:
  609. publish_time = 0
  610. else:
  611. publish_time = videoList[i]['publish_time']
  612. publish_time_str = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(publish_time))
  613. # is_top
  614. if 'is_top' not in videoList[i]:
  615. is_top = 0
  616. else:
  617. is_top = videoList[i]['is_top']
  618. # user_name
  619. if 'user_info' not in videoList[i]:
  620. user_name = 0
  621. elif 'name' not in videoList[i]['user_info']:
  622. user_name = 0
  623. else:
  624. user_name = videoList[i]['user_info']['name']
  625. # user_id
  626. if 'user_info' not in videoList[i]:
  627. user_id = 0
  628. elif 'user_id' not in videoList[i]['user_info']:
  629. user_id = 0
  630. else:
  631. user_id = videoList[i]['user_info']['user_id']
  632. # avatar_url
  633. if 'user_info' not in videoList[i]:
  634. avatar_url = 0
  635. elif 'avatar_url' not in videoList[i]['user_info']:
  636. avatar_url = 0
  637. else:
  638. avatar_url = videoList[i]['user_info']['avatar_url']
  639. # cover_url
  640. if 'video_detail_info' not in videoList[i]:
  641. cover_url = 0
  642. elif 'detail_video_large_image' not in videoList[i]['video_detail_info']:
  643. cover_url = 0
  644. elif 'url' in videoList[i]['video_detail_info']['detail_video_large_image']:
  645. cover_url = videoList[i]['video_detail_info']['detail_video_large_image']['url']
  646. else:
  647. cover_url = videoList[i]['video_detail_info']['detail_video_large_image']['url_list'][0][
  648. 'url']
  649. video_url_dict = cls.get_video_url(log_type, crawler, gid)
  650. video_url = video_url_dict["video_url"]
  651. audio_url = video_url_dict["audio_url"]
  652. video_width = video_url_dict["video_width"]
  653. video_height = video_url_dict["video_height"]
  654. video_dict = {'video_title': video_title,
  655. 'video_id': video_id,
  656. 'gid': gid,
  657. 'play_cnt': play_cnt,
  658. 'comment_cnt': comment_cnt,
  659. 'like_cnt': like_cnt,
  660. 'share_cnt': share_cnt,
  661. 'video_width': video_width,
  662. 'video_height': video_height,
  663. 'duration': video_duration,
  664. 'publish_time_stamp': publish_time,
  665. 'publish_time_str': publish_time_str,
  666. 'is_top': is_top,
  667. 'user_name': user_name,
  668. 'user_id': user_id,
  669. 'avatar_url': avatar_url,
  670. 'cover_url': cover_url,
  671. 'audio_url': audio_url,
  672. 'video_url': video_url,
  673. 'session': signature}
  674. for k, v in video_dict.items():
  675. Common.logger(log_type, crawler).info(f"{k}:{v}")
  676. cls.download_publish(log_type=log_type,
  677. crawler=crawler,
  678. strategy=strategy,
  679. video_dict=video_dict,
  680. task=task,
  681. our_uid=our_uid,
  682. oss_endpoint=oss_endpoint,
  683. env=env)
  684. except Exception as e:
  685. Common.logger(log_type, crawler).error(f"get_videolist:{e}\n")
  686. @classmethod
  687. def repeat_video(cls, log_type, crawler, video_id, env):
  688. sql = f""" select * from crawler_video where platform="{cls.platform}" and out_video_id="{video_id}"; """
  689. repeat_video = MysqlHelper.get_values(log_type, crawler, sql, env)
  690. return len(repeat_video)
  691. # 下载 / 上传
  692. @classmethod
  693. def download_publish(cls, log_type, crawler, strategy, video_dict, task, our_uid, oss_endpoint, env):
  694. try:
  695. filter_words = get_config_from_mysql(log_type, crawler, env, text='filter')
  696. for filter_word in filter_words:
  697. if filter_word in video_dict['video_title']:
  698. Common.logger(log_type, crawler).info('标题已中过滤词:{}\n', video_dict['video_title'])
  699. return
  700. if download_rule(log_type, crawler, video_dict, task['rule_dict']) is False:
  701. Common.logger(log_type, crawler).info('不满足抓取规则\n')
  702. elif cls.repeat_video(log_type, crawler, video_dict['video_id'], env) != 0:
  703. Common.logger(log_type, crawler).info('视频已下载\n')
  704. else:
  705. # 下载视频
  706. Common.download_method(log_type=log_type, crawler=crawler, text='xigua_video',
  707. title=video_dict['video_title'], url=video_dict['video_url'])
  708. # 下载音频
  709. Common.download_method(log_type=log_type, crawler=crawler, text='xigua_audio',
  710. title=video_dict['video_title'], url=video_dict['audio_url'])
  711. # 合成音视频
  712. Common.video_compose(log_type=log_type, crawler=crawler,
  713. video_dir=f"./{crawler}/videos/{video_dict['video_title']}")
  714. md_title = md5(video_dict['video_title'].encode('utf8')).hexdigest()
  715. if os.path.getsize(f"./{crawler}/videos/{md_title}/video.mp4") == 0:
  716. # 删除视频文件夹
  717. shutil.rmtree(f"./{crawler}/videos/{md_title}")
  718. Common.logger(log_type, crawler).info("视频size=0,删除成功\n")
  719. return
  720. # ffmpeg_dict = Common.ffmpeg(log_type, crawler, f"./{crawler}/videos/{video_dict['video_title']}/video.mp4")
  721. # if ffmpeg_dict is None or ffmpeg_dict['size'] == 0:
  722. # Common.logger(log_type, crawler).warning(f"下载的视频无效,已删除\n")
  723. # # 删除视频文件夹
  724. # shutil.rmtree(f"./{crawler}/videos/{video_dict['video_title']}")
  725. # return
  726. # 下载封面
  727. Common.download_method(log_type=log_type, crawler=crawler, text='cover',
  728. title=video_dict['video_title'], url=video_dict['cover_url'])
  729. # 保存视频信息至txt
  730. Common.save_video_info(log_type=log_type, crawler=crawler, video_dict=video_dict)
  731. # 上传视频
  732. Common.logger(log_type, crawler).info("开始上传视频...")
  733. our_video_id = Publish.upload_and_publish(log_type=log_type,
  734. crawler=crawler,
  735. strategy=strategy,
  736. our_uid=our_uid,
  737. env=env,
  738. oss_endpoint=oss_endpoint)
  739. if env == 'dev':
  740. our_video_link = f"https://testadmin.piaoquantv.com/cms/post-detail/{our_video_id}/info"
  741. else:
  742. our_video_link = f"https://admin.piaoquantv.com/cms/post-detail/{our_video_id}/info"
  743. Common.logger(log_type, crawler).info("视频上传完成")
  744. if our_video_id is None:
  745. # 删除视频文件夹
  746. shutil.rmtree(f"./{crawler}/videos/{video_dict['video_title']}")
  747. return
  748. # 视频写入飞书
  749. Feishu.insert_columns(log_type, 'xigua', "e075e9", "ROWS", 1, 2)
  750. upload_time = int(time.time())
  751. values = [[time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(upload_time)),
  752. "定向榜",
  753. video_dict['video_title'],
  754. str(video_dict['video_id']),
  755. our_video_link,
  756. video_dict['gid'],
  757. video_dict['play_cnt'],
  758. video_dict['comment_cnt'],
  759. video_dict['like_cnt'],
  760. video_dict['share_cnt'],
  761. video_dict['duration'],
  762. str(video_dict['video_width']) + '*' + str(video_dict['video_height']),
  763. video_dict['publish_time_str'],
  764. video_dict['user_name'],
  765. video_dict['user_id'],
  766. video_dict['avatar_url'],
  767. video_dict['cover_url'],
  768. video_dict['video_url'],
  769. video_dict['audio_url']]]
  770. time.sleep(1)
  771. Feishu.update_values(log_type, 'xigua', "e075e9", "F2:Z2", values)
  772. Common.logger(log_type, crawler).info(f"视频已保存至云文档\n")
  773. rule_dict = {
  774. "play_cnt": task["play_cnt"],
  775. "video_width": task["video_width"],
  776. "video_height": task["video_height"],
  777. "video_like": task["video_like"],
  778. "share_cnt": task["share_cnt"],
  779. "duration": {"min": task["duration_min"], "max": task["duration_max"]}
  780. }
  781. # 视频信息保存数据库
  782. insert_sql = f""" insert into crawler_video(video_id,
  783. user_id,
  784. out_user_id,
  785. platform,
  786. strategy,
  787. out_video_id,
  788. video_title,
  789. cover_url,
  790. video_url,
  791. duration,
  792. publish_time,
  793. play_cnt,
  794. crawler_rule,
  795. width,
  796. height)
  797. values({our_video_id},
  798. {our_uid},
  799. "{video_dict['user_id']}",
  800. "{cls.platform}",
  801. "定向爬虫策略",
  802. "{video_dict['video_id']}",
  803. "{video_dict['video_title']}",
  804. "{video_dict['cover_url']}",
  805. "{video_dict['video_url']}",
  806. {int(video_dict['duration'])},
  807. "{video_dict['publish_time_str']}",
  808. {int(video_dict['play_cnt'])},
  809. '{json.dumps(rule_dict)}',
  810. {int(video_dict['video_width'])},
  811. {int(video_dict['video_height'])}) """
  812. Common.logger(log_type, crawler).info(f"insert_sql:{insert_sql}")
  813. MysqlHelper.update_values(log_type, crawler, insert_sql, env)
  814. Common.logger(log_type, crawler).info('视频信息插入数据库成功!\n')
  815. except Exception as e:
  816. Common.logger(log_type, crawler).error(f'download_publish异常:{e}\n')
  817. @classmethod
  818. def get_follow_videos(cls, log_type, crawler, task, oss_endpoint, env):
  819. user_list = get_user_from_mysql(log_type, crawler, crawler, env)
  820. strategy = '定向抓取策略'
  821. for user in user_list:
  822. try:
  823. spider_link = user["link"]
  824. out_uid = spider_link.split('/')[-1]
  825. user_name = user["nick_name"]
  826. our_uid = user["uid"]
  827. Common.logger(log_type, crawler).info(f"开始抓取 {user_name} 用户主页视频\n")
  828. cls.get_videolist(log_type=log_type,
  829. crawler=crawler,
  830. strategy=strategy,
  831. task=task,
  832. our_uid=our_uid,
  833. out_uid=out_uid,
  834. oss_endpoint=oss_endpoint,
  835. env=env)
  836. cls.offset = 0
  837. except Exception as e:
  838. Common.logger(log_type, crawler).error(f"get_follow_videos:{e}\n")
  839. if __name__ == '__main__':
  840. # SchedulingFollow.get_users(log_type="follow",
  841. # crawler="xigua",
  842. # spider_rule="['https://www.ixigua.com/home/95420624045', 'https://www.ixigua.com/home/6431477489']",
  843. # env="dev",
  844. # machine="local")
  845. print(ScheduleXiguaFollow.repeat_video("follow", "xigua", "v0201ag10000ce3jcjbc77u8jsplpgrg", "dev"))
  846. pass