xigua_follow.py 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431
  1. # -*- coding: utf-8 -*-
  2. # @Author: wangkun
  3. # @Time: 2023/2/17
  4. import base64
  5. import json
  6. import os
  7. import sys
  8. import time
  9. import requests
  10. import urllib3
  11. from selenium.webdriver import DesiredCapabilities
  12. from selenium.webdriver.chrome.service import Service
  13. from selenium.webdriver.common.by import By
  14. from seleniumwire import webdriver
  15. sys.path.append(os.getcwd())
  16. from common.common import Common
  17. from common.feishu import Feishu
  18. from common.publish import Publish
  19. proxies = {"http": None, "https": None}
  20. class Follow:
  21. # 个人主页视频翻页参数
  22. offset = 0
  23. # 下载规则
  24. @staticmethod
  25. def download_rule(duration, width, height):
  26. if int(duration) >= 60:
  27. if int(width) >= 720 or int(height) >= 720:
  28. return True
  29. else:
  30. return False
  31. else:
  32. return False
  33. # 过滤词库
  34. @classmethod
  35. def filter_words(cls, log_type, crawler):
  36. try:
  37. filter_words_sheet = Feishu.get_values_batch(log_type, crawler, 'KGB4Hc')
  38. filter_words_list = []
  39. for x in filter_words_sheet:
  40. for y in x:
  41. if y is None:
  42. pass
  43. else:
  44. filter_words_list.append(y)
  45. return filter_words_list
  46. except Exception as e:
  47. Common.logger(log_type, crawler).error(f'filter_words异常:{e}\n')
  48. # 获取用户信息(字典格式). 注意:部分 user_id 字符类型是 int / str
  49. @classmethod
  50. def get_user_info_from_feishu(cls, log_type, crawler):
  51. try:
  52. user_sheet = Feishu.get_values_batch(log_type, crawler, '5tlTYB')
  53. user_dict = {}
  54. for i in range(1, len(user_sheet)):
  55. user_name = user_sheet[i][0]
  56. out_id = user_sheet[i][1]
  57. our_id = user_sheet[i][3]
  58. if user_name is None or out_id is None or our_id is None:
  59. pass
  60. else:
  61. user_dict[user_name] = str(out_id) + ',' + str(our_id)
  62. return user_dict
  63. except Exception as e:
  64. Common.logger(log_type, crawler).error(f'get_user_id_from_feishu异常:{e}\n')
  65. @classmethod
  66. def get_signature(cls, log_type, crawler, out_uid, machine):
  67. try:
  68. # 打印请求配置
  69. ca = DesiredCapabilities.CHROME
  70. ca["goog:loggingPrefs"] = {"performance": "ALL"}
  71. # 不打开浏览器运行
  72. chrome_options = webdriver.ChromeOptions()
  73. chrome_options.add_argument("--headless")
  74. chrome_options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36')
  75. chrome_options.add_argument("--no-sandbox")
  76. # driver初始化
  77. if machine == 'aliyun' or machine == 'aliyun_hk':
  78. driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options)
  79. elif machine == 'macpro':
  80. driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options,
  81. service=Service('/Users/lieyunye/Downloads/chromedriver_v86/chromedriver'))
  82. elif machine == 'macair':
  83. driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options,
  84. service=Service('/Users/piaoquan/Downloads/chromedriver'))
  85. else:
  86. driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options, service=Service('/Users/wangkun/Downloads/chromedriver/chromedriver_v110/chromedriver'))
  87. driver.implicitly_wait(10)
  88. driver.get(f'https://www.ixigua.com/home/{out_uid}/')
  89. time.sleep(3)
  90. data_src = driver.find_elements(By.XPATH, '//img[@class="tt-img BU-MagicImage tt-img-loaded"]')[1].get_attribute("data-src")
  91. signature = data_src.split("x-signature=")[-1]
  92. # print(f"data_src:{data_src}")
  93. # print(f"signature:{signature}")
  94. return signature
  95. except Exception as e:
  96. Common.logger(log_type, crawler).error(f'get_signature异常:{e}\n')
  97. # 获取视频详情
  98. @classmethod
  99. def get_video_url(cls, log_type, crawler, gid):
  100. # try:
  101. url = 'https://www.ixigua.com/api/mixVideo/information?'
  102. headers = {
  103. "accept-encoding": "gzip, deflate",
  104. "accept-language": "zh-CN,zh-Hans;q=0.9",
  105. "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
  106. "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.5 Safari/605.1.15",
  107. "referer": "https://www.ixigua.com/7102614741050196520?logTag=0531c88ac04f38ab2c62",
  108. }
  109. params = {
  110. 'mixId': gid,
  111. 'msToken': 'IlG0wd0Pylyw9ghcYiB2YseUmTwrsrqqhXrbIcsSaTcLTJyVlbYJzk20zw3UO-CfrfC'
  112. 'NVVIOBNjIl7vfBoxnVUwO9ZyzAI3umSKsT5-pef_RRfQCJwmA',
  113. 'X-Bogus': 'DFSzswVupYTANCJOSBk0P53WxM-r',
  114. '_signature': '_02B4Z6wo0000119LvEwAAIDCuktNZ0y5wkdfS7jAALThuOR8D9yWNZ.EmWHKV0WSn6Px'
  115. 'fPsH9-BldyxVje0f49ryXgmn7Tzk-swEHNb15TiGqa6YF.cX0jW8Eds1TtJOIZyfc9s5emH7gdWN94',
  116. }
  117. cookies = {
  118. 'ixigua-a-s': '1',
  119. 'msToken': 'IlG0wd0Pylyw9ghcYiB2YseUmTwrsrqqhXrbIcsSaTcLTJyVlbYJzk20zw3UO-CfrfCNVVIOB'
  120. 'NjIl7vfBoxnVUwO9ZyzAI3umSKsT5-pef_RRfQCJwmA',
  121. 'ttwid': '1%7C_yXQeHWwLZgCsgHClOwTCdYSOt_MjdOkgnPIkpi-Sr8%7C1661241238%7Cf57d0c5ef3f1d7'
  122. '6e049fccdca1ac54887c34d1f8731c8e51a49780ff0ceab9f8',
  123. 'tt_scid': 'QZ4l8KXDG0YAEaMCSbADdcybdKbUfG4BC6S4OBv9lpRS5VyqYLX2bIR8CTeZeGHR9ee3',
  124. 'MONITOR_WEB_ID': '0a49204a-7af5-4e96-95f0-f4bafb7450ad',
  125. '__ac_nonce': '06304878000964fdad287',
  126. '__ac_signature': '_02B4Z6wo00f017Rcr3AAAIDCUVxeW1tOKEu0fKvAAI4cvoYzV-wBhq7B6D8k0no7lb'
  127. 'FlvYoinmtK6UXjRIYPXnahUlFTvmWVtb77jsMkKAXzAEsLE56m36RlvL7ky.M3Xn52r9t1IEb7IR3ke8',
  128. 'ttcid': 'e56fabf6e85d4adf9e4d91902496a0e882',
  129. '_tea_utm_cache_1300': 'undefined',
  130. 'support_avif': 'false',
  131. 'support_webp': 'false',
  132. 'xiguavideopcwebid': '7134967546256016900',
  133. 'xiguavideopcwebid.sig': 'xxRww5R1VEMJN_dQepHorEu_eAc',
  134. }
  135. urllib3.disable_warnings()
  136. response = requests.get(url=url, headers=headers, params=params, cookies=cookies, verify=False)
  137. if 'data' not in response.json() or response.json()['data'] == '':
  138. Common.logger(log_type, crawler).warning('get_video_info: response: {}', response)
  139. else:
  140. video_info = response.json()['data']['gidInformation']['packerData']['video']
  141. video_url_dict = {}
  142. # video_url
  143. if 'videoResource' not in video_info:
  144. video_url_dict["video_url"] = ''
  145. video_url_dict["audio_url"] = ''
  146. video_url_dict["video_width"] = 0
  147. video_url_dict["video_height"] = 0
  148. elif 'dash_120fps' in video_info['videoResource']:
  149. if "video_list" in video_info['videoResource']['dash_120fps'] and len(video_info['videoResource']['dash_120fps']['video_list']) != 0:
  150. video_url = video_info['videoResource']['dash_120fps']['video_list'][-1]['backup_url_1']
  151. audio_url = video_info['videoResource']['dash_120fps']['video_list'][-1]['backup_url_1']
  152. if len(video_url) % 3 == 1:
  153. video_url += '=='
  154. elif len(video_url) % 3 == 2:
  155. video_url += '='
  156. elif len(audio_url) % 3 == 1:
  157. audio_url += '=='
  158. elif len(audio_url) % 3 == 2:
  159. audio_url += '='
  160. video_url = base64.b64decode(video_url).decode('utf8')
  161. audio_url = base64.b64decode(audio_url).decode('utf8')
  162. video_width = video_info['videoResource']['dash_120fps']['video_list'][-1]['vwidth']
  163. video_height = video_info['videoResource']['dash_120fps']['video_list'][-1]['vheight']
  164. video_url_dict["video_url"] = video_url
  165. video_url_dict["audio_url"] = audio_url
  166. video_url_dict["video_width"] = video_width
  167. video_url_dict["video_height"] = video_height
  168. elif 'dynamic_video' in video_info['videoResource']['dash_120fps'] \
  169. and 'dynamic_video' in video_info['videoResource']['dash_120fps'] \
  170. and 'dynamic_video_list' in video_info['videoResource']['dash_120fps']['dynamic_video'] \
  171. and 'dynamic_audio_list' in video_info['videoResource']['dash_120fps']['dynamic_video'] \
  172. and len(video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_video_list']) != 0 \
  173. and len(video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_audio_list']) != 0:
  174. video_url = video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_video_list'][-1]['backup_url_1']
  175. audio_url = video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_audio_list'][-1]['backup_url_1']
  176. if len(video_url) % 3 == 1:
  177. video_url += '=='
  178. elif len(video_url) % 3 == 2:
  179. video_url += '='
  180. elif len(audio_url) % 3 == 1:
  181. audio_url += '=='
  182. elif len(audio_url) % 3 == 2:
  183. audio_url += '='
  184. video_url = base64.b64decode(video_url).decode('utf8')
  185. audio_url = base64.b64decode(audio_url).decode('utf8')
  186. video_width = video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_video_list'][-1]['vwidth']
  187. video_height = video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_video_list'][-1]['vheight']
  188. video_url_dict["video_url"] = video_url
  189. video_url_dict["audio_url"] = audio_url
  190. video_url_dict["video_width"] = video_width
  191. video_url_dict["video_height"] = video_height
  192. elif 'dash' in video_info['videoResource'] \
  193. and 'dynamic_video' in video_info['videoResource']['dash'] \
  194. and 'dynamic_video_list' in video_info['videoResource']['dash']['dynamic_video']:
  195. video_url = video_info['videoResource']['dash']['dynamic_video']['dynamic_video_list'][-1]['backup_url_1']
  196. audio_url = video_info['videoResource']['dash']['dynamic_video']['dynamic_audio_list'][-1]['backup_url_1']
  197. if len(video_url) % 3 == 1:
  198. video_url += '=='
  199. elif len(video_url) % 3 == 2:
  200. video_url += '='
  201. elif len(audio_url) % 3 == 1:
  202. audio_url += '=='
  203. elif len(audio_url) % 3 == 2:
  204. audio_url += '='
  205. video_url = base64.b64decode(video_url).decode('utf8')
  206. audio_url = base64.b64decode(audio_url).decode('utf8')
  207. video_width = video_info['videoResource']['dash']['dynamic_video']['dynamic_video_list'][-1]['vwidth']
  208. video_height = video_info['videoResource']['dash']['dynamic_video']['dynamic_video_list'][-1]['vheight']
  209. elif 'normal' in video_info['videoResource']:
  210. video_url = video_info['videoResource']['normal']['video_list'][-1]['backup_url_1']
  211. audio_url = video_info['videoResource']['normal']['video_list'][-1]['backup_url_1']
  212. if len(video_url) % 3 == 1:
  213. video_url += '=='
  214. elif len(video_url) % 3 == 2:
  215. video_url += '='
  216. elif len(audio_url) % 3 == 1:
  217. audio_url += '=='
  218. elif len(audio_url) % 3 == 2:
  219. audio_url += '='
  220. video_url = base64.b64decode(video_url).decode('utf8')
  221. audio_url = base64.b64decode(audio_url).decode('utf8')
  222. video_width = video_info['videoResource']['normal']['video_list'][-1]['vwidth']
  223. video_height = video_info['videoResource']['normal']['video_list'][-1]['vheight']
  224. else:
  225. video_url = 0
  226. audio_url = 0
  227. video_width = 0
  228. video_height = 0
  229. return video_url_dict
  230. # except Exception as e:
  231. # Common.logger(log_type).error(f'get_video_info异常:{e}\n')
  232. @classmethod
  233. def get_videolist(cls, log_type, crawler, out_uid, machine):
  234. signature = cls.get_signature(log_type, crawler, out_uid, machine)
  235. url = "https://www.ixigua.com/api/videov2/author/new_video_list?"
  236. params = {
  237. 'to_user_id': str(out_uid),
  238. 'offset': str(cls.offset),
  239. 'limit': '30',
  240. 'maxBehotTime': '0',
  241. 'order': 'new',
  242. 'isHome': '0',
  243. 'msToken': 'G0eRzNkw189a8TLaXjc6nTHVMQwh9XcxVAqTbGKi7iPJdQcLwS3-XRrJ3MZ7QBfqErpxp3EX1WtvWOIcZ3NIgr41hgcd-v64so_RRj3YCRw1UsKW8mIssNLlIMspsg==',
  244. 'X-Bogus': 'DFSzswVuEkUANjW9ShFTgR/F6qHt',
  245. '_signature': signature,
  246. }
  247. headers = {
  248. 'authority': 'www.ixigua.com',
  249. 'accept': 'application/json, text/plain, */*',
  250. 'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
  251. 'cache-control': 'no-cache',
  252. 'cookie': f'MONITOR_WEB_ID=7168304743566296612; __ac_signature={signature}; ixigua-a-s=1; support_webp=true; support_avif=false; csrf_session_id=a5355d954d3c63ed1ba35faada452b4d; msToken=G0eRzNkw189a8TLaXjc6nTHVMQwh9XcxVAqTbGKi7iPJdQcLwS3-XRrJ3MZ7QBfqErpxp3EX1WtvWOIcZ3NIgr41hgcd-v64so_RRj3YCRw1UsKW8mIssNLlIMspsg==; tt_scid=o4agqz7u9SKPwfBoPt6S82Cw0q.9KDtqmNe0JHxMqmpxNHQWq1BmrQdgVU6jEoX7ed99; ttwid=1%7CHHtv2QqpSGuSu8r-zXF1QoWsvjmNi1SJrqOrZzg-UCY%7C1676618894%7Cee5ad95378275f282f230a7ffa9947ae7eff40d0829c5a2568672a6dc90a1c96; ixigua-a-s=1',
  253. 'pragma': 'no-cache',
  254. 'referer': f'https://www.ixigua.com/home/{out_uid}/video/?preActiveKey=hotsoon&list_entrance=userdetail',
  255. 'sec-ch-ua': '"Chromium";v="110", "Not A(Brand";v="24", "Microsoft Edge";v="110"',
  256. 'sec-ch-ua-mobile': '?0',
  257. 'sec-ch-ua-platform': '"macOS"',
  258. 'sec-fetch-dest': 'empty',
  259. 'sec-fetch-mode': 'cors',
  260. 'sec-fetch-site': 'same-origin',
  261. 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.41',
  262. 'x-secsdk-csrf-token': '00010000000119e3f9454d1dcbb288704cda1960f241e2d19bd21f2fd283520c3615a990ac5a17448bfbb902a249'
  263. }
  264. urllib3.disable_warnings()
  265. response = requests.get(url=url, headers=headers, params=params, proxies=proxies, verify=False)
  266. cls.offset += 30
  267. if response.status_code != 200:
  268. Common.logger(log_type, crawler).warning(f"get_videolist_response:{response.text}\n")
  269. elif 'data' not in response.text:
  270. Common.logger(log_type, crawler).warning(f"get_videolist_response:{response.text}\n")
  271. elif 'videoList' not in response.json()["data"]:
  272. Common.logger(log_type, crawler).warning(f"get_videolist_response:{response.json()}\n")
  273. else:
  274. videoList = response.json()['data']['videoList']
  275. for i in range(len(videoList)):
  276. # video_title
  277. if 'title' not in videoList[i]:
  278. video_title = 0
  279. else:
  280. video_title = videoList[i]['title'].strip().replace('手游', '') \
  281. .replace('/', '').replace('\/', '').replace('\n', '')
  282. # video_id
  283. if 'video_id' not in videoList[i]:
  284. video_id = 0
  285. else:
  286. video_id = videoList[i]['video_id']
  287. # gid
  288. if 'gid' not in videoList[i]:
  289. gid = 0
  290. else:
  291. gid = videoList[i]['gid']
  292. # play_cnt
  293. if 'video_detail_info' not in videoList[i]:
  294. play_cnt = 0
  295. elif 'video_watch_count' not in videoList[i]['video_detail_info']:
  296. play_cnt = 0
  297. else:
  298. play_cnt = videoList[i]['video_detail_info']['video_watch_count']
  299. # comment_cnt
  300. if 'comment_count' not in videoList[i]:
  301. comment_cnt = 0
  302. else:
  303. comment_cnt = videoList[i]['comment_count']
  304. # like_cnt
  305. if 'digg_count' not in videoList[i]:
  306. like_cnt = 0
  307. else:
  308. like_cnt = videoList[i]['digg_count']
  309. # share_cnt
  310. share_cnt = 0
  311. # video_duration
  312. if 'video_duration' not in videoList[i]:
  313. video_duration = 0
  314. else:
  315. video_duration = videoList[i]['video_duration']
  316. # send_time
  317. if 'publish_time' not in videoList[i]:
  318. publish_time = 0
  319. else:
  320. publish_time = videoList[i]['publish_time']
  321. # is_top
  322. if 'is_top' not in videoList[i]:
  323. is_top = 0
  324. else:
  325. is_top = videoList[i]['is_top']
  326. # user_name
  327. if 'user_info' not in videoList[i]:
  328. user_name = 0
  329. elif 'name' not in videoList[i]['user_info']:
  330. user_name = 0
  331. else:
  332. user_name = videoList[i]['user_info']['name']
  333. # user_id
  334. if 'user_info' not in videoList[i]:
  335. user_id = 0
  336. elif 'user_id' not in videoList[i]['user_info']:
  337. user_id = 0
  338. else:
  339. user_id = videoList[i]['user_info']['user_id']
  340. # avatar_url
  341. if 'user_info' not in videoList[i]:
  342. avatar_url = 0
  343. elif 'avatar_url' not in videoList[i]['user_info']:
  344. avatar_url = 0
  345. else:
  346. avatar_url = videoList[i]['user_info']['avatar_url']
  347. # cover_url
  348. if 'video_detail_info' not in videoList[i]:
  349. cover_url = 0
  350. elif 'detail_video_large_image' not in videoList[i]['video_detail_info']:
  351. cover_url = 0
  352. elif 'url' in videoList[i]['video_detail_info']['detail_video_large_image']:
  353. cover_url = videoList[i]['video_detail_info']['detail_video_large_image']['url']
  354. else:
  355. cover_url = videoList[i]['video_detail_info']['detail_video_large_image']['url_list'][0]['url']
  356. Common.logger(log_type, crawler).info(
  357. f'send_time:{time.strftime("%Y/%m/%d %H:%M:%S", time.localtime(publish_time))}')
  358. video_url_dict = cls.get_video_url(log_type, crawler, gid)
  359. video_url = video_url_dict["video_url"]
  360. audio_url = video_url_dict["audio_url"]
  361. video_width = video_url_dict["video_width"]
  362. video_height = video_url_dict["video_height"]
  363. video_dict = {'video_title': video_title,
  364. 'video_id': video_id,
  365. 'gid': gid,
  366. 'play_cnt': play_cnt,
  367. 'comment_cnt': comment_cnt,
  368. 'like_cnt': like_cnt,
  369. 'share_cnt': share_cnt,
  370. 'video_width': video_width,
  371. 'video_height': video_height,
  372. 'video_duration': video_duration,
  373. 'publish_time': publish_time,
  374. 'is_top': is_top,
  375. 'user_name': user_name,
  376. 'user_id': user_id,
  377. 'avatar_url': avatar_url,
  378. 'cover_url': cover_url,
  379. 'audio_url': audio_url,
  380. 'video_url': video_url}
  381. for k, v in video_dict.items():
  382. print(f"{k}:{v}")
  383. print("\n")
  384. if __name__ == '__main__':
  385. # print(Follow.get_signature("follow", "xigua", "95420624045", "local"))
  386. Follow.get_videolist("follow", "xigua", "95420624045", "local")
  387. pass