xigua_follow.py 59 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006
  1. # -*- coding: utf-8 -*-
  2. # @Author: wangkun
  3. # @Time: 2023/2/17
  4. import base64
  5. import json
  6. import os
  7. import random
  8. import shutil
  9. import string
  10. import sys
  11. import time
  12. import requests
  13. import urllib3
  14. from selenium.webdriver import DesiredCapabilities
  15. from selenium.webdriver.chrome.service import Service
  16. from selenium.webdriver.common.by import By
  17. from selenium import webdriver
  18. from lxml import etree
  19. sys.path.append(os.getcwd())
  20. from common.db import MysqlHelper
  21. from common.users import Users
  22. from common.common import Common
  23. from common.feishu import Feishu
  24. from common.publish import Publish
  25. proxies = {"http": None, "https": None}
  26. class Follow:
  27. # 个人主页视频翻页参数
  28. offset = 0
  29. platform = "西瓜视频"
  30. tag = "西瓜视频爬虫,定向爬虫策略"
  31. @classmethod
  32. def get_rule(cls, log_type, crawler):
  33. try:
  34. while True:
  35. rule_sheet = Feishu.get_values_batch(log_type, crawler, "4kxd31")
  36. if rule_sheet is None:
  37. Common.logger(log_type, crawler).warning("rule_sheet is None! 10秒后重新获取")
  38. time.sleep(10)
  39. continue
  40. rule_dict = {
  41. "play_cnt": int(rule_sheet[1][2]),
  42. "comment_cnt": int(rule_sheet[2][2]),
  43. "like_cnt": int(rule_sheet[3][2]),
  44. "duration": int(rule_sheet[4][2]),
  45. "publish_time": int(rule_sheet[5][2]),
  46. "video_width": int(rule_sheet[6][2]),
  47. "video_height": int(rule_sheet[7][2]),
  48. }
  49. return rule_dict
  50. except Exception as e:
  51. Common.logger(log_type, crawler).error(f"get_rule:{e}\n")
  52. # 下载规则
  53. @classmethod
  54. def download_rule(cls, video_info_dict, rule_dict):
  55. if video_info_dict['play_cnt'] >= rule_dict['play_cnt']:
  56. if video_info_dict['comment_cnt'] >= rule_dict['comment_cnt']:
  57. if video_info_dict['like_cnt'] >= rule_dict['like_cnt']:
  58. if video_info_dict['duration'] >= rule_dict['duration']:
  59. if video_info_dict['video_width'] >= rule_dict['video_width'] \
  60. or video_info_dict['video_height'] >= rule_dict['video_height']:
  61. return True
  62. else:
  63. return False
  64. else:
  65. return False
  66. else:
  67. return False
  68. else:
  69. return False
  70. else:
  71. return False
  72. # 过滤词库
  73. @classmethod
  74. def filter_words(cls, log_type, crawler):
  75. try:
  76. while True:
  77. filter_words_sheet = Feishu.get_values_batch(log_type, crawler, 'KGB4Hc')
  78. if filter_words_sheet is None:
  79. Common.logger(log_type, crawler).warning(f"filter_words_sheet:{filter_words_sheet} 10秒钟后重试")
  80. continue
  81. filter_words_list = []
  82. for x in filter_words_sheet:
  83. for y in x:
  84. if y is None:
  85. pass
  86. else:
  87. filter_words_list.append(y)
  88. return filter_words_list
  89. except Exception as e:
  90. Common.logger(log_type, crawler).error(f'filter_words异常:{e}\n')
  91. @classmethod
  92. def get_out_user_info(cls, log_type, crawler, out_uid):
  93. try:
  94. headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.41',
  95. 'referer': f'https://www.ixigua.com/home/{out_uid}',
  96. 'Cookie': f'ixigua-a-s=1; support_webp=true; support_avif=false; csrf_session_id=a5355d954d3c63ed1ba35faada452b4d; __ac_signature={cls.random_signature()}; MONITOR_WEB_ID=67cb5099-a022-4ec3-bb8e-c4de6ba51dd0; s_v_web_id=verify_lef4i99x_32SosrdH_Qrtk_4LJn_8S7q_fhu16xe3s8ZV; tt_scid=QLJjPuHf6wxVqu6IIq6gHiJXQpVrCwrdhjH2zpm7-E3ZniE1RXBcP6M8b41FJOdo41e1; ttwid=1%7CHHtv2QqpSGuSu8r-zXF1QoWsvjmNi1SJrqOrZzg-UCY%7C1677047013%7C5866a444e5ae10a9df8c11551db75010fb77b657f214ccf84e503fae8d313d09; msToken=PerXJcDdIsZ6zXkGITsftXX4mDaVaW21GuqtzSVdctH46oXXT2GcELIs9f0XW2hunRzP6KVHLZaYElRvNYflLKUXih7lC27XKxs3HjdZiXPK9NQaoKbLfA==; ixigua-a-s=1',}
  97. url = f"https://www.ixigua.com/home/{out_uid}"
  98. response = requests.get(url=url, headers=headers, proxies=proxies).text
  99. html = etree.HTML(response)
  100. out_follow_str = html.xpath('//div[@class="userDetailV3__header__detail2"]/*[1]/span')[0].text.encode('raw_unicode_escape').decode()
  101. out_fans_str = html.xpath('//div[@class="userDetailV3__header__detail2"]/*[2]/span')[0].text.encode('raw_unicode_escape').decode()
  102. out_like_str = html.xpath('//div[@class="userDetailV3__header__detail2"]/*[3]/span')[0].text.encode('raw_unicode_escape').decode()
  103. out_avatar_url = f"""https:{html.xpath('//span[@class="component-avatar__inner"]//img/@src')[0]}"""
  104. if "万" in out_follow_str:
  105. out_follow = int(float(out_follow_str.split("万")[0])*10000)
  106. else:
  107. out_follow = int(out_follow_str.replace(",", ""))
  108. if "万" in out_fans_str:
  109. out_fans = int(float(out_fans_str.split("万")[0])*10000)
  110. else:
  111. out_fans = int(out_fans_str.replace(",", ""))
  112. if "万" in out_like_str:
  113. out_like = int(float(out_like_str.split("万")[0])*10000)
  114. else:
  115. out_like = int(out_like_str.replace(",", ""))
  116. out_user_dict = {
  117. "out_follow": out_follow,
  118. "out_fans": out_fans,
  119. "out_like": out_like,
  120. "out_avatar_url": out_avatar_url,
  121. }
  122. # for k, v in out_user_dict.items():
  123. # print(f"{k}:{v}")
  124. return out_user_dict
  125. except Exception as e:
  126. Common.logger(log_type, crawler).error(f"get_out_user_info:{e}\n")
  127. # 获取用户信息(字典格式). 注意:部分 user_id 字符类型是 int / str
  128. @classmethod
  129. def get_user_list(cls, log_type, crawler, sheetid, env, machine):
  130. try:
  131. while True:
  132. user_sheet = Feishu.get_values_batch(log_type, crawler, sheetid)
  133. if user_sheet is None:
  134. Common.logger(log_type, crawler).warning(f"user_sheet:{user_sheet} 10秒钟后重试")
  135. continue
  136. our_user_list = []
  137. for i in range(1, len(user_sheet)):
  138. out_uid = user_sheet[i][2]
  139. user_name = user_sheet[i][3]
  140. our_uid = user_sheet[i][6]
  141. our_user_link = user_sheet[i][7]
  142. if out_uid is None or user_name is None:
  143. Common.logger(log_type, crawler).info("空行\n")
  144. else:
  145. Common.logger(log_type, crawler).info(f"正在更新 {user_name} 用户信息\n")
  146. if our_uid is None:
  147. out_user_info = cls.get_out_user_info(log_type, crawler, out_uid)
  148. out_user_dict = {
  149. "out_uid": out_uid,
  150. "user_name": user_name,
  151. "out_avatar_url": out_user_info["out_avatar_url"],
  152. "out_create_time": '',
  153. "out_tag": '',
  154. "out_play_cnt": 0,
  155. "out_fans": out_user_info["out_fans"],
  156. "out_follow": out_user_info["out_follow"],
  157. "out_friend": 0,
  158. "out_like": out_user_info["out_like"],
  159. "platform": cls.platform,
  160. "tag": cls.tag,
  161. }
  162. our_user_dict = Users.create_user(log_type=log_type, crawler=crawler, out_user_dict=out_user_dict, env=env, machine=machine)
  163. our_uid = our_user_dict['our_uid']
  164. our_user_link = our_user_dict['our_user_link']
  165. Feishu.update_values(log_type, crawler, sheetid, f'G{i + 1}:H{i + 1}', [[our_uid, our_user_link]])
  166. Common.logger(log_type, crawler).info(f'站内用户信息写入飞书成功!\n')
  167. our_user_list.append(our_user_dict)
  168. else:
  169. our_user_dict = {
  170. 'out_uid': out_uid,
  171. 'user_name': user_name,
  172. 'our_uid': our_uid,
  173. 'our_user_link': our_user_link,
  174. }
  175. our_user_list.append(our_user_dict)
  176. return our_user_list
  177. except Exception as e:
  178. Common.logger(log_type, crawler).error(f'get_user_id_from_feishu异常:{e}\n')
  179. @classmethod
  180. def random_signature(cls):
  181. src_digits = string.digits # string_数字
  182. src_uppercase = string.ascii_uppercase # string_大写字母
  183. src_lowercase = string.ascii_lowercase # string_小写字母
  184. digits_num = random.randint(1, 6)
  185. uppercase_num = random.randint(1, 26 - digits_num - 1)
  186. lowercase_num = 26 - (digits_num + uppercase_num)
  187. password = random.sample(src_digits, digits_num) + random.sample(src_uppercase, uppercase_num) + random.sample(
  188. src_lowercase, lowercase_num)
  189. random.shuffle(password)
  190. new_password = 'AAAAAAAAAA' + ''.join(password)[10:-4] + 'AAAB'
  191. new_password_start = new_password[0:18]
  192. new_password_end = new_password[-7:]
  193. if new_password[18] == '8':
  194. new_password = new_password_start + 'w' + new_password_end
  195. elif new_password[18] == '9':
  196. new_password = new_password_start + 'x' + new_password_end
  197. elif new_password[18] == '-':
  198. new_password = new_password_start + 'y' + new_password_end
  199. elif new_password[18] == '.':
  200. new_password = new_password_start + 'z' + new_password_end
  201. else:
  202. new_password = new_password_start + 'y' + new_password_end
  203. return new_password
  204. @classmethod
  205. def get_signature(cls, log_type, crawler, out_uid, machine):
  206. try:
  207. # 打印请求配置
  208. ca = DesiredCapabilities.CHROME
  209. ca["goog:loggingPrefs"] = {"performance": "ALL"}
  210. # 不打开浏览器运行
  211. chrome_options = webdriver.ChromeOptions()
  212. chrome_options.add_argument("--headless")
  213. chrome_options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36')
  214. chrome_options.add_argument("--no-sandbox")
  215. # driver初始化
  216. if machine == 'aliyun' or machine == 'aliyun_hk':
  217. driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options)
  218. elif machine == 'macpro':
  219. driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options,
  220. service=Service('/Users/lieyunye/Downloads/chromedriver_v86/chromedriver'))
  221. elif machine == 'macair':
  222. driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options,
  223. service=Service('/Users/piaoquan/Downloads/chromedriver'))
  224. else:
  225. driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options, service=Service('/Users/wangkun/Downloads/chromedriver/chromedriver_v110/chromedriver'))
  226. driver.implicitly_wait(10)
  227. driver.get(f'https://www.ixigua.com/home/{out_uid}/')
  228. time.sleep(3)
  229. data_src = driver.find_elements(By.XPATH, '//img[@class="tt-img BU-MagicImage tt-img-loaded"]')[1].get_attribute("data-src")
  230. signature = data_src.split("x-signature=")[-1]
  231. return signature
  232. except Exception as e:
  233. Common.logger(log_type, crawler).error(f'get_signature异常:{e}\n')
  234. # 获取视频详情
  235. @classmethod
  236. def get_video_url(cls, log_type, crawler, gid):
  237. try:
  238. url = 'https://www.ixigua.com/api/mixVideo/information?'
  239. headers = {
  240. "accept-encoding": "gzip, deflate",
  241. "accept-language": "zh-CN,zh-Hans;q=0.9",
  242. "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
  243. "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.5 Safari/605.1.15",
  244. "referer": "https://www.ixigua.com/7102614741050196520?logTag=0531c88ac04f38ab2c62",
  245. }
  246. params = {
  247. 'mixId': gid,
  248. 'msToken': 'IlG0wd0Pylyw9ghcYiB2YseUmTwrsrqqhXrbIcsSaTcLTJyVlbYJzk20zw3UO-CfrfC'
  249. 'NVVIOBNjIl7vfBoxnVUwO9ZyzAI3umSKsT5-pef_RRfQCJwmA',
  250. 'X-Bogus': 'DFSzswVupYTANCJOSBk0P53WxM-r',
  251. '_signature': '_02B4Z6wo0000119LvEwAAIDCuktNZ0y5wkdfS7jAALThuOR8D9yWNZ.EmWHKV0WSn6Px'
  252. 'fPsH9-BldyxVje0f49ryXgmn7Tzk-swEHNb15TiGqa6YF.cX0jW8Eds1TtJOIZyfc9s5emH7gdWN94',
  253. }
  254. cookies = {
  255. 'ixigua-a-s': '1',
  256. 'msToken': 'IlG0wd0Pylyw9ghcYiB2YseUmTwrsrqqhXrbIcsSaTcLTJyVlbYJzk20zw3UO-CfrfCNVVIOB'
  257. 'NjIl7vfBoxnVUwO9ZyzAI3umSKsT5-pef_RRfQCJwmA',
  258. 'ttwid': '1%7C_yXQeHWwLZgCsgHClOwTCdYSOt_MjdOkgnPIkpi-Sr8%7C1661241238%7Cf57d0c5ef3f1d7'
  259. '6e049fccdca1ac54887c34d1f8731c8e51a49780ff0ceab9f8',
  260. 'tt_scid': 'QZ4l8KXDG0YAEaMCSbADdcybdKbUfG4BC6S4OBv9lpRS5VyqYLX2bIR8CTeZeGHR9ee3',
  261. 'MONITOR_WEB_ID': '0a49204a-7af5-4e96-95f0-f4bafb7450ad',
  262. '__ac_nonce': '06304878000964fdad287',
  263. '__ac_signature': '_02B4Z6wo00f017Rcr3AAAIDCUVxeW1tOKEu0fKvAAI4cvoYzV-wBhq7B6D8k0no7lb'
  264. 'FlvYoinmtK6UXjRIYPXnahUlFTvmWVtb77jsMkKAXzAEsLE56m36RlvL7ky.M3Xn52r9t1IEb7IR3ke8',
  265. 'ttcid': 'e56fabf6e85d4adf9e4d91902496a0e882',
  266. '_tea_utm_cache_1300': 'undefined',
  267. 'support_avif': 'false',
  268. 'support_webp': 'false',
  269. 'xiguavideopcwebid': '7134967546256016900',
  270. 'xiguavideopcwebid.sig': 'xxRww5R1VEMJN_dQepHorEu_eAc',
  271. }
  272. urllib3.disable_warnings()
  273. response = requests.get(url=url, headers=headers, params=params, cookies=cookies, verify=False)
  274. if 'data' not in response.json() or response.json()['data'] == '':
  275. Common.logger(log_type, crawler).warning('get_video_info: response: {}', response)
  276. else:
  277. video_info = response.json()['data']['gidInformation']['packerData']['video']
  278. video_url_dict = {}
  279. # video_url
  280. if 'videoResource' not in video_info:
  281. video_url_dict["video_url"] = ''
  282. video_url_dict["audio_url"] = ''
  283. video_url_dict["video_width"] = 0
  284. video_url_dict["video_height"] = 0
  285. elif 'dash_120fps' in video_info['videoResource']:
  286. if "video_list" in video_info['videoResource']['dash_120fps'] and 'video_4' in video_info['videoResource']['dash_120fps']['video_list']:
  287. video_url = video_info['videoResource']['dash_120fps']['video_list']['video_4']['backup_url_1']
  288. audio_url = video_info['videoResource']['dash_120fps']['video_list']['video_4']['backup_url_1']
  289. if len(video_url) % 3 == 1:
  290. video_url += '=='
  291. elif len(video_url) % 3 == 2:
  292. video_url += '='
  293. elif len(audio_url) % 3 == 1:
  294. audio_url += '=='
  295. elif len(audio_url) % 3 == 2:
  296. audio_url += '='
  297. video_url = base64.b64decode(video_url).decode('utf8')
  298. audio_url = base64.b64decode(audio_url).decode('utf8')
  299. video_width = video_info['videoResource']['dash_120fps']['video_list']['video_4']['vwidth']
  300. video_height = video_info['videoResource']['dash_120fps']['video_list']['video_4']['vheight']
  301. video_url_dict["video_url"] = video_url
  302. video_url_dict["audio_url"] = audio_url
  303. video_url_dict["video_width"] = video_width
  304. video_url_dict["video_height"] = video_height
  305. elif "video_list" in video_info['videoResource']['dash_120fps'] and 'video_3' in video_info['videoResource']['dash_120fps']['video_list']:
  306. video_url = video_info['videoResource']['dash_120fps']['video_list']['video_3']['backup_url_1']
  307. audio_url = video_info['videoResource']['dash_120fps']['video_list']['video_3']['backup_url_1']
  308. if len(video_url) % 3 == 1:
  309. video_url += '=='
  310. elif len(video_url) % 3 == 2:
  311. video_url += '='
  312. elif len(audio_url) % 3 == 1:
  313. audio_url += '=='
  314. elif len(audio_url) % 3 == 2:
  315. audio_url += '='
  316. video_url = base64.b64decode(video_url).decode('utf8')
  317. audio_url = base64.b64decode(audio_url).decode('utf8')
  318. video_width = video_info['videoResource']['dash_120fps']['video_list']['video_3']['vwidth']
  319. video_height = video_info['videoResource']['dash_120fps']['video_list']['video_3']['vheight']
  320. video_url_dict["video_url"] = video_url
  321. video_url_dict["audio_url"] = audio_url
  322. video_url_dict["video_width"] = video_width
  323. video_url_dict["video_height"] = video_height
  324. elif "video_list" in video_info['videoResource']['dash_120fps'] and 'video_2' in video_info['videoResource']['dash_120fps']['video_list']:
  325. video_url = video_info['videoResource']['dash_120fps']['video_list']['video_2']['backup_url_1']
  326. audio_url = video_info['videoResource']['dash_120fps']['video_list']['video_2']['backup_url_1']
  327. if len(video_url) % 3 == 1:
  328. video_url += '=='
  329. elif len(video_url) % 3 == 2:
  330. video_url += '='
  331. elif len(audio_url) % 3 == 1:
  332. audio_url += '=='
  333. elif len(audio_url) % 3 == 2:
  334. audio_url += '='
  335. video_url = base64.b64decode(video_url).decode('utf8')
  336. audio_url = base64.b64decode(audio_url).decode('utf8')
  337. video_width = video_info['videoResource']['dash_120fps']['video_list']['video_2']['vwidth']
  338. video_height = video_info['videoResource']['dash_120fps']['video_list']['video_2']['vheight']
  339. video_url_dict["video_url"] = video_url
  340. video_url_dict["audio_url"] = audio_url
  341. video_url_dict["video_width"] = video_width
  342. video_url_dict["video_height"] = video_height
  343. elif "video_list" in video_info['videoResource']['dash_120fps'] and 'video_1' in video_info['videoResource']['dash_120fps']['video_list']:
  344. video_url = video_info['videoResource']['dash_120fps']['video_list']['video_1']['backup_url_1']
  345. audio_url = video_info['videoResource']['dash_120fps']['video_list']['video_1']['backup_url_1']
  346. if len(video_url) % 3 == 1:
  347. video_url += '=='
  348. elif len(video_url) % 3 == 2:
  349. video_url += '='
  350. elif len(audio_url) % 3 == 1:
  351. audio_url += '=='
  352. elif len(audio_url) % 3 == 2:
  353. audio_url += '='
  354. video_url = base64.b64decode(video_url).decode('utf8')
  355. audio_url = base64.b64decode(audio_url).decode('utf8')
  356. video_width = video_info['videoResource']['dash_120fps']['video_list']['video_1']['vwidth']
  357. video_height = video_info['videoResource']['dash_120fps']['video_list']['video_1']['vheight']
  358. video_url_dict["video_url"] = video_url
  359. video_url_dict["audio_url"] = audio_url
  360. video_url_dict["video_width"] = video_width
  361. video_url_dict["video_height"] = video_height
  362. elif 'dynamic_video' in video_info['videoResource']['dash_120fps'] \
  363. and 'dynamic_video_list' in video_info['videoResource']['dash_120fps']['dynamic_video'] \
  364. and 'dynamic_audio_list' in video_info['videoResource']['dash_120fps']['dynamic_video'] \
  365. and len(video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_video_list']) != 0 \
  366. and len(video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_audio_list']) != 0:
  367. video_url = video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_video_list'][-1]['backup_url_1']
  368. audio_url = video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_audio_list'][-1]['backup_url_1']
  369. if len(video_url) % 3 == 1:
  370. video_url += '=='
  371. elif len(video_url) % 3 == 2:
  372. video_url += '='
  373. elif len(audio_url) % 3 == 1:
  374. audio_url += '=='
  375. elif len(audio_url) % 3 == 2:
  376. audio_url += '='
  377. video_url = base64.b64decode(video_url).decode('utf8')
  378. audio_url = base64.b64decode(audio_url).decode('utf8')
  379. video_width = video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_video_list'][-1]['vwidth']
  380. video_height = video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_video_list'][-1]['vheight']
  381. video_url_dict["video_url"] = video_url
  382. video_url_dict["audio_url"] = audio_url
  383. video_url_dict["video_width"] = video_width
  384. video_url_dict["video_height"] = video_height
  385. else:
  386. video_url_dict["video_url"] = ''
  387. video_url_dict["audio_url"] = ''
  388. video_url_dict["video_width"] = 0
  389. video_url_dict["video_height"] = 0
  390. elif 'dash' in video_info['videoResource']:
  391. if "video_list" in video_info['videoResource']['dash'] and 'video_4' in video_info['videoResource']['dash']['video_list']:
  392. video_url = video_info['videoResource']['dash']['video_list']['video_4']['backup_url_1']
  393. audio_url = video_info['videoResource']['dash']['video_list']['video_4']['backup_url_1']
  394. if len(video_url) % 3 == 1:
  395. video_url += '=='
  396. elif len(video_url) % 3 == 2:
  397. video_url += '='
  398. elif len(audio_url) % 3 == 1:
  399. audio_url += '=='
  400. elif len(audio_url) % 3 == 2:
  401. audio_url += '='
  402. video_url = base64.b64decode(video_url).decode('utf8')
  403. audio_url = base64.b64decode(audio_url).decode('utf8')
  404. video_width = video_info['videoResource']['dash']['video_list']['video_4']['vwidth']
  405. video_height = video_info['videoResource']['dash']['video_list']['video_4']['vheight']
  406. video_url_dict["video_url"] = video_url
  407. video_url_dict["audio_url"] = audio_url
  408. video_url_dict["video_width"] = video_width
  409. video_url_dict["video_height"] = video_height
  410. elif "video_list" in video_info['videoResource']['dash'] and 'video_3' in video_info['videoResource']['dash']['video_list']:
  411. video_url = video_info['videoResource']['dash']['video_list']['video_3']['backup_url_1']
  412. audio_url = video_info['videoResource']['dash']['video_list']['video_3']['backup_url_1']
  413. if len(video_url) % 3 == 1:
  414. video_url += '=='
  415. elif len(video_url) % 3 == 2:
  416. video_url += '='
  417. elif len(audio_url) % 3 == 1:
  418. audio_url += '=='
  419. elif len(audio_url) % 3 == 2:
  420. audio_url += '='
  421. video_url = base64.b64decode(video_url).decode('utf8')
  422. audio_url = base64.b64decode(audio_url).decode('utf8')
  423. video_width = video_info['videoResource']['dash']['video_list']['video_3']['vwidth']
  424. video_height = video_info['videoResource']['dash']['video_list']['video_3']['vheight']
  425. video_url_dict["video_url"] = video_url
  426. video_url_dict["audio_url"] = audio_url
  427. video_url_dict["video_width"] = video_width
  428. video_url_dict["video_height"] = video_height
  429. elif "video_list" in video_info['videoResource']['dash'] and 'video_2' in video_info['videoResource']['dash']['video_list']:
  430. video_url = video_info['videoResource']['dash']['video_list']['video_2']['backup_url_1']
  431. audio_url = video_info['videoResource']['dash']['video_list']['video_2']['backup_url_1']
  432. if len(video_url) % 3 == 1:
  433. video_url += '=='
  434. elif len(video_url) % 3 == 2:
  435. video_url += '='
  436. elif len(audio_url) % 3 == 1:
  437. audio_url += '=='
  438. elif len(audio_url) % 3 == 2:
  439. audio_url += '='
  440. video_url = base64.b64decode(video_url).decode('utf8')
  441. audio_url = base64.b64decode(audio_url).decode('utf8')
  442. video_width = video_info['videoResource']['dash']['video_list']['video_2']['vwidth']
  443. video_height = video_info['videoResource']['dash']['video_list']['video_2']['vheight']
  444. video_url_dict["video_url"] = video_url
  445. video_url_dict["audio_url"] = audio_url
  446. video_url_dict["video_width"] = video_width
  447. video_url_dict["video_height"] = video_height
  448. elif "video_list" in video_info['videoResource']['dash'] and 'video_1' in video_info['videoResource']['dash']['video_list']:
  449. video_url = video_info['videoResource']['dash']['video_list']['video_1']['backup_url_1']
  450. audio_url = video_info['videoResource']['dash']['video_list']['video_1']['backup_url_1']
  451. if len(video_url) % 3 == 1:
  452. video_url += '=='
  453. elif len(video_url) % 3 == 2:
  454. video_url += '='
  455. elif len(audio_url) % 3 == 1:
  456. audio_url += '=='
  457. elif len(audio_url) % 3 == 2:
  458. audio_url += '='
  459. video_url = base64.b64decode(video_url).decode('utf8')
  460. audio_url = base64.b64decode(audio_url).decode('utf8')
  461. video_width = video_info['videoResource']['dash']['video_list']['video_1']['vwidth']
  462. video_height = video_info['videoResource']['dash']['video_list']['video_1']['vheight']
  463. video_url_dict["video_url"] = video_url
  464. video_url_dict["audio_url"] = audio_url
  465. video_url_dict["video_width"] = video_width
  466. video_url_dict["video_height"] = video_height
  467. elif 'dynamic_video' in video_info['videoResource']['dash'] \
  468. and 'dynamic_video_list' in video_info['videoResource']['dash']['dynamic_video'] \
  469. and 'dynamic_audio_list' in video_info['videoResource']['dash']['dynamic_video'] \
  470. and len(video_info['videoResource']['dash']['dynamic_video']['dynamic_video_list']) != 0 \
  471. and len(video_info['videoResource']['dash']['dynamic_video']['dynamic_audio_list']) != 0:
  472. video_url = video_info['videoResource']['dash']['dynamic_video']['dynamic_video_list'][-1]['backup_url_1']
  473. audio_url = video_info['videoResource']['dash']['dynamic_video']['dynamic_audio_list'][-1]['backup_url_1']
  474. if len(video_url) % 3 == 1:
  475. video_url += '=='
  476. elif len(video_url) % 3 == 2:
  477. video_url += '='
  478. elif len(audio_url) % 3 == 1:
  479. audio_url += '=='
  480. elif len(audio_url) % 3 == 2:
  481. audio_url += '='
  482. video_url = base64.b64decode(video_url).decode('utf8')
  483. audio_url = base64.b64decode(audio_url).decode('utf8')
  484. video_width = video_info['videoResource']['dash']['dynamic_video']['dynamic_video_list'][-1]['vwidth']
  485. video_height = video_info['videoResource']['dash']['dynamic_video']['dynamic_video_list'][-1]['vheight']
  486. video_url_dict["video_url"] = video_url
  487. video_url_dict["audio_url"] = audio_url
  488. video_url_dict["video_width"] = video_width
  489. video_url_dict["video_height"] = video_height
  490. else:
  491. video_url_dict["video_url"] = ''
  492. video_url_dict["audio_url"] = ''
  493. video_url_dict["video_width"] = 0
  494. video_url_dict["video_height"] = 0
  495. elif 'normal' in video_info['videoResource']:
  496. if "video_list" in video_info['videoResource']['normal'] and 'video_4' in \
  497. video_info['videoResource']['normal']['video_list']:
  498. video_url = video_info['videoResource']['normal']['video_list']['video_4']['backup_url_1']
  499. audio_url = video_info['videoResource']['normal']['video_list']['video_4']['backup_url_1']
  500. if len(video_url) % 3 == 1:
  501. video_url += '=='
  502. elif len(video_url) % 3 == 2:
  503. video_url += '='
  504. elif len(audio_url) % 3 == 1:
  505. audio_url += '=='
  506. elif len(audio_url) % 3 == 2:
  507. audio_url += '='
  508. video_url = base64.b64decode(video_url).decode('utf8')
  509. audio_url = base64.b64decode(audio_url).decode('utf8')
  510. video_width = video_info['videoResource']['normal']['video_list']['video_4']['vwidth']
  511. video_height = video_info['videoResource']['normal']['video_list']['video_4']['vheight']
  512. video_url_dict["video_url"] = video_url
  513. video_url_dict["audio_url"] = audio_url
  514. video_url_dict["video_width"] = video_width
  515. video_url_dict["video_height"] = video_height
  516. elif "video_list" in video_info['videoResource']['normal'] and 'video_3' in \
  517. video_info['videoResource']['normal']['video_list']:
  518. video_url = video_info['videoResource']['normal']['video_list']['video_3']['backup_url_1']
  519. audio_url = video_info['videoResource']['normal']['video_list']['video_3']['backup_url_1']
  520. if len(video_url) % 3 == 1:
  521. video_url += '=='
  522. elif len(video_url) % 3 == 2:
  523. video_url += '='
  524. elif len(audio_url) % 3 == 1:
  525. audio_url += '=='
  526. elif len(audio_url) % 3 == 2:
  527. audio_url += '='
  528. video_url = base64.b64decode(video_url).decode('utf8')
  529. audio_url = base64.b64decode(audio_url).decode('utf8')
  530. video_width = video_info['videoResource']['normal']['video_list']['video_3']['vwidth']
  531. video_height = video_info['videoResource']['normal']['video_list']['video_3']['vheight']
  532. video_url_dict["video_url"] = video_url
  533. video_url_dict["audio_url"] = audio_url
  534. video_url_dict["video_width"] = video_width
  535. video_url_dict["video_height"] = video_height
  536. elif "video_list" in video_info['videoResource']['normal'] and 'video_2' in \
  537. video_info['videoResource']['normal']['video_list']:
  538. video_url = video_info['videoResource']['normal']['video_list']['video_2']['backup_url_1']
  539. audio_url = video_info['videoResource']['normal']['video_list']['video_2']['backup_url_1']
  540. if len(video_url) % 3 == 1:
  541. video_url += '=='
  542. elif len(video_url) % 3 == 2:
  543. video_url += '='
  544. elif len(audio_url) % 3 == 1:
  545. audio_url += '=='
  546. elif len(audio_url) % 3 == 2:
  547. audio_url += '='
  548. video_url = base64.b64decode(video_url).decode('utf8')
  549. audio_url = base64.b64decode(audio_url).decode('utf8')
  550. video_width = video_info['videoResource']['normal']['video_list']['video_2']['vwidth']
  551. video_height = video_info['videoResource']['normal']['video_list']['video_2']['vheight']
  552. video_url_dict["video_url"] = video_url
  553. video_url_dict["audio_url"] = audio_url
  554. video_url_dict["video_width"] = video_width
  555. video_url_dict["video_height"] = video_height
  556. elif "video_list" in video_info['videoResource']['normal'] and 'video_1' in \
  557. video_info['videoResource']['normal']['video_list']:
  558. video_url = video_info['videoResource']['normal']['video_list']['video_1']['backup_url_1']
  559. audio_url = video_info['videoResource']['normal']['video_list']['video_1']['backup_url_1']
  560. if len(video_url) % 3 == 1:
  561. video_url += '=='
  562. elif len(video_url) % 3 == 2:
  563. video_url += '='
  564. elif len(audio_url) % 3 == 1:
  565. audio_url += '=='
  566. elif len(audio_url) % 3 == 2:
  567. audio_url += '='
  568. video_url = base64.b64decode(video_url).decode('utf8')
  569. audio_url = base64.b64decode(audio_url).decode('utf8')
  570. video_width = video_info['videoResource']['normal']['video_list']['video_1']['vwidth']
  571. video_height = video_info['videoResource']['normal']['video_list']['video_1']['vheight']
  572. video_url_dict["video_url"] = video_url
  573. video_url_dict["audio_url"] = audio_url
  574. video_url_dict["video_width"] = video_width
  575. video_url_dict["video_height"] = video_height
  576. elif 'dynamic_video' in video_info['videoResource']['normal'] \
  577. and 'dynamic_video_list' in video_info['videoResource']['normal']['dynamic_video'] \
  578. and 'dynamic_audio_list' in video_info['videoResource']['normal']['dynamic_video'] \
  579. and len(video_info['videoResource']['normal']['dynamic_video']['dynamic_video_list']) != 0 \
  580. and len(video_info['videoResource']['normal']['dynamic_video']['dynamic_audio_list']) != 0:
  581. video_url = video_info['videoResource']['normal']['dynamic_video']['dynamic_video_list'][-1][
  582. 'backup_url_1']
  583. audio_url = video_info['videoResource']['normal']['dynamic_video']['dynamic_audio_list'][-1][
  584. 'backup_url_1']
  585. if len(video_url) % 3 == 1:
  586. video_url += '=='
  587. elif len(video_url) % 3 == 2:
  588. video_url += '='
  589. elif len(audio_url) % 3 == 1:
  590. audio_url += '=='
  591. elif len(audio_url) % 3 == 2:
  592. audio_url += '='
  593. video_url = base64.b64decode(video_url).decode('utf8')
  594. audio_url = base64.b64decode(audio_url).decode('utf8')
  595. video_width = video_info['videoResource']['normal']['dynamic_video']['dynamic_video_list'][-1][
  596. 'vwidth']
  597. video_height = video_info['videoResource']['normal']['dynamic_video']['dynamic_video_list'][-1][
  598. 'vheight']
  599. video_url_dict["video_url"] = video_url
  600. video_url_dict["audio_url"] = audio_url
  601. video_url_dict["video_width"] = video_width
  602. video_url_dict["video_height"] = video_height
  603. else:
  604. video_url_dict["video_url"] = ''
  605. video_url_dict["audio_url"] = ''
  606. video_url_dict["video_width"] = 0
  607. video_url_dict["video_height"] = 0
  608. else:
  609. video_url_dict["video_url"] = ''
  610. video_url_dict["audio_url"] = ''
  611. video_url_dict["video_width"] = 0
  612. video_url_dict["video_height"] = 0
  613. return video_url_dict
  614. except Exception as e:
  615. Common.logger(log_type, crawler).error(f'get_video_url:{e}\n')
  616. @classmethod
  617. def get_videolist(cls, log_type, crawler, strategy, our_uid, out_uid, oss_endpoint, env, machine):
  618. try:
  619. signature = cls.random_signature()
  620. while True:
  621. url = "https://www.ixigua.com/api/videov2/author/new_video_list?"
  622. params = {
  623. 'to_user_id': str(out_uid),
  624. 'offset': str(cls.offset),
  625. 'limit': '30',
  626. 'maxBehotTime': '0',
  627. 'order': 'new',
  628. 'isHome': '0',
  629. # 'msToken': 'G0eRzNkw189a8TLaXjc6nTHVMQwh9XcxVAqTbGKi7iPJdQcLwS3-XRrJ3MZ7QBfqErpxp3EX1WtvWOIcZ3NIgr41hgcd-v64so_RRj3YCRw1UsKW8mIssNLlIMspsg==',
  630. # 'X-Bogus': 'DFSzswVuEkUANjW9ShFTgR/F6qHt',
  631. '_signature': signature,
  632. }
  633. headers = {
  634. # 'authority': 'www.ixigua.com',
  635. # 'accept': 'application/json, text/plain, */*',
  636. # 'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
  637. # 'cache-control': 'no-cache',
  638. # 'cookie': f'MONITOR_WEB_ID=7168304743566296612; __ac_signature={signature}; ixigua-a-s=1; support_webp=true; support_avif=false; csrf_session_id=a5355d954d3c63ed1ba35faada452b4d; msToken=G0eRzNkw189a8TLaXjc6nTHVMQwh9XcxVAqTbGKi7iPJdQcLwS3-XRrJ3MZ7QBfqErpxp3EX1WtvWOIcZ3NIgr41hgcd-v64so_RRj3YCRw1UsKW8mIssNLlIMspsg==; tt_scid=o4agqz7u9SKPwfBoPt6S82Cw0q.9KDtqmNe0JHxMqmpxNHQWq1BmrQdgVU6jEoX7ed99; ttwid=1%7CHHtv2QqpSGuSu8r-zXF1QoWsvjmNi1SJrqOrZzg-UCY%7C1676618894%7Cee5ad95378275f282f230a7ffa9947ae7eff40d0829c5a2568672a6dc90a1c96; ixigua-a-s=1',
  639. # 'pragma': 'no-cache',
  640. 'referer': f'https://www.ixigua.com/home/{out_uid}/video/?preActiveKey=hotsoon&list_entrance=userdetail',
  641. # 'sec-ch-ua': '"Chromium";v="110", "Not A(Brand";v="24", "Microsoft Edge";v="110"',
  642. # 'sec-ch-ua-mobile': '?0',
  643. # 'sec-ch-ua-platform': '"macOS"',
  644. # 'sec-fetch-dest': 'empty',
  645. # 'sec-fetch-mode': 'cors',
  646. # 'sec-fetch-site': 'same-origin',
  647. 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.41',
  648. # 'x-secsdk-csrf-token': '00010000000119e3f9454d1dcbb288704cda1960f241e2d19bd21f2fd283520c3615a990ac5a17448bfbb902a249'
  649. }
  650. urllib3.disable_warnings()
  651. response = requests.get(url=url, headers=headers, params=params, proxies=proxies, verify=False)
  652. cls.offset += 30
  653. if response.status_code != 200:
  654. Common.logger(log_type, crawler).warning(f"get_videolist_response:{response.text}\n")
  655. cls.offset = 0
  656. return
  657. elif 'data' not in response.text:
  658. Common.logger(log_type, crawler).warning(f"get_videolist_response:{response.text}\n")
  659. cls.offset = 0
  660. return
  661. elif 'videoList' not in response.json()["data"]:
  662. Common.logger(log_type, crawler).warning(f"get_videolist_response:{response.json()}\n")
  663. cls.offset = 0
  664. return
  665. else:
  666. videoList = response.json()['data']['videoList']
  667. for i in range(len(videoList)):
  668. # video_title
  669. if 'title' not in videoList[i]:
  670. video_title = 0
  671. else:
  672. video_title = videoList[i]['title'].strip().replace('手游', '') \
  673. .replace('/', '').replace('\/', '').replace('\n', '')
  674. # video_id
  675. if 'video_id' not in videoList[i]:
  676. video_id = 0
  677. else:
  678. video_id = videoList[i]['video_id']
  679. # gid
  680. if 'gid' not in videoList[i]:
  681. gid = 0
  682. else:
  683. gid = videoList[i]['gid']
  684. # play_cnt
  685. if 'video_detail_info' not in videoList[i]:
  686. play_cnt = 0
  687. elif 'video_watch_count' not in videoList[i]['video_detail_info']:
  688. play_cnt = 0
  689. else:
  690. play_cnt = videoList[i]['video_detail_info']['video_watch_count']
  691. # comment_cnt
  692. if 'comment_count' not in videoList[i]:
  693. comment_cnt = 0
  694. else:
  695. comment_cnt = videoList[i]['comment_count']
  696. # like_cnt
  697. if 'digg_count' not in videoList[i]:
  698. like_cnt = 0
  699. else:
  700. like_cnt = videoList[i]['digg_count']
  701. # share_cnt
  702. share_cnt = 0
  703. # video_duration
  704. if 'video_duration' not in videoList[i]:
  705. video_duration = 0
  706. else:
  707. video_duration = int(videoList[i]['video_duration'])
  708. # send_time
  709. if 'publish_time' not in videoList[i]:
  710. publish_time = 0
  711. else:
  712. publish_time = videoList[i]['publish_time']
  713. publish_time_str = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(publish_time))
  714. # is_top
  715. if 'is_top' not in videoList[i]:
  716. is_top = 0
  717. else:
  718. is_top = videoList[i]['is_top']
  719. # user_name
  720. if 'user_info' not in videoList[i]:
  721. user_name = 0
  722. elif 'name' not in videoList[i]['user_info']:
  723. user_name = 0
  724. else:
  725. user_name = videoList[i]['user_info']['name']
  726. # user_id
  727. if 'user_info' not in videoList[i]:
  728. user_id = 0
  729. elif 'user_id' not in videoList[i]['user_info']:
  730. user_id = 0
  731. else:
  732. user_id = videoList[i]['user_info']['user_id']
  733. # avatar_url
  734. if 'user_info' not in videoList[i]:
  735. avatar_url = 0
  736. elif 'avatar_url' not in videoList[i]['user_info']:
  737. avatar_url = 0
  738. else:
  739. avatar_url = videoList[i]['user_info']['avatar_url']
  740. # cover_url
  741. if 'video_detail_info' not in videoList[i]:
  742. cover_url = 0
  743. elif 'detail_video_large_image' not in videoList[i]['video_detail_info']:
  744. cover_url = 0
  745. elif 'url' in videoList[i]['video_detail_info']['detail_video_large_image']:
  746. cover_url = videoList[i]['video_detail_info']['detail_video_large_image']['url']
  747. else:
  748. cover_url = videoList[i]['video_detail_info']['detail_video_large_image']['url_list'][0]['url']
  749. while True:
  750. rule_dict = cls.get_rule(log_type, crawler)
  751. if rule_dict is None:
  752. Common.logger(log_type, crawler).warning(f"rule_dict:{rule_dict}, 10秒后重试")
  753. time.sleep(10)
  754. else:
  755. break
  756. if gid == 0 or video_id == 0 or cover_url == 0:
  757. Common.logger(log_type, crawler).info('无效视频\n')
  758. elif is_top is True and int(time.time()) - int(publish_time) > 3600 * 24 * rule_dict['publish_time']:
  759. Common.logger(log_type, crawler).info(f'置顶视频,且发布时间:{publish_time_str} 超过{rule_dict["publish_time"]}天\n')
  760. elif int(time.time()) - int(publish_time) > 3600 * 24 * rule_dict['publish_time']:
  761. Common.logger(log_type, crawler).info(f'发布时间:{publish_time_str}超过{rule_dict["publish_time"]}天\n')
  762. cls.offset = 0
  763. return
  764. else:
  765. video_url_dict = cls.get_video_url(log_type, crawler, gid)
  766. video_url = video_url_dict["video_url"]
  767. audio_url = video_url_dict["audio_url"]
  768. video_width = video_url_dict["video_width"]
  769. video_height = video_url_dict["video_height"]
  770. video_dict = {'video_title': video_title,
  771. 'video_id': video_id,
  772. 'gid': gid,
  773. 'play_cnt': play_cnt,
  774. 'comment_cnt': comment_cnt,
  775. 'like_cnt': like_cnt,
  776. 'share_cnt': share_cnt,
  777. 'video_width': video_width,
  778. 'video_height': video_height,
  779. 'duration': video_duration,
  780. 'publish_time_stamp': publish_time,
  781. 'publish_time_str': publish_time_str,
  782. 'is_top': is_top,
  783. 'user_name': user_name,
  784. 'user_id': user_id,
  785. 'avatar_url': avatar_url,
  786. 'cover_url': cover_url,
  787. 'audio_url': audio_url,
  788. 'video_url': video_url,
  789. 'session': signature}
  790. for k, v in video_dict.items():
  791. Common.logger(log_type, crawler).info(f"{k}:{v}")
  792. cls.download_publish(log_type=log_type,
  793. crawler=crawler,
  794. video_dict=video_dict,
  795. rule_dict=rule_dict,
  796. strategy=strategy,
  797. our_uid=our_uid,
  798. oss_endpoint=oss_endpoint,
  799. env=env,
  800. machine=machine)
  801. except Exception as e:
  802. Common.logger(log_type, crawler).error(f"get_videolist:{e}\n")
  803. @classmethod
  804. def repeat_video(cls, log_type, crawler, video_id, env, machine):
  805. sql = f""" select * from crawler_video where platform="{cls.platform}" and out_video_id="{video_id}"; """
  806. repeat_video = MysqlHelper.get_values(log_type, crawler, sql, env, machine)
  807. return len(repeat_video)
  808. # 下载 / 上传
  809. @classmethod
  810. def download_publish(cls, log_type, crawler, strategy, video_dict, rule_dict, our_uid, oss_endpoint, env, machine):
  811. try:
  812. if cls.download_rule(video_dict, rule_dict) is False:
  813. Common.logger(log_type, crawler).info('不满足抓取规则\n')
  814. elif any(word if word in video_dict['video_title'] else False for word in cls.filter_words(log_type, crawler)) is True:
  815. Common.logger(log_type, crawler).info('标题已中过滤词:{}\n', video_dict['video_title'])
  816. elif cls.repeat_video(log_type, crawler, video_dict['video_id'], env, machine) != 0:
  817. Common.logger(log_type, crawler).info('视频已下载\n')
  818. # elif str(video_dict['video_id']) in [x for y in Feishu.get_values_batch(log_type, 'xigua', 'e075e9') for x in y]:
  819. # Common.logger(log_type, crawler).info('视频已下载\n')
  820. # elif str(video_dict['video_id']) in [x for y in Feishu.get_values_batch(log_type, 'xigua', '3Ul6wZ') for x in y]:
  821. # Common.logger(log_type, crawler).info('视频已下载\n')
  822. # elif str(video_dict['video_id']) in [x for y in Feishu.get_values_batch(log_type, 'xigua', 'QOWqMo') for x in y]:
  823. # Common.logger(log_type, crawler).info('视频已下载\n')
  824. # elif str(video_dict['video_id']) in [x for y in Feishu.get_values_batch(log_type, 'xigua', 'wjhpDs') for x in y]:
  825. # Common.logger(log_type, crawler).info('视频已存在\n')
  826. else:
  827. # 下载封面
  828. Common.download_method(log_type=log_type, crawler=crawler, text='cover', title=video_dict['video_title'], url=video_dict['cover_url'])
  829. # 下载视频
  830. Common.download_method(log_type=log_type, crawler=crawler, text='xigua_video', title=video_dict['video_title'], url=video_dict['video_url'])
  831. # 下载音频
  832. Common.download_method(log_type=log_type, crawler=crawler, text='xigua_audio', title=video_dict['video_title'], url=video_dict['audio_url'])
  833. # 保存视频信息至txt
  834. Common.save_video_info(log_type=log_type, crawler=crawler, video_dict=video_dict)
  835. # 合成音视频
  836. Common.video_compose(log_type=log_type, crawler=crawler, video_dir=f"./{crawler}/videos/{video_dict['video_title']}")
  837. # 上传视频
  838. Common.logger(log_type, crawler).info("开始上传视频...")
  839. our_video_id = Publish.upload_and_publish(log_type=log_type,
  840. crawler=crawler,
  841. strategy=strategy,
  842. our_uid=our_uid,
  843. env=env,
  844. oss_endpoint=oss_endpoint)
  845. if env == 'dev':
  846. our_video_link = f"https://testadmin.piaoquantv.com/cms/post-detail/{our_video_id}/info"
  847. else:
  848. our_video_link = f"https://admin.piaoquantv.com/cms/post-detail/{our_video_id}/info"
  849. Common.logger(log_type, crawler).info("视频上传完成")
  850. if our_video_id is None:
  851. # 删除视频文件夹
  852. shutil.rmtree(f"./{crawler}/videos/{video_dict['video_title']}")
  853. return
  854. # 视频写入飞书
  855. Feishu.insert_columns(log_type, 'xigua', "e075e9", "ROWS", 1, 2)
  856. upload_time = int(time.time())
  857. values = [[time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(upload_time)),
  858. "定向榜",
  859. video_dict['video_title'],
  860. str(video_dict['video_id']),
  861. our_video_link,
  862. video_dict['gid'],
  863. video_dict['play_cnt'],
  864. video_dict['comment_cnt'],
  865. video_dict['like_cnt'],
  866. video_dict['share_cnt'],
  867. video_dict['duration'],
  868. str(video_dict['video_width']) + '*' + str(video_dict['video_height']),
  869. video_dict['publish_time_str'],
  870. video_dict['user_name'],
  871. video_dict['user_id'],
  872. video_dict['avatar_url'],
  873. video_dict['cover_url'],
  874. video_dict['video_url'],
  875. video_dict['audio_url']]]
  876. time.sleep(1)
  877. Feishu.update_values(log_type, 'xigua', "e075e9", "F2:Z2", values)
  878. Common.logger(log_type, crawler).info(f"视频已保存至云文档\n")
  879. # 视频信息保存数据库
  880. insert_sql = f""" insert into crawler_video(video_id,
  881. user_id,
  882. out_user_id,
  883. platform,
  884. strategy,
  885. out_video_id,
  886. video_title,
  887. cover_url,
  888. video_url,
  889. duration,
  890. publish_time,
  891. play_cnt,
  892. crawler_rule,
  893. width,
  894. height)
  895. values({our_video_id},
  896. {our_uid},
  897. "{video_dict['user_id']}",
  898. "{cls.platform}",
  899. "定向爬虫策略",
  900. "{video_dict['video_id']}",
  901. "{video_dict['video_title']}",
  902. "{video_dict['cover_url']}",
  903. "{video_dict['video_url']}",
  904. {int(video_dict['duration'])},
  905. "{video_dict['publish_time_str']}",
  906. {int(video_dict['play_cnt'])},
  907. '{json.dumps(rule_dict)}',
  908. {int(video_dict['video_width'])},
  909. {int(video_dict['video_height'])}) """
  910. Common.logger(log_type, crawler).info(f"insert_sql:{insert_sql}")
  911. MysqlHelper.update_values(log_type, crawler, insert_sql, env, machine)
  912. Common.logger(log_type, crawler).info('视频信息插入数据库成功!\n')
  913. except Exception as e:
  914. Common.logger(log_type, crawler).error(f'download_publish异常:{e}\n')
  915. @classmethod
  916. def get_follow_videos(cls, log_type, crawler, strategy, oss_endpoint, env, machine):
  917. try:
  918. user_list = cls.get_user_list(log_type=log_type, crawler=crawler, sheetid="5tlTYB", env=env, machine=machine)
  919. for user in user_list:
  920. out_uid = user["out_uid"]
  921. user_name = user["user_name"]
  922. our_uid = user["our_uid"]
  923. Common.logger(log_type, crawler).info(f"开始抓取 {user_name} 用户主页视频\n")
  924. cls.get_videolist(log_type=log_type,
  925. crawler=crawler,
  926. strategy=strategy,
  927. our_uid=our_uid,
  928. out_uid=out_uid,
  929. oss_endpoint=oss_endpoint,
  930. env=env,
  931. machine=machine)
  932. cls.offset = 0
  933. time.sleep(3)
  934. except Exception as e:
  935. Common.logger(log_type, crawler).error(f"get_follow_videos:{e}\n")
  936. if __name__ == '__main__':
  937. # print(Follow.get_signature("follow", "xigua", "95420624045", "local"))
  938. # Follow.get_videolist(log_type="follow",
  939. # crawler="xigua",
  940. # strategy="定向爬虫策略",
  941. # our_uid="6267141",
  942. # out_uid="95420624045",
  943. # oss_endpoint="out",
  944. # env="dev",
  945. # machine="local")
  946. # print(Follow.random_signature())
  947. rule = Follow.get_rule("follow", "xigua")
  948. print(type(rule))
  949. print(type(json.dumps(rule)))
  950. print(json.dumps(rule))
  951. pass