xigua_author.py 49 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190
  1. import json
  2. import os
  3. import re
  4. import random
  5. import sys
  6. import string
  7. import time
  8. import uuid
  9. import base64
  10. import requests
  11. from lxml import etree
  12. from Crypto.Cipher import AES
  13. from Crypto.Util.Padding import unpad
  14. from fake_useragent import FakeUserAgent
  15. from common.mq import MQ
  16. sys.path.append(os.getcwd())
  17. from common import AliyunLogger, PiaoQuanPipeline, tunnel_proxies
  18. from common.limit import AuthorLimit
  19. def aes_decrypt(data: str, key: str) -> str:
  20. """
  21. XiGua AES decrypt
  22. :param data:
  23. :param key:
  24. :return:
  25. """
  26. password = key.encode()
  27. iv = password[:16]
  28. try:
  29. ct = base64.b64decode(data.encode())
  30. cipher = AES.new(password, AES.MODE_CBC, iv)
  31. pt = unpad(cipher.decrypt(ct), AES.block_size)
  32. return base64.b64decode(pt).decode()
  33. except Exception as e:
  34. print("Incorrect decryption {}".format(e))
  35. return None
  36. def extract_video_url(text):
  37. """
  38. 获取视频 video_url
  39. :param text:
  40. :return:
  41. """
  42. HTML = etree.HTML(text)
  43. str_2 = HTML.xpath('//script[@id="SSR_HYDRATED_DATA"]/text()')[0]
  44. json_2 = str_2[str_2.find('{'):str_2.rfind('}') + 1]
  45. Irregulars = ['null', 'undefined', '=false', '=true', 'false', 'true']
  46. # python中不规则的定义
  47. for I in Irregulars:
  48. if I in ['=false', '=true']:
  49. json_2 = json_2.replace(I, '=' + I[1:].capitalize())
  50. else:
  51. json_2 = json_2.replace(I, '12')
  52. dict_2 = json.loads(json_2)["anyVideo"]["gidInformation"]["packerData"]["video"]["videoResource"]["dash"]
  53. ptk = dict_2["ptk"]
  54. video_url = dict_2['dynamic_video']['main_url']
  55. real_video_url = aes_decrypt(data=video_url, key=ptk)
  56. return real_video_url
  57. def extract_info_by_re(text):
  58. """
  59. 通过正则表达式获取文本中的信息
  60. :param text:
  61. :return:
  62. """
  63. # 标题
  64. title_match = re.search(r'<title[^>]*>(.*?)</title>', text)
  65. if title_match:
  66. title_content = title_match.group(1)
  67. title_content = title_content.split(" - ")[0]
  68. title_content = bytes(title_content, "latin1").decode()
  69. else:
  70. title_content = ""
  71. # video_id
  72. video_id = re.search(r'"vid":"(.*?)"', text).group(1)
  73. # like_count
  74. like_count = re.search(r'"video_like_count":(.*?),', text).group(1)
  75. # cover_url
  76. cover_url = re.search(r'"avatar_url":"(.*?)"', text).group(1)
  77. # video_play
  78. video_watch_count = re.search(r'"video_watch_count":(.*?),', text).group(1)
  79. # "video_publish_time"
  80. publish_time = re.search(r'"video_publish_time":"(.*?)"', text).group(1)
  81. # video_duration
  82. duration = re.search(r'("video_duration":)(.*?)"', text).group(2).replace(",", "")
  83. return {
  84. "title": title_content,
  85. "url": extract_video_url(text),
  86. "video_id": video_id,
  87. "like_count": like_count,
  88. "cover_url": cover_url,
  89. "play_count": video_watch_count,
  90. "publish_time": publish_time,
  91. "duration": duration
  92. }
  93. def random_signature():
  94. """
  95. 随机生成签名
  96. """
  97. src_digits = string.digits # string_数字
  98. src_uppercase = string.ascii_uppercase # string_大写字母
  99. src_lowercase = string.ascii_lowercase # string_小写字母
  100. digits_num = random.randint(1, 6)
  101. uppercase_num = random.randint(1, 26 - digits_num - 1)
  102. lowercase_num = 26 - (digits_num + uppercase_num)
  103. password = (
  104. random.sample(src_digits, digits_num)
  105. + random.sample(src_uppercase, uppercase_num)
  106. + random.sample(src_lowercase, lowercase_num)
  107. )
  108. random.shuffle(password)
  109. new_password = "AAAAAAAAAA" + "".join(password)[10:-4] + "AAAB"
  110. new_password_start = new_password[0:18]
  111. new_password_end = new_password[-7:]
  112. if new_password[18] == "8":
  113. new_password = new_password_start + "w" + new_password_end
  114. elif new_password[18] == "9":
  115. new_password = new_password_start + "x" + new_password_end
  116. elif new_password[18] == "-":
  117. new_password = new_password_start + "y" + new_password_end
  118. elif new_password[18] == ".":
  119. new_password = new_password_start + "z" + new_password_end
  120. else:
  121. new_password = new_password_start + "y" + new_password_end
  122. return new_password
  123. def byte_dance_cookie(item_id):
  124. """
  125. 获取西瓜视频的 cookie
  126. :param item_id:
  127. """
  128. sess = requests.Session()
  129. sess.headers.update({
  130. 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_1_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',
  131. 'referer': 'https://www.ixigua.com/home/{}/'.format(item_id),
  132. })
  133. # 获取 cookies
  134. sess.get('https://i.snssdk.com/slardar/sdk.js?bid=xigua_video_web_pc')
  135. data = '{"region":"cn","aid":1768,"needFid":false,"service":"www.ixigua.com","migrate_info":{"ticket":"","source":"node"},"cbUrlProtocol":"https","union":true}'
  136. r = sess.post('https://ttwid.bytedance.com/ttwid/union/register/', data=data)
  137. # print(r.text)
  138. return r.cookies.values()[0]
  139. def get_video_url(video_info):
  140. """
  141. 获取视频的链接
  142. """
  143. video_url_dict = {}
  144. # video_url
  145. if "videoResource" not in video_info:
  146. video_url_dict["video_url"] = ""
  147. video_url_dict["audio_url"] = ""
  148. video_url_dict["video_width"] = 0
  149. video_url_dict["video_height"] = 0
  150. elif "dash_120fps" in video_info["videoResource"]:
  151. if (
  152. "video_list" in video_info["videoResource"]["dash_120fps"]
  153. and "video_4" in video_info["videoResource"]["dash_120fps"]["video_list"]
  154. ):
  155. video_url = video_info["videoResource"]["dash_120fps"]["video_list"][
  156. "video_4"
  157. ]["backup_url_1"]
  158. audio_url = video_info["videoResource"]["dash_120fps"]["video_list"][
  159. "video_4"
  160. ]["backup_url_1"]
  161. if len(video_url) % 3 == 1:
  162. video_url += "=="
  163. elif len(video_url) % 3 == 2:
  164. video_url += "="
  165. elif len(audio_url) % 3 == 1:
  166. audio_url += "=="
  167. elif len(audio_url) % 3 == 2:
  168. audio_url += "="
  169. video_url = base64.b64decode(video_url).decode("utf8")
  170. audio_url = base64.b64decode(audio_url).decode("utf8")
  171. video_width = video_info["videoResource"]["dash_120fps"]["video_list"][
  172. "video_4"
  173. ]["vwidth"]
  174. video_height = video_info["videoResource"]["dash_120fps"]["video_list"][
  175. "video_4"
  176. ]["vheight"]
  177. video_url_dict["video_url"] = video_url
  178. video_url_dict["audio_url"] = audio_url
  179. video_url_dict["video_width"] = video_width
  180. video_url_dict["video_height"] = video_height
  181. elif (
  182. "video_list" in video_info["videoResource"]["dash_120fps"]
  183. and "video_3" in video_info["videoResource"]["dash_120fps"]["video_list"]
  184. ):
  185. video_url = video_info["videoResource"]["dash_120fps"]["video_list"][
  186. "video_3"
  187. ]["backup_url_1"]
  188. audio_url = video_info["videoResource"]["dash_120fps"]["video_list"][
  189. "video_3"
  190. ]["backup_url_1"]
  191. if len(video_url) % 3 == 1:
  192. video_url += "=="
  193. elif len(video_url) % 3 == 2:
  194. video_url += "="
  195. elif len(audio_url) % 3 == 1:
  196. audio_url += "=="
  197. elif len(audio_url) % 3 == 2:
  198. audio_url += "="
  199. video_url = base64.b64decode(video_url).decode("utf8")
  200. audio_url = base64.b64decode(audio_url).decode("utf8")
  201. video_width = video_info["videoResource"]["dash_120fps"]["video_list"][
  202. "video_3"
  203. ]["vwidth"]
  204. video_height = video_info["videoResource"]["dash_120fps"]["video_list"][
  205. "video_3"
  206. ]["vheight"]
  207. video_url_dict["video_url"] = video_url
  208. video_url_dict["audio_url"] = audio_url
  209. video_url_dict["video_width"] = video_width
  210. video_url_dict["video_height"] = video_height
  211. elif (
  212. "video_list" in video_info["videoResource"]["dash_120fps"]
  213. and "video_2" in video_info["videoResource"]["dash_120fps"]["video_list"]
  214. ):
  215. video_url = video_info["videoResource"]["dash_120fps"]["video_list"][
  216. "video_2"
  217. ]["backup_url_1"]
  218. audio_url = video_info["videoResource"]["dash_120fps"]["video_list"][
  219. "video_2"
  220. ]["backup_url_1"]
  221. if len(video_url) % 3 == 1:
  222. video_url += "=="
  223. elif len(video_url) % 3 == 2:
  224. video_url += "="
  225. elif len(audio_url) % 3 == 1:
  226. audio_url += "=="
  227. elif len(audio_url) % 3 == 2:
  228. audio_url += "="
  229. video_url = base64.b64decode(video_url).decode("utf8")
  230. audio_url = base64.b64decode(audio_url).decode("utf8")
  231. video_width = video_info["videoResource"]["dash_120fps"]["video_list"][
  232. "video_2"
  233. ]["vwidth"]
  234. video_height = video_info["videoResource"]["dash_120fps"]["video_list"][
  235. "video_2"
  236. ]["vheight"]
  237. video_url_dict["video_url"] = video_url
  238. video_url_dict["audio_url"] = audio_url
  239. video_url_dict["video_width"] = video_width
  240. video_url_dict["video_height"] = video_height
  241. elif (
  242. "video_list" in video_info["videoResource"]["dash_120fps"]
  243. and "video_1" in video_info["videoResource"]["dash_120fps"]["video_list"]
  244. ):
  245. video_url = video_info["videoResource"]["dash_120fps"]["video_list"][
  246. "video_1"
  247. ]["backup_url_1"]
  248. audio_url = video_info["videoResource"]["dash_120fps"]["video_list"][
  249. "video_1"
  250. ]["backup_url_1"]
  251. if len(video_url) % 3 == 1:
  252. video_url += "=="
  253. elif len(video_url) % 3 == 2:
  254. video_url += "="
  255. elif len(audio_url) % 3 == 1:
  256. audio_url += "=="
  257. elif len(audio_url) % 3 == 2:
  258. audio_url += "="
  259. video_url = base64.b64decode(video_url).decode("utf8")
  260. audio_url = base64.b64decode(audio_url).decode("utf8")
  261. video_width = video_info["videoResource"]["dash_120fps"]["video_list"][
  262. "video_1"
  263. ]["vwidth"]
  264. video_height = video_info["videoResource"]["dash_120fps"]["video_list"][
  265. "video_1"
  266. ]["vheight"]
  267. video_url_dict["video_url"] = video_url
  268. video_url_dict["audio_url"] = audio_url
  269. video_url_dict["video_width"] = video_width
  270. video_url_dict["video_height"] = video_height
  271. elif (
  272. "dynamic_video" in video_info["videoResource"]["dash_120fps"]
  273. and "dynamic_video_list"
  274. in video_info["videoResource"]["dash_120fps"]["dynamic_video"]
  275. and "dynamic_audio_list"
  276. in video_info["videoResource"]["dash_120fps"]["dynamic_video"]
  277. and len(
  278. video_info["videoResource"]["dash_120fps"]["dynamic_video"][
  279. "dynamic_video_list"
  280. ]
  281. )
  282. != 0
  283. and len(
  284. video_info["videoResource"]["dash_120fps"]["dynamic_video"][
  285. "dynamic_audio_list"
  286. ]
  287. )
  288. != 0
  289. ):
  290. video_url = video_info["videoResource"]["dash_120fps"]["dynamic_video"][
  291. "dynamic_video_list"
  292. ][-1]["backup_url_1"]
  293. audio_url = video_info["videoResource"]["dash_120fps"]["dynamic_video"][
  294. "dynamic_audio_list"
  295. ][-1]["backup_url_1"]
  296. if len(video_url) % 3 == 1:
  297. video_url += "=="
  298. elif len(video_url) % 3 == 2:
  299. video_url += "="
  300. elif len(audio_url) % 3 == 1:
  301. audio_url += "=="
  302. elif len(audio_url) % 3 == 2:
  303. audio_url += "="
  304. video_url = base64.b64decode(video_url).decode("utf8")
  305. audio_url = base64.b64decode(audio_url).decode("utf8")
  306. video_width = video_info["videoResource"]["dash_120fps"]["dynamic_video"][
  307. "dynamic_video_list"
  308. ][-1]["vwidth"]
  309. video_height = video_info["videoResource"]["dash_120fps"]["dynamic_video"][
  310. "dynamic_video_list"
  311. ][-1]["vheight"]
  312. video_url_dict["video_url"] = video_url
  313. video_url_dict["audio_url"] = audio_url
  314. video_url_dict["video_width"] = video_width
  315. video_url_dict["video_height"] = video_height
  316. else:
  317. video_url_dict["video_url"] = ""
  318. video_url_dict["audio_url"] = ""
  319. video_url_dict["video_width"] = 0
  320. video_url_dict["video_height"] = 0
  321. elif "dash" in video_info["videoResource"]:
  322. if (
  323. "video_list" in video_info["videoResource"]["dash"]
  324. and "video_4" in video_info["videoResource"]["dash"]["video_list"]
  325. ):
  326. video_url = video_info["videoResource"]["dash"]["video_list"]["video_4"][
  327. "backup_url_1"
  328. ]
  329. audio_url = video_info["videoResource"]["dash"]["video_list"]["video_4"][
  330. "backup_url_1"
  331. ]
  332. if len(video_url) % 3 == 1:
  333. video_url += "=="
  334. elif len(video_url) % 3 == 2:
  335. video_url += "="
  336. elif len(audio_url) % 3 == 1:
  337. audio_url += "=="
  338. elif len(audio_url) % 3 == 2:
  339. audio_url += "="
  340. video_url = base64.b64decode(video_url).decode("utf8")
  341. audio_url = base64.b64decode(audio_url).decode("utf8")
  342. video_width = video_info["videoResource"]["dash"]["video_list"]["video_4"][
  343. "vwidth"
  344. ]
  345. video_height = video_info["videoResource"]["dash"]["video_list"]["video_4"][
  346. "vheight"
  347. ]
  348. video_url_dict["video_url"] = video_url
  349. video_url_dict["audio_url"] = audio_url
  350. video_url_dict["video_width"] = video_width
  351. video_url_dict["video_height"] = video_height
  352. elif (
  353. "video_list" in video_info["videoResource"]["dash"]
  354. and "video_3" in video_info["videoResource"]["dash"]["video_list"]
  355. ):
  356. video_url = video_info["videoResource"]["dash"]["video_list"]["video_3"][
  357. "backup_url_1"
  358. ]
  359. audio_url = video_info["videoResource"]["dash"]["video_list"]["video_3"][
  360. "backup_url_1"
  361. ]
  362. if len(video_url) % 3 == 1:
  363. video_url += "=="
  364. elif len(video_url) % 3 == 2:
  365. video_url += "="
  366. elif len(audio_url) % 3 == 1:
  367. audio_url += "=="
  368. elif len(audio_url) % 3 == 2:
  369. audio_url += "="
  370. video_url = base64.b64decode(video_url).decode("utf8")
  371. audio_url = base64.b64decode(audio_url).decode("utf8")
  372. video_width = video_info["videoResource"]["dash"]["video_list"]["video_3"][
  373. "vwidth"
  374. ]
  375. video_height = video_info["videoResource"]["dash"]["video_list"]["video_3"][
  376. "vheight"
  377. ]
  378. video_url_dict["video_url"] = video_url
  379. video_url_dict["audio_url"] = audio_url
  380. video_url_dict["video_width"] = video_width
  381. video_url_dict["video_height"] = video_height
  382. elif (
  383. "video_list" in video_info["videoResource"]["dash"]
  384. and "video_2" in video_info["videoResource"]["dash"]["video_list"]
  385. ):
  386. video_url = video_info["videoResource"]["dash"]["video_list"]["video_2"][
  387. "backup_url_1"
  388. ]
  389. audio_url = video_info["videoResource"]["dash"]["video_list"]["video_2"][
  390. "backup_url_1"
  391. ]
  392. if len(video_url) % 3 == 1:
  393. video_url += "=="
  394. elif len(video_url) % 3 == 2:
  395. video_url += "="
  396. elif len(audio_url) % 3 == 1:
  397. audio_url += "=="
  398. elif len(audio_url) % 3 == 2:
  399. audio_url += "="
  400. video_url = base64.b64decode(video_url).decode("utf8")
  401. audio_url = base64.b64decode(audio_url).decode("utf8")
  402. video_width = video_info["videoResource"]["dash"]["video_list"]["video_2"][
  403. "vwidth"
  404. ]
  405. video_height = video_info["videoResource"]["dash"]["video_list"]["video_2"][
  406. "vheight"
  407. ]
  408. video_url_dict["video_url"] = video_url
  409. video_url_dict["audio_url"] = audio_url
  410. video_url_dict["video_width"] = video_width
  411. video_url_dict["video_height"] = video_height
  412. elif (
  413. "video_list" in video_info["videoResource"]["dash"]
  414. and "video_1" in video_info["videoResource"]["dash"]["video_list"]
  415. ):
  416. video_url = video_info["videoResource"]["dash"]["video_list"]["video_1"][
  417. "backup_url_1"
  418. ]
  419. audio_url = video_info["videoResource"]["dash"]["video_list"]["video_1"][
  420. "backup_url_1"
  421. ]
  422. if len(video_url) % 3 == 1:
  423. video_url += "=="
  424. elif len(video_url) % 3 == 2:
  425. video_url += "="
  426. elif len(audio_url) % 3 == 1:
  427. audio_url += "=="
  428. elif len(audio_url) % 3 == 2:
  429. audio_url += "="
  430. video_url = base64.b64decode(video_url).decode("utf8")
  431. audio_url = base64.b64decode(audio_url).decode("utf8")
  432. video_width = video_info["videoResource"]["dash"]["video_list"]["video_1"][
  433. "vwidth"
  434. ]
  435. video_height = video_info["videoResource"]["dash"]["video_list"]["video_1"][
  436. "vheight"
  437. ]
  438. video_url_dict["video_url"] = video_url
  439. video_url_dict["audio_url"] = audio_url
  440. video_url_dict["video_width"] = video_width
  441. video_url_dict["video_height"] = video_height
  442. elif (
  443. "dynamic_video" in video_info["videoResource"]["dash"]
  444. and "dynamic_video_list"
  445. in video_info["videoResource"]["dash"]["dynamic_video"]
  446. and "dynamic_audio_list"
  447. in video_info["videoResource"]["dash"]["dynamic_video"]
  448. and len(
  449. video_info["videoResource"]["dash"]["dynamic_video"][
  450. "dynamic_video_list"
  451. ]
  452. )
  453. != 0
  454. and len(
  455. video_info["videoResource"]["dash"]["dynamic_video"][
  456. "dynamic_audio_list"
  457. ]
  458. )
  459. != 0
  460. ):
  461. video_url = video_info["videoResource"]["dash"]["dynamic_video"][
  462. "dynamic_video_list"
  463. ][-1]["backup_url_1"]
  464. audio_url = video_info["videoResource"]["dash"]["dynamic_video"][
  465. "dynamic_audio_list"
  466. ][-1]["backup_url_1"]
  467. if len(video_url) % 3 == 1:
  468. video_url += "=="
  469. elif len(video_url) % 3 == 2:
  470. video_url += "="
  471. elif len(audio_url) % 3 == 1:
  472. audio_url += "=="
  473. elif len(audio_url) % 3 == 2:
  474. audio_url += "="
  475. video_url = base64.b64decode(video_url).decode("utf8")
  476. audio_url = base64.b64decode(audio_url).decode("utf8")
  477. video_width = video_info["videoResource"]["dash"]["dynamic_video"][
  478. "dynamic_video_list"
  479. ][-1]["vwidth"]
  480. video_height = video_info["videoResource"]["dash"]["dynamic_video"][
  481. "dynamic_video_list"
  482. ][-1]["vheight"]
  483. video_url_dict["video_url"] = video_url
  484. video_url_dict["audio_url"] = audio_url
  485. video_url_dict["video_width"] = video_width
  486. video_url_dict["video_height"] = video_height
  487. else:
  488. video_url_dict["video_url"] = ""
  489. video_url_dict["audio_url"] = ""
  490. video_url_dict["video_width"] = 0
  491. video_url_dict["video_height"] = 0
  492. elif "normal" in video_info["videoResource"]:
  493. if (
  494. "video_list" in video_info["videoResource"]["normal"]
  495. and "video_4" in video_info["videoResource"]["normal"]["video_list"]
  496. ):
  497. video_url = video_info["videoResource"]["normal"]["video_list"]["video_4"][
  498. "backup_url_1"
  499. ]
  500. audio_url = video_info["videoResource"]["normal"]["video_list"]["video_4"][
  501. "backup_url_1"
  502. ]
  503. if len(video_url) % 3 == 1:
  504. video_url += "=="
  505. elif len(video_url) % 3 == 2:
  506. video_url += "="
  507. elif len(audio_url) % 3 == 1:
  508. audio_url += "=="
  509. elif len(audio_url) % 3 == 2:
  510. audio_url += "="
  511. video_url = base64.b64decode(video_url).decode("utf8")
  512. audio_url = base64.b64decode(audio_url).decode("utf8")
  513. video_width = video_info["videoResource"]["normal"]["video_list"][
  514. "video_4"
  515. ]["vwidth"]
  516. video_height = video_info["videoResource"]["normal"]["video_list"][
  517. "video_4"
  518. ]["vheight"]
  519. video_url_dict["video_url"] = video_url
  520. video_url_dict["audio_url"] = audio_url
  521. video_url_dict["video_width"] = video_width
  522. video_url_dict["video_height"] = video_height
  523. elif (
  524. "video_list" in video_info["videoResource"]["normal"]
  525. and "video_3" in video_info["videoResource"]["normal"]["video_list"]
  526. ):
  527. video_url = video_info["videoResource"]["normal"]["video_list"]["video_3"][
  528. "backup_url_1"
  529. ]
  530. audio_url = video_info["videoResource"]["normal"]["video_list"]["video_3"][
  531. "backup_url_1"
  532. ]
  533. if len(video_url) % 3 == 1:
  534. video_url += "=="
  535. elif len(video_url) % 3 == 2:
  536. video_url += "="
  537. elif len(audio_url) % 3 == 1:
  538. audio_url += "=="
  539. elif len(audio_url) % 3 == 2:
  540. audio_url += "="
  541. video_url = base64.b64decode(video_url).decode("utf8")
  542. audio_url = base64.b64decode(audio_url).decode("utf8")
  543. video_width = video_info["videoResource"]["normal"]["video_list"][
  544. "video_3"
  545. ]["vwidth"]
  546. video_height = video_info["videoResource"]["normal"]["video_list"][
  547. "video_3"
  548. ]["vheight"]
  549. video_url_dict["video_url"] = video_url
  550. video_url_dict["audio_url"] = audio_url
  551. video_url_dict["video_width"] = video_width
  552. video_url_dict["video_height"] = video_height
  553. elif (
  554. "video_list" in video_info["videoResource"]["normal"]
  555. and "video_2" in video_info["videoResource"]["normal"]["video_list"]
  556. ):
  557. video_url = video_info["videoResource"]["normal"]["video_list"]["video_2"][
  558. "backup_url_1"
  559. ]
  560. audio_url = video_info["videoResource"]["normal"]["video_list"]["video_2"][
  561. "backup_url_1"
  562. ]
  563. if len(video_url) % 3 == 1:
  564. video_url += "=="
  565. elif len(video_url) % 3 == 2:
  566. video_url += "="
  567. elif len(audio_url) % 3 == 1:
  568. audio_url += "=="
  569. elif len(audio_url) % 3 == 2:
  570. audio_url += "="
  571. video_url = base64.b64decode(video_url).decode("utf8")
  572. audio_url = base64.b64decode(audio_url).decode("utf8")
  573. video_width = video_info["videoResource"]["normal"]["video_list"][
  574. "video_2"
  575. ]["vwidth"]
  576. video_height = video_info["videoResource"]["normal"]["video_list"][
  577. "video_2"
  578. ]["vheight"]
  579. video_url_dict["video_url"] = video_url
  580. video_url_dict["audio_url"] = audio_url
  581. video_url_dict["video_width"] = video_width
  582. video_url_dict["video_height"] = video_height
  583. elif (
  584. "video_list" in video_info["videoResource"]["normal"]
  585. and "video_1" in video_info["videoResource"]["normal"]["video_list"]
  586. ):
  587. video_url = video_info["videoResource"]["normal"]["video_list"]["video_1"][
  588. "backup_url_1"
  589. ]
  590. audio_url = video_info["videoResource"]["normal"]["video_list"]["video_1"][
  591. "backup_url_1"
  592. ]
  593. if len(video_url) % 3 == 1:
  594. video_url += "=="
  595. elif len(video_url) % 3 == 2:
  596. video_url += "="
  597. elif len(audio_url) % 3 == 1:
  598. audio_url += "=="
  599. elif len(audio_url) % 3 == 2:
  600. audio_url += "="
  601. video_url = base64.b64decode(video_url).decode("utf8")
  602. audio_url = base64.b64decode(audio_url).decode("utf8")
  603. video_width = video_info["videoResource"]["normal"]["video_list"][
  604. "video_1"
  605. ]["vwidth"]
  606. video_height = video_info["videoResource"]["normal"]["video_list"][
  607. "video_1"
  608. ]["vheight"]
  609. video_url_dict["video_url"] = video_url
  610. video_url_dict["audio_url"] = audio_url
  611. video_url_dict["video_width"] = video_width
  612. video_url_dict["video_height"] = video_height
  613. elif (
  614. "dynamic_video" in video_info["videoResource"]["normal"]
  615. and "dynamic_video_list"
  616. in video_info["videoResource"]["normal"]["dynamic_video"]
  617. and "dynamic_audio_list"
  618. in video_info["videoResource"]["normal"]["dynamic_video"]
  619. and len(
  620. video_info["videoResource"]["normal"]["dynamic_video"][
  621. "dynamic_video_list"
  622. ]
  623. )
  624. != 0
  625. and len(
  626. video_info["videoResource"]["normal"]["dynamic_video"][
  627. "dynamic_audio_list"
  628. ]
  629. )
  630. != 0
  631. ):
  632. video_url = video_info["videoResource"]["normal"]["dynamic_video"][
  633. "dynamic_video_list"
  634. ][-1]["backup_url_1"]
  635. audio_url = video_info["videoResource"]["normal"]["dynamic_video"][
  636. "dynamic_audio_list"
  637. ][-1]["backup_url_1"]
  638. if len(video_url) % 3 == 1:
  639. video_url += "=="
  640. elif len(video_url) % 3 == 2:
  641. video_url += "="
  642. elif len(audio_url) % 3 == 1:
  643. audio_url += "=="
  644. elif len(audio_url) % 3 == 2:
  645. audio_url += "="
  646. video_url = base64.b64decode(video_url).decode("utf8")
  647. audio_url = base64.b64decode(audio_url).decode("utf8")
  648. video_width = video_info["videoResource"]["normal"]["dynamic_video"][
  649. "dynamic_video_list"
  650. ][-1]["vwidth"]
  651. video_height = video_info["videoResource"]["normal"]["dynamic_video"][
  652. "dynamic_video_list"
  653. ][-1]["vheight"]
  654. video_url_dict["video_url"] = video_url
  655. video_url_dict["audio_url"] = audio_url
  656. video_url_dict["video_width"] = video_width
  657. video_url_dict["video_height"] = video_height
  658. else:
  659. video_url_dict["video_url"] = ""
  660. video_url_dict["audio_url"] = ""
  661. video_url_dict["video_width"] = 0
  662. video_url_dict["video_height"] = 0
  663. else:
  664. video_url_dict["video_url"] = ""
  665. video_url_dict["audio_url"] = ""
  666. video_url_dict["video_width"] = 0
  667. video_url_dict["video_height"] = 0
  668. return video_url_dict
  669. def get_comment_cnt(item_id):
  670. """
  671. 获取视频的评论数量
  672. """
  673. url = "https://www.ixigua.com/tlb/comment/article/v5/tab_comments/?"
  674. params = {
  675. "tab_index": "0",
  676. "count": "10",
  677. "offset": "10",
  678. "group_id": str(item_id),
  679. "item_id": str(item_id),
  680. "aid": "1768",
  681. "msToken": "50-JJObWB07HfHs-BMJWT1eIDX3G-6lPSF_i-QwxBIXE9VVa-iN0jbEXR5pG2DKjXBmP299n6ZTuXzY-GAy968CCvouSAYIS4GzvGQT3pNlKNejr5G4-1g==",
  682. "X-Bogus": "DFSzswVOyGtANVeWtCLMqR/F6q9U",
  683. "_signature": random_signature(),
  684. }
  685. headers = {
  686. "authority": "www.ixigua.com",
  687. "accept": "application/json, text/plain, */*",
  688. "accept-language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
  689. "cache-control": "no-cache",
  690. "cookie": "MONITOR_WEB_ID=67cb5099-a022-4ec3-bb8e-c4de6ba51dd0; passport_csrf_token=72b2574f3c99f8ba670e42df430218fd; passport_csrf_token_default=72b2574f3c99f8ba670e42df430218fd; sid_guard=c7472b508ea631823ba765a60cf8757f%7C1680867422%7C3024002%7CFri%2C+12-May-2023+11%3A37%3A04+GMT; uid_tt=c13f47d51767f616befe32fb3e9f485a; uid_tt_ss=c13f47d51767f616befe32fb3e9f485a; sid_tt=c7472b508ea631823ba765a60cf8757f; sessionid=c7472b508ea631823ba765a60cf8757f; sessionid_ss=c7472b508ea631823ba765a60cf8757f; sid_ucp_v1=1.0.0-KGUzNWYxNmRkZGJiZjgxY2MzZWNkMTEzMTkwYjY1Yjg5OTY5NzVlNmMKFQiu3d-eqQIQ3oDAoQYYGCAMOAhACxoCaGwiIGM3NDcyYjUwOGVhNjMxODIzYmE3NjVhNjBjZjg3NTdm; ssid_ucp_v1=1.0.0-KGUzNWYxNmRkZGJiZjgxY2MzZWNkMTEzMTkwYjY1Yjg5OTY5NzVlNmMKFQiu3d-eqQIQ3oDAoQYYGCAMOAhACxoCaGwiIGM3NDcyYjUwOGVhNjMxODIzYmE3NjVhNjBjZjg3NTdm; odin_tt=b893608d4dde2e1e8df8cd5d97a0e2fbeafc4ca762ac72ebef6e6c97e2ed19859bb01d46b4190ddd6dd17d7f9678e1de; SEARCH_CARD_MODE=7168304743566296612_0; support_webp=true; support_avif=false; csrf_session_id=a5355d954d3c63ed1ba35faada452b4d; tt_scid=7Pux7s634-z8DYvCM20y7KigwH5u7Rh6D9C-RROpnT.aGMEcz6Vsxp.oai47wJqa4f86; ttwid=1%7CHHtv2QqpSGuSu8r-zXF1QoWsvjmNi1SJrqOrZzg-UCY%7C1683858689%7Ca5223fe1500578e01e138a0d71d6444692018296c4c24f5885af174a65873c95; ixigua-a-s=3; msToken=50-JJObWB07HfHs-BMJWT1eIDX3G-6lPSF_i-QwxBIXE9VVa-iN0jbEXR5pG2DKjXBmP299n6ZTuXzY-GAy968CCvouSAYIS4GzvGQT3pNlKNejr5G4-1g==; __ac_nonce=0645dcbf0005064517440; __ac_signature=_02B4Z6wo00f01FEGmAwAAIDBKchzCGqn-MBRJpyAAHAjieFC5GEg6gGiwz.I4PRrJl7f0GcixFrExKmgt6QI1i1S-dQyofPEj2ugWTCnmKUdJQv-wYuDofeKNe8VtMtZq2aKewyUGeKU-5Ud21; ixigua-a-s=3",
  691. "pragma": "no-cache",
  692. "referer": f"https://www.ixigua.com/{item_id}?logTag=3c5aa86a8600b9ab8540",
  693. "sec-ch-ua": '"Microsoft Edge";v="113", "Chromium";v="113", "Not-A.Brand";v="24"',
  694. "sec-ch-ua-mobile": "?0",
  695. "sec-ch-ua-platform": '"macOS"',
  696. "sec-fetch-dest": "empty",
  697. "sec-fetch-mode": "cors",
  698. "sec-fetch-site": "same-origin",
  699. "tt-anti-token": "cBITBHvmYjEygzv-f9c78c1297722cf1f559c74b084e4525ce4900bdcf9e8588f20cc7c2e3234422",
  700. "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.35",
  701. "x-secsdk-csrf-token": "000100000001f8e733cf37f0cd255a51aea9a81ff7bc0c09490cfe41ad827c3c5c18ec809279175e4d9f5553d8a5",
  702. }
  703. response = requests.get(
  704. url=url, headers=headers, params=params, proxies=tunnel_proxies(), timeout=5
  705. )
  706. response.close()
  707. if (
  708. response.status_code != 200
  709. or "total_number" not in response.json()
  710. or response.json() == {}
  711. ):
  712. return 0
  713. return response.json().get("total_number", 0)
  714. class XiGuaAuthor:
  715. """
  716. 西瓜账号爬虫
  717. """
  718. def __init__(self, platform, mode, rule_dict, env, user_list):
  719. self.platform = platform
  720. self.mode = mode
  721. self.rule_dict = rule_dict
  722. self.env = env
  723. self.user_list = user_list
  724. self.mq = MQ(topic_name="topic_crawler_etl_" + self.env)
  725. self.download_count = 0
  726. self.limiter = AuthorLimit(platform=self.platform, mode=self.mode)
  727. def rule_maker(self, account):
  728. """
  729. 通过不同的账号生成不同的规则
  730. :param account: 输入的账号信息
  731. {'play_cnt': {'min': 100000, 'max': 0}, 'period': {'min': 5, 'max': 5}}
  732. """
  733. temp = account['link'].split("?")[0].split("_")
  734. if len(temp) == 1:
  735. return self.rule_dict
  736. else:
  737. flag = temp[-2]
  738. match flag:
  739. case "V1":
  740. rule_dict = {
  741. "play_cnt": {"min": 100000, "max": 0},
  742. 'period': {"min": 90, "max": 90},
  743. 'special': 0.02
  744. }
  745. return rule_dict
  746. case "V2":
  747. rule_dict = {
  748. "play_cnt": {"min": 10000, "max": 0},
  749. 'period': {"min": 90, "max": 90},
  750. 'special': 0.01
  751. }
  752. return rule_dict
  753. case "V3":
  754. rule_dict = {
  755. "play_cnt": {"min": 5000, "max": 0},
  756. 'period': {"min": 90, "max": 90},
  757. 'special': 0.01
  758. }
  759. return rule_dict
  760. def get_author_list(self):
  761. """
  762. 每轮只抓取定量的数据,到达数量后自己退出
  763. 获取账号列表以及账号信息
  764. """
  765. # max_count = int(self.rule_dict.get("videos_cnt", {}).get("min", 300))
  766. for user_dict in self.user_list:
  767. # if self.download_count <= max_count:
  768. try:
  769. flag = user_dict["link"][0]
  770. match flag:
  771. case "V":
  772. self.get_video_list(user_dict)
  773. case "X":
  774. self.get_tiny_video_list(user_dict)
  775. case "h":
  776. self.get_video_list(user_dict)
  777. case "D":
  778. self.get_video_list(user_dict)
  779. case "B":
  780. self.get_video_list(user_dict)
  781. self.get_tiny_video_list(user_dict)
  782. except Exception as e:
  783. AliyunLogger.logging(
  784. code="3001",
  785. account=user_dict["uid"],
  786. platform=self.platform,
  787. mode=self.mode,
  788. env=self.env,
  789. message="扫描账号时出现bug, 报错是 {}".format(e)
  790. )
  791. # time.sleep(random.randint(1, 15))
  792. # else:
  793. # AliyunLogger.logging(
  794. # code="2000",
  795. # platform=self.platform,
  796. # mode=self.mode,
  797. # env=self.env,
  798. # message="本轮已经抓取足够数量的视频,已经自动退出",
  799. # )
  800. # return
  801. def get_video_list(self, user_dict):
  802. """
  803. 获取某个账号的视频列表
  804. 账号分为 3 类
  805. """
  806. offset = 0
  807. signature = random_signature()
  808. link = user_dict['link'].split("?")[0].split("_")[-1]
  809. url = "https://www.ixigua.com/api/videov2/author/new_video_list?"
  810. while True:
  811. to_user_id = str(link.replace("https://www.ixigua.com/home/", ""))
  812. params = {
  813. "to_user_id": to_user_id,
  814. "offset": str(offset),
  815. "limit": "30",
  816. "maxBehotTime": "0",
  817. "order": "new",
  818. "isHome": "0",
  819. "_signature": signature,
  820. }
  821. headers = {
  822. "referer": f'https://www.ixigua.com/home/{link.replace("https://www.ixigua.com/home/", "")}/video/?preActiveKey=hotsoon&list_entrance=userdetail',
  823. "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.41",
  824. }
  825. response = requests.get(
  826. url=url,
  827. headers=headers,
  828. params=params,
  829. proxies=tunnel_proxies(),
  830. timeout=5,
  831. )
  832. offset += 30
  833. if "data" not in response.text or response.status_code != 200:
  834. AliyunLogger.logging(
  835. code="3000",
  836. platform=self.platform,
  837. mode=self.mode,
  838. env=self.env,
  839. message=f"get_videoList:{response.text}\n",
  840. )
  841. return
  842. elif not response.json()["data"]["videoList"]:
  843. AliyunLogger.logging(
  844. account=link,
  845. code="3000",
  846. platform=self.platform,
  847. mode=self.mode,
  848. env=self.env,
  849. data=response.json(),
  850. message=f"没有更多数据啦~\n",
  851. )
  852. return
  853. else:
  854. feeds = response.json()["data"]["videoList"]
  855. for video_obj in feeds:
  856. try:
  857. AliyunLogger.logging(
  858. code="1001",
  859. account=user_dict['uid'],
  860. platform=self.platform,
  861. mode=self.mode,
  862. env=self.env,
  863. data=video_obj,
  864. message="扫描到一条视频",
  865. )
  866. date_flag = self.process_video_obj(video_obj, user_dict, "l")
  867. if not date_flag:
  868. return
  869. except Exception as e:
  870. AliyunLogger.logging(
  871. code="3000",
  872. platform=self.platform,
  873. mode=self.mode,
  874. env=self.env,
  875. data=video_obj,
  876. message="抓取单条视频异常, 报错原因是: {}".format(e),
  877. )
  878. def get_tiny_video_list(self, user_dict):
  879. """
  880. 获取小视频
  881. """
  882. url = "https://www.ixigua.com/api/videov2/hotsoon/video"
  883. max_behot_time = "0"
  884. link = user_dict['link'].split("?")[0].split("_")[-1]
  885. to_user_id = str(link.replace("https://www.ixigua.com/home/", ""))
  886. while True:
  887. params = {
  888. "to_user_id": to_user_id,
  889. "max_behot_time": max_behot_time,
  890. "_signature": random_signature()
  891. }
  892. headers = {
  893. "referer": "https://www.ixigua.com/{}?&".format(to_user_id),
  894. "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.41",
  895. }
  896. response = requests.get(
  897. url=url,
  898. headers=headers,
  899. params=params,
  900. proxies=tunnel_proxies(),
  901. timeout=5,
  902. )
  903. if "data" not in response.text or response.status_code != 200:
  904. AliyunLogger.logging(
  905. code="2000",
  906. platform=self.platform,
  907. mode=self.mode,
  908. env=self.env,
  909. message=f"get_videoList:{response.text}\n",
  910. )
  911. return
  912. elif not response.json()["data"]["data"]:
  913. AliyunLogger.logging(
  914. account=link,
  915. code="2000",
  916. platform=self.platform,
  917. mode=self.mode,
  918. env=self.env,
  919. data=response.json(),
  920. message=f"没有更多数据啦~\n",
  921. )
  922. return
  923. else:
  924. video_list = response.json()['data']['data']
  925. max_behot_time = video_list[-1]["max_behot_time"]
  926. for video_obj in video_list:
  927. try:
  928. AliyunLogger.logging(
  929. code="1001",
  930. account=user_dict['uid'],
  931. platform=self.platform,
  932. mode=self.mode,
  933. env=self.env,
  934. data=video_obj,
  935. message="扫描到一条小视频",
  936. )
  937. date_flag = self.process_video_obj(video_obj, user_dict, "s")
  938. if not date_flag:
  939. return
  940. except Exception as e:
  941. AliyunLogger.logging(
  942. code="3000",
  943. platform=self.platform,
  944. mode=self.mode,
  945. env=self.env,
  946. data=video_obj,
  947. message="抓取单条视频异常, 报错原因是: {}".format(e),
  948. )
  949. def process_video_obj(self, video_obj, user_dict, f):
  950. """
  951. process video_obj and extract video_url
  952. """
  953. new_rule = self.rule_maker(user_dict)
  954. trace_id = self.platform + str(uuid.uuid1())
  955. if f == "s":
  956. item_id = video_obj.get("id_str", "")
  957. else:
  958. item_id = video_obj.get("item_id", "")
  959. if not item_id:
  960. AliyunLogger.logging(
  961. code="2005",
  962. account=user_dict['uid'],
  963. platform=self.platform,
  964. mode=self.mode,
  965. env=self.env,
  966. message="无效视频",
  967. data=video_obj,
  968. trace_id=trace_id,
  969. )
  970. return
  971. # 获取视频信息
  972. video_dict = self.get_video_info(item_id=item_id)
  973. video_dict["platform"] = self.platform
  974. video_dict["strategy"] = self.mode
  975. video_dict["out_video_id"] = video_dict["video_id"]
  976. video_dict["width"] = video_dict["video_width"]
  977. video_dict["height"] = video_dict["video_height"]
  978. video_dict["crawler_rule"] = json.dumps(new_rule)
  979. video_dict["user_id"] = user_dict["uid"]
  980. video_dict["publish_time"] = video_dict["publish_time_str"]
  981. video_dict["strategy_type"] = self.mode
  982. video_dict["update_time_stamp"] = int(time.time())
  983. if int(time.time()) - video_dict['publish_time_stamp'] > 3600 * 24 * int(
  984. new_rule.get("period", {}).get("max", 1000)):
  985. if not video_obj['is_top']:
  986. """
  987. 非置顶数据发布时间超过才退出
  988. """
  989. AliyunLogger.logging(
  990. code="2004",
  991. account=user_dict['uid'],
  992. platform=self.platform,
  993. mode=self.mode,
  994. env=self.env,
  995. data=video_dict,
  996. message="发布时间超过{}天".format(
  997. int(new_rule.get("period", {}).get("max", 1000))
  998. ),
  999. )
  1000. return False
  1001. pipeline = PiaoQuanPipeline(
  1002. platform=self.platform,
  1003. mode=self.mode,
  1004. rule_dict=new_rule,
  1005. env=self.env,
  1006. item=video_dict,
  1007. trace_id=trace_id,
  1008. )
  1009. limit_flag = self.limiter.author_limitation(user_id=video_dict['user_id'])
  1010. if limit_flag:
  1011. title_flag = pipeline.title_flag()
  1012. repeat_flag = pipeline.repeat_video()
  1013. if title_flag and repeat_flag:
  1014. if new_rule.get("special"):
  1015. if int(video_dict['play_cnt']) >= int(new_rule.get("play_cnt", {}).get("min", 100000)):
  1016. if float(video_dict['like_cnt']) / float(video_dict['play_cnt']) >= new_rule['special']:
  1017. self.mq.send_msg(video_dict)
  1018. self.download_count += 1
  1019. AliyunLogger.logging(
  1020. code="1002",
  1021. account=user_dict['uid'],
  1022. platform=self.platform,
  1023. mode=self.mode,
  1024. env=self.env,
  1025. data=video_dict,
  1026. trace_id=trace_id,
  1027. message="成功发送 MQ 至 ETL",
  1028. )
  1029. return True
  1030. else:
  1031. AliyunLogger.logging(
  1032. code="2008",
  1033. account=user_dict['uid'],
  1034. platform=self.platform,
  1035. mode=self.mode,
  1036. env=self.env,
  1037. message="不满足特殊规则, 点赞量/播放量",
  1038. data=video_dict
  1039. )
  1040. else:
  1041. if int(video_dict['play_cnt']) >= int(new_rule.get("play_cnt", {}).get("min", 100000)):
  1042. self.mq.send_msg(video_dict)
  1043. self.download_count += 1
  1044. AliyunLogger.logging(
  1045. code="1002",
  1046. account=user_dict['uid'],
  1047. platform=self.platform,
  1048. mode=self.mode,
  1049. env=self.env,
  1050. data=video_dict,
  1051. trace_id=trace_id,
  1052. message="成功发送 MQ 至 ETL",
  1053. )
  1054. return True
  1055. else:
  1056. AliyunLogger.logging(
  1057. code="2008",
  1058. account=user_dict['uid'],
  1059. platform=self.platform,
  1060. mode=self.mode,
  1061. env=self.env,
  1062. message="不满足特殊规则, 播放量",
  1063. data=video_dict
  1064. )
  1065. return True
  1066. def get_video_info(self, item_id):
  1067. """
  1068. 获取视频信息
  1069. """
  1070. url = "https://www.ixigua.com/{}".format(item_id)
  1071. headers = {
  1072. "accept-encoding": "gzip, deflate",
  1073. "accept-language": "zh-CN,zh-Hans;q=0.9",
  1074. "cookie": "ttwid={}".format(byte_dance_cookie(item_id)),
  1075. "user-agent": FakeUserAgent().random,
  1076. "referer": "https://www.ixigua.com/{}/".format(item_id),
  1077. }
  1078. response = requests.get(
  1079. url=url,
  1080. headers=headers,
  1081. proxies=tunnel_proxies(),
  1082. timeout=5,
  1083. )
  1084. video_info = extract_info_by_re(response.text)
  1085. video_dict = {
  1086. "video_title": video_info.get("title", ""),
  1087. "video_id": video_info.get("video_id"),
  1088. "gid": str(item_id),
  1089. "play_cnt": int(video_info.get("play_count", 0)),
  1090. "like_cnt": int(video_info.get("like_count", 0)),
  1091. "comment_cnt": 0,
  1092. "share_cnt": 0,
  1093. "favorite_cnt": 0,
  1094. "duration": int(video_info.get("duration", 0)),
  1095. "video_width": 0,
  1096. "video_height": 0,
  1097. "publish_time_stamp": int(video_info.get("publish_time", 0)),
  1098. "publish_time_str": time.strftime(
  1099. "%Y-%m-%d %H:%M:%S",
  1100. time.localtime(int(video_info.get("publish_time", 0))),
  1101. ),
  1102. "avatar_url": str(
  1103. video_info.get("user_info", {}).get("avatar_url", "")
  1104. ),
  1105. "cover_url": video_info.get("cover_url", "").replace("\\u002F", "/"),
  1106. "video_url": video_info.get("url"),
  1107. "session": f"xigua-author-{int(time.time())}",
  1108. }
  1109. return video_dict
  1110. if __name__ == "__main__":
  1111. user_list = [
  1112. {
  1113. "uid": 6267140,
  1114. "source": "xigua",
  1115. "link": "https://www.ixigua.com/home/2779177225827568",
  1116. "nick_name": "秋晴爱音乐",
  1117. "avatar_url": "",
  1118. "mode": "author",
  1119. },
  1120. {
  1121. "uid": 6267140,
  1122. "source": "xigua",
  1123. "link": "https://www.ixigua.com/home/2885546124776780",
  1124. "nick_name": "朗诵放歌的老山羊",
  1125. "avatar_url": "",
  1126. "mode": "author",
  1127. },
  1128. {
  1129. "uid": 6267140,
  1130. "source": "xigua",
  1131. "link": "https://www.ixigua.com/home/5880938217",
  1132. "nick_name": "天原声疗",
  1133. "avatar_url": "",
  1134. "mode": "author",
  1135. },
  1136. ]
  1137. rule = {'period': {'min': 30, 'max': 30}, 'duration': {'min': 20, 'max': 0}, 'play_cnt': {'min': 100000, 'max': 0}}
  1138. XGA = XiGuaAuthor(
  1139. platform="xigua",
  1140. mode="author",
  1141. rule_dict=rule,
  1142. env="prod",
  1143. user_list=user_list
  1144. )
  1145. XGA.get_author_list()