xigua_author.py 44 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039
  1. import json
  2. import os
  3. import random
  4. import sys
  5. import string
  6. import time
  7. import uuid
  8. import base64
  9. import requests
  10. from fake_useragent import FakeUserAgent
  11. from common.mq import MQ
  12. sys.path.append(os.getcwd())
  13. from common import AliyunLogger, PiaoQuanPipeline, tunnel_proxies
  14. from common.limit import AuthorLimit
  15. def random_signature():
  16. """
  17. 随机生成签名
  18. """
  19. src_digits = string.digits # string_数字
  20. src_uppercase = string.ascii_uppercase # string_大写字母
  21. src_lowercase = string.ascii_lowercase # string_小写字母
  22. digits_num = random.randint(1, 6)
  23. uppercase_num = random.randint(1, 26 - digits_num - 1)
  24. lowercase_num = 26 - (digits_num + uppercase_num)
  25. password = (
  26. random.sample(src_digits, digits_num)
  27. + random.sample(src_uppercase, uppercase_num)
  28. + random.sample(src_lowercase, lowercase_num)
  29. )
  30. random.shuffle(password)
  31. new_password = "AAAAAAAAAA" + "".join(password)[10:-4] + "AAAB"
  32. new_password_start = new_password[0:18]
  33. new_password_end = new_password[-7:]
  34. if new_password[18] == "8":
  35. new_password = new_password_start + "w" + new_password_end
  36. elif new_password[18] == "9":
  37. new_password = new_password_start + "x" + new_password_end
  38. elif new_password[18] == "-":
  39. new_password = new_password_start + "y" + new_password_end
  40. elif new_password[18] == ".":
  41. new_password = new_password_start + "z" + new_password_end
  42. else:
  43. new_password = new_password_start + "y" + new_password_end
  44. return new_password
  45. def get_video_url(video_info):
  46. """
  47. 获取视频的链接
  48. """
  49. video_url_dict = {}
  50. # video_url
  51. if "videoResource" not in video_info:
  52. video_url_dict["video_url"] = ""
  53. video_url_dict["audio_url"] = ""
  54. video_url_dict["video_width"] = 0
  55. video_url_dict["video_height"] = 0
  56. elif "dash_120fps" in video_info["videoResource"]:
  57. if (
  58. "video_list" in video_info["videoResource"]["dash_120fps"]
  59. and "video_4" in video_info["videoResource"]["dash_120fps"]["video_list"]
  60. ):
  61. video_url = video_info["videoResource"]["dash_120fps"]["video_list"][
  62. "video_4"
  63. ]["backup_url_1"]
  64. audio_url = video_info["videoResource"]["dash_120fps"]["video_list"][
  65. "video_4"
  66. ]["backup_url_1"]
  67. if len(video_url) % 3 == 1:
  68. video_url += "=="
  69. elif len(video_url) % 3 == 2:
  70. video_url += "="
  71. elif len(audio_url) % 3 == 1:
  72. audio_url += "=="
  73. elif len(audio_url) % 3 == 2:
  74. audio_url += "="
  75. video_url = base64.b64decode(video_url).decode("utf8")
  76. audio_url = base64.b64decode(audio_url).decode("utf8")
  77. video_width = video_info["videoResource"]["dash_120fps"]["video_list"][
  78. "video_4"
  79. ]["vwidth"]
  80. video_height = video_info["videoResource"]["dash_120fps"]["video_list"][
  81. "video_4"
  82. ]["vheight"]
  83. video_url_dict["video_url"] = video_url
  84. video_url_dict["audio_url"] = audio_url
  85. video_url_dict["video_width"] = video_width
  86. video_url_dict["video_height"] = video_height
  87. elif (
  88. "video_list" in video_info["videoResource"]["dash_120fps"]
  89. and "video_3" in video_info["videoResource"]["dash_120fps"]["video_list"]
  90. ):
  91. video_url = video_info["videoResource"]["dash_120fps"]["video_list"][
  92. "video_3"
  93. ]["backup_url_1"]
  94. audio_url = video_info["videoResource"]["dash_120fps"]["video_list"][
  95. "video_3"
  96. ]["backup_url_1"]
  97. if len(video_url) % 3 == 1:
  98. video_url += "=="
  99. elif len(video_url) % 3 == 2:
  100. video_url += "="
  101. elif len(audio_url) % 3 == 1:
  102. audio_url += "=="
  103. elif len(audio_url) % 3 == 2:
  104. audio_url += "="
  105. video_url = base64.b64decode(video_url).decode("utf8")
  106. audio_url = base64.b64decode(audio_url).decode("utf8")
  107. video_width = video_info["videoResource"]["dash_120fps"]["video_list"][
  108. "video_3"
  109. ]["vwidth"]
  110. video_height = video_info["videoResource"]["dash_120fps"]["video_list"][
  111. "video_3"
  112. ]["vheight"]
  113. video_url_dict["video_url"] = video_url
  114. video_url_dict["audio_url"] = audio_url
  115. video_url_dict["video_width"] = video_width
  116. video_url_dict["video_height"] = video_height
  117. elif (
  118. "video_list" in video_info["videoResource"]["dash_120fps"]
  119. and "video_2" in video_info["videoResource"]["dash_120fps"]["video_list"]
  120. ):
  121. video_url = video_info["videoResource"]["dash_120fps"]["video_list"][
  122. "video_2"
  123. ]["backup_url_1"]
  124. audio_url = video_info["videoResource"]["dash_120fps"]["video_list"][
  125. "video_2"
  126. ]["backup_url_1"]
  127. if len(video_url) % 3 == 1:
  128. video_url += "=="
  129. elif len(video_url) % 3 == 2:
  130. video_url += "="
  131. elif len(audio_url) % 3 == 1:
  132. audio_url += "=="
  133. elif len(audio_url) % 3 == 2:
  134. audio_url += "="
  135. video_url = base64.b64decode(video_url).decode("utf8")
  136. audio_url = base64.b64decode(audio_url).decode("utf8")
  137. video_width = video_info["videoResource"]["dash_120fps"]["video_list"][
  138. "video_2"
  139. ]["vwidth"]
  140. video_height = video_info["videoResource"]["dash_120fps"]["video_list"][
  141. "video_2"
  142. ]["vheight"]
  143. video_url_dict["video_url"] = video_url
  144. video_url_dict["audio_url"] = audio_url
  145. video_url_dict["video_width"] = video_width
  146. video_url_dict["video_height"] = video_height
  147. elif (
  148. "video_list" in video_info["videoResource"]["dash_120fps"]
  149. and "video_1" in video_info["videoResource"]["dash_120fps"]["video_list"]
  150. ):
  151. video_url = video_info["videoResource"]["dash_120fps"]["video_list"][
  152. "video_1"
  153. ]["backup_url_1"]
  154. audio_url = video_info["videoResource"]["dash_120fps"]["video_list"][
  155. "video_1"
  156. ]["backup_url_1"]
  157. if len(video_url) % 3 == 1:
  158. video_url += "=="
  159. elif len(video_url) % 3 == 2:
  160. video_url += "="
  161. elif len(audio_url) % 3 == 1:
  162. audio_url += "=="
  163. elif len(audio_url) % 3 == 2:
  164. audio_url += "="
  165. video_url = base64.b64decode(video_url).decode("utf8")
  166. audio_url = base64.b64decode(audio_url).decode("utf8")
  167. video_width = video_info["videoResource"]["dash_120fps"]["video_list"][
  168. "video_1"
  169. ]["vwidth"]
  170. video_height = video_info["videoResource"]["dash_120fps"]["video_list"][
  171. "video_1"
  172. ]["vheight"]
  173. video_url_dict["video_url"] = video_url
  174. video_url_dict["audio_url"] = audio_url
  175. video_url_dict["video_width"] = video_width
  176. video_url_dict["video_height"] = video_height
  177. elif (
  178. "dynamic_video" in video_info["videoResource"]["dash_120fps"]
  179. and "dynamic_video_list"
  180. in video_info["videoResource"]["dash_120fps"]["dynamic_video"]
  181. and "dynamic_audio_list"
  182. in video_info["videoResource"]["dash_120fps"]["dynamic_video"]
  183. and len(
  184. video_info["videoResource"]["dash_120fps"]["dynamic_video"][
  185. "dynamic_video_list"
  186. ]
  187. )
  188. != 0
  189. and len(
  190. video_info["videoResource"]["dash_120fps"]["dynamic_video"][
  191. "dynamic_audio_list"
  192. ]
  193. )
  194. != 0
  195. ):
  196. video_url = video_info["videoResource"]["dash_120fps"]["dynamic_video"][
  197. "dynamic_video_list"
  198. ][-1]["backup_url_1"]
  199. audio_url = video_info["videoResource"]["dash_120fps"]["dynamic_video"][
  200. "dynamic_audio_list"
  201. ][-1]["backup_url_1"]
  202. if len(video_url) % 3 == 1:
  203. video_url += "=="
  204. elif len(video_url) % 3 == 2:
  205. video_url += "="
  206. elif len(audio_url) % 3 == 1:
  207. audio_url += "=="
  208. elif len(audio_url) % 3 == 2:
  209. audio_url += "="
  210. video_url = base64.b64decode(video_url).decode("utf8")
  211. audio_url = base64.b64decode(audio_url).decode("utf8")
  212. video_width = video_info["videoResource"]["dash_120fps"]["dynamic_video"][
  213. "dynamic_video_list"
  214. ][-1]["vwidth"]
  215. video_height = video_info["videoResource"]["dash_120fps"]["dynamic_video"][
  216. "dynamic_video_list"
  217. ][-1]["vheight"]
  218. video_url_dict["video_url"] = video_url
  219. video_url_dict["audio_url"] = audio_url
  220. video_url_dict["video_width"] = video_width
  221. video_url_dict["video_height"] = video_height
  222. else:
  223. video_url_dict["video_url"] = ""
  224. video_url_dict["audio_url"] = ""
  225. video_url_dict["video_width"] = 0
  226. video_url_dict["video_height"] = 0
  227. elif "dash" in video_info["videoResource"]:
  228. if (
  229. "video_list" in video_info["videoResource"]["dash"]
  230. and "video_4" in video_info["videoResource"]["dash"]["video_list"]
  231. ):
  232. video_url = video_info["videoResource"]["dash"]["video_list"]["video_4"][
  233. "backup_url_1"
  234. ]
  235. audio_url = video_info["videoResource"]["dash"]["video_list"]["video_4"][
  236. "backup_url_1"
  237. ]
  238. if len(video_url) % 3 == 1:
  239. video_url += "=="
  240. elif len(video_url) % 3 == 2:
  241. video_url += "="
  242. elif len(audio_url) % 3 == 1:
  243. audio_url += "=="
  244. elif len(audio_url) % 3 == 2:
  245. audio_url += "="
  246. video_url = base64.b64decode(video_url).decode("utf8")
  247. audio_url = base64.b64decode(audio_url).decode("utf8")
  248. video_width = video_info["videoResource"]["dash"]["video_list"]["video_4"][
  249. "vwidth"
  250. ]
  251. video_height = video_info["videoResource"]["dash"]["video_list"]["video_4"][
  252. "vheight"
  253. ]
  254. video_url_dict["video_url"] = video_url
  255. video_url_dict["audio_url"] = audio_url
  256. video_url_dict["video_width"] = video_width
  257. video_url_dict["video_height"] = video_height
  258. elif (
  259. "video_list" in video_info["videoResource"]["dash"]
  260. and "video_3" in video_info["videoResource"]["dash"]["video_list"]
  261. ):
  262. video_url = video_info["videoResource"]["dash"]["video_list"]["video_3"][
  263. "backup_url_1"
  264. ]
  265. audio_url = video_info["videoResource"]["dash"]["video_list"]["video_3"][
  266. "backup_url_1"
  267. ]
  268. if len(video_url) % 3 == 1:
  269. video_url += "=="
  270. elif len(video_url) % 3 == 2:
  271. video_url += "="
  272. elif len(audio_url) % 3 == 1:
  273. audio_url += "=="
  274. elif len(audio_url) % 3 == 2:
  275. audio_url += "="
  276. video_url = base64.b64decode(video_url).decode("utf8")
  277. audio_url = base64.b64decode(audio_url).decode("utf8")
  278. video_width = video_info["videoResource"]["dash"]["video_list"]["video_3"][
  279. "vwidth"
  280. ]
  281. video_height = video_info["videoResource"]["dash"]["video_list"]["video_3"][
  282. "vheight"
  283. ]
  284. video_url_dict["video_url"] = video_url
  285. video_url_dict["audio_url"] = audio_url
  286. video_url_dict["video_width"] = video_width
  287. video_url_dict["video_height"] = video_height
  288. elif (
  289. "video_list" in video_info["videoResource"]["dash"]
  290. and "video_2" in video_info["videoResource"]["dash"]["video_list"]
  291. ):
  292. video_url = video_info["videoResource"]["dash"]["video_list"]["video_2"][
  293. "backup_url_1"
  294. ]
  295. audio_url = video_info["videoResource"]["dash"]["video_list"]["video_2"][
  296. "backup_url_1"
  297. ]
  298. if len(video_url) % 3 == 1:
  299. video_url += "=="
  300. elif len(video_url) % 3 == 2:
  301. video_url += "="
  302. elif len(audio_url) % 3 == 1:
  303. audio_url += "=="
  304. elif len(audio_url) % 3 == 2:
  305. audio_url += "="
  306. video_url = base64.b64decode(video_url).decode("utf8")
  307. audio_url = base64.b64decode(audio_url).decode("utf8")
  308. video_width = video_info["videoResource"]["dash"]["video_list"]["video_2"][
  309. "vwidth"
  310. ]
  311. video_height = video_info["videoResource"]["dash"]["video_list"]["video_2"][
  312. "vheight"
  313. ]
  314. video_url_dict["video_url"] = video_url
  315. video_url_dict["audio_url"] = audio_url
  316. video_url_dict["video_width"] = video_width
  317. video_url_dict["video_height"] = video_height
  318. elif (
  319. "video_list" in video_info["videoResource"]["dash"]
  320. and "video_1" in video_info["videoResource"]["dash"]["video_list"]
  321. ):
  322. video_url = video_info["videoResource"]["dash"]["video_list"]["video_1"][
  323. "backup_url_1"
  324. ]
  325. audio_url = video_info["videoResource"]["dash"]["video_list"]["video_1"][
  326. "backup_url_1"
  327. ]
  328. if len(video_url) % 3 == 1:
  329. video_url += "=="
  330. elif len(video_url) % 3 == 2:
  331. video_url += "="
  332. elif len(audio_url) % 3 == 1:
  333. audio_url += "=="
  334. elif len(audio_url) % 3 == 2:
  335. audio_url += "="
  336. video_url = base64.b64decode(video_url).decode("utf8")
  337. audio_url = base64.b64decode(audio_url).decode("utf8")
  338. video_width = video_info["videoResource"]["dash"]["video_list"]["video_1"][
  339. "vwidth"
  340. ]
  341. video_height = video_info["videoResource"]["dash"]["video_list"]["video_1"][
  342. "vheight"
  343. ]
  344. video_url_dict["video_url"] = video_url
  345. video_url_dict["audio_url"] = audio_url
  346. video_url_dict["video_width"] = video_width
  347. video_url_dict["video_height"] = video_height
  348. elif (
  349. "dynamic_video" in video_info["videoResource"]["dash"]
  350. and "dynamic_video_list"
  351. in video_info["videoResource"]["dash"]["dynamic_video"]
  352. and "dynamic_audio_list"
  353. in video_info["videoResource"]["dash"]["dynamic_video"]
  354. and len(
  355. video_info["videoResource"]["dash"]["dynamic_video"][
  356. "dynamic_video_list"
  357. ]
  358. )
  359. != 0
  360. and len(
  361. video_info["videoResource"]["dash"]["dynamic_video"][
  362. "dynamic_audio_list"
  363. ]
  364. )
  365. != 0
  366. ):
  367. video_url = video_info["videoResource"]["dash"]["dynamic_video"][
  368. "dynamic_video_list"
  369. ][-1]["backup_url_1"]
  370. audio_url = video_info["videoResource"]["dash"]["dynamic_video"][
  371. "dynamic_audio_list"
  372. ][-1]["backup_url_1"]
  373. if len(video_url) % 3 == 1:
  374. video_url += "=="
  375. elif len(video_url) % 3 == 2:
  376. video_url += "="
  377. elif len(audio_url) % 3 == 1:
  378. audio_url += "=="
  379. elif len(audio_url) % 3 == 2:
  380. audio_url += "="
  381. video_url = base64.b64decode(video_url).decode("utf8")
  382. audio_url = base64.b64decode(audio_url).decode("utf8")
  383. video_width = video_info["videoResource"]["dash"]["dynamic_video"][
  384. "dynamic_video_list"
  385. ][-1]["vwidth"]
  386. video_height = video_info["videoResource"]["dash"]["dynamic_video"][
  387. "dynamic_video_list"
  388. ][-1]["vheight"]
  389. video_url_dict["video_url"] = video_url
  390. video_url_dict["audio_url"] = audio_url
  391. video_url_dict["video_width"] = video_width
  392. video_url_dict["video_height"] = video_height
  393. else:
  394. video_url_dict["video_url"] = ""
  395. video_url_dict["audio_url"] = ""
  396. video_url_dict["video_width"] = 0
  397. video_url_dict["video_height"] = 0
  398. elif "normal" in video_info["videoResource"]:
  399. if (
  400. "video_list" in video_info["videoResource"]["normal"]
  401. and "video_4" in video_info["videoResource"]["normal"]["video_list"]
  402. ):
  403. video_url = video_info["videoResource"]["normal"]["video_list"]["video_4"][
  404. "backup_url_1"
  405. ]
  406. audio_url = video_info["videoResource"]["normal"]["video_list"]["video_4"][
  407. "backup_url_1"
  408. ]
  409. if len(video_url) % 3 == 1:
  410. video_url += "=="
  411. elif len(video_url) % 3 == 2:
  412. video_url += "="
  413. elif len(audio_url) % 3 == 1:
  414. audio_url += "=="
  415. elif len(audio_url) % 3 == 2:
  416. audio_url += "="
  417. video_url = base64.b64decode(video_url).decode("utf8")
  418. audio_url = base64.b64decode(audio_url).decode("utf8")
  419. video_width = video_info["videoResource"]["normal"]["video_list"][
  420. "video_4"
  421. ]["vwidth"]
  422. video_height = video_info["videoResource"]["normal"]["video_list"][
  423. "video_4"
  424. ]["vheight"]
  425. video_url_dict["video_url"] = video_url
  426. video_url_dict["audio_url"] = audio_url
  427. video_url_dict["video_width"] = video_width
  428. video_url_dict["video_height"] = video_height
  429. elif (
  430. "video_list" in video_info["videoResource"]["normal"]
  431. and "video_3" in video_info["videoResource"]["normal"]["video_list"]
  432. ):
  433. video_url = video_info["videoResource"]["normal"]["video_list"]["video_3"][
  434. "backup_url_1"
  435. ]
  436. audio_url = video_info["videoResource"]["normal"]["video_list"]["video_3"][
  437. "backup_url_1"
  438. ]
  439. if len(video_url) % 3 == 1:
  440. video_url += "=="
  441. elif len(video_url) % 3 == 2:
  442. video_url += "="
  443. elif len(audio_url) % 3 == 1:
  444. audio_url += "=="
  445. elif len(audio_url) % 3 == 2:
  446. audio_url += "="
  447. video_url = base64.b64decode(video_url).decode("utf8")
  448. audio_url = base64.b64decode(audio_url).decode("utf8")
  449. video_width = video_info["videoResource"]["normal"]["video_list"][
  450. "video_3"
  451. ]["vwidth"]
  452. video_height = video_info["videoResource"]["normal"]["video_list"][
  453. "video_3"
  454. ]["vheight"]
  455. video_url_dict["video_url"] = video_url
  456. video_url_dict["audio_url"] = audio_url
  457. video_url_dict["video_width"] = video_width
  458. video_url_dict["video_height"] = video_height
  459. elif (
  460. "video_list" in video_info["videoResource"]["normal"]
  461. and "video_2" in video_info["videoResource"]["normal"]["video_list"]
  462. ):
  463. video_url = video_info["videoResource"]["normal"]["video_list"]["video_2"][
  464. "backup_url_1"
  465. ]
  466. audio_url = video_info["videoResource"]["normal"]["video_list"]["video_2"][
  467. "backup_url_1"
  468. ]
  469. if len(video_url) % 3 == 1:
  470. video_url += "=="
  471. elif len(video_url) % 3 == 2:
  472. video_url += "="
  473. elif len(audio_url) % 3 == 1:
  474. audio_url += "=="
  475. elif len(audio_url) % 3 == 2:
  476. audio_url += "="
  477. video_url = base64.b64decode(video_url).decode("utf8")
  478. audio_url = base64.b64decode(audio_url).decode("utf8")
  479. video_width = video_info["videoResource"]["normal"]["video_list"][
  480. "video_2"
  481. ]["vwidth"]
  482. video_height = video_info["videoResource"]["normal"]["video_list"][
  483. "video_2"
  484. ]["vheight"]
  485. video_url_dict["video_url"] = video_url
  486. video_url_dict["audio_url"] = audio_url
  487. video_url_dict["video_width"] = video_width
  488. video_url_dict["video_height"] = video_height
  489. elif (
  490. "video_list" in video_info["videoResource"]["normal"]
  491. and "video_1" in video_info["videoResource"]["normal"]["video_list"]
  492. ):
  493. video_url = video_info["videoResource"]["normal"]["video_list"]["video_1"][
  494. "backup_url_1"
  495. ]
  496. audio_url = video_info["videoResource"]["normal"]["video_list"]["video_1"][
  497. "backup_url_1"
  498. ]
  499. if len(video_url) % 3 == 1:
  500. video_url += "=="
  501. elif len(video_url) % 3 == 2:
  502. video_url += "="
  503. elif len(audio_url) % 3 == 1:
  504. audio_url += "=="
  505. elif len(audio_url) % 3 == 2:
  506. audio_url += "="
  507. video_url = base64.b64decode(video_url).decode("utf8")
  508. audio_url = base64.b64decode(audio_url).decode("utf8")
  509. video_width = video_info["videoResource"]["normal"]["video_list"][
  510. "video_1"
  511. ]["vwidth"]
  512. video_height = video_info["videoResource"]["normal"]["video_list"][
  513. "video_1"
  514. ]["vheight"]
  515. video_url_dict["video_url"] = video_url
  516. video_url_dict["audio_url"] = audio_url
  517. video_url_dict["video_width"] = video_width
  518. video_url_dict["video_height"] = video_height
  519. elif (
  520. "dynamic_video" in video_info["videoResource"]["normal"]
  521. and "dynamic_video_list"
  522. in video_info["videoResource"]["normal"]["dynamic_video"]
  523. and "dynamic_audio_list"
  524. in video_info["videoResource"]["normal"]["dynamic_video"]
  525. and len(
  526. video_info["videoResource"]["normal"]["dynamic_video"][
  527. "dynamic_video_list"
  528. ]
  529. )
  530. != 0
  531. and len(
  532. video_info["videoResource"]["normal"]["dynamic_video"][
  533. "dynamic_audio_list"
  534. ]
  535. )
  536. != 0
  537. ):
  538. video_url = video_info["videoResource"]["normal"]["dynamic_video"][
  539. "dynamic_video_list"
  540. ][-1]["backup_url_1"]
  541. audio_url = video_info["videoResource"]["normal"]["dynamic_video"][
  542. "dynamic_audio_list"
  543. ][-1]["backup_url_1"]
  544. if len(video_url) % 3 == 1:
  545. video_url += "=="
  546. elif len(video_url) % 3 == 2:
  547. video_url += "="
  548. elif len(audio_url) % 3 == 1:
  549. audio_url += "=="
  550. elif len(audio_url) % 3 == 2:
  551. audio_url += "="
  552. video_url = base64.b64decode(video_url).decode("utf8")
  553. audio_url = base64.b64decode(audio_url).decode("utf8")
  554. video_width = video_info["videoResource"]["normal"]["dynamic_video"][
  555. "dynamic_video_list"
  556. ][-1]["vwidth"]
  557. video_height = video_info["videoResource"]["normal"]["dynamic_video"][
  558. "dynamic_video_list"
  559. ][-1]["vheight"]
  560. video_url_dict["video_url"] = video_url
  561. video_url_dict["audio_url"] = audio_url
  562. video_url_dict["video_width"] = video_width
  563. video_url_dict["video_height"] = video_height
  564. else:
  565. video_url_dict["video_url"] = ""
  566. video_url_dict["audio_url"] = ""
  567. video_url_dict["video_width"] = 0
  568. video_url_dict["video_height"] = 0
  569. else:
  570. video_url_dict["video_url"] = ""
  571. video_url_dict["audio_url"] = ""
  572. video_url_dict["video_width"] = 0
  573. video_url_dict["video_height"] = 0
  574. return video_url_dict
  575. def get_comment_cnt(item_id):
  576. """
  577. 获取视频的评论数量
  578. """
  579. url = "https://www.ixigua.com/tlb/comment/article/v5/tab_comments/?"
  580. params = {
  581. "tab_index": "0",
  582. "count": "10",
  583. "offset": "10",
  584. "group_id": str(item_id),
  585. "item_id": str(item_id),
  586. "aid": "1768",
  587. "msToken": "50-JJObWB07HfHs-BMJWT1eIDX3G-6lPSF_i-QwxBIXE9VVa-iN0jbEXR5pG2DKjXBmP299n6ZTuXzY-GAy968CCvouSAYIS4GzvGQT3pNlKNejr5G4-1g==",
  588. "X-Bogus": "DFSzswVOyGtANVeWtCLMqR/F6q9U",
  589. "_signature": random_signature(),
  590. }
  591. headers = {
  592. "authority": "www.ixigua.com",
  593. "accept": "application/json, text/plain, */*",
  594. "accept-language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
  595. "cache-control": "no-cache",
  596. "cookie": "MONITOR_WEB_ID=67cb5099-a022-4ec3-bb8e-c4de6ba51dd0; passport_csrf_token=72b2574f3c99f8ba670e42df430218fd; passport_csrf_token_default=72b2574f3c99f8ba670e42df430218fd; sid_guard=c7472b508ea631823ba765a60cf8757f%7C1680867422%7C3024002%7CFri%2C+12-May-2023+11%3A37%3A04+GMT; uid_tt=c13f47d51767f616befe32fb3e9f485a; uid_tt_ss=c13f47d51767f616befe32fb3e9f485a; sid_tt=c7472b508ea631823ba765a60cf8757f; sessionid=c7472b508ea631823ba765a60cf8757f; sessionid_ss=c7472b508ea631823ba765a60cf8757f; sid_ucp_v1=1.0.0-KGUzNWYxNmRkZGJiZjgxY2MzZWNkMTEzMTkwYjY1Yjg5OTY5NzVlNmMKFQiu3d-eqQIQ3oDAoQYYGCAMOAhACxoCaGwiIGM3NDcyYjUwOGVhNjMxODIzYmE3NjVhNjBjZjg3NTdm; ssid_ucp_v1=1.0.0-KGUzNWYxNmRkZGJiZjgxY2MzZWNkMTEzMTkwYjY1Yjg5OTY5NzVlNmMKFQiu3d-eqQIQ3oDAoQYYGCAMOAhACxoCaGwiIGM3NDcyYjUwOGVhNjMxODIzYmE3NjVhNjBjZjg3NTdm; odin_tt=b893608d4dde2e1e8df8cd5d97a0e2fbeafc4ca762ac72ebef6e6c97e2ed19859bb01d46b4190ddd6dd17d7f9678e1de; SEARCH_CARD_MODE=7168304743566296612_0; support_webp=true; support_avif=false; csrf_session_id=a5355d954d3c63ed1ba35faada452b4d; tt_scid=7Pux7s634-z8DYvCM20y7KigwH5u7Rh6D9C-RROpnT.aGMEcz6Vsxp.oai47wJqa4f86; ttwid=1%7CHHtv2QqpSGuSu8r-zXF1QoWsvjmNi1SJrqOrZzg-UCY%7C1683858689%7Ca5223fe1500578e01e138a0d71d6444692018296c4c24f5885af174a65873c95; ixigua-a-s=3; msToken=50-JJObWB07HfHs-BMJWT1eIDX3G-6lPSF_i-QwxBIXE9VVa-iN0jbEXR5pG2DKjXBmP299n6ZTuXzY-GAy968CCvouSAYIS4GzvGQT3pNlKNejr5G4-1g==; __ac_nonce=0645dcbf0005064517440; __ac_signature=_02B4Z6wo00f01FEGmAwAAIDBKchzCGqn-MBRJpyAAHAjieFC5GEg6gGiwz.I4PRrJl7f0GcixFrExKmgt6QI1i1S-dQyofPEj2ugWTCnmKUdJQv-wYuDofeKNe8VtMtZq2aKewyUGeKU-5Ud21; ixigua-a-s=3",
  597. "pragma": "no-cache",
  598. "referer": f"https://www.ixigua.com/{item_id}?logTag=3c5aa86a8600b9ab8540",
  599. "sec-ch-ua": '"Microsoft Edge";v="113", "Chromium";v="113", "Not-A.Brand";v="24"',
  600. "sec-ch-ua-mobile": "?0",
  601. "sec-ch-ua-platform": '"macOS"',
  602. "sec-fetch-dest": "empty",
  603. "sec-fetch-mode": "cors",
  604. "sec-fetch-site": "same-origin",
  605. "tt-anti-token": "cBITBHvmYjEygzv-f9c78c1297722cf1f559c74b084e4525ce4900bdcf9e8588f20cc7c2e3234422",
  606. "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.35",
  607. "x-secsdk-csrf-token": "000100000001f8e733cf37f0cd255a51aea9a81ff7bc0c09490cfe41ad827c3c5c18ec809279175e4d9f5553d8a5",
  608. }
  609. response = requests.get(
  610. url=url, headers=headers, params=params, proxies=tunnel_proxies(), timeout=5
  611. )
  612. response.close()
  613. if (
  614. response.status_code != 200
  615. or "total_number" not in response.json()
  616. or response.json() == {}
  617. ):
  618. return 0
  619. return response.json().get("total_number", 0)
  620. class XiGuaAuthor:
  621. """
  622. 西瓜账号爬虫
  623. """
  624. def __init__(self, platform, mode, rule_dict, env, user_list):
  625. self.platform = platform
  626. self.mode = mode
  627. self.rule_dict = rule_dict
  628. self.env = env
  629. self.user_list = user_list
  630. self.mq = MQ(topic_name="topic_crawler_etl_" + self.env)
  631. self.download_count = 0
  632. self.limiter = AuthorLimit(platform=self.platform, mode=self.mode)
  633. def rule_maker(self, account):
  634. """
  635. 通过不同的账号生成不同的规则
  636. :param account: 输入的账号信息
  637. {'play_cnt': {'min': 100000, 'max': 0}, 'period': {'min': 5, 'max': 5}}
  638. """
  639. flag = account['link'].split("_")[0]
  640. if flag == "V1":
  641. rule_dict = {
  642. "play_cnt": {"min": 100000, "max": 0},
  643. 'period': {"min": 90, "max": 90},
  644. 'special': 0.02
  645. }
  646. return rule_dict
  647. elif flag == "V2":
  648. rule_dict = {
  649. "play_cnt": {"min": 10000, "max": 0},
  650. 'period': {"min": 90, "max": 90},
  651. 'special': 0.01
  652. }
  653. return rule_dict
  654. elif flag == "V3":
  655. rule_dict = {
  656. "play_cnt": {"min": 5000, "max": 0},
  657. 'period': {"min": 90, "max": 90},
  658. 'special': 0.01
  659. }
  660. return rule_dict
  661. else:
  662. return self.rule_dict
  663. def get_author_list(self):
  664. """
  665. 每轮只抓取定量的数据,到达数量后自己退出
  666. 获取账号列表以及账号信息
  667. """
  668. # max_count = int(self.rule_dict.get("videos_cnt", {}).get("min", 300))
  669. for user_dict in self.user_list:
  670. # if self.download_count <= max_count:
  671. try:
  672. self.get_video_list(user_dict)
  673. except Exception as e:
  674. AliyunLogger.logging(
  675. code="3001",
  676. account=user_dict["uid"],
  677. platform=self.platform,
  678. mode=self.mode,
  679. env=self.env,
  680. message="扫描账号时出现bug, 报错是 {}".format(e)
  681. )
  682. # time.sleep(random.randint(1, 15))
  683. # else:
  684. # AliyunLogger.logging(
  685. # code="2000",
  686. # platform=self.platform,
  687. # mode=self.mode,
  688. # env=self.env,
  689. # message="本轮已经抓取足够数量的视频,已经自动退出",
  690. # )
  691. # return
  692. def get_video_list(self, user_dict):
  693. """
  694. 获取某个账号的视频列表
  695. """
  696. offset = 0
  697. signature = random_signature()
  698. url = "https://www.ixigua.com/api/videov2/author/new_video_list?"
  699. while True:
  700. if user_dict['link'][0] == "V":
  701. link = user_dict["link"][3:]
  702. else:
  703. link = user_dict["link"]
  704. to_user_id = str(link.replace("https://www.ixigua.com/home/", ""))
  705. params = {
  706. "to_user_id": to_user_id,
  707. "offset": str(offset),
  708. "limit": "30",
  709. "maxBehotTime": "0",
  710. "order": "new",
  711. "isHome": "0",
  712. # 'msToken': 'G0eRzNkw189a8TLaXjc6nTHVMQwh9XcxVAqTbGKi7iPJdQcLwS3-XRrJ3MZ7QBfqErpxp3EX1WtvWOIcZ3NIgr41hgcd-v64so_RRj3YCRw1UsKW8mIssNLlIMspsg==',
  713. # 'X-Bogus': 'DFSzswVuEkUANjW9ShFTgR/F6qHt',
  714. "_signature": signature,
  715. }
  716. headers = {
  717. "referer": f'https://www.ixigua.com/home/{link.replace("https://www.ixigua.com/home/", "")}/video/?preActiveKey=hotsoon&list_entrance=userdetail',
  718. "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.41",
  719. }
  720. response = requests.get(
  721. url=url,
  722. headers=headers,
  723. params=params,
  724. proxies=tunnel_proxies(),
  725. timeout=5,
  726. )
  727. offset += 30
  728. if "data" not in response.text or response.status_code != 200:
  729. AliyunLogger.logging(
  730. code="2000",
  731. platform=self.platform,
  732. mode=self.mode,
  733. env=self.env,
  734. message=f"get_videoList:{response.text}\n",
  735. )
  736. return
  737. elif not response.json()["data"]["videoList"]:
  738. AliyunLogger.logging(
  739. code="2000",
  740. platform=self.platform,
  741. mode=self.mode,
  742. env=self.env,
  743. message=f"没有更多数据啦~\n",
  744. )
  745. return
  746. else:
  747. feeds = response.json()["data"]["videoList"]
  748. for video_obj in feeds:
  749. try:
  750. AliyunLogger.logging(
  751. code="1001",
  752. account=user_dict['uid'],
  753. platform=self.platform,
  754. mode=self.mode,
  755. env=self.env,
  756. data=video_obj,
  757. message="扫描到一条视频",
  758. )
  759. date_flag = self.process_video_obj(video_obj, user_dict)
  760. if not date_flag:
  761. return
  762. except Exception as e:
  763. AliyunLogger.logging(
  764. code="3000",
  765. platform=self.platform,
  766. mode=self.mode,
  767. env=self.env,
  768. data=video_obj,
  769. message="抓取单条视频异常, 报错原因是: {}".format(e),
  770. )
  771. def process_video_obj(self, video_obj, user_dict):
  772. new_rule = self.rule_maker(user_dict)
  773. trace_id = self.platform + str(uuid.uuid1())
  774. item_id = video_obj.get("item_id", "")
  775. if not item_id:
  776. AliyunLogger.logging(
  777. code="2005",
  778. account=user_dict['uid'],
  779. platform=self.platform,
  780. mode=self.mode,
  781. env=self.env,
  782. message="无效视频",
  783. data=video_obj,
  784. trace_id=trace_id,
  785. )
  786. return
  787. # 获取视频信息
  788. video_dict = self.get_video_info(item_id=item_id, trace_id=trace_id)
  789. video_dict["out_user_id"] = video_dict["user_id"]
  790. video_dict["platform"] = self.platform
  791. video_dict["strategy"] = self.mode
  792. video_dict["out_video_id"] = video_dict["video_id"]
  793. video_dict["width"] = video_dict["video_width"]
  794. video_dict["height"] = video_dict["video_height"]
  795. video_dict["crawler_rule"] = json.dumps(new_rule)
  796. video_dict["user_id"] = user_dict["uid"]
  797. video_dict["publish_time"] = video_dict["publish_time_str"]
  798. video_dict["strategy_type"] = self.mode
  799. video_dict["update_time_stamp"] = int(time.time())
  800. if int(time.time()) - video_dict['publish_time_stamp'] > 3600 * 24 * int(
  801. new_rule.get("period", {}).get("max", 1000)):
  802. if not video_obj['is_top']:
  803. """
  804. 非置顶数据发布时间超过才退出
  805. """
  806. AliyunLogger.logging(
  807. code="2004",
  808. account=user_dict['uid'],
  809. platform=self.platform,
  810. mode=self.mode,
  811. env=self.env,
  812. data=video_dict,
  813. message="发布时间超过{}天".format(
  814. int(new_rule.get("period", {}).get("max", 1000))
  815. ),
  816. )
  817. return False
  818. pipeline = PiaoQuanPipeline(
  819. platform=self.platform,
  820. mode=self.mode,
  821. rule_dict=new_rule,
  822. env=self.env,
  823. item=video_dict,
  824. trace_id=trace_id,
  825. )
  826. limit_flag = self.limiter.author_limitation(user_id=video_dict['user_id'])
  827. if limit_flag:
  828. title_flag = pipeline.title_flag()
  829. repeat_flag = pipeline.repeat_video()
  830. if title_flag and repeat_flag:
  831. if new_rule.get("special"):
  832. if int(video_dict['play_cnt']) >= int(new_rule.get("play_cnt", {}).get("min", 100000)):
  833. if float(video_dict['like_cnt']) / float(video_dict['play_cnt']) >= new_rule['special']:
  834. self.mq.send_msg(video_dict)
  835. self.download_count += 1
  836. AliyunLogger.logging(
  837. code="1002",
  838. account=user_dict['uid'],
  839. platform=self.platform,
  840. mode=self.mode,
  841. env=self.env,
  842. data=video_dict,
  843. trace_id=trace_id,
  844. message="成功发送 MQ 至 ETL",
  845. )
  846. return True
  847. else:
  848. AliyunLogger.logging(
  849. code="2008",
  850. account=user_dict['uid'],
  851. platform=self.platform,
  852. mode=self.mode,
  853. env=self.env,
  854. message="不满足特殊规则, 点赞量/播放量",
  855. data=video_dict
  856. )
  857. else:
  858. if int(video_dict['play_cnt']) >= int(new_rule.get("play_cnt", {}).get("min", 100000)):
  859. self.mq.send_msg(video_dict)
  860. self.download_count += 1
  861. AliyunLogger.logging(
  862. code="1002",
  863. account=user_dict['uid'],
  864. platform=self.platform,
  865. mode=self.mode,
  866. env=self.env,
  867. data=video_dict,
  868. trace_id=trace_id,
  869. message="成功发送 MQ 至 ETL",
  870. )
  871. return True
  872. else:
  873. AliyunLogger.logging(
  874. code="2008",
  875. account=user_dict['uid'],
  876. platform=self.platform,
  877. mode=self.mode,
  878. env=self.env,
  879. message="不满足特殊规则, 播放量",
  880. data=video_dict
  881. )
  882. return True
  883. def get_video_info(self, item_id, trace_id):
  884. url = "https://www.ixigua.com/api/mixVideo/information?"
  885. headers = {
  886. "accept-encoding": "gzip, deflate",
  887. "accept-language": "zh-CN,zh-Hans;q=0.9",
  888. "user-agent": FakeUserAgent().random,
  889. "referer": "https://www.ixigua.com/7102614741050196520?logTag=0531c88ac04f38ab2c62",
  890. }
  891. params = {
  892. "mixId": str(item_id),
  893. "msToken": "IlG0wd0Pylyw9ghcYiB2YseUmTwrsrqqhXrbIcsSaTcLTJyVlbYJzk20zw3UO-CfrfC"
  894. "NVVIOBNjIl7vfBoxnVUwO9ZyzAI3umSKsT5-pef_RRfQCJwmA",
  895. "X-Bogus": "DFSzswVupYTANCJOSBk0P53WxM-r",
  896. "_signature": "_02B4Z6wo0000119LvEwAAIDCuktNZ0y5wkdfS7jAALThuOR8D9yWNZ.EmWHKV0WSn6Px"
  897. "fPsH9-BldyxVje0f49ryXgmn7Tzk-swEHNb15TiGqa6YF.cX0jW8Eds1TtJOIZyfc9s5emH7gdWN94",
  898. }
  899. cookies = {
  900. "ixigua-a-s": "1",
  901. "msToken": "IlG0wd0Pylyw9ghcYiB2YseUmTwrsrqqhXrbIcsSaTcLTJyVlbYJzk20zw3UO-CfrfCNVVIOB"
  902. "NjIl7vfBoxnVUwO9ZyzAI3umSKsT5-pef_RRfQCJwmA",
  903. "ttwid": "1%7C_yXQeHWwLZgCsgHClOwTCdYSOt_MjdOkgnPIkpi-Sr8%7C1661241238%7Cf57d0c5ef3f1d7"
  904. "6e049fccdca1ac54887c34d1f8731c8e51a49780ff0ceab9f8",
  905. "tt_scid": "QZ4l8KXDG0YAEaMCSbADdcybdKbUfG4BC6S4OBv9lpRS5VyqYLX2bIR8CTeZeGHR9ee3",
  906. "MONITOR_WEB_ID": "0a49204a-7af5-4e96-95f0-f4bafb7450ad",
  907. "__ac_nonce": "06304878000964fdad287",
  908. "__ac_signature": "_02B4Z6wo00f017Rcr3AAAIDCUVxeW1tOKEu0fKvAAI4cvoYzV-wBhq7B6D8k0no7lb"
  909. "FlvYoinmtK6UXjRIYPXnahUlFTvmWVtb77jsMkKAXzAEsLE56m36RlvL7ky.M3Xn52r9t1IEb7IR3ke8",
  910. "ttcid": "e56fabf6e85d4adf9e4d91902496a0e882",
  911. "_tea_utm_cache_1300": "undefined",
  912. "support_avif": "false",
  913. "support_webp": "false",
  914. "xiguavideopcwebid": "7134967546256016900",
  915. "xiguavideopcwebid.sig": "xxRww5R1VEMJN_dQepHorEu_eAc",
  916. }
  917. response = requests.get(
  918. url=url,
  919. headers=headers,
  920. params=params,
  921. cookies=cookies,
  922. proxies=tunnel_proxies(),
  923. timeout=5,
  924. )
  925. if (
  926. response.status_code != 200
  927. or "data" not in response.json()
  928. or response.json()["data"] == {}
  929. ):
  930. AliyunLogger.logging(
  931. code="2000",
  932. platform=self.platform,
  933. mode=self.mode,
  934. env=self.env,
  935. message="获取视频信息失败",
  936. trace_id=trace_id,
  937. )
  938. return None
  939. else:
  940. video_info = (
  941. response.json()["data"]
  942. .get("gidInformation", {})
  943. .get("packerData", {})
  944. .get("video", {})
  945. )
  946. if video_info == {}:
  947. return None
  948. video_detail = get_video_url(video_info)
  949. video_dict = {
  950. "video_title": video_info.get("title", ""),
  951. "video_id": video_info.get("videoResource", {}).get("vid", ""),
  952. "gid": str(item_id),
  953. "play_cnt": int(video_info.get("video_watch_count", 0)),
  954. "like_cnt": int(video_info.get("video_like_count", 0)),
  955. "comment_cnt": int(get_comment_cnt(item_id)),
  956. "share_cnt": 0,
  957. "favorite_cnt": 0,
  958. "duration": int(video_info.get("video_duration", 0)),
  959. "video_width": int(video_detail["video_width"]),
  960. "video_height": int(video_detail["video_height"]),
  961. "publish_time_stamp": int(video_info.get("video_publish_time", 0)),
  962. "publish_time_str": time.strftime(
  963. "%Y-%m-%d %H:%M:%S",
  964. time.localtime(int(video_info.get("video_publish_time", 0))),
  965. ),
  966. "user_name": video_info.get("user_info", {}).get("name", ""),
  967. "user_id": str(video_info.get("user_info", {}).get("user_id", "")),
  968. "avatar_url": str(
  969. video_info.get("user_info", {}).get("avatar_url", "")
  970. ),
  971. "cover_url": video_info.get("poster_url", ""),
  972. "audio_url": video_detail["audio_url"],
  973. "video_url": video_detail["video_url"],
  974. "session": f"xigua-search-{int(time.time())}",
  975. }
  976. return video_dict
  977. if __name__ == "__main__":
  978. user_list = [
  979. {
  980. "uid": 6267140,
  981. "source": "xigua",
  982. "link": "https://www.ixigua.com/home/2779177225827568",
  983. "nick_name": "秋晴爱音乐",
  984. "avatar_url": "",
  985. "mode": "author",
  986. },
  987. {
  988. "uid": 6267140,
  989. "source": "xigua",
  990. "link": "https://www.ixigua.com/home/2885546124776780",
  991. "nick_name": "朗诵放歌的老山羊",
  992. "avatar_url": "",
  993. "mode": "author",
  994. },
  995. {
  996. "uid": 6267140,
  997. "source": "xigua",
  998. "link": "https://www.ixigua.com/home/5880938217",
  999. "nick_name": "天原声疗",
  1000. "avatar_url": "",
  1001. "mode": "author",
  1002. },
  1003. ]
  1004. # rule = {'period': {'min': 30, 'max': 30}, 'duration': {'min': 20, 'max': 0}, 'play_cnt': {'min': 100000, 'max': 0}}
  1005. # XGA = XiGuaAuthor(
  1006. # platform="xigua",
  1007. # mode="author",
  1008. # rule_dict=rule,
  1009. # env="prod",
  1010. # user_list=user_list
  1011. # )
  1012. # XGA.get_author_list()