xigua_author.py 44 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035
  1. import json
  2. import os
  3. import random
  4. import sys
  5. import string
  6. import time
  7. import uuid
  8. import base64
  9. import requests
  10. from fake_useragent import FakeUserAgent
  11. from common.mq import MQ
  12. sys.path.append(os.getcwd())
  13. from common import AliyunLogger, PiaoQuanPipeline, tunnel_proxies
  14. def random_signature():
  15. """
  16. 随机生成签名
  17. """
  18. src_digits = string.digits # string_数字
  19. src_uppercase = string.ascii_uppercase # string_大写字母
  20. src_lowercase = string.ascii_lowercase # string_小写字母
  21. digits_num = random.randint(1, 6)
  22. uppercase_num = random.randint(1, 26 - digits_num - 1)
  23. lowercase_num = 26 - (digits_num + uppercase_num)
  24. password = (
  25. random.sample(src_digits, digits_num)
  26. + random.sample(src_uppercase, uppercase_num)
  27. + random.sample(src_lowercase, lowercase_num)
  28. )
  29. random.shuffle(password)
  30. new_password = "AAAAAAAAAA" + "".join(password)[10:-4] + "AAAB"
  31. new_password_start = new_password[0:18]
  32. new_password_end = new_password[-7:]
  33. if new_password[18] == "8":
  34. new_password = new_password_start + "w" + new_password_end
  35. elif new_password[18] == "9":
  36. new_password = new_password_start + "x" + new_password_end
  37. elif new_password[18] == "-":
  38. new_password = new_password_start + "y" + new_password_end
  39. elif new_password[18] == ".":
  40. new_password = new_password_start + "z" + new_password_end
  41. else:
  42. new_password = new_password_start + "y" + new_password_end
  43. return new_password
  44. def get_video_url(video_info):
  45. """
  46. 获取视频的链接
  47. """
  48. video_url_dict = {}
  49. # video_url
  50. if "videoResource" not in video_info:
  51. video_url_dict["video_url"] = ""
  52. video_url_dict["audio_url"] = ""
  53. video_url_dict["video_width"] = 0
  54. video_url_dict["video_height"] = 0
  55. elif "dash_120fps" in video_info["videoResource"]:
  56. if (
  57. "video_list" in video_info["videoResource"]["dash_120fps"]
  58. and "video_4" in video_info["videoResource"]["dash_120fps"]["video_list"]
  59. ):
  60. video_url = video_info["videoResource"]["dash_120fps"]["video_list"][
  61. "video_4"
  62. ]["backup_url_1"]
  63. audio_url = video_info["videoResource"]["dash_120fps"]["video_list"][
  64. "video_4"
  65. ]["backup_url_1"]
  66. if len(video_url) % 3 == 1:
  67. video_url += "=="
  68. elif len(video_url) % 3 == 2:
  69. video_url += "="
  70. elif len(audio_url) % 3 == 1:
  71. audio_url += "=="
  72. elif len(audio_url) % 3 == 2:
  73. audio_url += "="
  74. video_url = base64.b64decode(video_url).decode("utf8")
  75. audio_url = base64.b64decode(audio_url).decode("utf8")
  76. video_width = video_info["videoResource"]["dash_120fps"]["video_list"][
  77. "video_4"
  78. ]["vwidth"]
  79. video_height = video_info["videoResource"]["dash_120fps"]["video_list"][
  80. "video_4"
  81. ]["vheight"]
  82. video_url_dict["video_url"] = video_url
  83. video_url_dict["audio_url"] = audio_url
  84. video_url_dict["video_width"] = video_width
  85. video_url_dict["video_height"] = video_height
  86. elif (
  87. "video_list" in video_info["videoResource"]["dash_120fps"]
  88. and "video_3" in video_info["videoResource"]["dash_120fps"]["video_list"]
  89. ):
  90. video_url = video_info["videoResource"]["dash_120fps"]["video_list"][
  91. "video_3"
  92. ]["backup_url_1"]
  93. audio_url = video_info["videoResource"]["dash_120fps"]["video_list"][
  94. "video_3"
  95. ]["backup_url_1"]
  96. if len(video_url) % 3 == 1:
  97. video_url += "=="
  98. elif len(video_url) % 3 == 2:
  99. video_url += "="
  100. elif len(audio_url) % 3 == 1:
  101. audio_url += "=="
  102. elif len(audio_url) % 3 == 2:
  103. audio_url += "="
  104. video_url = base64.b64decode(video_url).decode("utf8")
  105. audio_url = base64.b64decode(audio_url).decode("utf8")
  106. video_width = video_info["videoResource"]["dash_120fps"]["video_list"][
  107. "video_3"
  108. ]["vwidth"]
  109. video_height = video_info["videoResource"]["dash_120fps"]["video_list"][
  110. "video_3"
  111. ]["vheight"]
  112. video_url_dict["video_url"] = video_url
  113. video_url_dict["audio_url"] = audio_url
  114. video_url_dict["video_width"] = video_width
  115. video_url_dict["video_height"] = video_height
  116. elif (
  117. "video_list" in video_info["videoResource"]["dash_120fps"]
  118. and "video_2" in video_info["videoResource"]["dash_120fps"]["video_list"]
  119. ):
  120. video_url = video_info["videoResource"]["dash_120fps"]["video_list"][
  121. "video_2"
  122. ]["backup_url_1"]
  123. audio_url = video_info["videoResource"]["dash_120fps"]["video_list"][
  124. "video_2"
  125. ]["backup_url_1"]
  126. if len(video_url) % 3 == 1:
  127. video_url += "=="
  128. elif len(video_url) % 3 == 2:
  129. video_url += "="
  130. elif len(audio_url) % 3 == 1:
  131. audio_url += "=="
  132. elif len(audio_url) % 3 == 2:
  133. audio_url += "="
  134. video_url = base64.b64decode(video_url).decode("utf8")
  135. audio_url = base64.b64decode(audio_url).decode("utf8")
  136. video_width = video_info["videoResource"]["dash_120fps"]["video_list"][
  137. "video_2"
  138. ]["vwidth"]
  139. video_height = video_info["videoResource"]["dash_120fps"]["video_list"][
  140. "video_2"
  141. ]["vheight"]
  142. video_url_dict["video_url"] = video_url
  143. video_url_dict["audio_url"] = audio_url
  144. video_url_dict["video_width"] = video_width
  145. video_url_dict["video_height"] = video_height
  146. elif (
  147. "video_list" in video_info["videoResource"]["dash_120fps"]
  148. and "video_1" in video_info["videoResource"]["dash_120fps"]["video_list"]
  149. ):
  150. video_url = video_info["videoResource"]["dash_120fps"]["video_list"][
  151. "video_1"
  152. ]["backup_url_1"]
  153. audio_url = video_info["videoResource"]["dash_120fps"]["video_list"][
  154. "video_1"
  155. ]["backup_url_1"]
  156. if len(video_url) % 3 == 1:
  157. video_url += "=="
  158. elif len(video_url) % 3 == 2:
  159. video_url += "="
  160. elif len(audio_url) % 3 == 1:
  161. audio_url += "=="
  162. elif len(audio_url) % 3 == 2:
  163. audio_url += "="
  164. video_url = base64.b64decode(video_url).decode("utf8")
  165. audio_url = base64.b64decode(audio_url).decode("utf8")
  166. video_width = video_info["videoResource"]["dash_120fps"]["video_list"][
  167. "video_1"
  168. ]["vwidth"]
  169. video_height = video_info["videoResource"]["dash_120fps"]["video_list"][
  170. "video_1"
  171. ]["vheight"]
  172. video_url_dict["video_url"] = video_url
  173. video_url_dict["audio_url"] = audio_url
  174. video_url_dict["video_width"] = video_width
  175. video_url_dict["video_height"] = video_height
  176. elif (
  177. "dynamic_video" in video_info["videoResource"]["dash_120fps"]
  178. and "dynamic_video_list"
  179. in video_info["videoResource"]["dash_120fps"]["dynamic_video"]
  180. and "dynamic_audio_list"
  181. in video_info["videoResource"]["dash_120fps"]["dynamic_video"]
  182. and len(
  183. video_info["videoResource"]["dash_120fps"]["dynamic_video"][
  184. "dynamic_video_list"
  185. ]
  186. )
  187. != 0
  188. and len(
  189. video_info["videoResource"]["dash_120fps"]["dynamic_video"][
  190. "dynamic_audio_list"
  191. ]
  192. )
  193. != 0
  194. ):
  195. video_url = video_info["videoResource"]["dash_120fps"]["dynamic_video"][
  196. "dynamic_video_list"
  197. ][-1]["backup_url_1"]
  198. audio_url = video_info["videoResource"]["dash_120fps"]["dynamic_video"][
  199. "dynamic_audio_list"
  200. ][-1]["backup_url_1"]
  201. if len(video_url) % 3 == 1:
  202. video_url += "=="
  203. elif len(video_url) % 3 == 2:
  204. video_url += "="
  205. elif len(audio_url) % 3 == 1:
  206. audio_url += "=="
  207. elif len(audio_url) % 3 == 2:
  208. audio_url += "="
  209. video_url = base64.b64decode(video_url).decode("utf8")
  210. audio_url = base64.b64decode(audio_url).decode("utf8")
  211. video_width = video_info["videoResource"]["dash_120fps"]["dynamic_video"][
  212. "dynamic_video_list"
  213. ][-1]["vwidth"]
  214. video_height = video_info["videoResource"]["dash_120fps"]["dynamic_video"][
  215. "dynamic_video_list"
  216. ][-1]["vheight"]
  217. video_url_dict["video_url"] = video_url
  218. video_url_dict["audio_url"] = audio_url
  219. video_url_dict["video_width"] = video_width
  220. video_url_dict["video_height"] = video_height
  221. else:
  222. video_url_dict["video_url"] = ""
  223. video_url_dict["audio_url"] = ""
  224. video_url_dict["video_width"] = 0
  225. video_url_dict["video_height"] = 0
  226. elif "dash" in video_info["videoResource"]:
  227. if (
  228. "video_list" in video_info["videoResource"]["dash"]
  229. and "video_4" in video_info["videoResource"]["dash"]["video_list"]
  230. ):
  231. video_url = video_info["videoResource"]["dash"]["video_list"]["video_4"][
  232. "backup_url_1"
  233. ]
  234. audio_url = video_info["videoResource"]["dash"]["video_list"]["video_4"][
  235. "backup_url_1"
  236. ]
  237. if len(video_url) % 3 == 1:
  238. video_url += "=="
  239. elif len(video_url) % 3 == 2:
  240. video_url += "="
  241. elif len(audio_url) % 3 == 1:
  242. audio_url += "=="
  243. elif len(audio_url) % 3 == 2:
  244. audio_url += "="
  245. video_url = base64.b64decode(video_url).decode("utf8")
  246. audio_url = base64.b64decode(audio_url).decode("utf8")
  247. video_width = video_info["videoResource"]["dash"]["video_list"]["video_4"][
  248. "vwidth"
  249. ]
  250. video_height = video_info["videoResource"]["dash"]["video_list"]["video_4"][
  251. "vheight"
  252. ]
  253. video_url_dict["video_url"] = video_url
  254. video_url_dict["audio_url"] = audio_url
  255. video_url_dict["video_width"] = video_width
  256. video_url_dict["video_height"] = video_height
  257. elif (
  258. "video_list" in video_info["videoResource"]["dash"]
  259. and "video_3" in video_info["videoResource"]["dash"]["video_list"]
  260. ):
  261. video_url = video_info["videoResource"]["dash"]["video_list"]["video_3"][
  262. "backup_url_1"
  263. ]
  264. audio_url = video_info["videoResource"]["dash"]["video_list"]["video_3"][
  265. "backup_url_1"
  266. ]
  267. if len(video_url) % 3 == 1:
  268. video_url += "=="
  269. elif len(video_url) % 3 == 2:
  270. video_url += "="
  271. elif len(audio_url) % 3 == 1:
  272. audio_url += "=="
  273. elif len(audio_url) % 3 == 2:
  274. audio_url += "="
  275. video_url = base64.b64decode(video_url).decode("utf8")
  276. audio_url = base64.b64decode(audio_url).decode("utf8")
  277. video_width = video_info["videoResource"]["dash"]["video_list"]["video_3"][
  278. "vwidth"
  279. ]
  280. video_height = video_info["videoResource"]["dash"]["video_list"]["video_3"][
  281. "vheight"
  282. ]
  283. video_url_dict["video_url"] = video_url
  284. video_url_dict["audio_url"] = audio_url
  285. video_url_dict["video_width"] = video_width
  286. video_url_dict["video_height"] = video_height
  287. elif (
  288. "video_list" in video_info["videoResource"]["dash"]
  289. and "video_2" in video_info["videoResource"]["dash"]["video_list"]
  290. ):
  291. video_url = video_info["videoResource"]["dash"]["video_list"]["video_2"][
  292. "backup_url_1"
  293. ]
  294. audio_url = video_info["videoResource"]["dash"]["video_list"]["video_2"][
  295. "backup_url_1"
  296. ]
  297. if len(video_url) % 3 == 1:
  298. video_url += "=="
  299. elif len(video_url) % 3 == 2:
  300. video_url += "="
  301. elif len(audio_url) % 3 == 1:
  302. audio_url += "=="
  303. elif len(audio_url) % 3 == 2:
  304. audio_url += "="
  305. video_url = base64.b64decode(video_url).decode("utf8")
  306. audio_url = base64.b64decode(audio_url).decode("utf8")
  307. video_width = video_info["videoResource"]["dash"]["video_list"]["video_2"][
  308. "vwidth"
  309. ]
  310. video_height = video_info["videoResource"]["dash"]["video_list"]["video_2"][
  311. "vheight"
  312. ]
  313. video_url_dict["video_url"] = video_url
  314. video_url_dict["audio_url"] = audio_url
  315. video_url_dict["video_width"] = video_width
  316. video_url_dict["video_height"] = video_height
  317. elif (
  318. "video_list" in video_info["videoResource"]["dash"]
  319. and "video_1" in video_info["videoResource"]["dash"]["video_list"]
  320. ):
  321. video_url = video_info["videoResource"]["dash"]["video_list"]["video_1"][
  322. "backup_url_1"
  323. ]
  324. audio_url = video_info["videoResource"]["dash"]["video_list"]["video_1"][
  325. "backup_url_1"
  326. ]
  327. if len(video_url) % 3 == 1:
  328. video_url += "=="
  329. elif len(video_url) % 3 == 2:
  330. video_url += "="
  331. elif len(audio_url) % 3 == 1:
  332. audio_url += "=="
  333. elif len(audio_url) % 3 == 2:
  334. audio_url += "="
  335. video_url = base64.b64decode(video_url).decode("utf8")
  336. audio_url = base64.b64decode(audio_url).decode("utf8")
  337. video_width = video_info["videoResource"]["dash"]["video_list"]["video_1"][
  338. "vwidth"
  339. ]
  340. video_height = video_info["videoResource"]["dash"]["video_list"]["video_1"][
  341. "vheight"
  342. ]
  343. video_url_dict["video_url"] = video_url
  344. video_url_dict["audio_url"] = audio_url
  345. video_url_dict["video_width"] = video_width
  346. video_url_dict["video_height"] = video_height
  347. elif (
  348. "dynamic_video" in video_info["videoResource"]["dash"]
  349. and "dynamic_video_list"
  350. in video_info["videoResource"]["dash"]["dynamic_video"]
  351. and "dynamic_audio_list"
  352. in video_info["videoResource"]["dash"]["dynamic_video"]
  353. and len(
  354. video_info["videoResource"]["dash"]["dynamic_video"][
  355. "dynamic_video_list"
  356. ]
  357. )
  358. != 0
  359. and len(
  360. video_info["videoResource"]["dash"]["dynamic_video"][
  361. "dynamic_audio_list"
  362. ]
  363. )
  364. != 0
  365. ):
  366. video_url = video_info["videoResource"]["dash"]["dynamic_video"][
  367. "dynamic_video_list"
  368. ][-1]["backup_url_1"]
  369. audio_url = video_info["videoResource"]["dash"]["dynamic_video"][
  370. "dynamic_audio_list"
  371. ][-1]["backup_url_1"]
  372. if len(video_url) % 3 == 1:
  373. video_url += "=="
  374. elif len(video_url) % 3 == 2:
  375. video_url += "="
  376. elif len(audio_url) % 3 == 1:
  377. audio_url += "=="
  378. elif len(audio_url) % 3 == 2:
  379. audio_url += "="
  380. video_url = base64.b64decode(video_url).decode("utf8")
  381. audio_url = base64.b64decode(audio_url).decode("utf8")
  382. video_width = video_info["videoResource"]["dash"]["dynamic_video"][
  383. "dynamic_video_list"
  384. ][-1]["vwidth"]
  385. video_height = video_info["videoResource"]["dash"]["dynamic_video"][
  386. "dynamic_video_list"
  387. ][-1]["vheight"]
  388. video_url_dict["video_url"] = video_url
  389. video_url_dict["audio_url"] = audio_url
  390. video_url_dict["video_width"] = video_width
  391. video_url_dict["video_height"] = video_height
  392. else:
  393. video_url_dict["video_url"] = ""
  394. video_url_dict["audio_url"] = ""
  395. video_url_dict["video_width"] = 0
  396. video_url_dict["video_height"] = 0
  397. elif "normal" in video_info["videoResource"]:
  398. if (
  399. "video_list" in video_info["videoResource"]["normal"]
  400. and "video_4" in video_info["videoResource"]["normal"]["video_list"]
  401. ):
  402. video_url = video_info["videoResource"]["normal"]["video_list"]["video_4"][
  403. "backup_url_1"
  404. ]
  405. audio_url = video_info["videoResource"]["normal"]["video_list"]["video_4"][
  406. "backup_url_1"
  407. ]
  408. if len(video_url) % 3 == 1:
  409. video_url += "=="
  410. elif len(video_url) % 3 == 2:
  411. video_url += "="
  412. elif len(audio_url) % 3 == 1:
  413. audio_url += "=="
  414. elif len(audio_url) % 3 == 2:
  415. audio_url += "="
  416. video_url = base64.b64decode(video_url).decode("utf8")
  417. audio_url = base64.b64decode(audio_url).decode("utf8")
  418. video_width = video_info["videoResource"]["normal"]["video_list"][
  419. "video_4"
  420. ]["vwidth"]
  421. video_height = video_info["videoResource"]["normal"]["video_list"][
  422. "video_4"
  423. ]["vheight"]
  424. video_url_dict["video_url"] = video_url
  425. video_url_dict["audio_url"] = audio_url
  426. video_url_dict["video_width"] = video_width
  427. video_url_dict["video_height"] = video_height
  428. elif (
  429. "video_list" in video_info["videoResource"]["normal"]
  430. and "video_3" in video_info["videoResource"]["normal"]["video_list"]
  431. ):
  432. video_url = video_info["videoResource"]["normal"]["video_list"]["video_3"][
  433. "backup_url_1"
  434. ]
  435. audio_url = video_info["videoResource"]["normal"]["video_list"]["video_3"][
  436. "backup_url_1"
  437. ]
  438. if len(video_url) % 3 == 1:
  439. video_url += "=="
  440. elif len(video_url) % 3 == 2:
  441. video_url += "="
  442. elif len(audio_url) % 3 == 1:
  443. audio_url += "=="
  444. elif len(audio_url) % 3 == 2:
  445. audio_url += "="
  446. video_url = base64.b64decode(video_url).decode("utf8")
  447. audio_url = base64.b64decode(audio_url).decode("utf8")
  448. video_width = video_info["videoResource"]["normal"]["video_list"][
  449. "video_3"
  450. ]["vwidth"]
  451. video_height = video_info["videoResource"]["normal"]["video_list"][
  452. "video_3"
  453. ]["vheight"]
  454. video_url_dict["video_url"] = video_url
  455. video_url_dict["audio_url"] = audio_url
  456. video_url_dict["video_width"] = video_width
  457. video_url_dict["video_height"] = video_height
  458. elif (
  459. "video_list" in video_info["videoResource"]["normal"]
  460. and "video_2" in video_info["videoResource"]["normal"]["video_list"]
  461. ):
  462. video_url = video_info["videoResource"]["normal"]["video_list"]["video_2"][
  463. "backup_url_1"
  464. ]
  465. audio_url = video_info["videoResource"]["normal"]["video_list"]["video_2"][
  466. "backup_url_1"
  467. ]
  468. if len(video_url) % 3 == 1:
  469. video_url += "=="
  470. elif len(video_url) % 3 == 2:
  471. video_url += "="
  472. elif len(audio_url) % 3 == 1:
  473. audio_url += "=="
  474. elif len(audio_url) % 3 == 2:
  475. audio_url += "="
  476. video_url = base64.b64decode(video_url).decode("utf8")
  477. audio_url = base64.b64decode(audio_url).decode("utf8")
  478. video_width = video_info["videoResource"]["normal"]["video_list"][
  479. "video_2"
  480. ]["vwidth"]
  481. video_height = video_info["videoResource"]["normal"]["video_list"][
  482. "video_2"
  483. ]["vheight"]
  484. video_url_dict["video_url"] = video_url
  485. video_url_dict["audio_url"] = audio_url
  486. video_url_dict["video_width"] = video_width
  487. video_url_dict["video_height"] = video_height
  488. elif (
  489. "video_list" in video_info["videoResource"]["normal"]
  490. and "video_1" in video_info["videoResource"]["normal"]["video_list"]
  491. ):
  492. video_url = video_info["videoResource"]["normal"]["video_list"]["video_1"][
  493. "backup_url_1"
  494. ]
  495. audio_url = video_info["videoResource"]["normal"]["video_list"]["video_1"][
  496. "backup_url_1"
  497. ]
  498. if len(video_url) % 3 == 1:
  499. video_url += "=="
  500. elif len(video_url) % 3 == 2:
  501. video_url += "="
  502. elif len(audio_url) % 3 == 1:
  503. audio_url += "=="
  504. elif len(audio_url) % 3 == 2:
  505. audio_url += "="
  506. video_url = base64.b64decode(video_url).decode("utf8")
  507. audio_url = base64.b64decode(audio_url).decode("utf8")
  508. video_width = video_info["videoResource"]["normal"]["video_list"][
  509. "video_1"
  510. ]["vwidth"]
  511. video_height = video_info["videoResource"]["normal"]["video_list"][
  512. "video_1"
  513. ]["vheight"]
  514. video_url_dict["video_url"] = video_url
  515. video_url_dict["audio_url"] = audio_url
  516. video_url_dict["video_width"] = video_width
  517. video_url_dict["video_height"] = video_height
  518. elif (
  519. "dynamic_video" in video_info["videoResource"]["normal"]
  520. and "dynamic_video_list"
  521. in video_info["videoResource"]["normal"]["dynamic_video"]
  522. and "dynamic_audio_list"
  523. in video_info["videoResource"]["normal"]["dynamic_video"]
  524. and len(
  525. video_info["videoResource"]["normal"]["dynamic_video"][
  526. "dynamic_video_list"
  527. ]
  528. )
  529. != 0
  530. and len(
  531. video_info["videoResource"]["normal"]["dynamic_video"][
  532. "dynamic_audio_list"
  533. ]
  534. )
  535. != 0
  536. ):
  537. video_url = video_info["videoResource"]["normal"]["dynamic_video"][
  538. "dynamic_video_list"
  539. ][-1]["backup_url_1"]
  540. audio_url = video_info["videoResource"]["normal"]["dynamic_video"][
  541. "dynamic_audio_list"
  542. ][-1]["backup_url_1"]
  543. if len(video_url) % 3 == 1:
  544. video_url += "=="
  545. elif len(video_url) % 3 == 2:
  546. video_url += "="
  547. elif len(audio_url) % 3 == 1:
  548. audio_url += "=="
  549. elif len(audio_url) % 3 == 2:
  550. audio_url += "="
  551. video_url = base64.b64decode(video_url).decode("utf8")
  552. audio_url = base64.b64decode(audio_url).decode("utf8")
  553. video_width = video_info["videoResource"]["normal"]["dynamic_video"][
  554. "dynamic_video_list"
  555. ][-1]["vwidth"]
  556. video_height = video_info["videoResource"]["normal"]["dynamic_video"][
  557. "dynamic_video_list"
  558. ][-1]["vheight"]
  559. video_url_dict["video_url"] = video_url
  560. video_url_dict["audio_url"] = audio_url
  561. video_url_dict["video_width"] = video_width
  562. video_url_dict["video_height"] = video_height
  563. else:
  564. video_url_dict["video_url"] = ""
  565. video_url_dict["audio_url"] = ""
  566. video_url_dict["video_width"] = 0
  567. video_url_dict["video_height"] = 0
  568. else:
  569. video_url_dict["video_url"] = ""
  570. video_url_dict["audio_url"] = ""
  571. video_url_dict["video_width"] = 0
  572. video_url_dict["video_height"] = 0
  573. return video_url_dict
  574. def get_comment_cnt(item_id):
  575. """
  576. 获取视频的评论数量
  577. """
  578. url = "https://www.ixigua.com/tlb/comment/article/v5/tab_comments/?"
  579. params = {
  580. "tab_index": "0",
  581. "count": "10",
  582. "offset": "10",
  583. "group_id": str(item_id),
  584. "item_id": str(item_id),
  585. "aid": "1768",
  586. "msToken": "50-JJObWB07HfHs-BMJWT1eIDX3G-6lPSF_i-QwxBIXE9VVa-iN0jbEXR5pG2DKjXBmP299n6ZTuXzY-GAy968CCvouSAYIS4GzvGQT3pNlKNejr5G4-1g==",
  587. "X-Bogus": "DFSzswVOyGtANVeWtCLMqR/F6q9U",
  588. "_signature": random_signature(),
  589. }
  590. headers = {
  591. "authority": "www.ixigua.com",
  592. "accept": "application/json, text/plain, */*",
  593. "accept-language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
  594. "cache-control": "no-cache",
  595. "cookie": "MONITOR_WEB_ID=67cb5099-a022-4ec3-bb8e-c4de6ba51dd0; passport_csrf_token=72b2574f3c99f8ba670e42df430218fd; passport_csrf_token_default=72b2574f3c99f8ba670e42df430218fd; sid_guard=c7472b508ea631823ba765a60cf8757f%7C1680867422%7C3024002%7CFri%2C+12-May-2023+11%3A37%3A04+GMT; uid_tt=c13f47d51767f616befe32fb3e9f485a; uid_tt_ss=c13f47d51767f616befe32fb3e9f485a; sid_tt=c7472b508ea631823ba765a60cf8757f; sessionid=c7472b508ea631823ba765a60cf8757f; sessionid_ss=c7472b508ea631823ba765a60cf8757f; sid_ucp_v1=1.0.0-KGUzNWYxNmRkZGJiZjgxY2MzZWNkMTEzMTkwYjY1Yjg5OTY5NzVlNmMKFQiu3d-eqQIQ3oDAoQYYGCAMOAhACxoCaGwiIGM3NDcyYjUwOGVhNjMxODIzYmE3NjVhNjBjZjg3NTdm; ssid_ucp_v1=1.0.0-KGUzNWYxNmRkZGJiZjgxY2MzZWNkMTEzMTkwYjY1Yjg5OTY5NzVlNmMKFQiu3d-eqQIQ3oDAoQYYGCAMOAhACxoCaGwiIGM3NDcyYjUwOGVhNjMxODIzYmE3NjVhNjBjZjg3NTdm; odin_tt=b893608d4dde2e1e8df8cd5d97a0e2fbeafc4ca762ac72ebef6e6c97e2ed19859bb01d46b4190ddd6dd17d7f9678e1de; SEARCH_CARD_MODE=7168304743566296612_0; support_webp=true; support_avif=false; csrf_session_id=a5355d954d3c63ed1ba35faada452b4d; tt_scid=7Pux7s634-z8DYvCM20y7KigwH5u7Rh6D9C-RROpnT.aGMEcz6Vsxp.oai47wJqa4f86; ttwid=1%7CHHtv2QqpSGuSu8r-zXF1QoWsvjmNi1SJrqOrZzg-UCY%7C1683858689%7Ca5223fe1500578e01e138a0d71d6444692018296c4c24f5885af174a65873c95; ixigua-a-s=3; msToken=50-JJObWB07HfHs-BMJWT1eIDX3G-6lPSF_i-QwxBIXE9VVa-iN0jbEXR5pG2DKjXBmP299n6ZTuXzY-GAy968CCvouSAYIS4GzvGQT3pNlKNejr5G4-1g==; __ac_nonce=0645dcbf0005064517440; __ac_signature=_02B4Z6wo00f01FEGmAwAAIDBKchzCGqn-MBRJpyAAHAjieFC5GEg6gGiwz.I4PRrJl7f0GcixFrExKmgt6QI1i1S-dQyofPEj2ugWTCnmKUdJQv-wYuDofeKNe8VtMtZq2aKewyUGeKU-5Ud21; ixigua-a-s=3",
  596. "pragma": "no-cache",
  597. "referer": f"https://www.ixigua.com/{item_id}?logTag=3c5aa86a8600b9ab8540",
  598. "sec-ch-ua": '"Microsoft Edge";v="113", "Chromium";v="113", "Not-A.Brand";v="24"',
  599. "sec-ch-ua-mobile": "?0",
  600. "sec-ch-ua-platform": '"macOS"',
  601. "sec-fetch-dest": "empty",
  602. "sec-fetch-mode": "cors",
  603. "sec-fetch-site": "same-origin",
  604. "tt-anti-token": "cBITBHvmYjEygzv-f9c78c1297722cf1f559c74b084e4525ce4900bdcf9e8588f20cc7c2e3234422",
  605. "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.35",
  606. "x-secsdk-csrf-token": "000100000001f8e733cf37f0cd255a51aea9a81ff7bc0c09490cfe41ad827c3c5c18ec809279175e4d9f5553d8a5",
  607. }
  608. response = requests.get(
  609. url=url, headers=headers, params=params, proxies=tunnel_proxies(), timeout=5
  610. )
  611. response.close()
  612. if (
  613. response.status_code != 200
  614. or "total_number" not in response.json()
  615. or response.json() == {}
  616. ):
  617. return 0
  618. return response.json().get("total_number", 0)
  619. class XiGuaAuthor:
  620. """
  621. 西瓜账号爬虫
  622. """
  623. def __init__(self, platform, mode, rule_dict, env, user_list):
  624. self.platform = platform
  625. self.mode = mode
  626. self.rule_dict = rule_dict
  627. self.env = env
  628. self.user_list = user_list
  629. self.mq = MQ(topic_name="topic_crawler_etl_" + self.env)
  630. self.download_count = 0
  631. def rule_maker(self, account):
  632. """
  633. 通过不同的账号生成不同的规则
  634. :param account: 输入的账号信息
  635. {'play_cnt': {'min': 100000, 'max': 0}, 'period': {'min': 5, 'max': 5}}
  636. """
  637. flag = account['link'].split("_")[0]
  638. if flag == "V1":
  639. rule_dict = {
  640. "play_cnt": {"min": 100000, "max": 0},
  641. 'period': {"min": 90, "max": 90},
  642. 'special': 0.02
  643. }
  644. return rule_dict
  645. elif flag == "V2":
  646. rule_dict = {
  647. "play_cnt": {"min": 10000, "max": 0},
  648. 'period': {"min": 90, "max": 90},
  649. 'special': 0.01
  650. }
  651. return rule_dict
  652. elif flag == "V3":
  653. rule_dict = {
  654. "play_cnt": {"min": 5000, "max": 0},
  655. 'period': {"min": 90, "max": 90},
  656. 'special': 0.01
  657. }
  658. return rule_dict
  659. else:
  660. return self.rule_dict
  661. def get_author_list(self):
  662. """
  663. 每轮只抓取定量的数据,到达数量后自己退出
  664. 获取账号列表以及账号信息
  665. """
  666. # max_count = int(self.rule_dict.get("videos_cnt", {}).get("min", 300))
  667. for user_dict in self.user_list:
  668. # if self.download_count <= max_count:
  669. try:
  670. self.get_video_list(user_dict)
  671. except Exception as e:
  672. AliyunLogger.logging(
  673. code="3001",
  674. account=user_dict["uid"],
  675. platform=self.platform,
  676. mode=self.mode,
  677. env=self.env,
  678. message="扫描账号时出现bug, 报错是 {}".format(e)
  679. )
  680. # time.sleep(random.randint(1, 15))
  681. # else:
  682. # AliyunLogger.logging(
  683. # code="2000",
  684. # platform=self.platform,
  685. # mode=self.mode,
  686. # env=self.env,
  687. # message="本轮已经抓取足够数量的视频,已经自动退出",
  688. # )
  689. # return
  690. def get_video_list(self, user_dict):
  691. """
  692. 获取某个账号的视频列表
  693. """
  694. offset = 0
  695. signature = random_signature()
  696. url = "https://www.ixigua.com/api/videov2/author/new_video_list?"
  697. while True:
  698. if user_dict['link'][0] == "V":
  699. link = user_dict["link"][3:]
  700. else:
  701. link = user_dict["link"]
  702. to_user_id = str(link.replace("https://www.ixigua.com/home/", ""))
  703. params = {
  704. "to_user_id": to_user_id,
  705. "offset": str(offset),
  706. "limit": "30",
  707. "maxBehotTime": "0",
  708. "order": "new",
  709. "isHome": "0",
  710. # 'msToken': 'G0eRzNkw189a8TLaXjc6nTHVMQwh9XcxVAqTbGKi7iPJdQcLwS3-XRrJ3MZ7QBfqErpxp3EX1WtvWOIcZ3NIgr41hgcd-v64so_RRj3YCRw1UsKW8mIssNLlIMspsg==',
  711. # 'X-Bogus': 'DFSzswVuEkUANjW9ShFTgR/F6qHt',
  712. "_signature": signature,
  713. }
  714. headers = {
  715. "referer": f'https://www.ixigua.com/home/{link.replace("https://www.ixigua.com/home/", "")}/video/?preActiveKey=hotsoon&list_entrance=userdetail',
  716. "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.41",
  717. }
  718. response = requests.get(
  719. url=url,
  720. headers=headers,
  721. params=params,
  722. proxies=tunnel_proxies(),
  723. timeout=5,
  724. )
  725. offset += 30
  726. if "data" not in response.text or response.status_code != 200:
  727. AliyunLogger.logging(
  728. code="2000",
  729. platform=self.platform,
  730. mode=self.mode,
  731. env=self.env,
  732. message=f"get_videoList:{response.text}\n",
  733. )
  734. return
  735. elif not response.json()["data"]["videoList"]:
  736. AliyunLogger.logging(
  737. code="2000",
  738. platform=self.platform,
  739. mode=self.mode,
  740. env=self.env,
  741. message=f"没有更多数据啦~\n",
  742. )
  743. return
  744. else:
  745. feeds = response.json()["data"]["videoList"]
  746. for video_obj in feeds:
  747. try:
  748. AliyunLogger.logging(
  749. code="1001",
  750. account=user_dict['uid'],
  751. platform=self.platform,
  752. mode=self.mode,
  753. env=self.env,
  754. data=video_obj,
  755. message="扫描到一条视频",
  756. )
  757. date_flag = self.process_video_obj(video_obj, user_dict)
  758. if not date_flag:
  759. return
  760. except Exception as e:
  761. AliyunLogger.logging(
  762. code="3000",
  763. platform=self.platform,
  764. mode=self.mode,
  765. env=self.env,
  766. data=video_obj,
  767. message="抓取单条视频异常, 报错原因是: {}".format(e),
  768. )
  769. def process_video_obj(self, video_obj, user_dict):
  770. new_rule = self.rule_maker(user_dict)
  771. trace_id = self.platform + str(uuid.uuid1())
  772. item_id = video_obj.get("item_id", "")
  773. if not item_id:
  774. AliyunLogger.logging(
  775. code="2005",
  776. account=user_dict['uid'],
  777. platform=self.platform,
  778. mode=self.mode,
  779. env=self.env,
  780. message="无效视频",
  781. data=video_obj,
  782. trace_id=trace_id,
  783. )
  784. return
  785. # 获取视频信息
  786. video_dict = self.get_video_info(item_id=item_id, trace_id=trace_id)
  787. video_dict["out_user_id"] = video_dict["user_id"]
  788. video_dict["platform"] = self.platform
  789. video_dict["strategy"] = self.mode
  790. video_dict["out_video_id"] = video_dict["video_id"]
  791. video_dict["width"] = video_dict["video_width"]
  792. video_dict["height"] = video_dict["video_height"]
  793. video_dict["crawler_rule"] = json.dumps(new_rule)
  794. video_dict["user_id"] = user_dict["uid"]
  795. video_dict["publish_time"] = video_dict["publish_time_str"]
  796. video_dict["strategy_type"] = self.mode
  797. video_dict["update_time_stamp"] = int(time.time())
  798. if int(time.time()) - video_dict['publish_time_stamp'] > 3600 * 24 * int(
  799. new_rule.get("period", {}).get("max", 1000)):
  800. if not video_obj['is_top']:
  801. """
  802. 非置顶数据发布时间超过才退出
  803. """
  804. AliyunLogger.logging(
  805. code="2004",
  806. account=user_dict['uid'],
  807. platform=self.platform,
  808. mode=self.mode,
  809. env=self.env,
  810. data=video_dict,
  811. message="发布时间超过{}天".format(
  812. int(new_rule.get("period", {}).get("max", 1000))
  813. ),
  814. )
  815. return False
  816. pipeline = PiaoQuanPipeline(
  817. platform=self.platform,
  818. mode=self.mode,
  819. rule_dict=new_rule,
  820. env=self.env,
  821. item=video_dict,
  822. trace_id=trace_id,
  823. )
  824. title_flag = pipeline.title_flag()
  825. repeat_flag = pipeline.repeat_video()
  826. if title_flag and repeat_flag:
  827. if new_rule.get("special"):
  828. if int(video_dict['play_cnt']) >= int(new_rule.get("play_cnt", {}).get("min", 100000)):
  829. if float(video_dict['like_cnt']) / float(video_dict['play_cnt']) >= new_rule['special']:
  830. self.mq.send_msg(video_dict)
  831. self.download_count += 1
  832. AliyunLogger.logging(
  833. code="1002",
  834. account=user_dict['uid'],
  835. platform=self.platform,
  836. mode=self.mode,
  837. env=self.env,
  838. data=video_dict,
  839. trace_id=trace_id,
  840. message="成功发送 MQ 至 ETL",
  841. )
  842. return True
  843. else:
  844. AliyunLogger.logging(
  845. code="2008",
  846. account=user_dict['uid'],
  847. platform=self.platform,
  848. mode=self.mode,
  849. env=self.env,
  850. message="不满足特殊规则, 点赞量/播放量",
  851. data=video_dict
  852. )
  853. else:
  854. if int(video_dict['play_cnt']) >= int(new_rule.get("play_cnt", {}).get("min", 100000)):
  855. self.mq.send_msg(video_dict)
  856. self.download_count += 1
  857. AliyunLogger.logging(
  858. code="1002",
  859. account=user_dict['uid'],
  860. platform=self.platform,
  861. mode=self.mode,
  862. env=self.env,
  863. data=video_dict,
  864. trace_id=trace_id,
  865. message="成功发送 MQ 至 ETL",
  866. )
  867. return True
  868. else:
  869. AliyunLogger.logging(
  870. code="2008",
  871. account=user_dict['uid'],
  872. platform=self.platform,
  873. mode=self.mode,
  874. env=self.env,
  875. message="不满足特殊规则, 播放量",
  876. data=video_dict
  877. )
  878. return True
  879. def get_video_info(self, item_id, trace_id):
  880. url = "https://www.ixigua.com/api/mixVideo/information?"
  881. headers = {
  882. "accept-encoding": "gzip, deflate",
  883. "accept-language": "zh-CN,zh-Hans;q=0.9",
  884. "user-agent": FakeUserAgent().random,
  885. "referer": "https://www.ixigua.com/7102614741050196520?logTag=0531c88ac04f38ab2c62",
  886. }
  887. params = {
  888. "mixId": str(item_id),
  889. "msToken": "IlG0wd0Pylyw9ghcYiB2YseUmTwrsrqqhXrbIcsSaTcLTJyVlbYJzk20zw3UO-CfrfC"
  890. "NVVIOBNjIl7vfBoxnVUwO9ZyzAI3umSKsT5-pef_RRfQCJwmA",
  891. "X-Bogus": "DFSzswVupYTANCJOSBk0P53WxM-r",
  892. "_signature": "_02B4Z6wo0000119LvEwAAIDCuktNZ0y5wkdfS7jAALThuOR8D9yWNZ.EmWHKV0WSn6Px"
  893. "fPsH9-BldyxVje0f49ryXgmn7Tzk-swEHNb15TiGqa6YF.cX0jW8Eds1TtJOIZyfc9s5emH7gdWN94",
  894. }
  895. cookies = {
  896. "ixigua-a-s": "1",
  897. "msToken": "IlG0wd0Pylyw9ghcYiB2YseUmTwrsrqqhXrbIcsSaTcLTJyVlbYJzk20zw3UO-CfrfCNVVIOB"
  898. "NjIl7vfBoxnVUwO9ZyzAI3umSKsT5-pef_RRfQCJwmA",
  899. "ttwid": "1%7C_yXQeHWwLZgCsgHClOwTCdYSOt_MjdOkgnPIkpi-Sr8%7C1661241238%7Cf57d0c5ef3f1d7"
  900. "6e049fccdca1ac54887c34d1f8731c8e51a49780ff0ceab9f8",
  901. "tt_scid": "QZ4l8KXDG0YAEaMCSbADdcybdKbUfG4BC6S4OBv9lpRS5VyqYLX2bIR8CTeZeGHR9ee3",
  902. "MONITOR_WEB_ID": "0a49204a-7af5-4e96-95f0-f4bafb7450ad",
  903. "__ac_nonce": "06304878000964fdad287",
  904. "__ac_signature": "_02B4Z6wo00f017Rcr3AAAIDCUVxeW1tOKEu0fKvAAI4cvoYzV-wBhq7B6D8k0no7lb"
  905. "FlvYoinmtK6UXjRIYPXnahUlFTvmWVtb77jsMkKAXzAEsLE56m36RlvL7ky.M3Xn52r9t1IEb7IR3ke8",
  906. "ttcid": "e56fabf6e85d4adf9e4d91902496a0e882",
  907. "_tea_utm_cache_1300": "undefined",
  908. "support_avif": "false",
  909. "support_webp": "false",
  910. "xiguavideopcwebid": "7134967546256016900",
  911. "xiguavideopcwebid.sig": "xxRww5R1VEMJN_dQepHorEu_eAc",
  912. }
  913. response = requests.get(
  914. url=url,
  915. headers=headers,
  916. params=params,
  917. cookies=cookies,
  918. proxies=tunnel_proxies(),
  919. timeout=5,
  920. )
  921. if (
  922. response.status_code != 200
  923. or "data" not in response.json()
  924. or response.json()["data"] == {}
  925. ):
  926. AliyunLogger.logging(
  927. code="2000",
  928. platform=self.platform,
  929. mode=self.mode,
  930. env=self.env,
  931. message="获取视频信息失败",
  932. trace_id=trace_id,
  933. )
  934. return None
  935. else:
  936. video_info = (
  937. response.json()["data"]
  938. .get("gidInformation", {})
  939. .get("packerData", {})
  940. .get("video", {})
  941. )
  942. if video_info == {}:
  943. return None
  944. video_detail = get_video_url(video_info)
  945. video_dict = {
  946. "video_title": video_info.get("title", ""),
  947. "video_id": video_info.get("videoResource", {}).get("vid", ""),
  948. "gid": str(item_id),
  949. "play_cnt": int(video_info.get("video_watch_count", 0)),
  950. "like_cnt": int(video_info.get("video_like_count", 0)),
  951. "comment_cnt": int(get_comment_cnt(item_id)),
  952. "share_cnt": 0,
  953. "favorite_cnt": 0,
  954. "duration": int(video_info.get("video_duration", 0)),
  955. "video_width": int(video_detail["video_width"]),
  956. "video_height": int(video_detail["video_height"]),
  957. "publish_time_stamp": int(video_info.get("video_publish_time", 0)),
  958. "publish_time_str": time.strftime(
  959. "%Y-%m-%d %H:%M:%S",
  960. time.localtime(int(video_info.get("video_publish_time", 0))),
  961. ),
  962. "user_name": video_info.get("user_info", {}).get("name", ""),
  963. "user_id": str(video_info.get("user_info", {}).get("user_id", "")),
  964. "avatar_url": str(
  965. video_info.get("user_info", {}).get("avatar_url", "")
  966. ),
  967. "cover_url": video_info.get("poster_url", ""),
  968. "audio_url": video_detail["audio_url"],
  969. "video_url": video_detail["video_url"],
  970. "session": f"xigua-search-{int(time.time())}",
  971. }
  972. return video_dict
  973. if __name__ == "__main__":
  974. user_list = [
  975. {
  976. "uid": 6267140,
  977. "source": "xigua",
  978. "link": "https://www.ixigua.com/home/2779177225827568",
  979. "nick_name": "秋晴爱音乐",
  980. "avatar_url": "",
  981. "mode": "author",
  982. },
  983. {
  984. "uid": 6267140,
  985. "source": "xigua",
  986. "link": "https://www.ixigua.com/home/2885546124776780",
  987. "nick_name": "朗诵放歌的老山羊",
  988. "avatar_url": "",
  989. "mode": "author",
  990. },
  991. {
  992. "uid": 6267140,
  993. "source": "xigua",
  994. "link": "https://www.ixigua.com/home/5880938217",
  995. "nick_name": "天原声疗",
  996. "avatar_url": "",
  997. "mode": "author",
  998. },
  999. ]
  1000. # rule = {'period': {'min': 30, 'max': 30}, 'duration': {'min': 20, 'max': 0}, 'play_cnt': {'min': 100000, 'max': 0}}
  1001. # XGA = XiGuaAuthor(
  1002. # platform="xigua",
  1003. # mode="author",
  1004. # rule_dict=rule,
  1005. # env="prod",
  1006. # user_list=user_list
  1007. # )
  1008. # XGA.get_author_list()