xigua_author.py 44 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029
  1. import json
  2. import os
  3. import random
  4. import sys
  5. import string
  6. import time
  7. import uuid
  8. import base64
  9. import requests
  10. from fake_useragent import FakeUserAgent
  11. from common.mq import MQ
  12. sys.path.append(os.getcwd())
  13. from common import AliyunLogger, PiaoQuanPipeline, tunnel_proxies
  14. def random_signature():
  15. """
  16. 随机生成签名
  17. """
  18. src_digits = string.digits # string_数字
  19. src_uppercase = string.ascii_uppercase # string_大写字母
  20. src_lowercase = string.ascii_lowercase # string_小写字母
  21. digits_num = random.randint(1, 6)
  22. uppercase_num = random.randint(1, 26 - digits_num - 1)
  23. lowercase_num = 26 - (digits_num + uppercase_num)
  24. password = (
  25. random.sample(src_digits, digits_num)
  26. + random.sample(src_uppercase, uppercase_num)
  27. + random.sample(src_lowercase, lowercase_num)
  28. )
  29. random.shuffle(password)
  30. new_password = "AAAAAAAAAA" + "".join(password)[10:-4] + "AAAB"
  31. new_password_start = new_password[0:18]
  32. new_password_end = new_password[-7:]
  33. if new_password[18] == "8":
  34. new_password = new_password_start + "w" + new_password_end
  35. elif new_password[18] == "9":
  36. new_password = new_password_start + "x" + new_password_end
  37. elif new_password[18] == "-":
  38. new_password = new_password_start + "y" + new_password_end
  39. elif new_password[18] == ".":
  40. new_password = new_password_start + "z" + new_password_end
  41. else:
  42. new_password = new_password_start + "y" + new_password_end
  43. return new_password
  44. def get_video_url(video_info):
  45. """
  46. 获取视频的链接
  47. """
  48. video_url_dict = {}
  49. # video_url
  50. if "videoResource" not in video_info:
  51. video_url_dict["video_url"] = ""
  52. video_url_dict["audio_url"] = ""
  53. video_url_dict["video_width"] = 0
  54. video_url_dict["video_height"] = 0
  55. elif "dash_120fps" in video_info["videoResource"]:
  56. if (
  57. "video_list" in video_info["videoResource"]["dash_120fps"]
  58. and "video_4" in video_info["videoResource"]["dash_120fps"]["video_list"]
  59. ):
  60. video_url = video_info["videoResource"]["dash_120fps"]["video_list"][
  61. "video_4"
  62. ]["backup_url_1"]
  63. audio_url = video_info["videoResource"]["dash_120fps"]["video_list"][
  64. "video_4"
  65. ]["backup_url_1"]
  66. if len(video_url) % 3 == 1:
  67. video_url += "=="
  68. elif len(video_url) % 3 == 2:
  69. video_url += "="
  70. elif len(audio_url) % 3 == 1:
  71. audio_url += "=="
  72. elif len(audio_url) % 3 == 2:
  73. audio_url += "="
  74. video_url = base64.b64decode(video_url).decode("utf8")
  75. audio_url = base64.b64decode(audio_url).decode("utf8")
  76. video_width = video_info["videoResource"]["dash_120fps"]["video_list"][
  77. "video_4"
  78. ]["vwidth"]
  79. video_height = video_info["videoResource"]["dash_120fps"]["video_list"][
  80. "video_4"
  81. ]["vheight"]
  82. video_url_dict["video_url"] = video_url
  83. video_url_dict["audio_url"] = audio_url
  84. video_url_dict["video_width"] = video_width
  85. video_url_dict["video_height"] = video_height
  86. elif (
  87. "video_list" in video_info["videoResource"]["dash_120fps"]
  88. and "video_3" in video_info["videoResource"]["dash_120fps"]["video_list"]
  89. ):
  90. video_url = video_info["videoResource"]["dash_120fps"]["video_list"][
  91. "video_3"
  92. ]["backup_url_1"]
  93. audio_url = video_info["videoResource"]["dash_120fps"]["video_list"][
  94. "video_3"
  95. ]["backup_url_1"]
  96. if len(video_url) % 3 == 1:
  97. video_url += "=="
  98. elif len(video_url) % 3 == 2:
  99. video_url += "="
  100. elif len(audio_url) % 3 == 1:
  101. audio_url += "=="
  102. elif len(audio_url) % 3 == 2:
  103. audio_url += "="
  104. video_url = base64.b64decode(video_url).decode("utf8")
  105. audio_url = base64.b64decode(audio_url).decode("utf8")
  106. video_width = video_info["videoResource"]["dash_120fps"]["video_list"][
  107. "video_3"
  108. ]["vwidth"]
  109. video_height = video_info["videoResource"]["dash_120fps"]["video_list"][
  110. "video_3"
  111. ]["vheight"]
  112. video_url_dict["video_url"] = video_url
  113. video_url_dict["audio_url"] = audio_url
  114. video_url_dict["video_width"] = video_width
  115. video_url_dict["video_height"] = video_height
  116. elif (
  117. "video_list" in video_info["videoResource"]["dash_120fps"]
  118. and "video_2" in video_info["videoResource"]["dash_120fps"]["video_list"]
  119. ):
  120. video_url = video_info["videoResource"]["dash_120fps"]["video_list"][
  121. "video_2"
  122. ]["backup_url_1"]
  123. audio_url = video_info["videoResource"]["dash_120fps"]["video_list"][
  124. "video_2"
  125. ]["backup_url_1"]
  126. if len(video_url) % 3 == 1:
  127. video_url += "=="
  128. elif len(video_url) % 3 == 2:
  129. video_url += "="
  130. elif len(audio_url) % 3 == 1:
  131. audio_url += "=="
  132. elif len(audio_url) % 3 == 2:
  133. audio_url += "="
  134. video_url = base64.b64decode(video_url).decode("utf8")
  135. audio_url = base64.b64decode(audio_url).decode("utf8")
  136. video_width = video_info["videoResource"]["dash_120fps"]["video_list"][
  137. "video_2"
  138. ]["vwidth"]
  139. video_height = video_info["videoResource"]["dash_120fps"]["video_list"][
  140. "video_2"
  141. ]["vheight"]
  142. video_url_dict["video_url"] = video_url
  143. video_url_dict["audio_url"] = audio_url
  144. video_url_dict["video_width"] = video_width
  145. video_url_dict["video_height"] = video_height
  146. elif (
  147. "video_list" in video_info["videoResource"]["dash_120fps"]
  148. and "video_1" in video_info["videoResource"]["dash_120fps"]["video_list"]
  149. ):
  150. video_url = video_info["videoResource"]["dash_120fps"]["video_list"][
  151. "video_1"
  152. ]["backup_url_1"]
  153. audio_url = video_info["videoResource"]["dash_120fps"]["video_list"][
  154. "video_1"
  155. ]["backup_url_1"]
  156. if len(video_url) % 3 == 1:
  157. video_url += "=="
  158. elif len(video_url) % 3 == 2:
  159. video_url += "="
  160. elif len(audio_url) % 3 == 1:
  161. audio_url += "=="
  162. elif len(audio_url) % 3 == 2:
  163. audio_url += "="
  164. video_url = base64.b64decode(video_url).decode("utf8")
  165. audio_url = base64.b64decode(audio_url).decode("utf8")
  166. video_width = video_info["videoResource"]["dash_120fps"]["video_list"][
  167. "video_1"
  168. ]["vwidth"]
  169. video_height = video_info["videoResource"]["dash_120fps"]["video_list"][
  170. "video_1"
  171. ]["vheight"]
  172. video_url_dict["video_url"] = video_url
  173. video_url_dict["audio_url"] = audio_url
  174. video_url_dict["video_width"] = video_width
  175. video_url_dict["video_height"] = video_height
  176. elif (
  177. "dynamic_video" in video_info["videoResource"]["dash_120fps"]
  178. and "dynamic_video_list"
  179. in video_info["videoResource"]["dash_120fps"]["dynamic_video"]
  180. and "dynamic_audio_list"
  181. in video_info["videoResource"]["dash_120fps"]["dynamic_video"]
  182. and len(
  183. video_info["videoResource"]["dash_120fps"]["dynamic_video"][
  184. "dynamic_video_list"
  185. ]
  186. )
  187. != 0
  188. and len(
  189. video_info["videoResource"]["dash_120fps"]["dynamic_video"][
  190. "dynamic_audio_list"
  191. ]
  192. )
  193. != 0
  194. ):
  195. video_url = video_info["videoResource"]["dash_120fps"]["dynamic_video"][
  196. "dynamic_video_list"
  197. ][-1]["backup_url_1"]
  198. audio_url = video_info["videoResource"]["dash_120fps"]["dynamic_video"][
  199. "dynamic_audio_list"
  200. ][-1]["backup_url_1"]
  201. if len(video_url) % 3 == 1:
  202. video_url += "=="
  203. elif len(video_url) % 3 == 2:
  204. video_url += "="
  205. elif len(audio_url) % 3 == 1:
  206. audio_url += "=="
  207. elif len(audio_url) % 3 == 2:
  208. audio_url += "="
  209. video_url = base64.b64decode(video_url).decode("utf8")
  210. audio_url = base64.b64decode(audio_url).decode("utf8")
  211. video_width = video_info["videoResource"]["dash_120fps"]["dynamic_video"][
  212. "dynamic_video_list"
  213. ][-1]["vwidth"]
  214. video_height = video_info["videoResource"]["dash_120fps"]["dynamic_video"][
  215. "dynamic_video_list"
  216. ][-1]["vheight"]
  217. video_url_dict["video_url"] = video_url
  218. video_url_dict["audio_url"] = audio_url
  219. video_url_dict["video_width"] = video_width
  220. video_url_dict["video_height"] = video_height
  221. else:
  222. video_url_dict["video_url"] = ""
  223. video_url_dict["audio_url"] = ""
  224. video_url_dict["video_width"] = 0
  225. video_url_dict["video_height"] = 0
  226. elif "dash" in video_info["videoResource"]:
  227. if (
  228. "video_list" in video_info["videoResource"]["dash"]
  229. and "video_4" in video_info["videoResource"]["dash"]["video_list"]
  230. ):
  231. video_url = video_info["videoResource"]["dash"]["video_list"]["video_4"][
  232. "backup_url_1"
  233. ]
  234. audio_url = video_info["videoResource"]["dash"]["video_list"]["video_4"][
  235. "backup_url_1"
  236. ]
  237. if len(video_url) % 3 == 1:
  238. video_url += "=="
  239. elif len(video_url) % 3 == 2:
  240. video_url += "="
  241. elif len(audio_url) % 3 == 1:
  242. audio_url += "=="
  243. elif len(audio_url) % 3 == 2:
  244. audio_url += "="
  245. video_url = base64.b64decode(video_url).decode("utf8")
  246. audio_url = base64.b64decode(audio_url).decode("utf8")
  247. video_width = video_info["videoResource"]["dash"]["video_list"]["video_4"][
  248. "vwidth"
  249. ]
  250. video_height = video_info["videoResource"]["dash"]["video_list"]["video_4"][
  251. "vheight"
  252. ]
  253. video_url_dict["video_url"] = video_url
  254. video_url_dict["audio_url"] = audio_url
  255. video_url_dict["video_width"] = video_width
  256. video_url_dict["video_height"] = video_height
  257. elif (
  258. "video_list" in video_info["videoResource"]["dash"]
  259. and "video_3" in video_info["videoResource"]["dash"]["video_list"]
  260. ):
  261. video_url = video_info["videoResource"]["dash"]["video_list"]["video_3"][
  262. "backup_url_1"
  263. ]
  264. audio_url = video_info["videoResource"]["dash"]["video_list"]["video_3"][
  265. "backup_url_1"
  266. ]
  267. if len(video_url) % 3 == 1:
  268. video_url += "=="
  269. elif len(video_url) % 3 == 2:
  270. video_url += "="
  271. elif len(audio_url) % 3 == 1:
  272. audio_url += "=="
  273. elif len(audio_url) % 3 == 2:
  274. audio_url += "="
  275. video_url = base64.b64decode(video_url).decode("utf8")
  276. audio_url = base64.b64decode(audio_url).decode("utf8")
  277. video_width = video_info["videoResource"]["dash"]["video_list"]["video_3"][
  278. "vwidth"
  279. ]
  280. video_height = video_info["videoResource"]["dash"]["video_list"]["video_3"][
  281. "vheight"
  282. ]
  283. video_url_dict["video_url"] = video_url
  284. video_url_dict["audio_url"] = audio_url
  285. video_url_dict["video_width"] = video_width
  286. video_url_dict["video_height"] = video_height
  287. elif (
  288. "video_list" in video_info["videoResource"]["dash"]
  289. and "video_2" in video_info["videoResource"]["dash"]["video_list"]
  290. ):
  291. video_url = video_info["videoResource"]["dash"]["video_list"]["video_2"][
  292. "backup_url_1"
  293. ]
  294. audio_url = video_info["videoResource"]["dash"]["video_list"]["video_2"][
  295. "backup_url_1"
  296. ]
  297. if len(video_url) % 3 == 1:
  298. video_url += "=="
  299. elif len(video_url) % 3 == 2:
  300. video_url += "="
  301. elif len(audio_url) % 3 == 1:
  302. audio_url += "=="
  303. elif len(audio_url) % 3 == 2:
  304. audio_url += "="
  305. video_url = base64.b64decode(video_url).decode("utf8")
  306. audio_url = base64.b64decode(audio_url).decode("utf8")
  307. video_width = video_info["videoResource"]["dash"]["video_list"]["video_2"][
  308. "vwidth"
  309. ]
  310. video_height = video_info["videoResource"]["dash"]["video_list"]["video_2"][
  311. "vheight"
  312. ]
  313. video_url_dict["video_url"] = video_url
  314. video_url_dict["audio_url"] = audio_url
  315. video_url_dict["video_width"] = video_width
  316. video_url_dict["video_height"] = video_height
  317. elif (
  318. "video_list" in video_info["videoResource"]["dash"]
  319. and "video_1" in video_info["videoResource"]["dash"]["video_list"]
  320. ):
  321. video_url = video_info["videoResource"]["dash"]["video_list"]["video_1"][
  322. "backup_url_1"
  323. ]
  324. audio_url = video_info["videoResource"]["dash"]["video_list"]["video_1"][
  325. "backup_url_1"
  326. ]
  327. if len(video_url) % 3 == 1:
  328. video_url += "=="
  329. elif len(video_url) % 3 == 2:
  330. video_url += "="
  331. elif len(audio_url) % 3 == 1:
  332. audio_url += "=="
  333. elif len(audio_url) % 3 == 2:
  334. audio_url += "="
  335. video_url = base64.b64decode(video_url).decode("utf8")
  336. audio_url = base64.b64decode(audio_url).decode("utf8")
  337. video_width = video_info["videoResource"]["dash"]["video_list"]["video_1"][
  338. "vwidth"
  339. ]
  340. video_height = video_info["videoResource"]["dash"]["video_list"]["video_1"][
  341. "vheight"
  342. ]
  343. video_url_dict["video_url"] = video_url
  344. video_url_dict["audio_url"] = audio_url
  345. video_url_dict["video_width"] = video_width
  346. video_url_dict["video_height"] = video_height
  347. elif (
  348. "dynamic_video" in video_info["videoResource"]["dash"]
  349. and "dynamic_video_list"
  350. in video_info["videoResource"]["dash"]["dynamic_video"]
  351. and "dynamic_audio_list"
  352. in video_info["videoResource"]["dash"]["dynamic_video"]
  353. and len(
  354. video_info["videoResource"]["dash"]["dynamic_video"][
  355. "dynamic_video_list"
  356. ]
  357. )
  358. != 0
  359. and len(
  360. video_info["videoResource"]["dash"]["dynamic_video"][
  361. "dynamic_audio_list"
  362. ]
  363. )
  364. != 0
  365. ):
  366. video_url = video_info["videoResource"]["dash"]["dynamic_video"][
  367. "dynamic_video_list"
  368. ][-1]["backup_url_1"]
  369. audio_url = video_info["videoResource"]["dash"]["dynamic_video"][
  370. "dynamic_audio_list"
  371. ][-1]["backup_url_1"]
  372. if len(video_url) % 3 == 1:
  373. video_url += "=="
  374. elif len(video_url) % 3 == 2:
  375. video_url += "="
  376. elif len(audio_url) % 3 == 1:
  377. audio_url += "=="
  378. elif len(audio_url) % 3 == 2:
  379. audio_url += "="
  380. video_url = base64.b64decode(video_url).decode("utf8")
  381. audio_url = base64.b64decode(audio_url).decode("utf8")
  382. video_width = video_info["videoResource"]["dash"]["dynamic_video"][
  383. "dynamic_video_list"
  384. ][-1]["vwidth"]
  385. video_height = video_info["videoResource"]["dash"]["dynamic_video"][
  386. "dynamic_video_list"
  387. ][-1]["vheight"]
  388. video_url_dict["video_url"] = video_url
  389. video_url_dict["audio_url"] = audio_url
  390. video_url_dict["video_width"] = video_width
  391. video_url_dict["video_height"] = video_height
  392. else:
  393. video_url_dict["video_url"] = ""
  394. video_url_dict["audio_url"] = ""
  395. video_url_dict["video_width"] = 0
  396. video_url_dict["video_height"] = 0
  397. elif "normal" in video_info["videoResource"]:
  398. if (
  399. "video_list" in video_info["videoResource"]["normal"]
  400. and "video_4" in video_info["videoResource"]["normal"]["video_list"]
  401. ):
  402. video_url = video_info["videoResource"]["normal"]["video_list"]["video_4"][
  403. "backup_url_1"
  404. ]
  405. audio_url = video_info["videoResource"]["normal"]["video_list"]["video_4"][
  406. "backup_url_1"
  407. ]
  408. if len(video_url) % 3 == 1:
  409. video_url += "=="
  410. elif len(video_url) % 3 == 2:
  411. video_url += "="
  412. elif len(audio_url) % 3 == 1:
  413. audio_url += "=="
  414. elif len(audio_url) % 3 == 2:
  415. audio_url += "="
  416. video_url = base64.b64decode(video_url).decode("utf8")
  417. audio_url = base64.b64decode(audio_url).decode("utf8")
  418. video_width = video_info["videoResource"]["normal"]["video_list"][
  419. "video_4"
  420. ]["vwidth"]
  421. video_height = video_info["videoResource"]["normal"]["video_list"][
  422. "video_4"
  423. ]["vheight"]
  424. video_url_dict["video_url"] = video_url
  425. video_url_dict["audio_url"] = audio_url
  426. video_url_dict["video_width"] = video_width
  427. video_url_dict["video_height"] = video_height
  428. elif (
  429. "video_list" in video_info["videoResource"]["normal"]
  430. and "video_3" in video_info["videoResource"]["normal"]["video_list"]
  431. ):
  432. video_url = video_info["videoResource"]["normal"]["video_list"]["video_3"][
  433. "backup_url_1"
  434. ]
  435. audio_url = video_info["videoResource"]["normal"]["video_list"]["video_3"][
  436. "backup_url_1"
  437. ]
  438. if len(video_url) % 3 == 1:
  439. video_url += "=="
  440. elif len(video_url) % 3 == 2:
  441. video_url += "="
  442. elif len(audio_url) % 3 == 1:
  443. audio_url += "=="
  444. elif len(audio_url) % 3 == 2:
  445. audio_url += "="
  446. video_url = base64.b64decode(video_url).decode("utf8")
  447. audio_url = base64.b64decode(audio_url).decode("utf8")
  448. video_width = video_info["videoResource"]["normal"]["video_list"][
  449. "video_3"
  450. ]["vwidth"]
  451. video_height = video_info["videoResource"]["normal"]["video_list"][
  452. "video_3"
  453. ]["vheight"]
  454. video_url_dict["video_url"] = video_url
  455. video_url_dict["audio_url"] = audio_url
  456. video_url_dict["video_width"] = video_width
  457. video_url_dict["video_height"] = video_height
  458. elif (
  459. "video_list" in video_info["videoResource"]["normal"]
  460. and "video_2" in video_info["videoResource"]["normal"]["video_list"]
  461. ):
  462. video_url = video_info["videoResource"]["normal"]["video_list"]["video_2"][
  463. "backup_url_1"
  464. ]
  465. audio_url = video_info["videoResource"]["normal"]["video_list"]["video_2"][
  466. "backup_url_1"
  467. ]
  468. if len(video_url) % 3 == 1:
  469. video_url += "=="
  470. elif len(video_url) % 3 == 2:
  471. video_url += "="
  472. elif len(audio_url) % 3 == 1:
  473. audio_url += "=="
  474. elif len(audio_url) % 3 == 2:
  475. audio_url += "="
  476. video_url = base64.b64decode(video_url).decode("utf8")
  477. audio_url = base64.b64decode(audio_url).decode("utf8")
  478. video_width = video_info["videoResource"]["normal"]["video_list"][
  479. "video_2"
  480. ]["vwidth"]
  481. video_height = video_info["videoResource"]["normal"]["video_list"][
  482. "video_2"
  483. ]["vheight"]
  484. video_url_dict["video_url"] = video_url
  485. video_url_dict["audio_url"] = audio_url
  486. video_url_dict["video_width"] = video_width
  487. video_url_dict["video_height"] = video_height
  488. elif (
  489. "video_list" in video_info["videoResource"]["normal"]
  490. and "video_1" in video_info["videoResource"]["normal"]["video_list"]
  491. ):
  492. video_url = video_info["videoResource"]["normal"]["video_list"]["video_1"][
  493. "backup_url_1"
  494. ]
  495. audio_url = video_info["videoResource"]["normal"]["video_list"]["video_1"][
  496. "backup_url_1"
  497. ]
  498. if len(video_url) % 3 == 1:
  499. video_url += "=="
  500. elif len(video_url) % 3 == 2:
  501. video_url += "="
  502. elif len(audio_url) % 3 == 1:
  503. audio_url += "=="
  504. elif len(audio_url) % 3 == 2:
  505. audio_url += "="
  506. video_url = base64.b64decode(video_url).decode("utf8")
  507. audio_url = base64.b64decode(audio_url).decode("utf8")
  508. video_width = video_info["videoResource"]["normal"]["video_list"][
  509. "video_1"
  510. ]["vwidth"]
  511. video_height = video_info["videoResource"]["normal"]["video_list"][
  512. "video_1"
  513. ]["vheight"]
  514. video_url_dict["video_url"] = video_url
  515. video_url_dict["audio_url"] = audio_url
  516. video_url_dict["video_width"] = video_width
  517. video_url_dict["video_height"] = video_height
  518. elif (
  519. "dynamic_video" in video_info["videoResource"]["normal"]
  520. and "dynamic_video_list"
  521. in video_info["videoResource"]["normal"]["dynamic_video"]
  522. and "dynamic_audio_list"
  523. in video_info["videoResource"]["normal"]["dynamic_video"]
  524. and len(
  525. video_info["videoResource"]["normal"]["dynamic_video"][
  526. "dynamic_video_list"
  527. ]
  528. )
  529. != 0
  530. and len(
  531. video_info["videoResource"]["normal"]["dynamic_video"][
  532. "dynamic_audio_list"
  533. ]
  534. )
  535. != 0
  536. ):
  537. video_url = video_info["videoResource"]["normal"]["dynamic_video"][
  538. "dynamic_video_list"
  539. ][-1]["backup_url_1"]
  540. audio_url = video_info["videoResource"]["normal"]["dynamic_video"][
  541. "dynamic_audio_list"
  542. ][-1]["backup_url_1"]
  543. if len(video_url) % 3 == 1:
  544. video_url += "=="
  545. elif len(video_url) % 3 == 2:
  546. video_url += "="
  547. elif len(audio_url) % 3 == 1:
  548. audio_url += "=="
  549. elif len(audio_url) % 3 == 2:
  550. audio_url += "="
  551. video_url = base64.b64decode(video_url).decode("utf8")
  552. audio_url = base64.b64decode(audio_url).decode("utf8")
  553. video_width = video_info["videoResource"]["normal"]["dynamic_video"][
  554. "dynamic_video_list"
  555. ][-1]["vwidth"]
  556. video_height = video_info["videoResource"]["normal"]["dynamic_video"][
  557. "dynamic_video_list"
  558. ][-1]["vheight"]
  559. video_url_dict["video_url"] = video_url
  560. video_url_dict["audio_url"] = audio_url
  561. video_url_dict["video_width"] = video_width
  562. video_url_dict["video_height"] = video_height
  563. else:
  564. video_url_dict["video_url"] = ""
  565. video_url_dict["audio_url"] = ""
  566. video_url_dict["video_width"] = 0
  567. video_url_dict["video_height"] = 0
  568. else:
  569. video_url_dict["video_url"] = ""
  570. video_url_dict["audio_url"] = ""
  571. video_url_dict["video_width"] = 0
  572. video_url_dict["video_height"] = 0
  573. return video_url_dict
  574. def get_comment_cnt(item_id):
  575. """
  576. 获取视频的评论数量
  577. """
  578. url = "https://www.ixigua.com/tlb/comment/article/v5/tab_comments/?"
  579. params = {
  580. "tab_index": "0",
  581. "count": "10",
  582. "offset": "10",
  583. "group_id": str(item_id),
  584. "item_id": str(item_id),
  585. "aid": "1768",
  586. "msToken": "50-JJObWB07HfHs-BMJWT1eIDX3G-6lPSF_i-QwxBIXE9VVa-iN0jbEXR5pG2DKjXBmP299n6ZTuXzY-GAy968CCvouSAYIS4GzvGQT3pNlKNejr5G4-1g==",
  587. "X-Bogus": "DFSzswVOyGtANVeWtCLMqR/F6q9U",
  588. "_signature": random_signature(),
  589. }
  590. headers = {
  591. "authority": "www.ixigua.com",
  592. "accept": "application/json, text/plain, */*",
  593. "accept-language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
  594. "cache-control": "no-cache",
  595. "cookie": "MONITOR_WEB_ID=67cb5099-a022-4ec3-bb8e-c4de6ba51dd0; passport_csrf_token=72b2574f3c99f8ba670e42df430218fd; passport_csrf_token_default=72b2574f3c99f8ba670e42df430218fd; sid_guard=c7472b508ea631823ba765a60cf8757f%7C1680867422%7C3024002%7CFri%2C+12-May-2023+11%3A37%3A04+GMT; uid_tt=c13f47d51767f616befe32fb3e9f485a; uid_tt_ss=c13f47d51767f616befe32fb3e9f485a; sid_tt=c7472b508ea631823ba765a60cf8757f; sessionid=c7472b508ea631823ba765a60cf8757f; sessionid_ss=c7472b508ea631823ba765a60cf8757f; sid_ucp_v1=1.0.0-KGUzNWYxNmRkZGJiZjgxY2MzZWNkMTEzMTkwYjY1Yjg5OTY5NzVlNmMKFQiu3d-eqQIQ3oDAoQYYGCAMOAhACxoCaGwiIGM3NDcyYjUwOGVhNjMxODIzYmE3NjVhNjBjZjg3NTdm; ssid_ucp_v1=1.0.0-KGUzNWYxNmRkZGJiZjgxY2MzZWNkMTEzMTkwYjY1Yjg5OTY5NzVlNmMKFQiu3d-eqQIQ3oDAoQYYGCAMOAhACxoCaGwiIGM3NDcyYjUwOGVhNjMxODIzYmE3NjVhNjBjZjg3NTdm; odin_tt=b893608d4dde2e1e8df8cd5d97a0e2fbeafc4ca762ac72ebef6e6c97e2ed19859bb01d46b4190ddd6dd17d7f9678e1de; SEARCH_CARD_MODE=7168304743566296612_0; support_webp=true; support_avif=false; csrf_session_id=a5355d954d3c63ed1ba35faada452b4d; tt_scid=7Pux7s634-z8DYvCM20y7KigwH5u7Rh6D9C-RROpnT.aGMEcz6Vsxp.oai47wJqa4f86; ttwid=1%7CHHtv2QqpSGuSu8r-zXF1QoWsvjmNi1SJrqOrZzg-UCY%7C1683858689%7Ca5223fe1500578e01e138a0d71d6444692018296c4c24f5885af174a65873c95; ixigua-a-s=3; msToken=50-JJObWB07HfHs-BMJWT1eIDX3G-6lPSF_i-QwxBIXE9VVa-iN0jbEXR5pG2DKjXBmP299n6ZTuXzY-GAy968CCvouSAYIS4GzvGQT3pNlKNejr5G4-1g==; __ac_nonce=0645dcbf0005064517440; __ac_signature=_02B4Z6wo00f01FEGmAwAAIDBKchzCGqn-MBRJpyAAHAjieFC5GEg6gGiwz.I4PRrJl7f0GcixFrExKmgt6QI1i1S-dQyofPEj2ugWTCnmKUdJQv-wYuDofeKNe8VtMtZq2aKewyUGeKU-5Ud21; ixigua-a-s=3",
  596. "pragma": "no-cache",
  597. "referer": f"https://www.ixigua.com/{item_id}?logTag=3c5aa86a8600b9ab8540",
  598. "sec-ch-ua": '"Microsoft Edge";v="113", "Chromium";v="113", "Not-A.Brand";v="24"',
  599. "sec-ch-ua-mobile": "?0",
  600. "sec-ch-ua-platform": '"macOS"',
  601. "sec-fetch-dest": "empty",
  602. "sec-fetch-mode": "cors",
  603. "sec-fetch-site": "same-origin",
  604. "tt-anti-token": "cBITBHvmYjEygzv-f9c78c1297722cf1f559c74b084e4525ce4900bdcf9e8588f20cc7c2e3234422",
  605. "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.35",
  606. "x-secsdk-csrf-token": "000100000001f8e733cf37f0cd255a51aea9a81ff7bc0c09490cfe41ad827c3c5c18ec809279175e4d9f5553d8a5",
  607. }
  608. response = requests.get(
  609. url=url, headers=headers, params=params, proxies=tunnel_proxies(), timeout=5
  610. )
  611. response.close()
  612. if (
  613. response.status_code != 200
  614. or "total_number" not in response.json()
  615. or response.json() == {}
  616. ):
  617. return 0
  618. return response.json().get("total_number", 0)
  619. class XiGuaAuthor:
  620. """
  621. 西瓜账号爬虫
  622. """
  623. def __init__(self, platform, mode, rule_dict, env, user_list):
  624. self.platform = platform
  625. self.mode = mode
  626. self.rule_dict = rule_dict
  627. self.env = env
  628. self.user_list = user_list
  629. self.mq = MQ(topic_name="topic_crawler_etl_" + self.env)
  630. self.download_count = 0
  631. def rule_maker(self, account):
  632. """
  633. 通过不同的账号生成不同的规则
  634. :param account: 输入的账号信息
  635. {'play_cnt': {'min': 100000, 'max': 0}, 'period': {'min': 5, 'max': 5}}
  636. """
  637. flag = account['link'].split("_")[0]
  638. if flag == "V1":
  639. rule_dict = {
  640. "play_cnt": {"min": 100000, "max": 0},
  641. 'period': {"min": 90, "max": 90},
  642. 'special': 0.02
  643. }
  644. return rule_dict
  645. elif flag == "V2":
  646. rule_dict = {
  647. "play_cnt": {"min": 10000, "max": 0},
  648. 'period': {"min": 90, "max": 90},
  649. 'special': 0.01
  650. }
  651. return rule_dict
  652. elif flag == "V3":
  653. rule_dict = {
  654. "play_cnt": {"min": 5000, "max": 0},
  655. 'period': {"min": 90, "max": 90},
  656. 'special': 0.01
  657. }
  658. return rule_dict
  659. else:
  660. return self.rule_dict
  661. def get_author_list(self):
  662. # 每轮只抓取定量的数据,到达数量后自己退出
  663. # max_count = int(self.rule_dict.get("videos_cnt", {}).get("min", 300))
  664. for user_dict in self.user_list:
  665. # if self.download_count <= max_count:
  666. try:
  667. self.get_video_list(user_dict)
  668. except Exception as e:
  669. AliyunLogger.logging(
  670. code="3001",
  671. account=user_dict["uid"],
  672. platform=self.platform,
  673. mode=self.mode,
  674. env=self.env,
  675. message="扫描账号时出现bug, 报错是 {}".format(e)
  676. )
  677. # time.sleep(random.randint(1, 15))
  678. # else:
  679. # AliyunLogger.logging(
  680. # code="2000",
  681. # platform=self.platform,
  682. # mode=self.mode,
  683. # env=self.env,
  684. # message="本轮已经抓取足够数量的视频,已经自动退出",
  685. # )
  686. # return
  687. def get_video_list(self, user_dict):
  688. offset = 0
  689. signature = random_signature()
  690. url = "https://www.ixigua.com/api/videov2/author/new_video_list?"
  691. while True:
  692. if user_dict['link'][0] == "V":
  693. link = user_dict["link"][2:]
  694. else:
  695. link = user_dict["link"]
  696. to_user_id = str(link.replace("https://www.ixigua.com/home/", ""))
  697. params = {
  698. "to_user_id": to_user_id,
  699. "offset": str(offset),
  700. "limit": "30",
  701. "maxBehotTime": "0",
  702. "order": "new",
  703. "isHome": "0",
  704. # 'msToken': 'G0eRzNkw189a8TLaXjc6nTHVMQwh9XcxVAqTbGKi7iPJdQcLwS3-XRrJ3MZ7QBfqErpxp3EX1WtvWOIcZ3NIgr41hgcd-v64so_RRj3YCRw1UsKW8mIssNLlIMspsg==',
  705. # 'X-Bogus': 'DFSzswVuEkUANjW9ShFTgR/F6qHt',
  706. "_signature": signature,
  707. }
  708. headers = {
  709. "referer": f'https://www.ixigua.com/home/{link.replace("https://www.ixigua.com/home/", "")}/video/?preActiveKey=hotsoon&list_entrance=userdetail',
  710. "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.41",
  711. }
  712. response = requests.get(
  713. url=url,
  714. headers=headers,
  715. params=params,
  716. proxies=tunnel_proxies(),
  717. timeout=5,
  718. )
  719. offset += 30
  720. if "data" not in response.text or response.status_code != 200:
  721. AliyunLogger.logging(
  722. code="2000",
  723. platform=self.platform,
  724. mode=self.mode,
  725. env=self.env,
  726. message=f"get_videoList:{response.text}\n",
  727. )
  728. return
  729. elif not response.json()["data"]["videoList"]:
  730. AliyunLogger.logging(
  731. code="2000",
  732. platform=self.platform,
  733. mode=self.mode,
  734. env=self.env,
  735. message=f"没有更多数据啦~\n",
  736. )
  737. return
  738. else:
  739. feeds = response.json()["data"]["videoList"]
  740. for video_obj in feeds:
  741. try:
  742. AliyunLogger.logging(
  743. code="1001",
  744. account=user_dict['uid'],
  745. platform=self.platform,
  746. mode=self.mode,
  747. env=self.env,
  748. data=video_obj,
  749. message="扫描到一条视频",
  750. )
  751. date_flag = self.process_video_obj(video_obj, user_dict)
  752. if not date_flag:
  753. return
  754. except Exception as e:
  755. AliyunLogger.logging(
  756. code="3000",
  757. platform=self.platform,
  758. mode=self.mode,
  759. env=self.env,
  760. data=video_obj,
  761. message="抓取单条视频异常, 报错原因是: {}".format(e),
  762. )
  763. def process_video_obj(self, video_obj, user_dict):
  764. new_rule = self.rule_maker(user_dict)
  765. trace_id = self.platform + str(uuid.uuid1())
  766. item_id = video_obj.get("item_id", "")
  767. if not item_id:
  768. AliyunLogger.logging(
  769. code="2005",
  770. account=user_dict['uid'],
  771. platform=self.platform,
  772. mode=self.mode,
  773. env=self.env,
  774. message="无效视频",
  775. data=video_obj,
  776. trace_id=trace_id,
  777. )
  778. return
  779. # 获取视频信息
  780. video_dict = self.get_video_info(item_id=item_id, trace_id=trace_id)
  781. video_dict["out_user_id"] = video_dict["user_id"]
  782. video_dict["platform"] = self.platform
  783. video_dict["strategy"] = self.mode
  784. video_dict["out_video_id"] = video_dict["video_id"]
  785. video_dict["width"] = video_dict["video_width"]
  786. video_dict["height"] = video_dict["video_height"]
  787. video_dict["crawler_rule"] = json.dumps(new_rule)
  788. video_dict["user_id"] = user_dict["uid"]
  789. video_dict["publish_time"] = video_dict["publish_time_str"]
  790. video_dict["strategy_type"] = self.mode
  791. video_dict["update_time_stamp"] = int(time.time())
  792. if int(time.time()) - video_dict['publish_time_stamp'] > 3600 * 24 * int(
  793. new_rule.get("period", {}).get("max", 1000)):
  794. if not video_obj['is_top']:
  795. """
  796. 非置顶数据发布时间超过才退出
  797. """
  798. AliyunLogger.logging(
  799. code="2004",
  800. account=user_dict['uid'],
  801. platform=self.platform,
  802. mode=self.mode,
  803. env=self.env,
  804. data=video_dict,
  805. message="发布时间超过{}天".format(
  806. int(new_rule.get("period", {}).get("max", 1000))
  807. ),
  808. )
  809. return False
  810. pipeline = PiaoQuanPipeline(
  811. platform=self.platform,
  812. mode=self.mode,
  813. rule_dict=new_rule,
  814. env=self.env,
  815. item=video_dict,
  816. trace_id=trace_id,
  817. )
  818. title_flag = pipeline.title_flag()
  819. repeat_flag = pipeline.repeat_video()
  820. if title_flag and repeat_flag:
  821. if new_rule.get("special"):
  822. if int(video_dict['play_cnt']) >= int(new_rule.get("play_cnt", {}).get("min", 100000)):
  823. if float(video_dict['like_cnt']) / float(video_dict['play_cnt']) >= new_rule['special']:
  824. self.mq.send_msg(video_dict)
  825. self.download_count += 1
  826. AliyunLogger.logging(
  827. code="1002",
  828. account=user_dict['uid'],
  829. platform=self.platform,
  830. mode=self.mode,
  831. env=self.env,
  832. data=video_dict,
  833. trace_id=trace_id,
  834. message="成功发送 MQ 至 ETL",
  835. )
  836. return True
  837. else:
  838. AliyunLogger.logging(
  839. code="2008",
  840. account=user_dict['uid'],
  841. platform=self.platform,
  842. mode=self.mode,
  843. env=self.env,
  844. message="不满足特殊规则, 点赞量/播放量",
  845. data=video_dict
  846. )
  847. else:
  848. if int(video_dict['play_cnt']) >= int(new_rule.get("play_cnt", {}).get("min", 100000)):
  849. self.mq.send_msg(video_dict)
  850. self.download_count += 1
  851. AliyunLogger.logging(
  852. code="1002",
  853. account=user_dict['uid'],
  854. platform=self.platform,
  855. mode=self.mode,
  856. env=self.env,
  857. data=video_dict,
  858. trace_id=trace_id,
  859. message="成功发送 MQ 至 ETL",
  860. )
  861. return True
  862. else:
  863. AliyunLogger.logging(
  864. code="2008",
  865. account=user_dict['uid'],
  866. platform=self.platform,
  867. mode=self.mode,
  868. env=self.env,
  869. message="不满足特殊规则, 播放量",
  870. data=video_dict
  871. )
  872. return True
  873. def get_video_info(self, item_id, trace_id):
  874. url = "https://www.ixigua.com/api/mixVideo/information?"
  875. headers = {
  876. "accept-encoding": "gzip, deflate",
  877. "accept-language": "zh-CN,zh-Hans;q=0.9",
  878. "user-agent": FakeUserAgent().random,
  879. "referer": "https://www.ixigua.com/7102614741050196520?logTag=0531c88ac04f38ab2c62",
  880. }
  881. params = {
  882. "mixId": str(item_id),
  883. "msToken": "IlG0wd0Pylyw9ghcYiB2YseUmTwrsrqqhXrbIcsSaTcLTJyVlbYJzk20zw3UO-CfrfC"
  884. "NVVIOBNjIl7vfBoxnVUwO9ZyzAI3umSKsT5-pef_RRfQCJwmA",
  885. "X-Bogus": "DFSzswVupYTANCJOSBk0P53WxM-r",
  886. "_signature": "_02B4Z6wo0000119LvEwAAIDCuktNZ0y5wkdfS7jAALThuOR8D9yWNZ.EmWHKV0WSn6Px"
  887. "fPsH9-BldyxVje0f49ryXgmn7Tzk-swEHNb15TiGqa6YF.cX0jW8Eds1TtJOIZyfc9s5emH7gdWN94",
  888. }
  889. cookies = {
  890. "ixigua-a-s": "1",
  891. "msToken": "IlG0wd0Pylyw9ghcYiB2YseUmTwrsrqqhXrbIcsSaTcLTJyVlbYJzk20zw3UO-CfrfCNVVIOB"
  892. "NjIl7vfBoxnVUwO9ZyzAI3umSKsT5-pef_RRfQCJwmA",
  893. "ttwid": "1%7C_yXQeHWwLZgCsgHClOwTCdYSOt_MjdOkgnPIkpi-Sr8%7C1661241238%7Cf57d0c5ef3f1d7"
  894. "6e049fccdca1ac54887c34d1f8731c8e51a49780ff0ceab9f8",
  895. "tt_scid": "QZ4l8KXDG0YAEaMCSbADdcybdKbUfG4BC6S4OBv9lpRS5VyqYLX2bIR8CTeZeGHR9ee3",
  896. "MONITOR_WEB_ID": "0a49204a-7af5-4e96-95f0-f4bafb7450ad",
  897. "__ac_nonce": "06304878000964fdad287",
  898. "__ac_signature": "_02B4Z6wo00f017Rcr3AAAIDCUVxeW1tOKEu0fKvAAI4cvoYzV-wBhq7B6D8k0no7lb"
  899. "FlvYoinmtK6UXjRIYPXnahUlFTvmWVtb77jsMkKAXzAEsLE56m36RlvL7ky.M3Xn52r9t1IEb7IR3ke8",
  900. "ttcid": "e56fabf6e85d4adf9e4d91902496a0e882",
  901. "_tea_utm_cache_1300": "undefined",
  902. "support_avif": "false",
  903. "support_webp": "false",
  904. "xiguavideopcwebid": "7134967546256016900",
  905. "xiguavideopcwebid.sig": "xxRww5R1VEMJN_dQepHorEu_eAc",
  906. }
  907. response = requests.get(
  908. url=url,
  909. headers=headers,
  910. params=params,
  911. cookies=cookies,
  912. proxies=tunnel_proxies(),
  913. timeout=5,
  914. )
  915. if (
  916. response.status_code != 200
  917. or "data" not in response.json()
  918. or response.json()["data"] == {}
  919. ):
  920. AliyunLogger.logging(
  921. code="2000",
  922. platform=self.platform,
  923. mode=self.mode,
  924. env=self.env,
  925. message="获取视频信息失败",
  926. trace_id=trace_id,
  927. )
  928. return None
  929. else:
  930. video_info = (
  931. response.json()["data"]
  932. .get("gidInformation", {})
  933. .get("packerData", {})
  934. .get("video", {})
  935. )
  936. if video_info == {}:
  937. return None
  938. video_detail = get_video_url(video_info)
  939. video_dict = {
  940. "video_title": video_info.get("title", ""),
  941. "video_id": video_info.get("videoResource", {}).get("vid", ""),
  942. "gid": str(item_id),
  943. "play_cnt": int(video_info.get("video_watch_count", 0)),
  944. "like_cnt": int(video_info.get("video_like_count", 0)),
  945. "comment_cnt": int(get_comment_cnt(item_id)),
  946. "share_cnt": 0,
  947. "favorite_cnt": 0,
  948. "duration": int(video_info.get("video_duration", 0)),
  949. "video_width": int(video_detail["video_width"]),
  950. "video_height": int(video_detail["video_height"]),
  951. "publish_time_stamp": int(video_info.get("video_publish_time", 0)),
  952. "publish_time_str": time.strftime(
  953. "%Y-%m-%d %H:%M:%S",
  954. time.localtime(int(video_info.get("video_publish_time", 0))),
  955. ),
  956. "user_name": video_info.get("user_info", {}).get("name", ""),
  957. "user_id": str(video_info.get("user_info", {}).get("user_id", "")),
  958. "avatar_url": str(
  959. video_info.get("user_info", {}).get("avatar_url", "")
  960. ),
  961. "cover_url": video_info.get("poster_url", ""),
  962. "audio_url": video_detail["audio_url"],
  963. "video_url": video_detail["video_url"],
  964. "session": f"xigua-search-{int(time.time())}",
  965. }
  966. return video_dict
  967. if __name__ == "__main__":
  968. user_list = [
  969. {
  970. "uid": 6267140,
  971. "source": "xigua",
  972. "link": "https://www.ixigua.com/home/2779177225827568",
  973. "nick_name": "秋晴爱音乐",
  974. "avatar_url": "",
  975. "mode": "author",
  976. },
  977. {
  978. "uid": 6267140,
  979. "source": "xigua",
  980. "link": "https://www.ixigua.com/home/2885546124776780",
  981. "nick_name": "朗诵放歌的老山羊",
  982. "avatar_url": "",
  983. "mode": "author",
  984. },
  985. {
  986. "uid": 6267140,
  987. "source": "xigua",
  988. "link": "https://www.ixigua.com/home/5880938217",
  989. "nick_name": "天原声疗",
  990. "avatar_url": "",
  991. "mode": "author",
  992. },
  993. ]
  994. # rule = {'period': {'min': 30, 'max': 30}, 'duration': {'min': 20, 'max': 0}, 'play_cnt': {'min': 100000, 'max': 0}}
  995. # XGA = XiGuaAuthor(
  996. # platform="xigua",
  997. # mode="author",
  998. # rule_dict=rule,
  999. # env="prod",
  1000. # user_list=user_list
  1001. # )
  1002. # XGA.get_author_list()