xigua_author.py 42 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978
  1. import json
  2. import os
  3. import random
  4. import sys
  5. import string
  6. import time
  7. import uuid
  8. import base64
  9. import requests
  10. from fake_useragent import FakeUserAgent
  11. from common.mq import MQ
  12. sys.path.append(os.getcwd())
  13. from common import AliyunLogger, PiaoQuanPipeline
  14. def tunnel_proxies():
  15. # 隧道域名:端口号
  16. tunnel = "q796.kdltps.com:15818"
  17. # 用户名密码方式
  18. username = "t17772369458618"
  19. password = "5zqcjkmy"
  20. tunnel_proxies = {
  21. "http": "http://%(user)s:%(pwd)s@%(proxy)s/"
  22. % {"user": username, "pwd": password, "proxy": tunnel},
  23. "https": "http://%(user)s:%(pwd)s@%(proxy)s/"
  24. % {"user": username, "pwd": password, "proxy": tunnel},
  25. }
  26. return tunnel_proxies
  27. def random_signature():
  28. src_digits = string.digits # string_数字
  29. src_uppercase = string.ascii_uppercase # string_大写字母
  30. src_lowercase = string.ascii_lowercase # string_小写字母
  31. digits_num = random.randint(1, 6)
  32. uppercase_num = random.randint(1, 26 - digits_num - 1)
  33. lowercase_num = 26 - (digits_num + uppercase_num)
  34. password = (
  35. random.sample(src_digits, digits_num)
  36. + random.sample(src_uppercase, uppercase_num)
  37. + random.sample(src_lowercase, lowercase_num)
  38. )
  39. random.shuffle(password)
  40. new_password = "AAAAAAAAAA" + "".join(password)[10:-4] + "AAAB"
  41. new_password_start = new_password[0:18]
  42. new_password_end = new_password[-7:]
  43. if new_password[18] == "8":
  44. new_password = new_password_start + "w" + new_password_end
  45. elif new_password[18] == "9":
  46. new_password = new_password_start + "x" + new_password_end
  47. elif new_password[18] == "-":
  48. new_password = new_password_start + "y" + new_password_end
  49. elif new_password[18] == ".":
  50. new_password = new_password_start + "z" + new_password_end
  51. else:
  52. new_password = new_password_start + "y" + new_password_end
  53. return new_password
  54. def get_video_url(video_info):
  55. video_url_dict = {}
  56. # video_url
  57. if "videoResource" not in video_info:
  58. video_url_dict["video_url"] = ""
  59. video_url_dict["audio_url"] = ""
  60. video_url_dict["video_width"] = 0
  61. video_url_dict["video_height"] = 0
  62. elif "dash_120fps" in video_info["videoResource"]:
  63. if (
  64. "video_list" in video_info["videoResource"]["dash_120fps"]
  65. and "video_4" in video_info["videoResource"]["dash_120fps"]["video_list"]
  66. ):
  67. video_url = video_info["videoResource"]["dash_120fps"]["video_list"][
  68. "video_4"
  69. ]["backup_url_1"]
  70. audio_url = video_info["videoResource"]["dash_120fps"]["video_list"][
  71. "video_4"
  72. ]["backup_url_1"]
  73. if len(video_url) % 3 == 1:
  74. video_url += "=="
  75. elif len(video_url) % 3 == 2:
  76. video_url += "="
  77. elif len(audio_url) % 3 == 1:
  78. audio_url += "=="
  79. elif len(audio_url) % 3 == 2:
  80. audio_url += "="
  81. video_url = base64.b64decode(video_url).decode("utf8")
  82. audio_url = base64.b64decode(audio_url).decode("utf8")
  83. video_width = video_info["videoResource"]["dash_120fps"]["video_list"][
  84. "video_4"
  85. ]["vwidth"]
  86. video_height = video_info["videoResource"]["dash_120fps"]["video_list"][
  87. "video_4"
  88. ]["vheight"]
  89. video_url_dict["video_url"] = video_url
  90. video_url_dict["audio_url"] = audio_url
  91. video_url_dict["video_width"] = video_width
  92. video_url_dict["video_height"] = video_height
  93. elif (
  94. "video_list" in video_info["videoResource"]["dash_120fps"]
  95. and "video_3" in video_info["videoResource"]["dash_120fps"]["video_list"]
  96. ):
  97. video_url = video_info["videoResource"]["dash_120fps"]["video_list"][
  98. "video_3"
  99. ]["backup_url_1"]
  100. audio_url = video_info["videoResource"]["dash_120fps"]["video_list"][
  101. "video_3"
  102. ]["backup_url_1"]
  103. if len(video_url) % 3 == 1:
  104. video_url += "=="
  105. elif len(video_url) % 3 == 2:
  106. video_url += "="
  107. elif len(audio_url) % 3 == 1:
  108. audio_url += "=="
  109. elif len(audio_url) % 3 == 2:
  110. audio_url += "="
  111. video_url = base64.b64decode(video_url).decode("utf8")
  112. audio_url = base64.b64decode(audio_url).decode("utf8")
  113. video_width = video_info["videoResource"]["dash_120fps"]["video_list"][
  114. "video_3"
  115. ]["vwidth"]
  116. video_height = video_info["videoResource"]["dash_120fps"]["video_list"][
  117. "video_3"
  118. ]["vheight"]
  119. video_url_dict["video_url"] = video_url
  120. video_url_dict["audio_url"] = audio_url
  121. video_url_dict["video_width"] = video_width
  122. video_url_dict["video_height"] = video_height
  123. elif (
  124. "video_list" in video_info["videoResource"]["dash_120fps"]
  125. and "video_2" in video_info["videoResource"]["dash_120fps"]["video_list"]
  126. ):
  127. video_url = video_info["videoResource"]["dash_120fps"]["video_list"][
  128. "video_2"
  129. ]["backup_url_1"]
  130. audio_url = video_info["videoResource"]["dash_120fps"]["video_list"][
  131. "video_2"
  132. ]["backup_url_1"]
  133. if len(video_url) % 3 == 1:
  134. video_url += "=="
  135. elif len(video_url) % 3 == 2:
  136. video_url += "="
  137. elif len(audio_url) % 3 == 1:
  138. audio_url += "=="
  139. elif len(audio_url) % 3 == 2:
  140. audio_url += "="
  141. video_url = base64.b64decode(video_url).decode("utf8")
  142. audio_url = base64.b64decode(audio_url).decode("utf8")
  143. video_width = video_info["videoResource"]["dash_120fps"]["video_list"][
  144. "video_2"
  145. ]["vwidth"]
  146. video_height = video_info["videoResource"]["dash_120fps"]["video_list"][
  147. "video_2"
  148. ]["vheight"]
  149. video_url_dict["video_url"] = video_url
  150. video_url_dict["audio_url"] = audio_url
  151. video_url_dict["video_width"] = video_width
  152. video_url_dict["video_height"] = video_height
  153. elif (
  154. "video_list" in video_info["videoResource"]["dash_120fps"]
  155. and "video_1" in video_info["videoResource"]["dash_120fps"]["video_list"]
  156. ):
  157. video_url = video_info["videoResource"]["dash_120fps"]["video_list"][
  158. "video_1"
  159. ]["backup_url_1"]
  160. audio_url = video_info["videoResource"]["dash_120fps"]["video_list"][
  161. "video_1"
  162. ]["backup_url_1"]
  163. if len(video_url) % 3 == 1:
  164. video_url += "=="
  165. elif len(video_url) % 3 == 2:
  166. video_url += "="
  167. elif len(audio_url) % 3 == 1:
  168. audio_url += "=="
  169. elif len(audio_url) % 3 == 2:
  170. audio_url += "="
  171. video_url = base64.b64decode(video_url).decode("utf8")
  172. audio_url = base64.b64decode(audio_url).decode("utf8")
  173. video_width = video_info["videoResource"]["dash_120fps"]["video_list"][
  174. "video_1"
  175. ]["vwidth"]
  176. video_height = video_info["videoResource"]["dash_120fps"]["video_list"][
  177. "video_1"
  178. ]["vheight"]
  179. video_url_dict["video_url"] = video_url
  180. video_url_dict["audio_url"] = audio_url
  181. video_url_dict["video_width"] = video_width
  182. video_url_dict["video_height"] = video_height
  183. elif (
  184. "dynamic_video" in video_info["videoResource"]["dash_120fps"]
  185. and "dynamic_video_list"
  186. in video_info["videoResource"]["dash_120fps"]["dynamic_video"]
  187. and "dynamic_audio_list"
  188. in video_info["videoResource"]["dash_120fps"]["dynamic_video"]
  189. and len(
  190. video_info["videoResource"]["dash_120fps"]["dynamic_video"][
  191. "dynamic_video_list"
  192. ]
  193. )
  194. != 0
  195. and len(
  196. video_info["videoResource"]["dash_120fps"]["dynamic_video"][
  197. "dynamic_audio_list"
  198. ]
  199. )
  200. != 0
  201. ):
  202. video_url = video_info["videoResource"]["dash_120fps"]["dynamic_video"][
  203. "dynamic_video_list"
  204. ][-1]["backup_url_1"]
  205. audio_url = video_info["videoResource"]["dash_120fps"]["dynamic_video"][
  206. "dynamic_audio_list"
  207. ][-1]["backup_url_1"]
  208. if len(video_url) % 3 == 1:
  209. video_url += "=="
  210. elif len(video_url) % 3 == 2:
  211. video_url += "="
  212. elif len(audio_url) % 3 == 1:
  213. audio_url += "=="
  214. elif len(audio_url) % 3 == 2:
  215. audio_url += "="
  216. video_url = base64.b64decode(video_url).decode("utf8")
  217. audio_url = base64.b64decode(audio_url).decode("utf8")
  218. video_width = video_info["videoResource"]["dash_120fps"]["dynamic_video"][
  219. "dynamic_video_list"
  220. ][-1]["vwidth"]
  221. video_height = video_info["videoResource"]["dash_120fps"]["dynamic_video"][
  222. "dynamic_video_list"
  223. ][-1]["vheight"]
  224. video_url_dict["video_url"] = video_url
  225. video_url_dict["audio_url"] = audio_url
  226. video_url_dict["video_width"] = video_width
  227. video_url_dict["video_height"] = video_height
  228. else:
  229. video_url_dict["video_url"] = ""
  230. video_url_dict["audio_url"] = ""
  231. video_url_dict["video_width"] = 0
  232. video_url_dict["video_height"] = 0
  233. elif "dash" in video_info["videoResource"]:
  234. if (
  235. "video_list" in video_info["videoResource"]["dash"]
  236. and "video_4" in video_info["videoResource"]["dash"]["video_list"]
  237. ):
  238. video_url = video_info["videoResource"]["dash"]["video_list"]["video_4"][
  239. "backup_url_1"
  240. ]
  241. audio_url = video_info["videoResource"]["dash"]["video_list"]["video_4"][
  242. "backup_url_1"
  243. ]
  244. if len(video_url) % 3 == 1:
  245. video_url += "=="
  246. elif len(video_url) % 3 == 2:
  247. video_url += "="
  248. elif len(audio_url) % 3 == 1:
  249. audio_url += "=="
  250. elif len(audio_url) % 3 == 2:
  251. audio_url += "="
  252. video_url = base64.b64decode(video_url).decode("utf8")
  253. audio_url = base64.b64decode(audio_url).decode("utf8")
  254. video_width = video_info["videoResource"]["dash"]["video_list"]["video_4"][
  255. "vwidth"
  256. ]
  257. video_height = video_info["videoResource"]["dash"]["video_list"]["video_4"][
  258. "vheight"
  259. ]
  260. video_url_dict["video_url"] = video_url
  261. video_url_dict["audio_url"] = audio_url
  262. video_url_dict["video_width"] = video_width
  263. video_url_dict["video_height"] = video_height
  264. elif (
  265. "video_list" in video_info["videoResource"]["dash"]
  266. and "video_3" in video_info["videoResource"]["dash"]["video_list"]
  267. ):
  268. video_url = video_info["videoResource"]["dash"]["video_list"]["video_3"][
  269. "backup_url_1"
  270. ]
  271. audio_url = video_info["videoResource"]["dash"]["video_list"]["video_3"][
  272. "backup_url_1"
  273. ]
  274. if len(video_url) % 3 == 1:
  275. video_url += "=="
  276. elif len(video_url) % 3 == 2:
  277. video_url += "="
  278. elif len(audio_url) % 3 == 1:
  279. audio_url += "=="
  280. elif len(audio_url) % 3 == 2:
  281. audio_url += "="
  282. video_url = base64.b64decode(video_url).decode("utf8")
  283. audio_url = base64.b64decode(audio_url).decode("utf8")
  284. video_width = video_info["videoResource"]["dash"]["video_list"]["video_3"][
  285. "vwidth"
  286. ]
  287. video_height = video_info["videoResource"]["dash"]["video_list"]["video_3"][
  288. "vheight"
  289. ]
  290. video_url_dict["video_url"] = video_url
  291. video_url_dict["audio_url"] = audio_url
  292. video_url_dict["video_width"] = video_width
  293. video_url_dict["video_height"] = video_height
  294. elif (
  295. "video_list" in video_info["videoResource"]["dash"]
  296. and "video_2" in video_info["videoResource"]["dash"]["video_list"]
  297. ):
  298. video_url = video_info["videoResource"]["dash"]["video_list"]["video_2"][
  299. "backup_url_1"
  300. ]
  301. audio_url = video_info["videoResource"]["dash"]["video_list"]["video_2"][
  302. "backup_url_1"
  303. ]
  304. if len(video_url) % 3 == 1:
  305. video_url += "=="
  306. elif len(video_url) % 3 == 2:
  307. video_url += "="
  308. elif len(audio_url) % 3 == 1:
  309. audio_url += "=="
  310. elif len(audio_url) % 3 == 2:
  311. audio_url += "="
  312. video_url = base64.b64decode(video_url).decode("utf8")
  313. audio_url = base64.b64decode(audio_url).decode("utf8")
  314. video_width = video_info["videoResource"]["dash"]["video_list"]["video_2"][
  315. "vwidth"
  316. ]
  317. video_height = video_info["videoResource"]["dash"]["video_list"]["video_2"][
  318. "vheight"
  319. ]
  320. video_url_dict["video_url"] = video_url
  321. video_url_dict["audio_url"] = audio_url
  322. video_url_dict["video_width"] = video_width
  323. video_url_dict["video_height"] = video_height
  324. elif (
  325. "video_list" in video_info["videoResource"]["dash"]
  326. and "video_1" in video_info["videoResource"]["dash"]["video_list"]
  327. ):
  328. video_url = video_info["videoResource"]["dash"]["video_list"]["video_1"][
  329. "backup_url_1"
  330. ]
  331. audio_url = video_info["videoResource"]["dash"]["video_list"]["video_1"][
  332. "backup_url_1"
  333. ]
  334. if len(video_url) % 3 == 1:
  335. video_url += "=="
  336. elif len(video_url) % 3 == 2:
  337. video_url += "="
  338. elif len(audio_url) % 3 == 1:
  339. audio_url += "=="
  340. elif len(audio_url) % 3 == 2:
  341. audio_url += "="
  342. video_url = base64.b64decode(video_url).decode("utf8")
  343. audio_url = base64.b64decode(audio_url).decode("utf8")
  344. video_width = video_info["videoResource"]["dash"]["video_list"]["video_1"][
  345. "vwidth"
  346. ]
  347. video_height = video_info["videoResource"]["dash"]["video_list"]["video_1"][
  348. "vheight"
  349. ]
  350. video_url_dict["video_url"] = video_url
  351. video_url_dict["audio_url"] = audio_url
  352. video_url_dict["video_width"] = video_width
  353. video_url_dict["video_height"] = video_height
  354. elif (
  355. "dynamic_video" in video_info["videoResource"]["dash"]
  356. and "dynamic_video_list"
  357. in video_info["videoResource"]["dash"]["dynamic_video"]
  358. and "dynamic_audio_list"
  359. in video_info["videoResource"]["dash"]["dynamic_video"]
  360. and len(
  361. video_info["videoResource"]["dash"]["dynamic_video"][
  362. "dynamic_video_list"
  363. ]
  364. )
  365. != 0
  366. and len(
  367. video_info["videoResource"]["dash"]["dynamic_video"][
  368. "dynamic_audio_list"
  369. ]
  370. )
  371. != 0
  372. ):
  373. video_url = video_info["videoResource"]["dash"]["dynamic_video"][
  374. "dynamic_video_list"
  375. ][-1]["backup_url_1"]
  376. audio_url = video_info["videoResource"]["dash"]["dynamic_video"][
  377. "dynamic_audio_list"
  378. ][-1]["backup_url_1"]
  379. if len(video_url) % 3 == 1:
  380. video_url += "=="
  381. elif len(video_url) % 3 == 2:
  382. video_url += "="
  383. elif len(audio_url) % 3 == 1:
  384. audio_url += "=="
  385. elif len(audio_url) % 3 == 2:
  386. audio_url += "="
  387. video_url = base64.b64decode(video_url).decode("utf8")
  388. audio_url = base64.b64decode(audio_url).decode("utf8")
  389. video_width = video_info["videoResource"]["dash"]["dynamic_video"][
  390. "dynamic_video_list"
  391. ][-1]["vwidth"]
  392. video_height = video_info["videoResource"]["dash"]["dynamic_video"][
  393. "dynamic_video_list"
  394. ][-1]["vheight"]
  395. video_url_dict["video_url"] = video_url
  396. video_url_dict["audio_url"] = audio_url
  397. video_url_dict["video_width"] = video_width
  398. video_url_dict["video_height"] = video_height
  399. else:
  400. video_url_dict["video_url"] = ""
  401. video_url_dict["audio_url"] = ""
  402. video_url_dict["video_width"] = 0
  403. video_url_dict["video_height"] = 0
  404. elif "normal" in video_info["videoResource"]:
  405. if (
  406. "video_list" in video_info["videoResource"]["normal"]
  407. and "video_4" in video_info["videoResource"]["normal"]["video_list"]
  408. ):
  409. video_url = video_info["videoResource"]["normal"]["video_list"]["video_4"][
  410. "backup_url_1"
  411. ]
  412. audio_url = video_info["videoResource"]["normal"]["video_list"]["video_4"][
  413. "backup_url_1"
  414. ]
  415. if len(video_url) % 3 == 1:
  416. video_url += "=="
  417. elif len(video_url) % 3 == 2:
  418. video_url += "="
  419. elif len(audio_url) % 3 == 1:
  420. audio_url += "=="
  421. elif len(audio_url) % 3 == 2:
  422. audio_url += "="
  423. video_url = base64.b64decode(video_url).decode("utf8")
  424. audio_url = base64.b64decode(audio_url).decode("utf8")
  425. video_width = video_info["videoResource"]["normal"]["video_list"][
  426. "video_4"
  427. ]["vwidth"]
  428. video_height = video_info["videoResource"]["normal"]["video_list"][
  429. "video_4"
  430. ]["vheight"]
  431. video_url_dict["video_url"] = video_url
  432. video_url_dict["audio_url"] = audio_url
  433. video_url_dict["video_width"] = video_width
  434. video_url_dict["video_height"] = video_height
  435. elif (
  436. "video_list" in video_info["videoResource"]["normal"]
  437. and "video_3" in video_info["videoResource"]["normal"]["video_list"]
  438. ):
  439. video_url = video_info["videoResource"]["normal"]["video_list"]["video_3"][
  440. "backup_url_1"
  441. ]
  442. audio_url = video_info["videoResource"]["normal"]["video_list"]["video_3"][
  443. "backup_url_1"
  444. ]
  445. if len(video_url) % 3 == 1:
  446. video_url += "=="
  447. elif len(video_url) % 3 == 2:
  448. video_url += "="
  449. elif len(audio_url) % 3 == 1:
  450. audio_url += "=="
  451. elif len(audio_url) % 3 == 2:
  452. audio_url += "="
  453. video_url = base64.b64decode(video_url).decode("utf8")
  454. audio_url = base64.b64decode(audio_url).decode("utf8")
  455. video_width = video_info["videoResource"]["normal"]["video_list"][
  456. "video_3"
  457. ]["vwidth"]
  458. video_height = video_info["videoResource"]["normal"]["video_list"][
  459. "video_3"
  460. ]["vheight"]
  461. video_url_dict["video_url"] = video_url
  462. video_url_dict["audio_url"] = audio_url
  463. video_url_dict["video_width"] = video_width
  464. video_url_dict["video_height"] = video_height
  465. elif (
  466. "video_list" in video_info["videoResource"]["normal"]
  467. and "video_2" in video_info["videoResource"]["normal"]["video_list"]
  468. ):
  469. video_url = video_info["videoResource"]["normal"]["video_list"]["video_2"][
  470. "backup_url_1"
  471. ]
  472. audio_url = video_info["videoResource"]["normal"]["video_list"]["video_2"][
  473. "backup_url_1"
  474. ]
  475. if len(video_url) % 3 == 1:
  476. video_url += "=="
  477. elif len(video_url) % 3 == 2:
  478. video_url += "="
  479. elif len(audio_url) % 3 == 1:
  480. audio_url += "=="
  481. elif len(audio_url) % 3 == 2:
  482. audio_url += "="
  483. video_url = base64.b64decode(video_url).decode("utf8")
  484. audio_url = base64.b64decode(audio_url).decode("utf8")
  485. video_width = video_info["videoResource"]["normal"]["video_list"][
  486. "video_2"
  487. ]["vwidth"]
  488. video_height = video_info["videoResource"]["normal"]["video_list"][
  489. "video_2"
  490. ]["vheight"]
  491. video_url_dict["video_url"] = video_url
  492. video_url_dict["audio_url"] = audio_url
  493. video_url_dict["video_width"] = video_width
  494. video_url_dict["video_height"] = video_height
  495. elif (
  496. "video_list" in video_info["videoResource"]["normal"]
  497. and "video_1" in video_info["videoResource"]["normal"]["video_list"]
  498. ):
  499. video_url = video_info["videoResource"]["normal"]["video_list"]["video_1"][
  500. "backup_url_1"
  501. ]
  502. audio_url = video_info["videoResource"]["normal"]["video_list"]["video_1"][
  503. "backup_url_1"
  504. ]
  505. if len(video_url) % 3 == 1:
  506. video_url += "=="
  507. elif len(video_url) % 3 == 2:
  508. video_url += "="
  509. elif len(audio_url) % 3 == 1:
  510. audio_url += "=="
  511. elif len(audio_url) % 3 == 2:
  512. audio_url += "="
  513. video_url = base64.b64decode(video_url).decode("utf8")
  514. audio_url = base64.b64decode(audio_url).decode("utf8")
  515. video_width = video_info["videoResource"]["normal"]["video_list"][
  516. "video_1"
  517. ]["vwidth"]
  518. video_height = video_info["videoResource"]["normal"]["video_list"][
  519. "video_1"
  520. ]["vheight"]
  521. video_url_dict["video_url"] = video_url
  522. video_url_dict["audio_url"] = audio_url
  523. video_url_dict["video_width"] = video_width
  524. video_url_dict["video_height"] = video_height
  525. elif (
  526. "dynamic_video" in video_info["videoResource"]["normal"]
  527. and "dynamic_video_list"
  528. in video_info["videoResource"]["normal"]["dynamic_video"]
  529. and "dynamic_audio_list"
  530. in video_info["videoResource"]["normal"]["dynamic_video"]
  531. and len(
  532. video_info["videoResource"]["normal"]["dynamic_video"][
  533. "dynamic_video_list"
  534. ]
  535. )
  536. != 0
  537. and len(
  538. video_info["videoResource"]["normal"]["dynamic_video"][
  539. "dynamic_audio_list"
  540. ]
  541. )
  542. != 0
  543. ):
  544. video_url = video_info["videoResource"]["normal"]["dynamic_video"][
  545. "dynamic_video_list"
  546. ][-1]["backup_url_1"]
  547. audio_url = video_info["videoResource"]["normal"]["dynamic_video"][
  548. "dynamic_audio_list"
  549. ][-1]["backup_url_1"]
  550. if len(video_url) % 3 == 1:
  551. video_url += "=="
  552. elif len(video_url) % 3 == 2:
  553. video_url += "="
  554. elif len(audio_url) % 3 == 1:
  555. audio_url += "=="
  556. elif len(audio_url) % 3 == 2:
  557. audio_url += "="
  558. video_url = base64.b64decode(video_url).decode("utf8")
  559. audio_url = base64.b64decode(audio_url).decode("utf8")
  560. video_width = video_info["videoResource"]["normal"]["dynamic_video"][
  561. "dynamic_video_list"
  562. ][-1]["vwidth"]
  563. video_height = video_info["videoResource"]["normal"]["dynamic_video"][
  564. "dynamic_video_list"
  565. ][-1]["vheight"]
  566. video_url_dict["video_url"] = video_url
  567. video_url_dict["audio_url"] = audio_url
  568. video_url_dict["video_width"] = video_width
  569. video_url_dict["video_height"] = video_height
  570. else:
  571. video_url_dict["video_url"] = ""
  572. video_url_dict["audio_url"] = ""
  573. video_url_dict["video_width"] = 0
  574. video_url_dict["video_height"] = 0
  575. else:
  576. video_url_dict["video_url"] = ""
  577. video_url_dict["audio_url"] = ""
  578. video_url_dict["video_width"] = 0
  579. video_url_dict["video_height"] = 0
  580. return video_url_dict
  581. def get_comment_cnt(item_id):
  582. url = "https://www.ixigua.com/tlb/comment/article/v5/tab_comments/?"
  583. params = {
  584. "tab_index": "0",
  585. "count": "10",
  586. "offset": "10",
  587. "group_id": str(item_id),
  588. "item_id": str(item_id),
  589. "aid": "1768",
  590. "msToken": "50-JJObWB07HfHs-BMJWT1eIDX3G-6lPSF_i-QwxBIXE9VVa-iN0jbEXR5pG2DKjXBmP299n6ZTuXzY-GAy968CCvouSAYIS4GzvGQT3pNlKNejr5G4-1g==",
  591. "X-Bogus": "DFSzswVOyGtANVeWtCLMqR/F6q9U",
  592. "_signature": random_signature(),
  593. }
  594. headers = {
  595. "authority": "www.ixigua.com",
  596. "accept": "application/json, text/plain, */*",
  597. "accept-language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
  598. "cache-control": "no-cache",
  599. "cookie": "MONITOR_WEB_ID=67cb5099-a022-4ec3-bb8e-c4de6ba51dd0; passport_csrf_token=72b2574f3c99f8ba670e42df430218fd; passport_csrf_token_default=72b2574f3c99f8ba670e42df430218fd; sid_guard=c7472b508ea631823ba765a60cf8757f%7C1680867422%7C3024002%7CFri%2C+12-May-2023+11%3A37%3A04+GMT; uid_tt=c13f47d51767f616befe32fb3e9f485a; uid_tt_ss=c13f47d51767f616befe32fb3e9f485a; sid_tt=c7472b508ea631823ba765a60cf8757f; sessionid=c7472b508ea631823ba765a60cf8757f; sessionid_ss=c7472b508ea631823ba765a60cf8757f; sid_ucp_v1=1.0.0-KGUzNWYxNmRkZGJiZjgxY2MzZWNkMTEzMTkwYjY1Yjg5OTY5NzVlNmMKFQiu3d-eqQIQ3oDAoQYYGCAMOAhACxoCaGwiIGM3NDcyYjUwOGVhNjMxODIzYmE3NjVhNjBjZjg3NTdm; ssid_ucp_v1=1.0.0-KGUzNWYxNmRkZGJiZjgxY2MzZWNkMTEzMTkwYjY1Yjg5OTY5NzVlNmMKFQiu3d-eqQIQ3oDAoQYYGCAMOAhACxoCaGwiIGM3NDcyYjUwOGVhNjMxODIzYmE3NjVhNjBjZjg3NTdm; odin_tt=b893608d4dde2e1e8df8cd5d97a0e2fbeafc4ca762ac72ebef6e6c97e2ed19859bb01d46b4190ddd6dd17d7f9678e1de; SEARCH_CARD_MODE=7168304743566296612_0; support_webp=true; support_avif=false; csrf_session_id=a5355d954d3c63ed1ba35faada452b4d; tt_scid=7Pux7s634-z8DYvCM20y7KigwH5u7Rh6D9C-RROpnT.aGMEcz6Vsxp.oai47wJqa4f86; ttwid=1%7CHHtv2QqpSGuSu8r-zXF1QoWsvjmNi1SJrqOrZzg-UCY%7C1683858689%7Ca5223fe1500578e01e138a0d71d6444692018296c4c24f5885af174a65873c95; ixigua-a-s=3; msToken=50-JJObWB07HfHs-BMJWT1eIDX3G-6lPSF_i-QwxBIXE9VVa-iN0jbEXR5pG2DKjXBmP299n6ZTuXzY-GAy968CCvouSAYIS4GzvGQT3pNlKNejr5G4-1g==; __ac_nonce=0645dcbf0005064517440; __ac_signature=_02B4Z6wo00f01FEGmAwAAIDBKchzCGqn-MBRJpyAAHAjieFC5GEg6gGiwz.I4PRrJl7f0GcixFrExKmgt6QI1i1S-dQyofPEj2ugWTCnmKUdJQv-wYuDofeKNe8VtMtZq2aKewyUGeKU-5Ud21; ixigua-a-s=3",
  600. "pragma": "no-cache",
  601. "referer": f"https://www.ixigua.com/{item_id}?logTag=3c5aa86a8600b9ab8540",
  602. "sec-ch-ua": '"Microsoft Edge";v="113", "Chromium";v="113", "Not-A.Brand";v="24"',
  603. "sec-ch-ua-mobile": "?0",
  604. "sec-ch-ua-platform": '"macOS"',
  605. "sec-fetch-dest": "empty",
  606. "sec-fetch-mode": "cors",
  607. "sec-fetch-site": "same-origin",
  608. "tt-anti-token": "cBITBHvmYjEygzv-f9c78c1297722cf1f559c74b084e4525ce4900bdcf9e8588f20cc7c2e3234422",
  609. "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.35",
  610. "x-secsdk-csrf-token": "000100000001f8e733cf37f0cd255a51aea9a81ff7bc0c09490cfe41ad827c3c5c18ec809279175e4d9f5553d8a5",
  611. }
  612. response = requests.get(
  613. url=url, headers=headers, params=params, proxies=tunnel_proxies(), timeout=5
  614. )
  615. response.close()
  616. if (
  617. response.status_code != 200
  618. or "total_number" not in response.json()
  619. or response.json() == {}
  620. ):
  621. return 0
  622. return response.json().get("total_number", 0)
  623. class XiGuaAuthor:
  624. def __init__(self, platform, mode, rule_dict, env, user_list):
  625. self.platform = platform
  626. self.mode = mode
  627. self.rule_dict = rule_dict
  628. self.env = env
  629. self.user_list = user_list
  630. self.mq = MQ(topic_name="topic_crawler_etl_" + self.env)
  631. self.download_count = 0
  632. def get_author_list(self):
  633. # 每轮只抓取定量的数据,到达数量后自己退出
  634. # max_count = int(self.rule_dict.get("videos_cnt", {}).get("min", 300))
  635. for user_dict in self.user_list:
  636. # if self.download_count <= max_count:
  637. self.get_video_list(user_dict)
  638. # time.sleep(random.randint(1, 15))
  639. # else:
  640. # AliyunLogger.logging(
  641. # code="2000",
  642. # platform=self.platform,
  643. # mode=self.mode,
  644. # env=self.env,
  645. # message="本轮已经抓取足够数量的视频,已经自动退出",
  646. # )
  647. # return
  648. def get_video_list(self, user_dict):
  649. offset = 0
  650. signature = random_signature()
  651. url = "https://www.ixigua.com/api/videov2/author/new_video_list?"
  652. while True:
  653. params = {
  654. "to_user_id": str(
  655. user_dict["link"].replace("https://www.ixigua.com/home/", "")
  656. ),
  657. "offset": str(offset),
  658. "limit": "30",
  659. "maxBehotTime": "0",
  660. "order": "new",
  661. "isHome": "0",
  662. # 'msToken': 'G0eRzNkw189a8TLaXjc6nTHVMQwh9XcxVAqTbGKi7iPJdQcLwS3-XRrJ3MZ7QBfqErpxp3EX1WtvWOIcZ3NIgr41hgcd-v64so_RRj3YCRw1UsKW8mIssNLlIMspsg==',
  663. # 'X-Bogus': 'DFSzswVuEkUANjW9ShFTgR/F6qHt',
  664. "_signature": signature,
  665. }
  666. headers = {
  667. "referer": f'https://www.ixigua.com/home/{user_dict["link"].replace("https://www.ixigua.com/home/", "")}/video/?preActiveKey=hotsoon&list_entrance=userdetail',
  668. "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.41",
  669. }
  670. response = requests.get(
  671. url=url,
  672. headers=headers,
  673. params=params,
  674. proxies=tunnel_proxies(),
  675. timeout=5,
  676. )
  677. offset += 30
  678. if "data" not in response.text or response.status_code != 200:
  679. AliyunLogger.logging(
  680. code="2000",
  681. platform=self.platform,
  682. mode=self.mode,
  683. env=self.env,
  684. message=f"get_videoList:{response.text}\n",
  685. )
  686. return
  687. elif not response.json()["data"]["videoList"]:
  688. AliyunLogger.logging(
  689. code="2000",
  690. platform=self.platform,
  691. mode=self.mode,
  692. env=self.env,
  693. message=f"没有更多数据啦~\n",
  694. )
  695. return
  696. else:
  697. feeds = response.json()["data"]["videoList"]
  698. for video_obj in feeds:
  699. try:
  700. AliyunLogger.logging(
  701. code="1001",
  702. platform=self.platform,
  703. mode=self.mode,
  704. env=self.env,
  705. data=video_obj,
  706. message="扫描到一条视频",
  707. )
  708. date_flag = self.process_video_obj(video_obj, user_dict)
  709. if not date_flag:
  710. return
  711. except Exception as e:
  712. AliyunLogger.logging(
  713. code="3000",
  714. platform=self.platform,
  715. mode=self.mode,
  716. env=self.env,
  717. data=video_obj,
  718. message="抓取单条视频异常, 报错原因是: {}".format(e),
  719. )
  720. def process_video_obj(self, video_obj, user_dict):
  721. trace_id = self.platform + str(uuid.uuid1())
  722. item_id = video_obj.get("item_id", "")
  723. if not item_id:
  724. AliyunLogger.logging(
  725. code="2005",
  726. platform=self.platform,
  727. mode=self.mode,
  728. env=self.env,
  729. message="无效视频",
  730. data=video_obj,
  731. trace_id=trace_id,
  732. )
  733. return
  734. # 获取视频信息
  735. video_dict = self.get_video_info(item_id=item_id, trace_id=trace_id)
  736. video_dict["out_user_id"] = video_dict["user_id"]
  737. video_dict["platform"] = self.platform
  738. video_dict["strategy"] = self.mode
  739. video_dict["out_video_id"] = video_dict["video_id"]
  740. video_dict["width"] = video_dict["video_width"]
  741. video_dict["height"] = video_dict["video_height"]
  742. video_dict["crawler_rule"] = json.dumps(self.rule_dict)
  743. video_dict["user_id"] = user_dict["uid"]
  744. video_dict["publish_time"] = video_dict["publish_time_str"]
  745. video_dict["strategy_type"] = self.mode
  746. video_dict["update_time_stamp"] = int(time.time())
  747. if int(time.time()) - video_dict['publish_time_stamp'] > 3600 * 24 * int(self.rule_dict.get("period", {}).get("max", 1000)):
  748. if not video_obj['is_top']:
  749. """
  750. 非置顶数据发布时间超过才退出
  751. """
  752. AliyunLogger.logging(
  753. code="2004",
  754. platform=self.platform,
  755. mode=self.mode,
  756. env=self.env,
  757. data=video_dict,
  758. message="发布时间超过{}天".format(
  759. int(self.rule_dict.get("period", {}).get("max", 1000))
  760. ),
  761. )
  762. return False
  763. pipeline = PiaoQuanPipeline(
  764. platform=self.platform,
  765. mode=self.mode,
  766. rule_dict=self.rule_dict,
  767. env=self.env,
  768. item=video_dict,
  769. trace_id=trace_id,
  770. )
  771. title_flag = pipeline.title_flag()
  772. repeat_flag = pipeline.repeat_video()
  773. if title_flag and repeat_flag:
  774. if int(video_dict['play_cnt']) >= int(self.rule_dict.get("play_cnt", {}).get("min", 100000)):
  775. self.mq.send_msg(video_dict)
  776. self.download_count += 1
  777. AliyunLogger.logging(
  778. code="1002",
  779. platform=self.platform,
  780. mode=self.mode,
  781. env=self.env,
  782. data=video_dict,
  783. trace_id=trace_id,
  784. message="成功发送 MQ 至 ETL",
  785. )
  786. return True
  787. else:
  788. AliyunLogger.logging(
  789. code="2008",
  790. platform=self.platform,
  791. mode=self.mode,
  792. env=self.env,
  793. message="不满足特殊规则, 播放量",
  794. data=video_dict
  795. )
  796. if float(video_dict['like_cnt']) / float(video_dict['play_cnt']) >= 0.04:
  797. self.mq.send_msg(video_dict)
  798. self.download_count += 1
  799. AliyunLogger.logging(
  800. code="1002",
  801. platform=self.platform,
  802. mode=self.mode,
  803. env=self.env,
  804. data=video_dict,
  805. trace_id=trace_id,
  806. message="成功发送 MQ 至 ETL",
  807. )
  808. return True
  809. else:
  810. AliyunLogger.logging(
  811. code="2008",
  812. platform=self.platform,
  813. mode=self.mode,
  814. env=self.env,
  815. message="不满足特殊规则, 点赞量/播放量",
  816. data=video_dict
  817. )
  818. return True
  819. def get_video_info(self, item_id, trace_id):
  820. url = "https://www.ixigua.com/api/mixVideo/information?"
  821. headers = {
  822. "accept-encoding": "gzip, deflate",
  823. "accept-language": "zh-CN,zh-Hans;q=0.9",
  824. "user-agent": FakeUserAgent().random,
  825. "referer": "https://www.ixigua.com/7102614741050196520?logTag=0531c88ac04f38ab2c62",
  826. }
  827. params = {
  828. "mixId": str(item_id),
  829. "msToken": "IlG0wd0Pylyw9ghcYiB2YseUmTwrsrqqhXrbIcsSaTcLTJyVlbYJzk20zw3UO-CfrfC"
  830. "NVVIOBNjIl7vfBoxnVUwO9ZyzAI3umSKsT5-pef_RRfQCJwmA",
  831. "X-Bogus": "DFSzswVupYTANCJOSBk0P53WxM-r",
  832. "_signature": "_02B4Z6wo0000119LvEwAAIDCuktNZ0y5wkdfS7jAALThuOR8D9yWNZ.EmWHKV0WSn6Px"
  833. "fPsH9-BldyxVje0f49ryXgmn7Tzk-swEHNb15TiGqa6YF.cX0jW8Eds1TtJOIZyfc9s5emH7gdWN94",
  834. }
  835. cookies = {
  836. "ixigua-a-s": "1",
  837. "msToken": "IlG0wd0Pylyw9ghcYiB2YseUmTwrsrqqhXrbIcsSaTcLTJyVlbYJzk20zw3UO-CfrfCNVVIOB"
  838. "NjIl7vfBoxnVUwO9ZyzAI3umSKsT5-pef_RRfQCJwmA",
  839. "ttwid": "1%7C_yXQeHWwLZgCsgHClOwTCdYSOt_MjdOkgnPIkpi-Sr8%7C1661241238%7Cf57d0c5ef3f1d7"
  840. "6e049fccdca1ac54887c34d1f8731c8e51a49780ff0ceab9f8",
  841. "tt_scid": "QZ4l8KXDG0YAEaMCSbADdcybdKbUfG4BC6S4OBv9lpRS5VyqYLX2bIR8CTeZeGHR9ee3",
  842. "MONITOR_WEB_ID": "0a49204a-7af5-4e96-95f0-f4bafb7450ad",
  843. "__ac_nonce": "06304878000964fdad287",
  844. "__ac_signature": "_02B4Z6wo00f017Rcr3AAAIDCUVxeW1tOKEu0fKvAAI4cvoYzV-wBhq7B6D8k0no7lb"
  845. "FlvYoinmtK6UXjRIYPXnahUlFTvmWVtb77jsMkKAXzAEsLE56m36RlvL7ky.M3Xn52r9t1IEb7IR3ke8",
  846. "ttcid": "e56fabf6e85d4adf9e4d91902496a0e882",
  847. "_tea_utm_cache_1300": "undefined",
  848. "support_avif": "false",
  849. "support_webp": "false",
  850. "xiguavideopcwebid": "7134967546256016900",
  851. "xiguavideopcwebid.sig": "xxRww5R1VEMJN_dQepHorEu_eAc",
  852. }
  853. response = requests.get(
  854. url=url,
  855. headers=headers,
  856. params=params,
  857. cookies=cookies,
  858. proxies=tunnel_proxies(),
  859. timeout=5,
  860. )
  861. if (
  862. response.status_code != 200
  863. or "data" not in response.json()
  864. or response.json()["data"] == {}
  865. ):
  866. AliyunLogger.logging(
  867. code="2000",
  868. platform=self.platform,
  869. mode=self.mode,
  870. env=self.env,
  871. message="获取视频信息失败",
  872. trace_id=trace_id,
  873. )
  874. return None
  875. else:
  876. video_info = (
  877. response.json()["data"]
  878. .get("gidInformation", {})
  879. .get("packerData", {})
  880. .get("video", {})
  881. )
  882. if video_info == {}:
  883. return None
  884. video_detail = get_video_url(video_info)
  885. video_dict = {
  886. "video_title": video_info.get("title", ""),
  887. "video_id": video_info.get("videoResource", {}).get("vid", ""),
  888. "gid": str(item_id),
  889. "play_cnt": int(video_info.get("video_watch_count", 0)),
  890. "like_cnt": int(video_info.get("video_like_count", 0)),
  891. "comment_cnt": int(get_comment_cnt(item_id)),
  892. "share_cnt": 0,
  893. "favorite_cnt": 0,
  894. "duration": int(video_info.get("video_duration", 0)),
  895. "video_width": int(video_detail["video_width"]),
  896. "video_height": int(video_detail["video_height"]),
  897. "publish_time_stamp": int(video_info.get("video_publish_time", 0)),
  898. "publish_time_str": time.strftime(
  899. "%Y-%m-%d %H:%M:%S",
  900. time.localtime(int(video_info.get("video_publish_time", 0))),
  901. ),
  902. "user_name": video_info.get("user_info", {}).get("name", ""),
  903. "user_id": str(video_info.get("user_info", {}).get("user_id", "")),
  904. "avatar_url": str(
  905. video_info.get("user_info", {}).get("avatar_url", "")
  906. ),
  907. "cover_url": video_info.get("poster_url", ""),
  908. "audio_url": video_detail["audio_url"],
  909. "video_url": video_detail["video_url"],
  910. "session": f"xigua-search-{int(time.time())}",
  911. }
  912. return video_dict
  913. if __name__ == "__main__":
  914. user_list = [
  915. {
  916. "uid": 6267140,
  917. "source": "xigua",
  918. "link": "https://www.ixigua.com/home/2779177225827568",
  919. "nick_name": "秋晴爱音乐",
  920. "avatar_url": "",
  921. "mode": "author",
  922. },
  923. {
  924. "uid": 6267140,
  925. "source": "xigua",
  926. "link": "https://www.ixigua.com/home/2885546124776780",
  927. "nick_name": "朗诵放歌的老山羊",
  928. "avatar_url": "",
  929. "mode": "author",
  930. },
  931. {
  932. "uid": 6267140,
  933. "source": "xigua",
  934. "link": "https://www.ixigua.com/home/5880938217",
  935. "nick_name": "天原声疗",
  936. "avatar_url": "",
  937. "mode": "author",
  938. },
  939. ]
  940. # rule = {'period': {'min': 30, 'max': 30}, 'duration': {'min': 20, 'max': 0}, 'play_cnt': {'min': 100000, 'max': 0}}
  941. # XGA = XiGuaAuthor(
  942. # platform="xigua",
  943. # mode="author",
  944. # rule_dict=rule,
  945. # env="prod",
  946. # user_list=user_list
  947. # )
  948. # XGA.get_author_list()