xigua_search.py 30 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711
  1. """
  2. 西瓜视频搜索爬虫
  3. """
  4. import os
  5. import sys
  6. import json
  7. import time
  8. import uuid
  9. import random
  10. import base64
  11. import asyncio
  12. import aiohttp
  13. import urllib.parse
  14. import requests
  15. from lxml import etree
  16. sys.path.append(os.getcwd())
  17. from application.items import VideoItem
  18. from application.pipeline import PiaoQuanPipeline
  19. from application.common.messageQueue import MQ
  20. from application.common.proxies import tunnel_proxies
  21. from application.common.log import AliyunLogger
  22. def get_video_url(video_info):
  23. """
  24. 获取视频链接信息
  25. :param video_info:
  26. :return:
  27. """
  28. video_url_dict = {}
  29. # video_url
  30. if "videoResource" not in video_info:
  31. video_url_dict["video_url"] = ""
  32. video_url_dict["audio_url"] = ""
  33. video_url_dict["video_width"] = 0
  34. video_url_dict["video_height"] = 0
  35. elif "dash_120fps" in video_info["videoResource"]:
  36. if (
  37. "video_list" in video_info["videoResource"]["dash_120fps"]
  38. and "video_4" in video_info["videoResource"]["dash_120fps"]["video_list"]
  39. ):
  40. video_url = video_info["videoResource"]["dash_120fps"]["video_list"][
  41. "video_4"
  42. ]["backup_url_1"]
  43. audio_url = video_info["videoResource"]["dash_120fps"]["video_list"][
  44. "video_4"
  45. ]["backup_url_1"]
  46. if len(video_url) % 3 == 1:
  47. video_url += "=="
  48. elif len(video_url) % 3 == 2:
  49. video_url += "="
  50. elif len(audio_url) % 3 == 1:
  51. audio_url += "=="
  52. elif len(audio_url) % 3 == 2:
  53. audio_url += "="
  54. video_url = base64.b64decode(video_url).decode("utf8")
  55. audio_url = base64.b64decode(audio_url).decode("utf8")
  56. video_width = video_info["videoResource"]["dash_120fps"]["video_list"][
  57. "video_4"
  58. ]["vwidth"]
  59. video_height = video_info["videoResource"]["dash_120fps"]["video_list"][
  60. "video_4"
  61. ]["vheight"]
  62. video_url_dict["video_url"] = video_url
  63. video_url_dict["audio_url"] = audio_url
  64. video_url_dict["video_width"] = video_width
  65. video_url_dict["video_height"] = video_height
  66. elif (
  67. "video_list" in video_info["videoResource"]["dash_120fps"]
  68. and "video_3" in video_info["videoResource"]["dash_120fps"]["video_list"]
  69. ):
  70. video_url = video_info["videoResource"]["dash_120fps"]["video_list"][
  71. "video_3"
  72. ]["backup_url_1"]
  73. audio_url = video_info["videoResource"]["dash_120fps"]["video_list"][
  74. "video_3"
  75. ]["backup_url_1"]
  76. if len(video_url) % 3 == 1:
  77. video_url += "=="
  78. elif len(video_url) % 3 == 2:
  79. video_url += "="
  80. elif len(audio_url) % 3 == 1:
  81. audio_url += "=="
  82. elif len(audio_url) % 3 == 2:
  83. audio_url += "="
  84. video_url = base64.b64decode(video_url).decode("utf8")
  85. audio_url = base64.b64decode(audio_url).decode("utf8")
  86. video_width = video_info["videoResource"]["dash_120fps"]["video_list"][
  87. "video_3"
  88. ]["vwidth"]
  89. video_height = video_info["videoResource"]["dash_120fps"]["video_list"][
  90. "video_3"
  91. ]["vheight"]
  92. video_url_dict["video_url"] = video_url
  93. video_url_dict["audio_url"] = audio_url
  94. video_url_dict["video_width"] = video_width
  95. video_url_dict["video_height"] = video_height
  96. elif (
  97. "video_list" in video_info["videoResource"]["dash_120fps"]
  98. and "video_2" in video_info["videoResource"]["dash_120fps"]["video_list"]
  99. ):
  100. video_url = video_info["videoResource"]["dash_120fps"]["video_list"][
  101. "video_2"
  102. ]["backup_url_1"]
  103. audio_url = video_info["videoResource"]["dash_120fps"]["video_list"][
  104. "video_2"
  105. ]["backup_url_1"]
  106. if len(video_url) % 3 == 1:
  107. video_url += "=="
  108. elif len(video_url) % 3 == 2:
  109. video_url += "="
  110. elif len(audio_url) % 3 == 1:
  111. audio_url += "=="
  112. elif len(audio_url) % 3 == 2:
  113. audio_url += "="
  114. video_url = base64.b64decode(video_url).decode("utf8")
  115. audio_url = base64.b64decode(audio_url).decode("utf8")
  116. video_width = video_info["videoResource"]["dash_120fps"]["video_list"][
  117. "video_2"
  118. ]["vwidth"]
  119. video_height = video_info["videoResource"]["dash_120fps"]["video_list"][
  120. "video_2"
  121. ]["vheight"]
  122. video_url_dict["video_url"] = video_url
  123. video_url_dict["audio_url"] = audio_url
  124. video_url_dict["video_width"] = video_width
  125. video_url_dict["video_height"] = video_height
  126. elif (
  127. "video_list" in video_info["videoResource"]["dash_120fps"]
  128. and "video_1" in video_info["videoResource"]["dash_120fps"]["video_list"]
  129. ):
  130. video_url = video_info["videoResource"]["dash_120fps"]["video_list"][
  131. "video_1"
  132. ]["backup_url_1"]
  133. audio_url = video_info["videoResource"]["dash_120fps"]["video_list"][
  134. "video_1"
  135. ]["backup_url_1"]
  136. if len(video_url) % 3 == 1:
  137. video_url += "=="
  138. elif len(video_url) % 3 == 2:
  139. video_url += "="
  140. elif len(audio_url) % 3 == 1:
  141. audio_url += "=="
  142. elif len(audio_url) % 3 == 2:
  143. audio_url += "="
  144. video_url = base64.b64decode(video_url).decode("utf8")
  145. audio_url = base64.b64decode(audio_url).decode("utf8")
  146. video_width = video_info["videoResource"]["dash_120fps"]["video_list"][
  147. "video_1"
  148. ]["vwidth"]
  149. video_height = video_info["videoResource"]["dash_120fps"]["video_list"][
  150. "video_1"
  151. ]["vheight"]
  152. video_url_dict["video_url"] = video_url
  153. video_url_dict["audio_url"] = audio_url
  154. video_url_dict["video_width"] = video_width
  155. video_url_dict["video_height"] = video_height
  156. elif (
  157. "dynamic_video" in video_info["videoResource"]["dash_120fps"]
  158. and "dynamic_video_list"
  159. in video_info["videoResource"]["dash_120fps"]["dynamic_video"]
  160. and "dynamic_audio_list"
  161. in video_info["videoResource"]["dash_120fps"]["dynamic_video"]
  162. and len(
  163. video_info["videoResource"]["dash_120fps"]["dynamic_video"][
  164. "dynamic_video_list"
  165. ]
  166. )
  167. != 0
  168. and len(
  169. video_info["videoResource"]["dash_120fps"]["dynamic_video"][
  170. "dynamic_audio_list"
  171. ]
  172. )
  173. != 0
  174. ):
  175. video_url = video_info["videoResource"]["dash_120fps"]["dynamic_video"][
  176. "dynamic_video_list"
  177. ][-1]["backup_url_1"]
  178. audio_url = video_info["videoResource"]["dash_120fps"]["dynamic_video"][
  179. "dynamic_audio_list"
  180. ][-1]["backup_url_1"]
  181. if len(video_url) % 3 == 1:
  182. video_url += "=="
  183. elif len(video_url) % 3 == 2:
  184. video_url += "="
  185. elif len(audio_url) % 3 == 1:
  186. audio_url += "=="
  187. elif len(audio_url) % 3 == 2:
  188. audio_url += "="
  189. video_url = base64.b64decode(video_url).decode("utf8")
  190. audio_url = base64.b64decode(audio_url).decode("utf8")
  191. video_width = video_info["videoResource"]["dash_120fps"]["dynamic_video"][
  192. "dynamic_video_list"
  193. ][-1]["vwidth"]
  194. video_height = video_info["videoResource"]["dash_120fps"]["dynamic_video"][
  195. "dynamic_video_list"
  196. ][-1]["vheight"]
  197. video_url_dict["video_url"] = video_url
  198. video_url_dict["audio_url"] = audio_url
  199. video_url_dict["video_width"] = video_width
  200. video_url_dict["video_height"] = video_height
  201. else:
  202. video_url_dict["video_url"] = ""
  203. video_url_dict["audio_url"] = ""
  204. video_url_dict["video_width"] = 0
  205. video_url_dict["video_height"] = 0
  206. elif "dash" in video_info["videoResource"]:
  207. if (
  208. "video_list" in video_info["videoResource"]["dash"]
  209. and "video_4" in video_info["videoResource"]["dash"]["video_list"]
  210. ):
  211. video_url = video_info["videoResource"]["dash"]["video_list"]["video_4"][
  212. "backup_url_1"
  213. ]
  214. audio_url = video_info["videoResource"]["dash"]["video_list"]["video_4"][
  215. "backup_url_1"
  216. ]
  217. if len(video_url) % 3 == 1:
  218. video_url += "=="
  219. elif len(video_url) % 3 == 2:
  220. video_url += "="
  221. elif len(audio_url) % 3 == 1:
  222. audio_url += "=="
  223. elif len(audio_url) % 3 == 2:
  224. audio_url += "="
  225. video_url = base64.b64decode(video_url).decode("utf8")
  226. audio_url = base64.b64decode(audio_url).decode("utf8")
  227. video_width = video_info["videoResource"]["dash"]["video_list"]["video_4"][
  228. "vwidth"
  229. ]
  230. video_height = video_info["videoResource"]["dash"]["video_list"]["video_4"][
  231. "vheight"
  232. ]
  233. video_url_dict["video_url"] = video_url
  234. video_url_dict["audio_url"] = audio_url
  235. video_url_dict["video_width"] = video_width
  236. video_url_dict["video_height"] = video_height
  237. elif (
  238. "video_list" in video_info["videoResource"]["dash"]
  239. and "video_3" in video_info["videoResource"]["dash"]["video_list"]
  240. ):
  241. video_url = video_info["videoResource"]["dash"]["video_list"]["video_3"][
  242. "backup_url_1"
  243. ]
  244. audio_url = video_info["videoResource"]["dash"]["video_list"]["video_3"][
  245. "backup_url_1"
  246. ]
  247. if len(video_url) % 3 == 1:
  248. video_url += "=="
  249. elif len(video_url) % 3 == 2:
  250. video_url += "="
  251. elif len(audio_url) % 3 == 1:
  252. audio_url += "=="
  253. elif len(audio_url) % 3 == 2:
  254. audio_url += "="
  255. video_url = base64.b64decode(video_url).decode("utf8")
  256. audio_url = base64.b64decode(audio_url).decode("utf8")
  257. video_width = video_info["videoResource"]["dash"]["video_list"]["video_3"][
  258. "vwidth"
  259. ]
  260. video_height = video_info["videoResource"]["dash"]["video_list"]["video_3"][
  261. "vheight"
  262. ]
  263. video_url_dict["video_url"] = video_url
  264. video_url_dict["audio_url"] = audio_url
  265. video_url_dict["video_width"] = video_width
  266. video_url_dict["video_height"] = video_height
  267. elif (
  268. "video_list" in video_info["videoResource"]["dash"]
  269. and "video_2" in video_info["videoResource"]["dash"]["video_list"]
  270. ):
  271. video_url = video_info["videoResource"]["dash"]["video_list"]["video_2"][
  272. "backup_url_1"
  273. ]
  274. audio_url = video_info["videoResource"]["dash"]["video_list"]["video_2"][
  275. "backup_url_1"
  276. ]
  277. if len(video_url) % 3 == 1:
  278. video_url += "=="
  279. elif len(video_url) % 3 == 2:
  280. video_url += "="
  281. elif len(audio_url) % 3 == 1:
  282. audio_url += "=="
  283. elif len(audio_url) % 3 == 2:
  284. audio_url += "="
  285. video_url = base64.b64decode(video_url).decode("utf8")
  286. audio_url = base64.b64decode(audio_url).decode("utf8")
  287. video_width = video_info["videoResource"]["dash"]["video_list"]["video_2"][
  288. "vwidth"
  289. ]
  290. video_height = video_info["videoResource"]["dash"]["video_list"]["video_2"][
  291. "vheight"
  292. ]
  293. video_url_dict["video_url"] = video_url
  294. video_url_dict["audio_url"] = audio_url
  295. video_url_dict["video_width"] = video_width
  296. video_url_dict["video_height"] = video_height
  297. elif (
  298. "video_list" in video_info["videoResource"]["dash"]
  299. and "video_1" in video_info["videoResource"]["dash"]["video_list"]
  300. ):
  301. video_url = video_info["videoResource"]["dash"]["video_list"]["video_1"][
  302. "backup_url_1"
  303. ]
  304. audio_url = video_info["videoResource"]["dash"]["video_list"]["video_1"][
  305. "backup_url_1"
  306. ]
  307. if len(video_url) % 3 == 1:
  308. video_url += "=="
  309. elif len(video_url) % 3 == 2:
  310. video_url += "="
  311. elif len(audio_url) % 3 == 1:
  312. audio_url += "=="
  313. elif len(audio_url) % 3 == 2:
  314. audio_url += "="
  315. video_url = base64.b64decode(video_url).decode("utf8")
  316. audio_url = base64.b64decode(audio_url).decode("utf8")
  317. video_width = video_info["videoResource"]["dash"]["video_list"]["video_1"][
  318. "vwidth"
  319. ]
  320. video_height = video_info["videoResource"]["dash"]["video_list"]["video_1"][
  321. "vheight"
  322. ]
  323. video_url_dict["video_url"] = video_url
  324. video_url_dict["audio_url"] = audio_url
  325. video_url_dict["video_width"] = video_width
  326. video_url_dict["video_height"] = video_height
  327. elif (
  328. "dynamic_video" in video_info["videoResource"]["dash"]
  329. and "dynamic_video_list"
  330. in video_info["videoResource"]["dash"]["dynamic_video"]
  331. and "dynamic_audio_list"
  332. in video_info["videoResource"]["dash"]["dynamic_video"]
  333. and len(
  334. video_info["videoResource"]["dash"]["dynamic_video"][
  335. "dynamic_video_list"
  336. ]
  337. )
  338. != 0
  339. and len(
  340. video_info["videoResource"]["dash"]["dynamic_video"][
  341. "dynamic_audio_list"
  342. ]
  343. )
  344. != 0
  345. ):
  346. video_url = video_info["videoResource"]["dash"]["dynamic_video"][
  347. "dynamic_video_list"
  348. ][-1]["backup_url_1"]
  349. audio_url = video_info["videoResource"]["dash"]["dynamic_video"][
  350. "dynamic_audio_list"
  351. ][-1]["backup_url_1"]
  352. if len(video_url) % 3 == 1:
  353. video_url += "=="
  354. elif len(video_url) % 3 == 2:
  355. video_url += "="
  356. elif len(audio_url) % 3 == 1:
  357. audio_url += "=="
  358. elif len(audio_url) % 3 == 2:
  359. audio_url += "="
  360. video_url = base64.b64decode(video_url).decode("utf8")
  361. audio_url = base64.b64decode(audio_url).decode("utf8")
  362. video_width = video_info["videoResource"]["dash"]["dynamic_video"][
  363. "dynamic_video_list"
  364. ][-1]["vwidth"]
  365. video_height = video_info["videoResource"]["dash"]["dynamic_video"][
  366. "dynamic_video_list"
  367. ][-1]["vheight"]
  368. video_url_dict["video_url"] = video_url
  369. video_url_dict["audio_url"] = audio_url
  370. video_url_dict["video_width"] = video_width
  371. video_url_dict["video_height"] = video_height
  372. else:
  373. video_url_dict["video_url"] = ""
  374. video_url_dict["audio_url"] = ""
  375. video_url_dict["video_width"] = 0
  376. video_url_dict["video_height"] = 0
  377. elif "normal" in video_info["videoResource"]:
  378. if (
  379. "video_list" in video_info["videoResource"]["normal"]
  380. and "video_4" in video_info["videoResource"]["normal"]["video_list"]
  381. ):
  382. video_url = video_info["videoResource"]["normal"]["video_list"]["video_4"][
  383. "backup_url_1"
  384. ]
  385. audio_url = video_info["videoResource"]["normal"]["video_list"]["video_4"][
  386. "backup_url_1"
  387. ]
  388. if len(video_url) % 3 == 1:
  389. video_url += "=="
  390. elif len(video_url) % 3 == 2:
  391. video_url += "="
  392. elif len(audio_url) % 3 == 1:
  393. audio_url += "=="
  394. elif len(audio_url) % 3 == 2:
  395. audio_url += "="
  396. video_url = base64.b64decode(video_url).decode("utf8")
  397. audio_url = base64.b64decode(audio_url).decode("utf8")
  398. video_width = video_info["videoResource"]["normal"]["video_list"][
  399. "video_4"
  400. ]["vwidth"]
  401. video_height = video_info["videoResource"]["normal"]["video_list"][
  402. "video_4"
  403. ]["vheight"]
  404. video_url_dict["video_url"] = video_url
  405. video_url_dict["audio_url"] = audio_url
  406. video_url_dict["video_width"] = video_width
  407. video_url_dict["video_height"] = video_height
  408. elif (
  409. "video_list" in video_info["videoResource"]["normal"]
  410. and "video_3" in video_info["videoResource"]["normal"]["video_list"]
  411. ):
  412. video_url = video_info["videoResource"]["normal"]["video_list"]["video_3"][
  413. "backup_url_1"
  414. ]
  415. audio_url = video_info["videoResource"]["normal"]["video_list"]["video_3"][
  416. "backup_url_1"
  417. ]
  418. if len(video_url) % 3 == 1:
  419. video_url += "=="
  420. elif len(video_url) % 3 == 2:
  421. video_url += "="
  422. elif len(audio_url) % 3 == 1:
  423. audio_url += "=="
  424. elif len(audio_url) % 3 == 2:
  425. audio_url += "="
  426. video_url = base64.b64decode(video_url).decode("utf8")
  427. audio_url = base64.b64decode(audio_url).decode("utf8")
  428. video_width = video_info["videoResource"]["normal"]["video_list"][
  429. "video_3"
  430. ]["vwidth"]
  431. video_height = video_info["videoResource"]["normal"]["video_list"][
  432. "video_3"
  433. ]["vheight"]
  434. video_url_dict["video_url"] = video_url
  435. video_url_dict["audio_url"] = audio_url
  436. video_url_dict["video_width"] = video_width
  437. video_url_dict["video_height"] = video_height
  438. elif (
  439. "video_list" in video_info["videoResource"]["normal"]
  440. and "video_2" in video_info["videoResource"]["normal"]["video_list"]
  441. ):
  442. video_url = video_info["videoResource"]["normal"]["video_list"]["video_2"][
  443. "backup_url_1"
  444. ]
  445. audio_url = video_info["videoResource"]["normal"]["video_list"]["video_2"][
  446. "backup_url_1"
  447. ]
  448. if len(video_url) % 3 == 1:
  449. video_url += "=="
  450. elif len(video_url) % 3 == 2:
  451. video_url += "="
  452. elif len(audio_url) % 3 == 1:
  453. audio_url += "=="
  454. elif len(audio_url) % 3 == 2:
  455. audio_url += "="
  456. video_url = base64.b64decode(video_url).decode("utf8")
  457. audio_url = base64.b64decode(audio_url).decode("utf8")
  458. video_width = video_info["videoResource"]["normal"]["video_list"][
  459. "video_2"
  460. ]["vwidth"]
  461. video_height = video_info["videoResource"]["normal"]["video_list"][
  462. "video_2"
  463. ]["vheight"]
  464. video_url_dict["video_url"] = video_url
  465. video_url_dict["audio_url"] = audio_url
  466. video_url_dict["video_width"] = video_width
  467. video_url_dict["video_height"] = video_height
  468. elif (
  469. "video_list" in video_info["videoResource"]["normal"]
  470. and "video_1" in video_info["videoResource"]["normal"]["video_list"]
  471. ):
  472. video_url = video_info["videoResource"]["normal"]["video_list"]["video_1"][
  473. "backup_url_1"
  474. ]
  475. audio_url = video_info["videoResource"]["normal"]["video_list"]["video_1"][
  476. "backup_url_1"
  477. ]
  478. if len(video_url) % 3 == 1:
  479. video_url += "=="
  480. elif len(video_url) % 3 == 2:
  481. video_url += "="
  482. elif len(audio_url) % 3 == 1:
  483. audio_url += "=="
  484. elif len(audio_url) % 3 == 2:
  485. audio_url += "="
  486. video_url = base64.b64decode(video_url).decode("utf8")
  487. audio_url = base64.b64decode(audio_url).decode("utf8")
  488. video_width = video_info["videoResource"]["normal"]["video_list"][
  489. "video_1"
  490. ]["vwidth"]
  491. video_height = video_info["videoResource"]["normal"]["video_list"][
  492. "video_1"
  493. ]["vheight"]
  494. video_url_dict["video_url"] = video_url
  495. video_url_dict["audio_url"] = audio_url
  496. video_url_dict["video_width"] = video_width
  497. video_url_dict["video_height"] = video_height
  498. elif (
  499. "dynamic_video" in video_info["videoResource"]["normal"]
  500. and "dynamic_video_list"
  501. in video_info["videoResource"]["normal"]["dynamic_video"]
  502. and "dynamic_audio_list"
  503. in video_info["videoResource"]["normal"]["dynamic_video"]
  504. and len(
  505. video_info["videoResource"]["normal"]["dynamic_video"][
  506. "dynamic_video_list"
  507. ]
  508. )
  509. != 0
  510. and len(
  511. video_info["videoResource"]["normal"]["dynamic_video"][
  512. "dynamic_audio_list"
  513. ]
  514. )
  515. != 0
  516. ):
  517. video_url = video_info["videoResource"]["normal"]["dynamic_video"][
  518. "dynamic_video_list"
  519. ][-1]["backup_url_1"]
  520. audio_url = video_info["videoResource"]["normal"]["dynamic_video"][
  521. "dynamic_audio_list"
  522. ][-1]["backup_url_1"]
  523. if len(video_url) % 3 == 1:
  524. video_url += "=="
  525. elif len(video_url) % 3 == 2:
  526. video_url += "="
  527. elif len(audio_url) % 3 == 1:
  528. audio_url += "=="
  529. elif len(audio_url) % 3 == 2:
  530. audio_url += "="
  531. video_url = base64.b64decode(video_url).decode("utf8")
  532. audio_url = base64.b64decode(audio_url).decode("utf8")
  533. video_width = video_info["videoResource"]["normal"]["dynamic_video"][
  534. "dynamic_video_list"
  535. ][-1]["vwidth"]
  536. video_height = video_info["videoResource"]["normal"]["dynamic_video"][
  537. "dynamic_video_list"
  538. ][-1]["vheight"]
  539. video_url_dict["video_url"] = video_url
  540. video_url_dict["audio_url"] = audio_url
  541. video_url_dict["video_width"] = video_width
  542. video_url_dict["video_height"] = video_height
  543. else:
  544. video_url_dict["video_url"] = ""
  545. video_url_dict["audio_url"] = ""
  546. video_url_dict["video_width"] = 0
  547. video_url_dict["video_height"] = 0
  548. else:
  549. video_url_dict["video_url"] = ""
  550. video_url_dict["audio_url"] = ""
  551. video_url_dict["video_width"] = 0
  552. video_url_dict["video_height"] = 0
  553. return video_url_dict
  554. class XiGuaSearch(object):
  555. """
  556. XiGuaSearch
  557. """
  558. def __init__(self, platform, mode, rule_dict, user_list, env="prod"):
  559. self.platform = platform
  560. self.mode = mode
  561. self.rule_dict = rule_dict
  562. self.user_list = user_list
  563. self.env = env
  564. self.download_cnt = 0
  565. self.mq = MQ(topic_name="topic_crawler_etl_" + self.env)
  566. self.expire_flag = False
  567. self.aliyun_log = AliyunLogger(platform=self.platform, mode=self.mode)
  568. async def search(self, keyword):
  569. """搜索"""
  570. keyword = urllib.parse.quote(keyword)
  571. base_url = "https://www.ixigua.com/search/{}/ab_name=search&fss=input".format(
  572. keyword
  573. )
  574. headers = {
  575. "authority": "www.ixigua.com",
  576. "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
  577. "accept-language": "zh,en;q=0.9,zh-CN;q=0.8",
  578. "cache-control": "max-age=0",
  579. "cookie": "ixigua-a-s=1; support_webp=true; support_avif=true; csrf_session_id=a5355d954d3c63ed1ba35faada452b4d; tt_scid=Ur23fgYD2pMJOvi1BpILyfaobg8wA7IhGwmQx260ULRa8Dvjaxc5ZA63BUIP-6Vi473f; ttwid=1%7CNtTtSp4Iej-v0nWtepdZH3d3Ts6uGNMFzTN20ps1cdo%7C1708236945%7Cc1f301c64aa3bf69cdaa41f28856e2bb7b7eed16583f8c92d50cffa2d9944fc6; msToken=rr418opQf04vm8n9s8FAGdr1AoCUsvAOGKSDPbBEfwVS1sznxxZCvcZTI93qVz5uAXlX9yRwcKlNQZ4wMro2DmlHw5yWHAVeKr_SzgO1KtVVnjUMTUNEux_cq1-EIkI=",
  580. "sec-ch-ua": '"Not A(Brand";v="99", "Google Chrome";v="121", "Chromium";v="121"',
  581. "sec-ch-ua-mobile": "?0",
  582. "sec-ch-ua-platform": '"macOS"',
  583. "sec-fetch-dest": "document",
  584. "sec-fetch-mode": "navigate",
  585. "sec-fetch-site": "none",
  586. "sec-fetch-user": "?1",
  587. "upgrade-insecure-requests": "1",
  588. "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
  589. }
  590. basic_response = requests.get(url=base_url, headers=headers)
  591. html = etree.HTML(basic_response.text)
  592. result = html.xpath(
  593. '//a[@class="HorizontalFeedCard__coverWrapper disableZoomAnimation"]/@href'
  594. )
  595. print(result)
  596. async with aiohttp.ClientSession() as session:
  597. tasks = [self.get_video_info(session, page_id[1:-2]) for page_id in result]
  598. await asyncio.gather(*tasks)
  599. async def get_video_info(self, session, page_id):
  600. """
  601. :param session:
  602. :param page_id: 视频主页 id
  603. :return:
  604. """
  605. url = "https://www.ixigua.com/api/mixVideo/information?"
  606. headers = {
  607. "accept-encoding": "gzip, deflate",
  608. "accept-language": "zh-CN,zh-Hans;q=0.9",
  609. "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
  610. "referer": "https://www.ixigua.com/7102614741050196520?logTag=0531c88ac04f38ab2c62",
  611. }
  612. params = {
  613. "mixId": str(page_id),
  614. "msToken": "IlG0wd0Pylyw9ghcYiB2YseUmTwrsrqqhXrbIcsSaTcLTJyVlbYJzk20zw3UO-CfrfC"
  615. "NVVIOBNjIl7vfBoxnVUwO9ZyzAI3umSKsT5-pef_RRfQCJwmA",
  616. "X-Bogus": "DFSzswVupYTANCJOSBk0P53WxM-r",
  617. "_signature": "_02B4Z6wo0000119LvEwAAIDCuktNZ0y5wkdfS7jAALThuOR8D9yWNZ.EmWHKV0WSn6Px"
  618. "fPsH9-BldyxVje0f49ryXgmn7Tzk-swEHNb15TiGqa6YF.cX0jW8Eds1TtJOIZyfc9s5emH7gdWN94",
  619. }
  620. cookies = {
  621. "ixigua-a-s": "1",
  622. "msToken": "IlG0wd0Pylyw9ghcYiB2YseUmTwrsrqqhXrbIcsSaTcLTJyVlbYJzk20zw3UO-CfrfCNVVIOB"
  623. "NjIl7vfBoxnVUwO9ZyzAI3umSKsT5-pef_RRfQCJwmA",
  624. "ttwid": "1%7C_yXQeHWwLZgCsgHClOwTCdYSOt_MjdOkgnPIkpi-Sr8%7C1661241238%7Cf57d0c5ef3f1d7"
  625. "6e049fccdca1ac54887c34d1f8731c8e51a49780ff0ceab9f8",
  626. "tt_scid": "QZ4l8KXDG0YAEaMCSbADdcybdKbUfG4BC6S4OBv9lpRS5VyqYLX2bIR8CTeZeGHR9ee3",
  627. "MONITOR_WEB_ID": "0a49204a-7af5-4e96-95f0-f4bafb7450ad",
  628. "__ac_nonce": "06304878000964fdad287",
  629. "__ac_signature": "_02B4Z6wo00f017Rcr3AAAIDCUVxeW1tOKEu0fKvAAI4cvoYzV-wBhq7B6D8k0no7lb"
  630. "FlvYoinmtK6UXjRIYPXnahUlFTvmWVtb77jsMkKAXzAEsLE56m36RlvL7ky.M3Xn52r9t1IEb7IR3ke8",
  631. "ttcid": "e56fabf6e85d4adf9e4d91902496a0e882",
  632. "_tea_utm_cache_1300": "undefined",
  633. "support_avif": "false",
  634. "support_webp": "false",
  635. "xiguavideopcwebid": "7134967546256016900",
  636. "xiguavideopcwebid.sig": "xxRww5R1VEMJN_dQepHorEu_eAc",
  637. }
  638. async with session.get(
  639. url, headers=headers, params=params, cookies=cookies
  640. ) as response:
  641. video_info = await response.json()
  642. video_info = (
  643. video_info["data"]
  644. .get("gidInformation", {})
  645. .get("packerData", {})
  646. .get("video", {})
  647. )
  648. # print(video_info)
  649. item = VideoItem()
  650. item.add_video_info("video_title", video_info.get("title", ""))
  651. item.add_video_info(
  652. "video_id", video_info.get("videoResource", {}).get("vid", "")
  653. )
  654. item.add_video_info("play_cnt", int(video_info.get("video_watch_count", 0)))
  655. item.add_video_info("like_cnt", int(video_info.get("video_like_count", 0)))
  656. item.add_video_info("duration", int(video_info.get("video_duration", 0)))
  657. item.add_video_info(
  658. "publish_time_stamp", int(video_info.get("video_publish_time", 0))
  659. )
  660. item.add_video_info(
  661. "publish_time_str",
  662. time.strftime(
  663. "%Y-%m-%d %H:%M:%S",
  664. time.localtime(int(video_info.get("video_publish_time", 0))),
  665. ),
  666. )
  667. item.add_video_info(
  668. "user_name", video_info.get("user_info", {}).get("name", "")
  669. )
  670. item.add_video_info(
  671. "user_id", str(video_info.get("user_info", {}).get("user_id", ""))
  672. )
  673. item.add_video_info(
  674. "avatar_url", str(video_info.get("user_info", {}).get("avatar_url", ""))
  675. )
  676. item.add_video_info("cover_url", video_info.get("poster_url", ""))
  677. item.add_video_info("audio_url", get_video_url(video_info)["audio_url"])
  678. item.add_video_info("video_url", get_video_url(video_info)["video_url"])
  679. item.add_video_info("session", "xigua-search-{}".format(int(time.time())))
  680. item.add_video_info("out_video_id", video_info.get("videoResource", {}).get("vid", ""))
  681. item.add_video_info("platform", self.platform)
  682. item.add_video_info("strategy", self.mode)
  683. # item.add_video_info("")
  684. mq_obj = item.produce_item()
  685. # print(mq_obj)
  686. print(json.dumps(mq_obj, ensure_ascii=False, indent=4))
  687. if __name__ == "__main__":
  688. S = XiGuaSearch(platform=1, mode=2, rule_dict=3, user_list=1)
  689. loop = asyncio.get_event_loop()
  690. loop.run_until_complete(S.search("春节"))
  691. # await