xigua_author_test.py 41 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968
  1. import json
  2. import re
  3. import os
  4. import random
  5. import sys
  6. import string
  7. import time
  8. import uuid
  9. import base64
  10. import requests
  11. from fake_useragent import FakeUserAgent
  12. sys.path.append(os.getcwd())
  13. class PiaoQuanPipelineTest:
  14. def __init__(self, platform, mode, rule_dict, env, item, trace_id):
  15. self.platform = platform
  16. self.mode = mode
  17. self.item = item
  18. self.rule_dict = rule_dict
  19. self.env = env
  20. self.trace_id = trace_id
  21. # 视频的发布时间限制, 属于是规则过滤
  22. def publish_time_flag(self):
  23. # 判断发布时间
  24. publish_time_stamp = self.item["publish_time_stamp"]
  25. update_time_stamp = self.item["update_time_stamp"]
  26. if self.platform == "gongzhonghao":
  27. if (
  28. int(time.time()) - publish_time_stamp
  29. > 3600 * 24 * int(self.rule_dict.get("period", {}).get("max", 1000))
  30. ) and (
  31. int(time.time()) - update_time_stamp
  32. > 3600 * 24 * int(self.rule_dict.get("period", {}).get("max", 1000))
  33. ):
  34. message = "发布时间超过{}天".format(
  35. int(self.rule_dict.get("period", {}).get("max", 1000))
  36. )
  37. print(message)
  38. return False
  39. else:
  40. if (
  41. int(time.time()) - publish_time_stamp
  42. > 3600 * 24 * int(self.rule_dict.get("period", {}).get("max", 1000))
  43. ):
  44. message = "发布时间超过{}天".format(
  45. int(self.rule_dict.get("period", {}).get("max", 1000))
  46. )
  47. print(message)
  48. return False
  49. return True
  50. # 视频标题是否满足需求
  51. def title_flag(self):
  52. title = self.item["video_title"]
  53. cleaned_title = re.sub(r"[^\w]", " ", title)
  54. # 敏感词
  55. # 获取敏感词列表
  56. sensitive_words = []
  57. if any(word in cleaned_title for word in sensitive_words):
  58. message = "标题中包含敏感词"
  59. print(message)
  60. return False
  61. return True
  62. # 视频基础下载规则
  63. def download_rule_flag(self):
  64. for key in self.item:
  65. if self.rule_dict.get(key):
  66. max_value = (
  67. int(self.rule_dict[key]["max"])
  68. if int(self.rule_dict[key]["max"]) > 0
  69. else 999999999999999
  70. )
  71. if key == "peroid": # peroid是抓取周期天数
  72. continue
  73. else:
  74. flag = int(self.rule_dict[key]["min"]) <= int(self.item[key]) <= max_value
  75. if not flag:
  76. message = "{}: {} <= {} <= {}, {}".format(
  77. key,
  78. self.rule_dict[key]["min"],
  79. self.item[key],
  80. max_value,
  81. flag,
  82. )
  83. print(message)
  84. return flag
  85. else:
  86. continue
  87. return True
  88. # 按照某个具体平台来去重
  89. # def repeat_video(self):
  90. # # sql = f""" select * from crawler_video where platform="公众号" and out_video_id="{video_id}"; """
  91. # out_id = self.item["out_video_id"]
  92. # sql = f""" select * from crawler_video where platform = "{self.platform}" and out_video_id="{out_id}"; """
  93. # repeat_video = MysqlHelper.get_values(
  94. # log_type=self.mode, crawler=self.platform, env=self.env, sql=sql, action=""
  95. # )
  96. # if repeat_video:
  97. # message = "重复的视频"
  98. # return False
  99. # return True
  100. def process_item(self):
  101. if not self.publish_time_flag():
  102. # 记录相关日志
  103. return False
  104. if not self.title_flag():
  105. # 记录相关日志
  106. return False
  107. # if not self.repeat_video():
  108. # # 记录相关日志
  109. # return False
  110. if not self.download_rule_flag():
  111. # 记录相关日志
  112. return False
  113. return True
  114. def tunnel_proxies():
  115. # 隧道域名:端口号
  116. tunnel = "q796.kdltps.com:15818"
  117. # 用户名密码方式
  118. username = "t17772369458618"
  119. password = "5zqcjkmy"
  120. tunnel_proxies = {
  121. "http": "http://%(user)s:%(pwd)s@%(proxy)s/"
  122. % {"user": username, "pwd": password, "proxy": tunnel},
  123. "https": "http://%(user)s:%(pwd)s@%(proxy)s/"
  124. % {"user": username, "pwd": password, "proxy": tunnel},
  125. }
  126. return tunnel_proxies
  127. def random_signature():
  128. src_digits = string.digits # string_数字
  129. src_uppercase = string.ascii_uppercase # string_大写字母
  130. src_lowercase = string.ascii_lowercase # string_小写字母
  131. digits_num = random.randint(1, 6)
  132. uppercase_num = random.randint(1, 26 - digits_num - 1)
  133. lowercase_num = 26 - (digits_num + uppercase_num)
  134. password = (
  135. random.sample(src_digits, digits_num)
  136. + random.sample(src_uppercase, uppercase_num)
  137. + random.sample(src_lowercase, lowercase_num)
  138. )
  139. random.shuffle(password)
  140. new_password = "AAAAAAAAAA" + "".join(password)[10:-4] + "AAAB"
  141. new_password_start = new_password[0:18]
  142. new_password_end = new_password[-7:]
  143. if new_password[18] == "8":
  144. new_password = new_password_start + "w" + new_password_end
  145. elif new_password[18] == "9":
  146. new_password = new_password_start + "x" + new_password_end
  147. elif new_password[18] == "-":
  148. new_password = new_password_start + "y" + new_password_end
  149. elif new_password[18] == ".":
  150. new_password = new_password_start + "z" + new_password_end
  151. else:
  152. new_password = new_password_start + "y" + new_password_end
  153. return new_password
  154. def get_video_url(video_info):
  155. video_url_dict = {}
  156. # video_url
  157. if "videoResource" not in video_info:
  158. video_url_dict["video_url"] = ""
  159. video_url_dict["audio_url"] = ""
  160. video_url_dict["video_width"] = 0
  161. video_url_dict["video_height"] = 0
  162. elif "dash_120fps" in video_info["videoResource"]:
  163. if (
  164. "video_list" in video_info["videoResource"]["dash_120fps"]
  165. and "video_4" in video_info["videoResource"]["dash_120fps"]["video_list"]
  166. ):
  167. video_url = video_info["videoResource"]["dash_120fps"]["video_list"][
  168. "video_4"
  169. ]["backup_url_1"]
  170. audio_url = video_info["videoResource"]["dash_120fps"]["video_list"][
  171. "video_4"
  172. ]["backup_url_1"]
  173. if len(video_url) % 3 == 1:
  174. video_url += "=="
  175. elif len(video_url) % 3 == 2:
  176. video_url += "="
  177. elif len(audio_url) % 3 == 1:
  178. audio_url += "=="
  179. elif len(audio_url) % 3 == 2:
  180. audio_url += "="
  181. video_url = base64.b64decode(video_url).decode("utf8")
  182. audio_url = base64.b64decode(audio_url).decode("utf8")
  183. video_width = video_info["videoResource"]["dash_120fps"]["video_list"][
  184. "video_4"
  185. ]["vwidth"]
  186. video_height = video_info["videoResource"]["dash_120fps"]["video_list"][
  187. "video_4"
  188. ]["vheight"]
  189. video_url_dict["video_url"] = video_url
  190. video_url_dict["audio_url"] = audio_url
  191. video_url_dict["video_width"] = video_width
  192. video_url_dict["video_height"] = video_height
  193. elif (
  194. "video_list" in video_info["videoResource"]["dash_120fps"]
  195. and "video_3" in video_info["videoResource"]["dash_120fps"]["video_list"]
  196. ):
  197. video_url = video_info["videoResource"]["dash_120fps"]["video_list"][
  198. "video_3"
  199. ]["backup_url_1"]
  200. audio_url = video_info["videoResource"]["dash_120fps"]["video_list"][
  201. "video_3"
  202. ]["backup_url_1"]
  203. if len(video_url) % 3 == 1:
  204. video_url += "=="
  205. elif len(video_url) % 3 == 2:
  206. video_url += "="
  207. elif len(audio_url) % 3 == 1:
  208. audio_url += "=="
  209. elif len(audio_url) % 3 == 2:
  210. audio_url += "="
  211. video_url = base64.b64decode(video_url).decode("utf8")
  212. audio_url = base64.b64decode(audio_url).decode("utf8")
  213. video_width = video_info["videoResource"]["dash_120fps"]["video_list"][
  214. "video_3"
  215. ]["vwidth"]
  216. video_height = video_info["videoResource"]["dash_120fps"]["video_list"][
  217. "video_3"
  218. ]["vheight"]
  219. video_url_dict["video_url"] = video_url
  220. video_url_dict["audio_url"] = audio_url
  221. video_url_dict["video_width"] = video_width
  222. video_url_dict["video_height"] = video_height
  223. elif (
  224. "video_list" in video_info["videoResource"]["dash_120fps"]
  225. and "video_2" in video_info["videoResource"]["dash_120fps"]["video_list"]
  226. ):
  227. video_url = video_info["videoResource"]["dash_120fps"]["video_list"][
  228. "video_2"
  229. ]["backup_url_1"]
  230. audio_url = video_info["videoResource"]["dash_120fps"]["video_list"][
  231. "video_2"
  232. ]["backup_url_1"]
  233. if len(video_url) % 3 == 1:
  234. video_url += "=="
  235. elif len(video_url) % 3 == 2:
  236. video_url += "="
  237. elif len(audio_url) % 3 == 1:
  238. audio_url += "=="
  239. elif len(audio_url) % 3 == 2:
  240. audio_url += "="
  241. video_url = base64.b64decode(video_url).decode("utf8")
  242. audio_url = base64.b64decode(audio_url).decode("utf8")
  243. video_width = video_info["videoResource"]["dash_120fps"]["video_list"][
  244. "video_2"
  245. ]["vwidth"]
  246. video_height = video_info["videoResource"]["dash_120fps"]["video_list"][
  247. "video_2"
  248. ]["vheight"]
  249. video_url_dict["video_url"] = video_url
  250. video_url_dict["audio_url"] = audio_url
  251. video_url_dict["video_width"] = video_width
  252. video_url_dict["video_height"] = video_height
  253. elif (
  254. "video_list" in video_info["videoResource"]["dash_120fps"]
  255. and "video_1" in video_info["videoResource"]["dash_120fps"]["video_list"]
  256. ):
  257. video_url = video_info["videoResource"]["dash_120fps"]["video_list"][
  258. "video_1"
  259. ]["backup_url_1"]
  260. audio_url = video_info["videoResource"]["dash_120fps"]["video_list"][
  261. "video_1"
  262. ]["backup_url_1"]
  263. if len(video_url) % 3 == 1:
  264. video_url += "=="
  265. elif len(video_url) % 3 == 2:
  266. video_url += "="
  267. elif len(audio_url) % 3 == 1:
  268. audio_url += "=="
  269. elif len(audio_url) % 3 == 2:
  270. audio_url += "="
  271. video_url = base64.b64decode(video_url).decode("utf8")
  272. audio_url = base64.b64decode(audio_url).decode("utf8")
  273. video_width = video_info["videoResource"]["dash_120fps"]["video_list"][
  274. "video_1"
  275. ]["vwidth"]
  276. video_height = video_info["videoResource"]["dash_120fps"]["video_list"][
  277. "video_1"
  278. ]["vheight"]
  279. video_url_dict["video_url"] = video_url
  280. video_url_dict["audio_url"] = audio_url
  281. video_url_dict["video_width"] = video_width
  282. video_url_dict["video_height"] = video_height
  283. elif (
  284. "dynamic_video" in video_info["videoResource"]["dash_120fps"]
  285. and "dynamic_video_list"
  286. in video_info["videoResource"]["dash_120fps"]["dynamic_video"]
  287. and "dynamic_audio_list"
  288. in video_info["videoResource"]["dash_120fps"]["dynamic_video"]
  289. and len(
  290. video_info["videoResource"]["dash_120fps"]["dynamic_video"][
  291. "dynamic_video_list"
  292. ]
  293. )
  294. != 0
  295. and len(
  296. video_info["videoResource"]["dash_120fps"]["dynamic_video"][
  297. "dynamic_audio_list"
  298. ]
  299. )
  300. != 0
  301. ):
  302. video_url = video_info["videoResource"]["dash_120fps"]["dynamic_video"][
  303. "dynamic_video_list"
  304. ][-1]["backup_url_1"]
  305. audio_url = video_info["videoResource"]["dash_120fps"]["dynamic_video"][
  306. "dynamic_audio_list"
  307. ][-1]["backup_url_1"]
  308. if len(video_url) % 3 == 1:
  309. video_url += "=="
  310. elif len(video_url) % 3 == 2:
  311. video_url += "="
  312. elif len(audio_url) % 3 == 1:
  313. audio_url += "=="
  314. elif len(audio_url) % 3 == 2:
  315. audio_url += "="
  316. video_url = base64.b64decode(video_url).decode("utf8")
  317. audio_url = base64.b64decode(audio_url).decode("utf8")
  318. video_width = video_info["videoResource"]["dash_120fps"]["dynamic_video"][
  319. "dynamic_video_list"
  320. ][-1]["vwidth"]
  321. video_height = video_info["videoResource"]["dash_120fps"]["dynamic_video"][
  322. "dynamic_video_list"
  323. ][-1]["vheight"]
  324. video_url_dict["video_url"] = video_url
  325. video_url_dict["audio_url"] = audio_url
  326. video_url_dict["video_width"] = video_width
  327. video_url_dict["video_height"] = video_height
  328. else:
  329. video_url_dict["video_url"] = ""
  330. video_url_dict["audio_url"] = ""
  331. video_url_dict["video_width"] = 0
  332. video_url_dict["video_height"] = 0
  333. elif "dash" in video_info["videoResource"]:
  334. if (
  335. "video_list" in video_info["videoResource"]["dash"]
  336. and "video_4" in video_info["videoResource"]["dash"]["video_list"]
  337. ):
  338. video_url = video_info["videoResource"]["dash"]["video_list"]["video_4"][
  339. "backup_url_1"
  340. ]
  341. audio_url = video_info["videoResource"]["dash"]["video_list"]["video_4"][
  342. "backup_url_1"
  343. ]
  344. if len(video_url) % 3 == 1:
  345. video_url += "=="
  346. elif len(video_url) % 3 == 2:
  347. video_url += "="
  348. elif len(audio_url) % 3 == 1:
  349. audio_url += "=="
  350. elif len(audio_url) % 3 == 2:
  351. audio_url += "="
  352. video_url = base64.b64decode(video_url).decode("utf8")
  353. audio_url = base64.b64decode(audio_url).decode("utf8")
  354. video_width = video_info["videoResource"]["dash"]["video_list"]["video_4"][
  355. "vwidth"
  356. ]
  357. video_height = video_info["videoResource"]["dash"]["video_list"]["video_4"][
  358. "vheight"
  359. ]
  360. video_url_dict["video_url"] = video_url
  361. video_url_dict["audio_url"] = audio_url
  362. video_url_dict["video_width"] = video_width
  363. video_url_dict["video_height"] = video_height
  364. elif (
  365. "video_list" in video_info["videoResource"]["dash"]
  366. and "video_3" in video_info["videoResource"]["dash"]["video_list"]
  367. ):
  368. video_url = video_info["videoResource"]["dash"]["video_list"]["video_3"][
  369. "backup_url_1"
  370. ]
  371. audio_url = video_info["videoResource"]["dash"]["video_list"]["video_3"][
  372. "backup_url_1"
  373. ]
  374. if len(video_url) % 3 == 1:
  375. video_url += "=="
  376. elif len(video_url) % 3 == 2:
  377. video_url += "="
  378. elif len(audio_url) % 3 == 1:
  379. audio_url += "=="
  380. elif len(audio_url) % 3 == 2:
  381. audio_url += "="
  382. video_url = base64.b64decode(video_url).decode("utf8")
  383. audio_url = base64.b64decode(audio_url).decode("utf8")
  384. video_width = video_info["videoResource"]["dash"]["video_list"]["video_3"][
  385. "vwidth"
  386. ]
  387. video_height = video_info["videoResource"]["dash"]["video_list"]["video_3"][
  388. "vheight"
  389. ]
  390. video_url_dict["video_url"] = video_url
  391. video_url_dict["audio_url"] = audio_url
  392. video_url_dict["video_width"] = video_width
  393. video_url_dict["video_height"] = video_height
  394. elif (
  395. "video_list" in video_info["videoResource"]["dash"]
  396. and "video_2" in video_info["videoResource"]["dash"]["video_list"]
  397. ):
  398. video_url = video_info["videoResource"]["dash"]["video_list"]["video_2"][
  399. "backup_url_1"
  400. ]
  401. audio_url = video_info["videoResource"]["dash"]["video_list"]["video_2"][
  402. "backup_url_1"
  403. ]
  404. if len(video_url) % 3 == 1:
  405. video_url += "=="
  406. elif len(video_url) % 3 == 2:
  407. video_url += "="
  408. elif len(audio_url) % 3 == 1:
  409. audio_url += "=="
  410. elif len(audio_url) % 3 == 2:
  411. audio_url += "="
  412. video_url = base64.b64decode(video_url).decode("utf8")
  413. audio_url = base64.b64decode(audio_url).decode("utf8")
  414. video_width = video_info["videoResource"]["dash"]["video_list"]["video_2"][
  415. "vwidth"
  416. ]
  417. video_height = video_info["videoResource"]["dash"]["video_list"]["video_2"][
  418. "vheight"
  419. ]
  420. video_url_dict["video_url"] = video_url
  421. video_url_dict["audio_url"] = audio_url
  422. video_url_dict["video_width"] = video_width
  423. video_url_dict["video_height"] = video_height
  424. elif (
  425. "video_list" in video_info["videoResource"]["dash"]
  426. and "video_1" in video_info["videoResource"]["dash"]["video_list"]
  427. ):
  428. video_url = video_info["videoResource"]["dash"]["video_list"]["video_1"][
  429. "backup_url_1"
  430. ]
  431. audio_url = video_info["videoResource"]["dash"]["video_list"]["video_1"][
  432. "backup_url_1"
  433. ]
  434. if len(video_url) % 3 == 1:
  435. video_url += "=="
  436. elif len(video_url) % 3 == 2:
  437. video_url += "="
  438. elif len(audio_url) % 3 == 1:
  439. audio_url += "=="
  440. elif len(audio_url) % 3 == 2:
  441. audio_url += "="
  442. video_url = base64.b64decode(video_url).decode("utf8")
  443. audio_url = base64.b64decode(audio_url).decode("utf8")
  444. video_width = video_info["videoResource"]["dash"]["video_list"]["video_1"][
  445. "vwidth"
  446. ]
  447. video_height = video_info["videoResource"]["dash"]["video_list"]["video_1"][
  448. "vheight"
  449. ]
  450. video_url_dict["video_url"] = video_url
  451. video_url_dict["audio_url"] = audio_url
  452. video_url_dict["video_width"] = video_width
  453. video_url_dict["video_height"] = video_height
  454. elif (
  455. "dynamic_video" in video_info["videoResource"]["dash"]
  456. and "dynamic_video_list"
  457. in video_info["videoResource"]["dash"]["dynamic_video"]
  458. and "dynamic_audio_list"
  459. in video_info["videoResource"]["dash"]["dynamic_video"]
  460. and len(
  461. video_info["videoResource"]["dash"]["dynamic_video"][
  462. "dynamic_video_list"
  463. ]
  464. )
  465. != 0
  466. and len(
  467. video_info["videoResource"]["dash"]["dynamic_video"][
  468. "dynamic_audio_list"
  469. ]
  470. )
  471. != 0
  472. ):
  473. video_url = video_info["videoResource"]["dash"]["dynamic_video"][
  474. "dynamic_video_list"
  475. ][-1]["backup_url_1"]
  476. audio_url = video_info["videoResource"]["dash"]["dynamic_video"][
  477. "dynamic_audio_list"
  478. ][-1]["backup_url_1"]
  479. if len(video_url) % 3 == 1:
  480. video_url += "=="
  481. elif len(video_url) % 3 == 2:
  482. video_url += "="
  483. elif len(audio_url) % 3 == 1:
  484. audio_url += "=="
  485. elif len(audio_url) % 3 == 2:
  486. audio_url += "="
  487. video_url = base64.b64decode(video_url).decode("utf8")
  488. audio_url = base64.b64decode(audio_url).decode("utf8")
  489. video_width = video_info["videoResource"]["dash"]["dynamic_video"][
  490. "dynamic_video_list"
  491. ][-1]["vwidth"]
  492. video_height = video_info["videoResource"]["dash"]["dynamic_video"][
  493. "dynamic_video_list"
  494. ][-1]["vheight"]
  495. video_url_dict["video_url"] = video_url
  496. video_url_dict["audio_url"] = audio_url
  497. video_url_dict["video_width"] = video_width
  498. video_url_dict["video_height"] = video_height
  499. else:
  500. video_url_dict["video_url"] = ""
  501. video_url_dict["audio_url"] = ""
  502. video_url_dict["video_width"] = 0
  503. video_url_dict["video_height"] = 0
  504. elif "normal" in video_info["videoResource"]:
  505. if (
  506. "video_list" in video_info["videoResource"]["normal"]
  507. and "video_4" in video_info["videoResource"]["normal"]["video_list"]
  508. ):
  509. video_url = video_info["videoResource"]["normal"]["video_list"]["video_4"][
  510. "backup_url_1"
  511. ]
  512. audio_url = video_info["videoResource"]["normal"]["video_list"]["video_4"][
  513. "backup_url_1"
  514. ]
  515. if len(video_url) % 3 == 1:
  516. video_url += "=="
  517. elif len(video_url) % 3 == 2:
  518. video_url += "="
  519. elif len(audio_url) % 3 == 1:
  520. audio_url += "=="
  521. elif len(audio_url) % 3 == 2:
  522. audio_url += "="
  523. video_url = base64.b64decode(video_url).decode("utf8")
  524. audio_url = base64.b64decode(audio_url).decode("utf8")
  525. video_width = video_info["videoResource"]["normal"]["video_list"][
  526. "video_4"
  527. ]["vwidth"]
  528. video_height = video_info["videoResource"]["normal"]["video_list"][
  529. "video_4"
  530. ]["vheight"]
  531. video_url_dict["video_url"] = video_url
  532. video_url_dict["audio_url"] = audio_url
  533. video_url_dict["video_width"] = video_width
  534. video_url_dict["video_height"] = video_height
  535. elif (
  536. "video_list" in video_info["videoResource"]["normal"]
  537. and "video_3" in video_info["videoResource"]["normal"]["video_list"]
  538. ):
  539. video_url = video_info["videoResource"]["normal"]["video_list"]["video_3"][
  540. "backup_url_1"
  541. ]
  542. audio_url = video_info["videoResource"]["normal"]["video_list"]["video_3"][
  543. "backup_url_1"
  544. ]
  545. if len(video_url) % 3 == 1:
  546. video_url += "=="
  547. elif len(video_url) % 3 == 2:
  548. video_url += "="
  549. elif len(audio_url) % 3 == 1:
  550. audio_url += "=="
  551. elif len(audio_url) % 3 == 2:
  552. audio_url += "="
  553. video_url = base64.b64decode(video_url).decode("utf8")
  554. audio_url = base64.b64decode(audio_url).decode("utf8")
  555. video_width = video_info["videoResource"]["normal"]["video_list"][
  556. "video_3"
  557. ]["vwidth"]
  558. video_height = video_info["videoResource"]["normal"]["video_list"][
  559. "video_3"
  560. ]["vheight"]
  561. video_url_dict["video_url"] = video_url
  562. video_url_dict["audio_url"] = audio_url
  563. video_url_dict["video_width"] = video_width
  564. video_url_dict["video_height"] = video_height
  565. elif (
  566. "video_list" in video_info["videoResource"]["normal"]
  567. and "video_2" in video_info["videoResource"]["normal"]["video_list"]
  568. ):
  569. video_url = video_info["videoResource"]["normal"]["video_list"]["video_2"][
  570. "backup_url_1"
  571. ]
  572. audio_url = video_info["videoResource"]["normal"]["video_list"]["video_2"][
  573. "backup_url_1"
  574. ]
  575. if len(video_url) % 3 == 1:
  576. video_url += "=="
  577. elif len(video_url) % 3 == 2:
  578. video_url += "="
  579. elif len(audio_url) % 3 == 1:
  580. audio_url += "=="
  581. elif len(audio_url) % 3 == 2:
  582. audio_url += "="
  583. video_url = base64.b64decode(video_url).decode("utf8")
  584. audio_url = base64.b64decode(audio_url).decode("utf8")
  585. video_width = video_info["videoResource"]["normal"]["video_list"][
  586. "video_2"
  587. ]["vwidth"]
  588. video_height = video_info["videoResource"]["normal"]["video_list"][
  589. "video_2"
  590. ]["vheight"]
  591. video_url_dict["video_url"] = video_url
  592. video_url_dict["audio_url"] = audio_url
  593. video_url_dict["video_width"] = video_width
  594. video_url_dict["video_height"] = video_height
  595. elif (
  596. "video_list" in video_info["videoResource"]["normal"]
  597. and "video_1" in video_info["videoResource"]["normal"]["video_list"]
  598. ):
  599. video_url = video_info["videoResource"]["normal"]["video_list"]["video_1"][
  600. "backup_url_1"
  601. ]
  602. audio_url = video_info["videoResource"]["normal"]["video_list"]["video_1"][
  603. "backup_url_1"
  604. ]
  605. if len(video_url) % 3 == 1:
  606. video_url += "=="
  607. elif len(video_url) % 3 == 2:
  608. video_url += "="
  609. elif len(audio_url) % 3 == 1:
  610. audio_url += "=="
  611. elif len(audio_url) % 3 == 2:
  612. audio_url += "="
  613. video_url = base64.b64decode(video_url).decode("utf8")
  614. audio_url = base64.b64decode(audio_url).decode("utf8")
  615. video_width = video_info["videoResource"]["normal"]["video_list"][
  616. "video_1"
  617. ]["vwidth"]
  618. video_height = video_info["videoResource"]["normal"]["video_list"][
  619. "video_1"
  620. ]["vheight"]
  621. video_url_dict["video_url"] = video_url
  622. video_url_dict["audio_url"] = audio_url
  623. video_url_dict["video_width"] = video_width
  624. video_url_dict["video_height"] = video_height
  625. elif (
  626. "dynamic_video" in video_info["videoResource"]["normal"]
  627. and "dynamic_video_list"
  628. in video_info["videoResource"]["normal"]["dynamic_video"]
  629. and "dynamic_audio_list"
  630. in video_info["videoResource"]["normal"]["dynamic_video"]
  631. and len(
  632. video_info["videoResource"]["normal"]["dynamic_video"][
  633. "dynamic_video_list"
  634. ]
  635. )
  636. != 0
  637. and len(
  638. video_info["videoResource"]["normal"]["dynamic_video"][
  639. "dynamic_audio_list"
  640. ]
  641. )
  642. != 0
  643. ):
  644. video_url = video_info["videoResource"]["normal"]["dynamic_video"][
  645. "dynamic_video_list"
  646. ][-1]["backup_url_1"]
  647. audio_url = video_info["videoResource"]["normal"]["dynamic_video"][
  648. "dynamic_audio_list"
  649. ][-1]["backup_url_1"]
  650. if len(video_url) % 3 == 1:
  651. video_url += "=="
  652. elif len(video_url) % 3 == 2:
  653. video_url += "="
  654. elif len(audio_url) % 3 == 1:
  655. audio_url += "=="
  656. elif len(audio_url) % 3 == 2:
  657. audio_url += "="
  658. video_url = base64.b64decode(video_url).decode("utf8")
  659. audio_url = base64.b64decode(audio_url).decode("utf8")
  660. video_width = video_info["videoResource"]["normal"]["dynamic_video"][
  661. "dynamic_video_list"
  662. ][-1]["vwidth"]
  663. video_height = video_info["videoResource"]["normal"]["dynamic_video"][
  664. "dynamic_video_list"
  665. ][-1]["vheight"]
  666. video_url_dict["video_url"] = video_url
  667. video_url_dict["audio_url"] = audio_url
  668. video_url_dict["video_width"] = video_width
  669. video_url_dict["video_height"] = video_height
  670. else:
  671. video_url_dict["video_url"] = ""
  672. video_url_dict["audio_url"] = ""
  673. video_url_dict["video_width"] = 0
  674. video_url_dict["video_height"] = 0
  675. else:
  676. video_url_dict["video_url"] = ""
  677. video_url_dict["audio_url"] = ""
  678. video_url_dict["video_width"] = 0
  679. video_url_dict["video_height"] = 0
  680. return video_url_dict
  681. def get_comment_cnt(item_id):
  682. url = "https://www.ixigua.com/tlb/comment/article/v5/tab_comments/?"
  683. params = {
  684. "tab_index": "0",
  685. "count": "10",
  686. "offset": "10",
  687. "group_id": str(item_id),
  688. "item_id": str(item_id),
  689. "aid": "1768",
  690. "msToken": "50-JJObWB07HfHs-BMJWT1eIDX3G-6lPSF_i-QwxBIXE9VVa-iN0jbEXR5pG2DKjXBmP299n6ZTuXzY-GAy968CCvouSAYIS4GzvGQT3pNlKNejr5G4-1g==",
  691. "X-Bogus": "DFSzswVOyGtANVeWtCLMqR/F6q9U",
  692. "_signature": FakeUserAgent().random,
  693. }
  694. headers = {
  695. "authority": "www.ixigua.com",
  696. "accept": "application/json, text/plain, */*",
  697. "accept-language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
  698. "cache-control": "no-cache",
  699. "cookie": "MONITOR_WEB_ID=67cb5099-a022-4ec3-bb8e-c4de6ba51dd0; passport_csrf_token=72b2574f3c99f8ba670e42df430218fd; passport_csrf_token_default=72b2574f3c99f8ba670e42df430218fd; sid_guard=c7472b508ea631823ba765a60cf8757f%7C1680867422%7C3024002%7CFri%2C+12-May-2023+11%3A37%3A04+GMT; uid_tt=c13f47d51767f616befe32fb3e9f485a; uid_tt_ss=c13f47d51767f616befe32fb3e9f485a; sid_tt=c7472b508ea631823ba765a60cf8757f; sessionid=c7472b508ea631823ba765a60cf8757f; sessionid_ss=c7472b508ea631823ba765a60cf8757f; sid_ucp_v1=1.0.0-KGUzNWYxNmRkZGJiZjgxY2MzZWNkMTEzMTkwYjY1Yjg5OTY5NzVlNmMKFQiu3d-eqQIQ3oDAoQYYGCAMOAhACxoCaGwiIGM3NDcyYjUwOGVhNjMxODIzYmE3NjVhNjBjZjg3NTdm; ssid_ucp_v1=1.0.0-KGUzNWYxNmRkZGJiZjgxY2MzZWNkMTEzMTkwYjY1Yjg5OTY5NzVlNmMKFQiu3d-eqQIQ3oDAoQYYGCAMOAhACxoCaGwiIGM3NDcyYjUwOGVhNjMxODIzYmE3NjVhNjBjZjg3NTdm; odin_tt=b893608d4dde2e1e8df8cd5d97a0e2fbeafc4ca762ac72ebef6e6c97e2ed19859bb01d46b4190ddd6dd17d7f9678e1de; SEARCH_CARD_MODE=7168304743566296612_0; support_webp=true; support_avif=false; csrf_session_id=a5355d954d3c63ed1ba35faada452b4d; tt_scid=7Pux7s634-z8DYvCM20y7KigwH5u7Rh6D9C-RROpnT.aGMEcz6Vsxp.oai47wJqa4f86; ttwid=1%7CHHtv2QqpSGuSu8r-zXF1QoWsvjmNi1SJrqOrZzg-UCY%7C1683858689%7Ca5223fe1500578e01e138a0d71d6444692018296c4c24f5885af174a65873c95; ixigua-a-s=3; msToken=50-JJObWB07HfHs-BMJWT1eIDX3G-6lPSF_i-QwxBIXE9VVa-iN0jbEXR5pG2DKjXBmP299n6ZTuXzY-GAy968CCvouSAYIS4GzvGQT3pNlKNejr5G4-1g==; __ac_nonce=0645dcbf0005064517440; __ac_signature=_02B4Z6wo00f01FEGmAwAAIDBKchzCGqn-MBRJpyAAHAjieFC5GEg6gGiwz.I4PRrJl7f0GcixFrExKmgt6QI1i1S-dQyofPEj2ugWTCnmKUdJQv-wYuDofeKNe8VtMtZq2aKewyUGeKU-5Ud21; ixigua-a-s=3",
  700. "pragma": "no-cache",
  701. "referer": f"https://www.ixigua.com/{item_id}?logTag=3c5aa86a8600b9ab8540",
  702. "sec-ch-ua": '"Microsoft Edge";v="113", "Chromium";v="113", "Not-A.Brand";v="24"',
  703. "sec-ch-ua-mobile": "?0",
  704. "sec-ch-ua-platform": '"macOS"',
  705. "sec-fetch-dest": "empty",
  706. "sec-fetch-mode": "cors",
  707. "sec-fetch-site": "same-origin",
  708. "tt-anti-token": "cBITBHvmYjEygzv-f9c78c1297722cf1f559c74b084e4525ce4900bdcf9e8588f20cc7c2e3234422",
  709. "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.35",
  710. "x-secsdk-csrf-token": "000100000001f8e733cf37f0cd255a51aea9a81ff7bc0c09490cfe41ad827c3c5c18ec809279175e4d9f5553d8a5",
  711. }
  712. response = requests.get(
  713. url=url, headers=headers, params=params, proxies=tunnel_proxies(), timeout=5
  714. )
  715. response.close()
  716. if (
  717. response.status_code != 200
  718. or "total_number" not in response.json()
  719. or response.json() == {}
  720. ):
  721. return 0
  722. return response.json().get("total_number", 0)
  723. def get_video_info(item_id, trace_id):
  724. url = "https://www.ixigua.com/api/mixVideo/information?"
  725. headers = {
  726. "accept-encoding": "gzip, deflate",
  727. "accept-language": "zh-CN,zh-Hans;q=0.9",
  728. "user-agent": FakeUserAgent().random,
  729. "referer": "https://www.ixigua.com/7102614741050196520?logTag=0531c88ac04f38ab2c62",
  730. }
  731. params = {
  732. "mixId": str(item_id),
  733. "msToken": "IlG0wd0Pylyw9ghcYiB2YseUmTwrsrqqhXrbIcsSaTcLTJyVlbYJzk20zw3UO-CfrfC"
  734. "NVVIOBNjIl7vfBoxnVUwO9ZyzAI3umSKsT5-pef_RRfQCJwmA",
  735. "X-Bogus": "DFSzswVupYTANCJOSBk0P53WxM-r",
  736. "_signature": "_02B4Z6wo0000119LvEwAAIDCuktNZ0y5wkdfS7jAALThuOR8D9yWNZ.EmWHKV0WSn6Px"
  737. "fPsH9-BldyxVje0f49ryXgmn7Tzk-swEHNb15TiGqa6YF.cX0jW8Eds1TtJOIZyfc9s5emH7gdWN94",
  738. }
  739. cookies = {
  740. "ixigua-a-s": "1",
  741. "msToken": "IlG0wd0Pylyw9ghcYiB2YseUmTwrsrqqhXrbIcsSaTcLTJyVlbYJzk20zw3UO-CfrfCNVVIOB"
  742. "NjIl7vfBoxnVUwO9ZyzAI3umSKsT5-pef_RRfQCJwmA",
  743. "ttwid": "1%7C_yXQeHWwLZgCsgHClOwTCdYSOt_MjdOkgnPIkpi-Sr8%7C1661241238%7Cf57d0c5ef3f1d7"
  744. "6e049fccdca1ac54887c34d1f8731c8e51a49780ff0ceab9f8",
  745. "tt_scid": "QZ4l8KXDG0YAEaMCSbADdcybdKbUfG4BC6S4OBv9lpRS5VyqYLX2bIR8CTeZeGHR9ee3",
  746. "MONITOR_WEB_ID": "0a49204a-7af5-4e96-95f0-f4bafb7450ad",
  747. "__ac_nonce": "06304878000964fdad287",
  748. "__ac_signature": "_02B4Z6wo00f017Rcr3AAAIDCUVxeW1tOKEu0fKvAAI4cvoYzV-wBhq7B6D8k0no7lb"
  749. "FlvYoinmtK6UXjRIYPXnahUlFTvmWVtb77jsMkKAXzAEsLE56m36RlvL7ky.M3Xn52r9t1IEb7IR3ke8",
  750. "ttcid": "e56fabf6e85d4adf9e4d91902496a0e882",
  751. "_tea_utm_cache_1300": "undefined",
  752. "support_avif": "false",
  753. "support_webp": "false",
  754. "xiguavideopcwebid": "7134967546256016900",
  755. "xiguavideopcwebid.sig": "xxRww5R1VEMJN_dQepHorEu_eAc",
  756. }
  757. response = requests.get(
  758. url=url,
  759. headers=headers,
  760. params=params,
  761. cookies=cookies,
  762. proxies=tunnel_proxies(),
  763. timeout=5,
  764. )
  765. if (
  766. response.status_code != 200
  767. or "data" not in response.json()
  768. or response.json()["data"] == {}
  769. ):
  770. print("获取视频信息失败")
  771. return None
  772. else:
  773. video_info = (
  774. response.json()["data"]
  775. .get("gidInformation", {})
  776. .get("packerData", {})
  777. .get("video", {})
  778. )
  779. if video_info == {}:
  780. return None
  781. video_detail = get_video_url(video_info)
  782. video_dict = {
  783. "video_title": video_info.get("title", ""),
  784. "video_id": video_info.get("videoResource", {}).get("vid", ""),
  785. "gid": str(item_id),
  786. "play_cnt": int(video_info.get("video_watch_count", 0)),
  787. "like_cnt": int(video_info.get("video_like_count", 0)),
  788. "comment_cnt": int(get_comment_cnt(item_id)),
  789. "share_cnt": 0,
  790. "favorite_cnt": 0,
  791. "duration": int(video_info.get("video_duration", 0)),
  792. "video_width": int(video_detail["video_width"]),
  793. "video_height": int(video_detail["video_height"]),
  794. "publish_time_stamp": int(video_info.get("video_publish_time", 0)),
  795. "publish_time_str": time.strftime(
  796. "%Y-%m-%d %H:%M:%S",
  797. time.localtime(int(video_info.get("video_publish_time", 0))),
  798. ),
  799. "user_name": video_info.get("user_info", {}).get("name", ""),
  800. "user_id": str(video_info.get("user_info", {}).get("user_id", "")),
  801. "avatar_url": str(
  802. video_info.get("user_info", {}).get("avatar_url", "")
  803. ),
  804. "cover_url": video_info.get("poster_url", ""),
  805. "audio_url": video_detail["audio_url"],
  806. "video_url": video_detail["video_url"],
  807. "session": f"xigua-search-{int(time.time())}",
  808. }
  809. return video_dict
  810. class XiGuaAuthor:
  811. def __init__(self, platform, mode, rule_dict, env, user_list):
  812. self.platform = platform
  813. self.mode = mode
  814. self.rule_dict = rule_dict
  815. self.env = env
  816. self.user_list = user_list
  817. # self.mq = MQ(topic_name="topic_crawler_etl_" + self.env)
  818. self.download_count = 0
  819. def get_author_list(self):
  820. # 每轮只抓取定量的数据,到达数量后自己退出
  821. max_count = int(self.rule_dict.get("videos_cnt", {}).get("min", 300))
  822. for user_dict in self.user_list:
  823. self.get_video_list(user_dict)
  824. if self.download_count <= max_count:
  825. self.get_video_list(user_dict)
  826. time.sleep(random.randint(1, 15))
  827. else:
  828. print("本轮已经抓取足够数量的视频,已经自动退出")
  829. return
  830. def get_video_list(self, user_dict):
  831. offset = 0
  832. signature = random_signature()
  833. url = "https://www.ixigua.com/api/videov2/author/new_video_list?"
  834. while True:
  835. params = {
  836. "to_user_id": str(
  837. user_dict["link"].replace("https://www.ixigua.com/home/", "")
  838. ),
  839. "offset": str(offset),
  840. "limit": "30",
  841. "maxBehotTime": "0",
  842. "order": "new",
  843. "isHome": "0",
  844. # 'msToken': 'G0eRzNkw189a8TLaXjc6nTHVMQwh9XcxVAqTbGKi7iPJdQcLwS3-XRrJ3MZ7QBfqErpxp3EX1WtvWOIcZ3NIgr41hgcd-v64so_RRj3YCRw1UsKW8mIssNLlIMspsg==',
  845. # 'X-Bogus': 'DFSzswVuEkUANjW9ShFTgR/F6qHt',
  846. "_signature": signature,
  847. }
  848. headers = {
  849. "referer": f'https://www.ixigua.com/home/{user_dict["link"].replace("https://www.ixigua.com/home/", "")}/video/?preActiveKey=hotsoon&list_entrance=userdetail',
  850. "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.41",
  851. }
  852. response = requests.get(
  853. url=url,
  854. headers=headers,
  855. params=params,
  856. proxies=tunnel_proxies(),
  857. timeout=5,
  858. )
  859. offset += 30
  860. if "data" not in response.text or response.status_code != 200:
  861. print(f"get_videoList:{response.text}\n")
  862. return
  863. elif not response.json()["data"]["videoList"]:
  864. print(f"没有更多数据啦~\n")
  865. return
  866. else:
  867. feeds = response.json()["data"]["videoList"]
  868. for video_obj in feeds:
  869. print(video_obj['is_top'])
  870. # print(json.dumps(video_obj, ensure_ascii=False, indent=4))
  871. # return
  872. self.process_video_obj(video_obj, user_dict)
  873. # try:
  874. # print("扫描到一条视频")
  875. # self.process_video_obj(video_obj, user_dict)
  876. # except Exception as e:
  877. # print("抓取单条视频异常, 报错原因是: {}".format(e))
  878. def process_video_obj(self, video_obj, user_dict):
  879. trace_id = self.platform + str(uuid.uuid1())
  880. item_id = video_obj.get("item_id", "")
  881. if not item_id:
  882. print("无效视频")
  883. return
  884. # 获取视频信息
  885. video_dict = get_video_info(item_id=item_id, trace_id=trace_id)
  886. video_dict["out_user_id"] = video_dict["user_id"]
  887. video_dict["platform"] = self.platform
  888. video_dict["strategy"] = self.mode
  889. video_dict["out_video_id"] = video_dict["video_id"]
  890. video_dict["width"] = video_dict["video_width"]
  891. video_dict["height"] = video_dict["video_height"]
  892. video_dict["crawler_rule"] = json.dumps(self.rule_dict)
  893. video_dict["user_id"] = user_dict["uid"]
  894. video_dict["publish_time"] = video_dict["publish_time_str"]
  895. video_dict["strategy_type"] = self.mode
  896. video_dict["update_time_stamp"] = int(time.time())
  897. pipeline = PiaoQuanPipelineTest(
  898. platform=self.platform,
  899. mode=self.mode,
  900. rule_dict=self.rule_dict,
  901. env=self.env,
  902. item=video_dict,
  903. trace_id=trace_id,
  904. )
  905. flag = pipeline.process_item()
  906. if flag:
  907. print(json.dumps(video_dict, ensure_ascii=False, indent=4))
  908. # self.mq.send_msg(video_dict)
  909. self.download_count += 1
  910. print("成功发送 MQ 至 ETL")
  911. if __name__ == "__main__":
  912. user_list = [
  913. {
  914. "uid": 6267140,
  915. "source": "xigua",
  916. "link": "https://www.ixigua.com/home/113976532286319/?list_entrance=anyVideo",
  917. "nick_name": "云姐犹记",
  918. "avatar_url": "",
  919. "mode": "author",
  920. }
  921. ]
  922. # rule = {'period': {'min': 30, 'max': 30}, 'duration': {'min': 20, 'max': 0}, 'play_cnt': {'min': 100, 'max': 0}}
  923. XGA = XiGuaAuthor(
  924. platform="xigua",
  925. mode="author",
  926. rule_dict={},
  927. env="prod",
  928. user_list=user_list
  929. )
  930. XGA.get_author_list()
  931. # item_id = "v0201ag10000cl4d7djc77u73eftvrcg"
  932. # get_video_info(item_id=item_id, trace_id="ljh")