xigua_author.py 48 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151
  1. import json
  2. import os
  3. import re
  4. import random
  5. import sys
  6. import string
  7. import time
  8. import uuid
  9. import base64
  10. import requests
  11. from fake_useragent import FakeUserAgent
  12. from common.mq import MQ
  13. sys.path.append(os.getcwd())
  14. from common import AliyunLogger, PiaoQuanPipeline, tunnel_proxies
  15. from common.limit import AuthorLimit
  16. def extract_info_by_re(text):
  17. """
  18. 通过正则表达式获取文本中的信息
  19. :param text:
  20. :return:
  21. """
  22. # 标题
  23. title_match = re.search(r'<title[^>]*>(.*?)</title>', text)
  24. if title_match:
  25. title_content = title_match.group(1)
  26. title_content = title_content.split(" - ")[0]
  27. title_content = bytes(title_content, "latin1").decode()
  28. else:
  29. title_content = ""
  30. # video_url
  31. main_url = re.search(r'("main_url":")(.*?)"', text)[0]
  32. main_url = main_url.split(":")[1]
  33. decoded_data = base64.b64decode(main_url)
  34. try:
  35. # 尝试使用utf-8解码
  36. video_url = decoded_data.decode()
  37. except UnicodeDecodeError:
  38. # 如果utf-8解码失败,尝试使用其他编码方式
  39. video_url = decoded_data.decode('latin-1')
  40. # video_id
  41. video_id = re.search(r'"vid":"(.*?)"', text).group(1)
  42. # like_count
  43. like_count = re.search(r'"video_like_count":(.*?),', text).group(1)
  44. # cover_url
  45. cover_url = re.search(r'"avatar_url":"(.*?)"', text).group(1)
  46. # video_play
  47. video_watch_count = re.search(r'"video_watch_count":(.*?),', text).group(1)
  48. # "video_publish_time"
  49. publish_time = re.search(r'"video_publish_time":"(.*?)"', text).group(1)
  50. # video_duration
  51. duration = re.search(r'("video_duration":)(.*?)"', text).group(2).replace(",", "")
  52. return {
  53. "title": title_content,
  54. "url": video_url,
  55. "video_id": video_id,
  56. "like_count": like_count,
  57. "cover_url": cover_url,
  58. "play_count": video_watch_count,
  59. "publish_time": publish_time,
  60. "duration": duration
  61. }
  62. def random_signature():
  63. """
  64. 随机生成签名
  65. """
  66. src_digits = string.digits # string_数字
  67. src_uppercase = string.ascii_uppercase # string_大写字母
  68. src_lowercase = string.ascii_lowercase # string_小写字母
  69. digits_num = random.randint(1, 6)
  70. uppercase_num = random.randint(1, 26 - digits_num - 1)
  71. lowercase_num = 26 - (digits_num + uppercase_num)
  72. password = (
  73. random.sample(src_digits, digits_num)
  74. + random.sample(src_uppercase, uppercase_num)
  75. + random.sample(src_lowercase, lowercase_num)
  76. )
  77. random.shuffle(password)
  78. new_password = "AAAAAAAAAA" + "".join(password)[10:-4] + "AAAB"
  79. new_password_start = new_password[0:18]
  80. new_password_end = new_password[-7:]
  81. if new_password[18] == "8":
  82. new_password = new_password_start + "w" + new_password_end
  83. elif new_password[18] == "9":
  84. new_password = new_password_start + "x" + new_password_end
  85. elif new_password[18] == "-":
  86. new_password = new_password_start + "y" + new_password_end
  87. elif new_password[18] == ".":
  88. new_password = new_password_start + "z" + new_password_end
  89. else:
  90. new_password = new_password_start + "y" + new_password_end
  91. return new_password
  92. def byte_dance_cookie(item_id):
  93. """
  94. 获取西瓜视频的 cookie
  95. :param item_id:
  96. """
  97. sess = requests.Session()
  98. sess.headers.update({
  99. 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_1_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',
  100. 'referer': 'https://www.ixigua.com/home/{}/'.format(item_id),
  101. })
  102. # 获取 cookies
  103. sess.get('https://i.snssdk.com/slardar/sdk.js?bid=xigua_video_web_pc')
  104. data = '{"region":"cn","aid":1768,"needFid":false,"service":"www.ixigua.com","migrate_info":{"ticket":"","source":"node"},"cbUrlProtocol":"https","union":true}'
  105. r = sess.post('https://ttwid.bytedance.com/ttwid/union/register/', data=data)
  106. # print(r.text)
  107. return r.cookies.values()[0]
  108. def get_video_url(video_info):
  109. """
  110. 获取视频的链接
  111. """
  112. video_url_dict = {}
  113. # video_url
  114. if "videoResource" not in video_info:
  115. video_url_dict["video_url"] = ""
  116. video_url_dict["audio_url"] = ""
  117. video_url_dict["video_width"] = 0
  118. video_url_dict["video_height"] = 0
  119. elif "dash_120fps" in video_info["videoResource"]:
  120. if (
  121. "video_list" in video_info["videoResource"]["dash_120fps"]
  122. and "video_4" in video_info["videoResource"]["dash_120fps"]["video_list"]
  123. ):
  124. video_url = video_info["videoResource"]["dash_120fps"]["video_list"][
  125. "video_4"
  126. ]["backup_url_1"]
  127. audio_url = video_info["videoResource"]["dash_120fps"]["video_list"][
  128. "video_4"
  129. ]["backup_url_1"]
  130. if len(video_url) % 3 == 1:
  131. video_url += "=="
  132. elif len(video_url) % 3 == 2:
  133. video_url += "="
  134. elif len(audio_url) % 3 == 1:
  135. audio_url += "=="
  136. elif len(audio_url) % 3 == 2:
  137. audio_url += "="
  138. video_url = base64.b64decode(video_url).decode("utf8")
  139. audio_url = base64.b64decode(audio_url).decode("utf8")
  140. video_width = video_info["videoResource"]["dash_120fps"]["video_list"][
  141. "video_4"
  142. ]["vwidth"]
  143. video_height = video_info["videoResource"]["dash_120fps"]["video_list"][
  144. "video_4"
  145. ]["vheight"]
  146. video_url_dict["video_url"] = video_url
  147. video_url_dict["audio_url"] = audio_url
  148. video_url_dict["video_width"] = video_width
  149. video_url_dict["video_height"] = video_height
  150. elif (
  151. "video_list" in video_info["videoResource"]["dash_120fps"]
  152. and "video_3" in video_info["videoResource"]["dash_120fps"]["video_list"]
  153. ):
  154. video_url = video_info["videoResource"]["dash_120fps"]["video_list"][
  155. "video_3"
  156. ]["backup_url_1"]
  157. audio_url = video_info["videoResource"]["dash_120fps"]["video_list"][
  158. "video_3"
  159. ]["backup_url_1"]
  160. if len(video_url) % 3 == 1:
  161. video_url += "=="
  162. elif len(video_url) % 3 == 2:
  163. video_url += "="
  164. elif len(audio_url) % 3 == 1:
  165. audio_url += "=="
  166. elif len(audio_url) % 3 == 2:
  167. audio_url += "="
  168. video_url = base64.b64decode(video_url).decode("utf8")
  169. audio_url = base64.b64decode(audio_url).decode("utf8")
  170. video_width = video_info["videoResource"]["dash_120fps"]["video_list"][
  171. "video_3"
  172. ]["vwidth"]
  173. video_height = video_info["videoResource"]["dash_120fps"]["video_list"][
  174. "video_3"
  175. ]["vheight"]
  176. video_url_dict["video_url"] = video_url
  177. video_url_dict["audio_url"] = audio_url
  178. video_url_dict["video_width"] = video_width
  179. video_url_dict["video_height"] = video_height
  180. elif (
  181. "video_list" in video_info["videoResource"]["dash_120fps"]
  182. and "video_2" in video_info["videoResource"]["dash_120fps"]["video_list"]
  183. ):
  184. video_url = video_info["videoResource"]["dash_120fps"]["video_list"][
  185. "video_2"
  186. ]["backup_url_1"]
  187. audio_url = video_info["videoResource"]["dash_120fps"]["video_list"][
  188. "video_2"
  189. ]["backup_url_1"]
  190. if len(video_url) % 3 == 1:
  191. video_url += "=="
  192. elif len(video_url) % 3 == 2:
  193. video_url += "="
  194. elif len(audio_url) % 3 == 1:
  195. audio_url += "=="
  196. elif len(audio_url) % 3 == 2:
  197. audio_url += "="
  198. video_url = base64.b64decode(video_url).decode("utf8")
  199. audio_url = base64.b64decode(audio_url).decode("utf8")
  200. video_width = video_info["videoResource"]["dash_120fps"]["video_list"][
  201. "video_2"
  202. ]["vwidth"]
  203. video_height = video_info["videoResource"]["dash_120fps"]["video_list"][
  204. "video_2"
  205. ]["vheight"]
  206. video_url_dict["video_url"] = video_url
  207. video_url_dict["audio_url"] = audio_url
  208. video_url_dict["video_width"] = video_width
  209. video_url_dict["video_height"] = video_height
  210. elif (
  211. "video_list" in video_info["videoResource"]["dash_120fps"]
  212. and "video_1" in video_info["videoResource"]["dash_120fps"]["video_list"]
  213. ):
  214. video_url = video_info["videoResource"]["dash_120fps"]["video_list"][
  215. "video_1"
  216. ]["backup_url_1"]
  217. audio_url = video_info["videoResource"]["dash_120fps"]["video_list"][
  218. "video_1"
  219. ]["backup_url_1"]
  220. if len(video_url) % 3 == 1:
  221. video_url += "=="
  222. elif len(video_url) % 3 == 2:
  223. video_url += "="
  224. elif len(audio_url) % 3 == 1:
  225. audio_url += "=="
  226. elif len(audio_url) % 3 == 2:
  227. audio_url += "="
  228. video_url = base64.b64decode(video_url).decode("utf8")
  229. audio_url = base64.b64decode(audio_url).decode("utf8")
  230. video_width = video_info["videoResource"]["dash_120fps"]["video_list"][
  231. "video_1"
  232. ]["vwidth"]
  233. video_height = video_info["videoResource"]["dash_120fps"]["video_list"][
  234. "video_1"
  235. ]["vheight"]
  236. video_url_dict["video_url"] = video_url
  237. video_url_dict["audio_url"] = audio_url
  238. video_url_dict["video_width"] = video_width
  239. video_url_dict["video_height"] = video_height
  240. elif (
  241. "dynamic_video" in video_info["videoResource"]["dash_120fps"]
  242. and "dynamic_video_list"
  243. in video_info["videoResource"]["dash_120fps"]["dynamic_video"]
  244. and "dynamic_audio_list"
  245. in video_info["videoResource"]["dash_120fps"]["dynamic_video"]
  246. and len(
  247. video_info["videoResource"]["dash_120fps"]["dynamic_video"][
  248. "dynamic_video_list"
  249. ]
  250. )
  251. != 0
  252. and len(
  253. video_info["videoResource"]["dash_120fps"]["dynamic_video"][
  254. "dynamic_audio_list"
  255. ]
  256. )
  257. != 0
  258. ):
  259. video_url = video_info["videoResource"]["dash_120fps"]["dynamic_video"][
  260. "dynamic_video_list"
  261. ][-1]["backup_url_1"]
  262. audio_url = video_info["videoResource"]["dash_120fps"]["dynamic_video"][
  263. "dynamic_audio_list"
  264. ][-1]["backup_url_1"]
  265. if len(video_url) % 3 == 1:
  266. video_url += "=="
  267. elif len(video_url) % 3 == 2:
  268. video_url += "="
  269. elif len(audio_url) % 3 == 1:
  270. audio_url += "=="
  271. elif len(audio_url) % 3 == 2:
  272. audio_url += "="
  273. video_url = base64.b64decode(video_url).decode("utf8")
  274. audio_url = base64.b64decode(audio_url).decode("utf8")
  275. video_width = video_info["videoResource"]["dash_120fps"]["dynamic_video"][
  276. "dynamic_video_list"
  277. ][-1]["vwidth"]
  278. video_height = video_info["videoResource"]["dash_120fps"]["dynamic_video"][
  279. "dynamic_video_list"
  280. ][-1]["vheight"]
  281. video_url_dict["video_url"] = video_url
  282. video_url_dict["audio_url"] = audio_url
  283. video_url_dict["video_width"] = video_width
  284. video_url_dict["video_height"] = video_height
  285. else:
  286. video_url_dict["video_url"] = ""
  287. video_url_dict["audio_url"] = ""
  288. video_url_dict["video_width"] = 0
  289. video_url_dict["video_height"] = 0
  290. elif "dash" in video_info["videoResource"]:
  291. if (
  292. "video_list" in video_info["videoResource"]["dash"]
  293. and "video_4" in video_info["videoResource"]["dash"]["video_list"]
  294. ):
  295. video_url = video_info["videoResource"]["dash"]["video_list"]["video_4"][
  296. "backup_url_1"
  297. ]
  298. audio_url = video_info["videoResource"]["dash"]["video_list"]["video_4"][
  299. "backup_url_1"
  300. ]
  301. if len(video_url) % 3 == 1:
  302. video_url += "=="
  303. elif len(video_url) % 3 == 2:
  304. video_url += "="
  305. elif len(audio_url) % 3 == 1:
  306. audio_url += "=="
  307. elif len(audio_url) % 3 == 2:
  308. audio_url += "="
  309. video_url = base64.b64decode(video_url).decode("utf8")
  310. audio_url = base64.b64decode(audio_url).decode("utf8")
  311. video_width = video_info["videoResource"]["dash"]["video_list"]["video_4"][
  312. "vwidth"
  313. ]
  314. video_height = video_info["videoResource"]["dash"]["video_list"]["video_4"][
  315. "vheight"
  316. ]
  317. video_url_dict["video_url"] = video_url
  318. video_url_dict["audio_url"] = audio_url
  319. video_url_dict["video_width"] = video_width
  320. video_url_dict["video_height"] = video_height
  321. elif (
  322. "video_list" in video_info["videoResource"]["dash"]
  323. and "video_3" in video_info["videoResource"]["dash"]["video_list"]
  324. ):
  325. video_url = video_info["videoResource"]["dash"]["video_list"]["video_3"][
  326. "backup_url_1"
  327. ]
  328. audio_url = video_info["videoResource"]["dash"]["video_list"]["video_3"][
  329. "backup_url_1"
  330. ]
  331. if len(video_url) % 3 == 1:
  332. video_url += "=="
  333. elif len(video_url) % 3 == 2:
  334. video_url += "="
  335. elif len(audio_url) % 3 == 1:
  336. audio_url += "=="
  337. elif len(audio_url) % 3 == 2:
  338. audio_url += "="
  339. video_url = base64.b64decode(video_url).decode("utf8")
  340. audio_url = base64.b64decode(audio_url).decode("utf8")
  341. video_width = video_info["videoResource"]["dash"]["video_list"]["video_3"][
  342. "vwidth"
  343. ]
  344. video_height = video_info["videoResource"]["dash"]["video_list"]["video_3"][
  345. "vheight"
  346. ]
  347. video_url_dict["video_url"] = video_url
  348. video_url_dict["audio_url"] = audio_url
  349. video_url_dict["video_width"] = video_width
  350. video_url_dict["video_height"] = video_height
  351. elif (
  352. "video_list" in video_info["videoResource"]["dash"]
  353. and "video_2" in video_info["videoResource"]["dash"]["video_list"]
  354. ):
  355. video_url = video_info["videoResource"]["dash"]["video_list"]["video_2"][
  356. "backup_url_1"
  357. ]
  358. audio_url = video_info["videoResource"]["dash"]["video_list"]["video_2"][
  359. "backup_url_1"
  360. ]
  361. if len(video_url) % 3 == 1:
  362. video_url += "=="
  363. elif len(video_url) % 3 == 2:
  364. video_url += "="
  365. elif len(audio_url) % 3 == 1:
  366. audio_url += "=="
  367. elif len(audio_url) % 3 == 2:
  368. audio_url += "="
  369. video_url = base64.b64decode(video_url).decode("utf8")
  370. audio_url = base64.b64decode(audio_url).decode("utf8")
  371. video_width = video_info["videoResource"]["dash"]["video_list"]["video_2"][
  372. "vwidth"
  373. ]
  374. video_height = video_info["videoResource"]["dash"]["video_list"]["video_2"][
  375. "vheight"
  376. ]
  377. video_url_dict["video_url"] = video_url
  378. video_url_dict["audio_url"] = audio_url
  379. video_url_dict["video_width"] = video_width
  380. video_url_dict["video_height"] = video_height
  381. elif (
  382. "video_list" in video_info["videoResource"]["dash"]
  383. and "video_1" in video_info["videoResource"]["dash"]["video_list"]
  384. ):
  385. video_url = video_info["videoResource"]["dash"]["video_list"]["video_1"][
  386. "backup_url_1"
  387. ]
  388. audio_url = video_info["videoResource"]["dash"]["video_list"]["video_1"][
  389. "backup_url_1"
  390. ]
  391. if len(video_url) % 3 == 1:
  392. video_url += "=="
  393. elif len(video_url) % 3 == 2:
  394. video_url += "="
  395. elif len(audio_url) % 3 == 1:
  396. audio_url += "=="
  397. elif len(audio_url) % 3 == 2:
  398. audio_url += "="
  399. video_url = base64.b64decode(video_url).decode("utf8")
  400. audio_url = base64.b64decode(audio_url).decode("utf8")
  401. video_width = video_info["videoResource"]["dash"]["video_list"]["video_1"][
  402. "vwidth"
  403. ]
  404. video_height = video_info["videoResource"]["dash"]["video_list"]["video_1"][
  405. "vheight"
  406. ]
  407. video_url_dict["video_url"] = video_url
  408. video_url_dict["audio_url"] = audio_url
  409. video_url_dict["video_width"] = video_width
  410. video_url_dict["video_height"] = video_height
  411. elif (
  412. "dynamic_video" in video_info["videoResource"]["dash"]
  413. and "dynamic_video_list"
  414. in video_info["videoResource"]["dash"]["dynamic_video"]
  415. and "dynamic_audio_list"
  416. in video_info["videoResource"]["dash"]["dynamic_video"]
  417. and len(
  418. video_info["videoResource"]["dash"]["dynamic_video"][
  419. "dynamic_video_list"
  420. ]
  421. )
  422. != 0
  423. and len(
  424. video_info["videoResource"]["dash"]["dynamic_video"][
  425. "dynamic_audio_list"
  426. ]
  427. )
  428. != 0
  429. ):
  430. video_url = video_info["videoResource"]["dash"]["dynamic_video"][
  431. "dynamic_video_list"
  432. ][-1]["backup_url_1"]
  433. audio_url = video_info["videoResource"]["dash"]["dynamic_video"][
  434. "dynamic_audio_list"
  435. ][-1]["backup_url_1"]
  436. if len(video_url) % 3 == 1:
  437. video_url += "=="
  438. elif len(video_url) % 3 == 2:
  439. video_url += "="
  440. elif len(audio_url) % 3 == 1:
  441. audio_url += "=="
  442. elif len(audio_url) % 3 == 2:
  443. audio_url += "="
  444. video_url = base64.b64decode(video_url).decode("utf8")
  445. audio_url = base64.b64decode(audio_url).decode("utf8")
  446. video_width = video_info["videoResource"]["dash"]["dynamic_video"][
  447. "dynamic_video_list"
  448. ][-1]["vwidth"]
  449. video_height = video_info["videoResource"]["dash"]["dynamic_video"][
  450. "dynamic_video_list"
  451. ][-1]["vheight"]
  452. video_url_dict["video_url"] = video_url
  453. video_url_dict["audio_url"] = audio_url
  454. video_url_dict["video_width"] = video_width
  455. video_url_dict["video_height"] = video_height
  456. else:
  457. video_url_dict["video_url"] = ""
  458. video_url_dict["audio_url"] = ""
  459. video_url_dict["video_width"] = 0
  460. video_url_dict["video_height"] = 0
  461. elif "normal" in video_info["videoResource"]:
  462. if (
  463. "video_list" in video_info["videoResource"]["normal"]
  464. and "video_4" in video_info["videoResource"]["normal"]["video_list"]
  465. ):
  466. video_url = video_info["videoResource"]["normal"]["video_list"]["video_4"][
  467. "backup_url_1"
  468. ]
  469. audio_url = video_info["videoResource"]["normal"]["video_list"]["video_4"][
  470. "backup_url_1"
  471. ]
  472. if len(video_url) % 3 == 1:
  473. video_url += "=="
  474. elif len(video_url) % 3 == 2:
  475. video_url += "="
  476. elif len(audio_url) % 3 == 1:
  477. audio_url += "=="
  478. elif len(audio_url) % 3 == 2:
  479. audio_url += "="
  480. video_url = base64.b64decode(video_url).decode("utf8")
  481. audio_url = base64.b64decode(audio_url).decode("utf8")
  482. video_width = video_info["videoResource"]["normal"]["video_list"][
  483. "video_4"
  484. ]["vwidth"]
  485. video_height = video_info["videoResource"]["normal"]["video_list"][
  486. "video_4"
  487. ]["vheight"]
  488. video_url_dict["video_url"] = video_url
  489. video_url_dict["audio_url"] = audio_url
  490. video_url_dict["video_width"] = video_width
  491. video_url_dict["video_height"] = video_height
  492. elif (
  493. "video_list" in video_info["videoResource"]["normal"]
  494. and "video_3" in video_info["videoResource"]["normal"]["video_list"]
  495. ):
  496. video_url = video_info["videoResource"]["normal"]["video_list"]["video_3"][
  497. "backup_url_1"
  498. ]
  499. audio_url = video_info["videoResource"]["normal"]["video_list"]["video_3"][
  500. "backup_url_1"
  501. ]
  502. if len(video_url) % 3 == 1:
  503. video_url += "=="
  504. elif len(video_url) % 3 == 2:
  505. video_url += "="
  506. elif len(audio_url) % 3 == 1:
  507. audio_url += "=="
  508. elif len(audio_url) % 3 == 2:
  509. audio_url += "="
  510. video_url = base64.b64decode(video_url).decode("utf8")
  511. audio_url = base64.b64decode(audio_url).decode("utf8")
  512. video_width = video_info["videoResource"]["normal"]["video_list"][
  513. "video_3"
  514. ]["vwidth"]
  515. video_height = video_info["videoResource"]["normal"]["video_list"][
  516. "video_3"
  517. ]["vheight"]
  518. video_url_dict["video_url"] = video_url
  519. video_url_dict["audio_url"] = audio_url
  520. video_url_dict["video_width"] = video_width
  521. video_url_dict["video_height"] = video_height
  522. elif (
  523. "video_list" in video_info["videoResource"]["normal"]
  524. and "video_2" in video_info["videoResource"]["normal"]["video_list"]
  525. ):
  526. video_url = video_info["videoResource"]["normal"]["video_list"]["video_2"][
  527. "backup_url_1"
  528. ]
  529. audio_url = video_info["videoResource"]["normal"]["video_list"]["video_2"][
  530. "backup_url_1"
  531. ]
  532. if len(video_url) % 3 == 1:
  533. video_url += "=="
  534. elif len(video_url) % 3 == 2:
  535. video_url += "="
  536. elif len(audio_url) % 3 == 1:
  537. audio_url += "=="
  538. elif len(audio_url) % 3 == 2:
  539. audio_url += "="
  540. video_url = base64.b64decode(video_url).decode("utf8")
  541. audio_url = base64.b64decode(audio_url).decode("utf8")
  542. video_width = video_info["videoResource"]["normal"]["video_list"][
  543. "video_2"
  544. ]["vwidth"]
  545. video_height = video_info["videoResource"]["normal"]["video_list"][
  546. "video_2"
  547. ]["vheight"]
  548. video_url_dict["video_url"] = video_url
  549. video_url_dict["audio_url"] = audio_url
  550. video_url_dict["video_width"] = video_width
  551. video_url_dict["video_height"] = video_height
  552. elif (
  553. "video_list" in video_info["videoResource"]["normal"]
  554. and "video_1" in video_info["videoResource"]["normal"]["video_list"]
  555. ):
  556. video_url = video_info["videoResource"]["normal"]["video_list"]["video_1"][
  557. "backup_url_1"
  558. ]
  559. audio_url = video_info["videoResource"]["normal"]["video_list"]["video_1"][
  560. "backup_url_1"
  561. ]
  562. if len(video_url) % 3 == 1:
  563. video_url += "=="
  564. elif len(video_url) % 3 == 2:
  565. video_url += "="
  566. elif len(audio_url) % 3 == 1:
  567. audio_url += "=="
  568. elif len(audio_url) % 3 == 2:
  569. audio_url += "="
  570. video_url = base64.b64decode(video_url).decode("utf8")
  571. audio_url = base64.b64decode(audio_url).decode("utf8")
  572. video_width = video_info["videoResource"]["normal"]["video_list"][
  573. "video_1"
  574. ]["vwidth"]
  575. video_height = video_info["videoResource"]["normal"]["video_list"][
  576. "video_1"
  577. ]["vheight"]
  578. video_url_dict["video_url"] = video_url
  579. video_url_dict["audio_url"] = audio_url
  580. video_url_dict["video_width"] = video_width
  581. video_url_dict["video_height"] = video_height
  582. elif (
  583. "dynamic_video" in video_info["videoResource"]["normal"]
  584. and "dynamic_video_list"
  585. in video_info["videoResource"]["normal"]["dynamic_video"]
  586. and "dynamic_audio_list"
  587. in video_info["videoResource"]["normal"]["dynamic_video"]
  588. and len(
  589. video_info["videoResource"]["normal"]["dynamic_video"][
  590. "dynamic_video_list"
  591. ]
  592. )
  593. != 0
  594. and len(
  595. video_info["videoResource"]["normal"]["dynamic_video"][
  596. "dynamic_audio_list"
  597. ]
  598. )
  599. != 0
  600. ):
  601. video_url = video_info["videoResource"]["normal"]["dynamic_video"][
  602. "dynamic_video_list"
  603. ][-1]["backup_url_1"]
  604. audio_url = video_info["videoResource"]["normal"]["dynamic_video"][
  605. "dynamic_audio_list"
  606. ][-1]["backup_url_1"]
  607. if len(video_url) % 3 == 1:
  608. video_url += "=="
  609. elif len(video_url) % 3 == 2:
  610. video_url += "="
  611. elif len(audio_url) % 3 == 1:
  612. audio_url += "=="
  613. elif len(audio_url) % 3 == 2:
  614. audio_url += "="
  615. video_url = base64.b64decode(video_url).decode("utf8")
  616. audio_url = base64.b64decode(audio_url).decode("utf8")
  617. video_width = video_info["videoResource"]["normal"]["dynamic_video"][
  618. "dynamic_video_list"
  619. ][-1]["vwidth"]
  620. video_height = video_info["videoResource"]["normal"]["dynamic_video"][
  621. "dynamic_video_list"
  622. ][-1]["vheight"]
  623. video_url_dict["video_url"] = video_url
  624. video_url_dict["audio_url"] = audio_url
  625. video_url_dict["video_width"] = video_width
  626. video_url_dict["video_height"] = video_height
  627. else:
  628. video_url_dict["video_url"] = ""
  629. video_url_dict["audio_url"] = ""
  630. video_url_dict["video_width"] = 0
  631. video_url_dict["video_height"] = 0
  632. else:
  633. video_url_dict["video_url"] = ""
  634. video_url_dict["audio_url"] = ""
  635. video_url_dict["video_width"] = 0
  636. video_url_dict["video_height"] = 0
  637. return video_url_dict
  638. def get_comment_cnt(item_id):
  639. """
  640. 获取视频的评论数量
  641. """
  642. url = "https://www.ixigua.com/tlb/comment/article/v5/tab_comments/?"
  643. params = {
  644. "tab_index": "0",
  645. "count": "10",
  646. "offset": "10",
  647. "group_id": str(item_id),
  648. "item_id": str(item_id),
  649. "aid": "1768",
  650. "msToken": "50-JJObWB07HfHs-BMJWT1eIDX3G-6lPSF_i-QwxBIXE9VVa-iN0jbEXR5pG2DKjXBmP299n6ZTuXzY-GAy968CCvouSAYIS4GzvGQT3pNlKNejr5G4-1g==",
  651. "X-Bogus": "DFSzswVOyGtANVeWtCLMqR/F6q9U",
  652. "_signature": random_signature(),
  653. }
  654. headers = {
  655. "authority": "www.ixigua.com",
  656. "accept": "application/json, text/plain, */*",
  657. "accept-language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
  658. "cache-control": "no-cache",
  659. "cookie": "MONITOR_WEB_ID=67cb5099-a022-4ec3-bb8e-c4de6ba51dd0; passport_csrf_token=72b2574f3c99f8ba670e42df430218fd; passport_csrf_token_default=72b2574f3c99f8ba670e42df430218fd; sid_guard=c7472b508ea631823ba765a60cf8757f%7C1680867422%7C3024002%7CFri%2C+12-May-2023+11%3A37%3A04+GMT; uid_tt=c13f47d51767f616befe32fb3e9f485a; uid_tt_ss=c13f47d51767f616befe32fb3e9f485a; sid_tt=c7472b508ea631823ba765a60cf8757f; sessionid=c7472b508ea631823ba765a60cf8757f; sessionid_ss=c7472b508ea631823ba765a60cf8757f; sid_ucp_v1=1.0.0-KGUzNWYxNmRkZGJiZjgxY2MzZWNkMTEzMTkwYjY1Yjg5OTY5NzVlNmMKFQiu3d-eqQIQ3oDAoQYYGCAMOAhACxoCaGwiIGM3NDcyYjUwOGVhNjMxODIzYmE3NjVhNjBjZjg3NTdm; ssid_ucp_v1=1.0.0-KGUzNWYxNmRkZGJiZjgxY2MzZWNkMTEzMTkwYjY1Yjg5OTY5NzVlNmMKFQiu3d-eqQIQ3oDAoQYYGCAMOAhACxoCaGwiIGM3NDcyYjUwOGVhNjMxODIzYmE3NjVhNjBjZjg3NTdm; odin_tt=b893608d4dde2e1e8df8cd5d97a0e2fbeafc4ca762ac72ebef6e6c97e2ed19859bb01d46b4190ddd6dd17d7f9678e1de; SEARCH_CARD_MODE=7168304743566296612_0; support_webp=true; support_avif=false; csrf_session_id=a5355d954d3c63ed1ba35faada452b4d; tt_scid=7Pux7s634-z8DYvCM20y7KigwH5u7Rh6D9C-RROpnT.aGMEcz6Vsxp.oai47wJqa4f86; ttwid=1%7CHHtv2QqpSGuSu8r-zXF1QoWsvjmNi1SJrqOrZzg-UCY%7C1683858689%7Ca5223fe1500578e01e138a0d71d6444692018296c4c24f5885af174a65873c95; ixigua-a-s=3; msToken=50-JJObWB07HfHs-BMJWT1eIDX3G-6lPSF_i-QwxBIXE9VVa-iN0jbEXR5pG2DKjXBmP299n6ZTuXzY-GAy968CCvouSAYIS4GzvGQT3pNlKNejr5G4-1g==; __ac_nonce=0645dcbf0005064517440; __ac_signature=_02B4Z6wo00f01FEGmAwAAIDBKchzCGqn-MBRJpyAAHAjieFC5GEg6gGiwz.I4PRrJl7f0GcixFrExKmgt6QI1i1S-dQyofPEj2ugWTCnmKUdJQv-wYuDofeKNe8VtMtZq2aKewyUGeKU-5Ud21; ixigua-a-s=3",
  660. "pragma": "no-cache",
  661. "referer": f"https://www.ixigua.com/{item_id}?logTag=3c5aa86a8600b9ab8540",
  662. "sec-ch-ua": '"Microsoft Edge";v="113", "Chromium";v="113", "Not-A.Brand";v="24"',
  663. "sec-ch-ua-mobile": "?0",
  664. "sec-ch-ua-platform": '"macOS"',
  665. "sec-fetch-dest": "empty",
  666. "sec-fetch-mode": "cors",
  667. "sec-fetch-site": "same-origin",
  668. "tt-anti-token": "cBITBHvmYjEygzv-f9c78c1297722cf1f559c74b084e4525ce4900bdcf9e8588f20cc7c2e3234422",
  669. "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.35",
  670. "x-secsdk-csrf-token": "000100000001f8e733cf37f0cd255a51aea9a81ff7bc0c09490cfe41ad827c3c5c18ec809279175e4d9f5553d8a5",
  671. }
  672. response = requests.get(
  673. url=url, headers=headers, params=params, proxies=tunnel_proxies(), timeout=5
  674. )
  675. response.close()
  676. if (
  677. response.status_code != 200
  678. or "total_number" not in response.json()
  679. or response.json() == {}
  680. ):
  681. return 0
  682. return response.json().get("total_number", 0)
  683. class XiGuaAuthor:
  684. """
  685. 西瓜账号爬虫
  686. """
  687. def __init__(self, platform, mode, rule_dict, env, user_list):
  688. self.platform = platform
  689. self.mode = mode
  690. self.rule_dict = rule_dict
  691. self.env = env
  692. self.user_list = user_list
  693. self.mq = MQ(topic_name="topic_crawler_etl_" + self.env)
  694. self.download_count = 0
  695. self.limiter = AuthorLimit(platform=self.platform, mode=self.mode)
  696. def rule_maker(self, account):
  697. """
  698. 通过不同的账号生成不同的规则
  699. :param account: 输入的账号信息
  700. {'play_cnt': {'min': 100000, 'max': 0}, 'period': {'min': 5, 'max': 5}}
  701. """
  702. temp = account['link'].split("?")[0].split("_")
  703. if len(temp) == 1:
  704. return self.rule_dict
  705. else:
  706. flag = temp[-2]
  707. match flag:
  708. case "V1":
  709. rule_dict = {
  710. "play_cnt": {"min": 100000, "max": 0},
  711. 'period': {"min": 90, "max": 90},
  712. 'special': 0.02
  713. }
  714. return rule_dict
  715. case "V2":
  716. rule_dict = {
  717. "play_cnt": {"min": 10000, "max": 0},
  718. 'period': {"min": 90, "max": 90},
  719. 'special': 0.01
  720. }
  721. return rule_dict
  722. case "V3":
  723. rule_dict = {
  724. "play_cnt": {"min": 5000, "max": 0},
  725. 'period': {"min": 90, "max": 90},
  726. 'special': 0.01
  727. }
  728. return rule_dict
  729. def get_author_list(self):
  730. """
  731. 每轮只抓取定量的数据,到达数量后自己退出
  732. 获取账号列表以及账号信息
  733. """
  734. # max_count = int(self.rule_dict.get("videos_cnt", {}).get("min", 300))
  735. for user_dict in self.user_list:
  736. # if self.download_count <= max_count:
  737. try:
  738. flag = user_dict["link"][0]
  739. match flag:
  740. case "V":
  741. self.get_video_list(user_dict)
  742. case "X":
  743. self.get_tiny_video_list(user_dict)
  744. case "h":
  745. self.get_video_list(user_dict)
  746. case "D":
  747. self.get_video_list(user_dict)
  748. case "B":
  749. self.get_video_list(user_dict)
  750. self.get_tiny_video_list(user_dict)
  751. except Exception as e:
  752. AliyunLogger.logging(
  753. code="3001",
  754. account=user_dict["uid"],
  755. platform=self.platform,
  756. mode=self.mode,
  757. env=self.env,
  758. message="扫描账号时出现bug, 报错是 {}".format(e)
  759. )
  760. # time.sleep(random.randint(1, 15))
  761. # else:
  762. # AliyunLogger.logging(
  763. # code="2000",
  764. # platform=self.platform,
  765. # mode=self.mode,
  766. # env=self.env,
  767. # message="本轮已经抓取足够数量的视频,已经自动退出",
  768. # )
  769. # return
  770. def get_video_list(self, user_dict):
  771. """
  772. 获取某个账号的视频列表
  773. 账号分为 3 类
  774. """
  775. offset = 0
  776. signature = random_signature()
  777. link = user_dict['link'].split("?")[0].split("_")[-1]
  778. url = "https://www.ixigua.com/api/videov2/author/new_video_list?"
  779. while True:
  780. to_user_id = str(link.replace("https://www.ixigua.com/home/", ""))
  781. params = {
  782. "to_user_id": to_user_id,
  783. "offset": str(offset),
  784. "limit": "30",
  785. "maxBehotTime": "0",
  786. "order": "new",
  787. "isHome": "0",
  788. "_signature": signature,
  789. }
  790. headers = {
  791. "referer": f'https://www.ixigua.com/home/{link.replace("https://www.ixigua.com/home/", "")}/video/?preActiveKey=hotsoon&list_entrance=userdetail',
  792. "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.41",
  793. }
  794. response = requests.get(
  795. url=url,
  796. headers=headers,
  797. params=params,
  798. proxies=tunnel_proxies(),
  799. timeout=5,
  800. )
  801. offset += 30
  802. if "data" not in response.text or response.status_code != 200:
  803. AliyunLogger.logging(
  804. code="3000",
  805. platform=self.platform,
  806. mode=self.mode,
  807. env=self.env,
  808. message=f"get_videoList:{response.text}\n",
  809. )
  810. return
  811. elif not response.json()["data"]["videoList"]:
  812. AliyunLogger.logging(
  813. account=link,
  814. code="3000",
  815. platform=self.platform,
  816. mode=self.mode,
  817. env=self.env,
  818. data=response.json(),
  819. message=f"没有更多数据啦~\n",
  820. )
  821. return
  822. else:
  823. feeds = response.json()["data"]["videoList"]
  824. for video_obj in feeds:
  825. try:
  826. AliyunLogger.logging(
  827. code="1001",
  828. account=user_dict['uid'],
  829. platform=self.platform,
  830. mode=self.mode,
  831. env=self.env,
  832. data=video_obj,
  833. message="扫描到一条视频",
  834. )
  835. date_flag = self.process_video_obj(video_obj, user_dict, "l")
  836. if not date_flag:
  837. return
  838. except Exception as e:
  839. AliyunLogger.logging(
  840. code="3000",
  841. platform=self.platform,
  842. mode=self.mode,
  843. env=self.env,
  844. data=video_obj,
  845. message="抓取单条视频异常, 报错原因是: {}".format(e),
  846. )
  847. def get_tiny_video_list(self, user_dict):
  848. """
  849. 获取小视频
  850. """
  851. url = "https://www.ixigua.com/api/videov2/hotsoon/video"
  852. max_behot_time = "0"
  853. link = user_dict['link'].split("?")[0].split("_")[-1]
  854. to_user_id = str(link.replace("https://www.ixigua.com/home/", ""))
  855. while True:
  856. params = {
  857. "to_user_id": to_user_id,
  858. "max_behot_time": max_behot_time,
  859. "_signature": random_signature()
  860. }
  861. headers = {
  862. "referer": "https://www.ixigua.com/{}?&".format(to_user_id),
  863. "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.41",
  864. }
  865. response = requests.get(
  866. url=url,
  867. headers=headers,
  868. params=params,
  869. proxies=tunnel_proxies(),
  870. timeout=5,
  871. )
  872. if "data" not in response.text or response.status_code != 200:
  873. AliyunLogger.logging(
  874. code="2000",
  875. platform=self.platform,
  876. mode=self.mode,
  877. env=self.env,
  878. message=f"get_videoList:{response.text}\n",
  879. )
  880. return
  881. elif not response.json()["data"]["data"]:
  882. AliyunLogger.logging(
  883. account=link,
  884. code="2000",
  885. platform=self.platform,
  886. mode=self.mode,
  887. env=self.env,
  888. data=response.json(),
  889. message=f"没有更多数据啦~\n",
  890. )
  891. return
  892. else:
  893. video_list = response.json()['data']['data']
  894. max_behot_time = video_list[-1]["max_behot_time"]
  895. for video_obj in video_list:
  896. try:
  897. AliyunLogger.logging(
  898. code="1001",
  899. account=user_dict['uid'],
  900. platform=self.platform,
  901. mode=self.mode,
  902. env=self.env,
  903. data=video_obj,
  904. message="扫描到一条小视频",
  905. )
  906. date_flag = self.process_video_obj(video_obj, user_dict, "s")
  907. if not date_flag:
  908. return
  909. except Exception as e:
  910. AliyunLogger.logging(
  911. code="3000",
  912. platform=self.platform,
  913. mode=self.mode,
  914. env=self.env,
  915. data=video_obj,
  916. message="抓取单条视频异常, 报错原因是: {}".format(e),
  917. )
  918. def process_video_obj(self, video_obj, user_dict, f):
  919. """
  920. process video_obj and extract video_url
  921. """
  922. new_rule = self.rule_maker(user_dict)
  923. trace_id = self.platform + str(uuid.uuid1())
  924. if f == "s":
  925. item_id = video_obj.get("id_str", "")
  926. else:
  927. item_id = video_obj.get("item_id", "")
  928. if not item_id:
  929. AliyunLogger.logging(
  930. code="2005",
  931. account=user_dict['uid'],
  932. platform=self.platform,
  933. mode=self.mode,
  934. env=self.env,
  935. message="无效视频",
  936. data=video_obj,
  937. trace_id=trace_id,
  938. )
  939. return
  940. # 获取视频信息
  941. video_dict = self.get_video_info(item_id=item_id)
  942. video_dict["platform"] = self.platform
  943. video_dict["strategy"] = self.mode
  944. video_dict["out_video_id"] = video_dict["video_id"]
  945. video_dict["width"] = video_dict["video_width"]
  946. video_dict["height"] = video_dict["video_height"]
  947. video_dict["crawler_rule"] = json.dumps(new_rule)
  948. video_dict["user_id"] = user_dict["uid"]
  949. video_dict["publish_time"] = video_dict["publish_time_str"]
  950. video_dict["strategy_type"] = self.mode
  951. video_dict["update_time_stamp"] = int(time.time())
  952. if int(time.time()) - video_dict['publish_time_stamp'] > 3600 * 24 * int(
  953. new_rule.get("period", {}).get("max", 1000)):
  954. if not video_obj['is_top']:
  955. """
  956. 非置顶数据发布时间超过才退出
  957. """
  958. AliyunLogger.logging(
  959. code="2004",
  960. account=user_dict['uid'],
  961. platform=self.platform,
  962. mode=self.mode,
  963. env=self.env,
  964. data=video_dict,
  965. message="发布时间超过{}天".format(
  966. int(new_rule.get("period", {}).get("max", 1000))
  967. ),
  968. )
  969. return False
  970. pipeline = PiaoQuanPipeline(
  971. platform=self.platform,
  972. mode=self.mode,
  973. rule_dict=new_rule,
  974. env=self.env,
  975. item=video_dict,
  976. trace_id=trace_id,
  977. )
  978. limit_flag = self.limiter.author_limitation(user_id=video_dict['user_id'])
  979. if limit_flag:
  980. title_flag = pipeline.title_flag()
  981. repeat_flag = pipeline.repeat_video()
  982. if title_flag and repeat_flag:
  983. if new_rule.get("special"):
  984. if int(video_dict['play_cnt']) >= int(new_rule.get("play_cnt", {}).get("min", 100000)):
  985. if float(video_dict['like_cnt']) / float(video_dict['play_cnt']) >= new_rule['special']:
  986. self.mq.send_msg(video_dict)
  987. self.download_count += 1
  988. AliyunLogger.logging(
  989. code="1002",
  990. account=user_dict['uid'],
  991. platform=self.platform,
  992. mode=self.mode,
  993. env=self.env,
  994. data=video_dict,
  995. trace_id=trace_id,
  996. message="成功发送 MQ 至 ETL",
  997. )
  998. return True
  999. else:
  1000. AliyunLogger.logging(
  1001. code="2008",
  1002. account=user_dict['uid'],
  1003. platform=self.platform,
  1004. mode=self.mode,
  1005. env=self.env,
  1006. message="不满足特殊规则, 点赞量/播放量",
  1007. data=video_dict
  1008. )
  1009. else:
  1010. if int(video_dict['play_cnt']) >= int(new_rule.get("play_cnt", {}).get("min", 100000)):
  1011. self.mq.send_msg(video_dict)
  1012. self.download_count += 1
  1013. AliyunLogger.logging(
  1014. code="1002",
  1015. account=user_dict['uid'],
  1016. platform=self.platform,
  1017. mode=self.mode,
  1018. env=self.env,
  1019. data=video_dict,
  1020. trace_id=trace_id,
  1021. message="成功发送 MQ 至 ETL",
  1022. )
  1023. return True
  1024. else:
  1025. AliyunLogger.logging(
  1026. code="2008",
  1027. account=user_dict['uid'],
  1028. platform=self.platform,
  1029. mode=self.mode,
  1030. env=self.env,
  1031. message="不满足特殊规则, 播放量",
  1032. data=video_dict
  1033. )
  1034. return True
  1035. def get_video_info(self, item_id):
  1036. """
  1037. 获取视频信息
  1038. """
  1039. url = "https://www.ixigua.com/{}".format(item_id)
  1040. headers = {
  1041. "accept-encoding": "gzip, deflate",
  1042. "accept-language": "zh-CN,zh-Hans;q=0.9",
  1043. "cookie": "ttwid={}".format(byte_dance_cookie(item_id)),
  1044. "user-agent": FakeUserAgent().random,
  1045. "referer": "https://www.ixigua.com/{}/".format(item_id),
  1046. }
  1047. response = requests.get(
  1048. url=url,
  1049. headers=headers,
  1050. proxies=tunnel_proxies(),
  1051. timeout=5,
  1052. )
  1053. video_info = extract_info_by_re(response.text)
  1054. video_dict = {
  1055. "video_title": video_info.get("title", ""),
  1056. "video_id": video_info.get("video_id"),
  1057. "gid": str(item_id),
  1058. "play_cnt": int(video_info.get("play_count", 0)),
  1059. "like_cnt": int(video_info.get("like_count", 0)),
  1060. "comment_cnt": 0,
  1061. "share_cnt": 0,
  1062. "favorite_cnt": 0,
  1063. "duration": int(video_info.get("duration", 0)),
  1064. "video_width": 0,
  1065. "video_height": 0,
  1066. "publish_time_stamp": int(video_info.get("publish_time", 0)),
  1067. "publish_time_str": time.strftime(
  1068. "%Y-%m-%d %H:%M:%S",
  1069. time.localtime(int(video_info.get("publish_time", 0))),
  1070. ),
  1071. "avatar_url": str(
  1072. video_info.get("user_info", {}).get("avatar_url", "")
  1073. ),
  1074. "cover_url": video_info.get("cover_url", ""),
  1075. "video_url": video_info.get("url"),
  1076. "session": f"xigua-search-{int(time.time())}",
  1077. }
  1078. return video_dict
  1079. if __name__ == "__main__":
  1080. user_list = [
  1081. {
  1082. "uid": 6267140,
  1083. "source": "xigua",
  1084. "link": "https://www.ixigua.com/home/2779177225827568",
  1085. "nick_name": "秋晴爱音乐",
  1086. "avatar_url": "",
  1087. "mode": "author",
  1088. },
  1089. {
  1090. "uid": 6267140,
  1091. "source": "xigua",
  1092. "link": "https://www.ixigua.com/home/2885546124776780",
  1093. "nick_name": "朗诵放歌的老山羊",
  1094. "avatar_url": "",
  1095. "mode": "author",
  1096. },
  1097. {
  1098. "uid": 6267140,
  1099. "source": "xigua",
  1100. "link": "https://www.ixigua.com/home/5880938217",
  1101. "nick_name": "天原声疗",
  1102. "avatar_url": "",
  1103. "mode": "author",
  1104. },
  1105. ]
  1106. rule = {'period': {'min': 30, 'max': 30}, 'duration': {'min': 20, 'max': 0}, 'play_cnt': {'min': 100000, 'max': 0}}
  1107. XGA = XiGuaAuthor(
  1108. platform="xigua",
  1109. mode="author",
  1110. rule_dict=rule,
  1111. env="prod",
  1112. user_list=user_list
  1113. )
  1114. XGA.get_author_list()