xigua_author.py 49 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191
  1. import json
  2. import os
  3. import re
  4. import random
  5. import sys
  6. import string
  7. import time
  8. import uuid
  9. import base64
  10. import requests
  11. from lxml import etree
  12. from Crypto.Cipher import AES
  13. from Crypto.Util.Padding import unpad
  14. from fake_useragent import FakeUserAgent
  15. from common.mq import MQ
  16. sys.path.append(os.getcwd())
  17. from common import AliyunLogger, PiaoQuanPipeline, tunnel_proxies
  18. from common.limit import AuthorLimit
  19. def aes_decrypt(data: str, key: str) -> str:
  20. """
  21. XiGua AES decrypt
  22. :param data:
  23. :param key:
  24. :return:
  25. """
  26. password = key.encode()
  27. iv = password[:16]
  28. try:
  29. ct = base64.b64decode(data.encode())
  30. cipher = AES.new(password, AES.MODE_CBC, iv)
  31. pt = unpad(cipher.decrypt(ct), AES.block_size)
  32. return base64.b64decode(pt).decode()
  33. except Exception as e:
  34. print("Incorrect decryption {}".format(e))
  35. return None
  36. def extract_video_url(text):
  37. """
  38. 获取视频 video_url
  39. :param text:
  40. :return:
  41. """
  42. HTML = etree.HTML(text)
  43. str_2 = HTML.xpath('//script[@id="SSR_HYDRATED_DATA"]/text()')[0]
  44. json_2 = str_2[str_2.find('{'):str_2.rfind('}') + 1]
  45. Irregulars = ['null', 'undefined', '=false', '=true', 'false', 'true']
  46. # python中不规则的定义
  47. for I in Irregulars:
  48. if I in ['=false', '=true']:
  49. json_2 = json_2.replace(I, '=' + I[1:].capitalize())
  50. else:
  51. json_2 = json_2.replace(I, '12')
  52. dict_2 = json.loads(json_2)["anyVideo"]["gidInformation"]["packerData"]["video"]["videoResource"]["dash"]
  53. ptk = dict_2["ptk"]
  54. video_url = dict_2['dynamic_video']['main_url']
  55. real_video_url = aes_decrypt(data=video_url, key=ptk)
  56. return real_video_url
  57. def extract_info_by_re(text):
  58. """
  59. 通过正则表达式获取文本中的信息
  60. :param text:
  61. :return:
  62. """
  63. # 标题
  64. title_match = re.search(r'<title[^>]*>(.*?)</title>', text)
  65. if title_match:
  66. title_content = title_match.group(1)
  67. title_content = title_content.split(" - ")[0]
  68. title_content = bytes(title_content, "latin1").decode()
  69. else:
  70. title_content = ""
  71. # video_id
  72. video_id = re.search(r'"vid":"(.*?)"', text).group(1)
  73. # like_count
  74. like_count = re.search(r'"video_like_count":(.*?),', text).group(1)
  75. # cover_url
  76. cover_url = re.search(r'"avatar_url":"(.*?)"', text).group(1)
  77. # video_play
  78. video_watch_count = re.search(r'"video_watch_count":(.*?),', text).group(1)
  79. # "video_publish_time"
  80. publish_time = re.search(r'"video_publish_time":"(.*?)"', text).group(1)
  81. # video_duration
  82. duration = re.search(r'("video_duration":)(.*?)"', text).group(2).replace(",", "")
  83. return {
  84. "title": title_content,
  85. "url": extract_video_url(text),
  86. "video_id": video_id,
  87. "like_count": like_count,
  88. "cover_url": cover_url,
  89. "play_count": video_watch_count,
  90. "publish_time": publish_time,
  91. "duration": duration
  92. }
  93. def random_signature():
  94. """
  95. 随机生成签名
  96. """
  97. src_digits = string.digits # string_数字
  98. src_uppercase = string.ascii_uppercase # string_大写字母
  99. src_lowercase = string.ascii_lowercase # string_小写字母
  100. digits_num = random.randint(1, 6)
  101. uppercase_num = random.randint(1, 26 - digits_num - 1)
  102. lowercase_num = 26 - (digits_num + uppercase_num)
  103. password = (
  104. random.sample(src_digits, digits_num)
  105. + random.sample(src_uppercase, uppercase_num)
  106. + random.sample(src_lowercase, lowercase_num)
  107. )
  108. random.shuffle(password)
  109. new_password = "AAAAAAAAAA" + "".join(password)[10:-4] + "AAAB"
  110. new_password_start = new_password[0:18]
  111. new_password_end = new_password[-7:]
  112. if new_password[18] == "8":
  113. new_password = new_password_start + "w" + new_password_end
  114. elif new_password[18] == "9":
  115. new_password = new_password_start + "x" + new_password_end
  116. elif new_password[18] == "-":
  117. new_password = new_password_start + "y" + new_password_end
  118. elif new_password[18] == ".":
  119. new_password = new_password_start + "z" + new_password_end
  120. else:
  121. new_password = new_password_start + "y" + new_password_end
  122. return new_password
  123. def byte_dance_cookie(item_id):
  124. """
  125. 获取西瓜视频的 cookie
  126. :param item_id:
  127. """
  128. sess = requests.Session()
  129. sess.headers.update({
  130. 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_1_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',
  131. 'referer': 'https://www.ixigua.com/home/{}/'.format(item_id),
  132. })
  133. # 获取 cookies
  134. sess.get('https://i.snssdk.com/slardar/sdk.js?bid=xigua_video_web_pc')
  135. data = '{"region":"cn","aid":1768,"needFid":false,"service":"www.ixigua.com","migrate_info":{"ticket":"","source":"node"},"cbUrlProtocol":"https","union":true}'
  136. r = sess.post('https://ttwid.bytedance.com/ttwid/union/register/', data=data)
  137. # print(r.text)
  138. return r.cookies.values()[0]
  139. def get_video_url(video_info):
  140. """
  141. 获取视频的链接
  142. """
  143. video_url_dict = {}
  144. # video_url
  145. if "videoResource" not in video_info:
  146. video_url_dict["video_url"] = ""
  147. video_url_dict["audio_url"] = ""
  148. video_url_dict["video_width"] = 0
  149. video_url_dict["video_height"] = 0
  150. elif "dash_120fps" in video_info["videoResource"]:
  151. if (
  152. "video_list" in video_info["videoResource"]["dash_120fps"]
  153. and "video_4" in video_info["videoResource"]["dash_120fps"]["video_list"]
  154. ):
  155. video_url = video_info["videoResource"]["dash_120fps"]["video_list"][
  156. "video_4"
  157. ]["backup_url_1"]
  158. audio_url = video_info["videoResource"]["dash_120fps"]["video_list"][
  159. "video_4"
  160. ]["backup_url_1"]
  161. if len(video_url) % 3 == 1:
  162. video_url += "=="
  163. elif len(video_url) % 3 == 2:
  164. video_url += "="
  165. elif len(audio_url) % 3 == 1:
  166. audio_url += "=="
  167. elif len(audio_url) % 3 == 2:
  168. audio_url += "="
  169. video_url = base64.b64decode(video_url).decode("utf8")
  170. audio_url = base64.b64decode(audio_url).decode("utf8")
  171. video_width = video_info["videoResource"]["dash_120fps"]["video_list"][
  172. "video_4"
  173. ]["vwidth"]
  174. video_height = video_info["videoResource"]["dash_120fps"]["video_list"][
  175. "video_4"
  176. ]["vheight"]
  177. video_url_dict["video_url"] = video_url
  178. video_url_dict["audio_url"] = audio_url
  179. video_url_dict["video_width"] = video_width
  180. video_url_dict["video_height"] = video_height
  181. elif (
  182. "video_list" in video_info["videoResource"]["dash_120fps"]
  183. and "video_3" in video_info["videoResource"]["dash_120fps"]["video_list"]
  184. ):
  185. video_url = video_info["videoResource"]["dash_120fps"]["video_list"][
  186. "video_3"
  187. ]["backup_url_1"]
  188. audio_url = video_info["videoResource"]["dash_120fps"]["video_list"][
  189. "video_3"
  190. ]["backup_url_1"]
  191. if len(video_url) % 3 == 1:
  192. video_url += "=="
  193. elif len(video_url) % 3 == 2:
  194. video_url += "="
  195. elif len(audio_url) % 3 == 1:
  196. audio_url += "=="
  197. elif len(audio_url) % 3 == 2:
  198. audio_url += "="
  199. video_url = base64.b64decode(video_url).decode("utf8")
  200. audio_url = base64.b64decode(audio_url).decode("utf8")
  201. video_width = video_info["videoResource"]["dash_120fps"]["video_list"][
  202. "video_3"
  203. ]["vwidth"]
  204. video_height = video_info["videoResource"]["dash_120fps"]["video_list"][
  205. "video_3"
  206. ]["vheight"]
  207. video_url_dict["video_url"] = video_url
  208. video_url_dict["audio_url"] = audio_url
  209. video_url_dict["video_width"] = video_width
  210. video_url_dict["video_height"] = video_height
  211. elif (
  212. "video_list" in video_info["videoResource"]["dash_120fps"]
  213. and "video_2" in video_info["videoResource"]["dash_120fps"]["video_list"]
  214. ):
  215. video_url = video_info["videoResource"]["dash_120fps"]["video_list"][
  216. "video_2"
  217. ]["backup_url_1"]
  218. audio_url = video_info["videoResource"]["dash_120fps"]["video_list"][
  219. "video_2"
  220. ]["backup_url_1"]
  221. if len(video_url) % 3 == 1:
  222. video_url += "=="
  223. elif len(video_url) % 3 == 2:
  224. video_url += "="
  225. elif len(audio_url) % 3 == 1:
  226. audio_url += "=="
  227. elif len(audio_url) % 3 == 2:
  228. audio_url += "="
  229. video_url = base64.b64decode(video_url).decode("utf8")
  230. audio_url = base64.b64decode(audio_url).decode("utf8")
  231. video_width = video_info["videoResource"]["dash_120fps"]["video_list"][
  232. "video_2"
  233. ]["vwidth"]
  234. video_height = video_info["videoResource"]["dash_120fps"]["video_list"][
  235. "video_2"
  236. ]["vheight"]
  237. video_url_dict["video_url"] = video_url
  238. video_url_dict["audio_url"] = audio_url
  239. video_url_dict["video_width"] = video_width
  240. video_url_dict["video_height"] = video_height
  241. elif (
  242. "video_list" in video_info["videoResource"]["dash_120fps"]
  243. and "video_1" in video_info["videoResource"]["dash_120fps"]["video_list"]
  244. ):
  245. video_url = video_info["videoResource"]["dash_120fps"]["video_list"][
  246. "video_1"
  247. ]["backup_url_1"]
  248. audio_url = video_info["videoResource"]["dash_120fps"]["video_list"][
  249. "video_1"
  250. ]["backup_url_1"]
  251. if len(video_url) % 3 == 1:
  252. video_url += "=="
  253. elif len(video_url) % 3 == 2:
  254. video_url += "="
  255. elif len(audio_url) % 3 == 1:
  256. audio_url += "=="
  257. elif len(audio_url) % 3 == 2:
  258. audio_url += "="
  259. video_url = base64.b64decode(video_url).decode("utf8")
  260. audio_url = base64.b64decode(audio_url).decode("utf8")
  261. video_width = video_info["videoResource"]["dash_120fps"]["video_list"][
  262. "video_1"
  263. ]["vwidth"]
  264. video_height = video_info["videoResource"]["dash_120fps"]["video_list"][
  265. "video_1"
  266. ]["vheight"]
  267. video_url_dict["video_url"] = video_url
  268. video_url_dict["audio_url"] = audio_url
  269. video_url_dict["video_width"] = video_width
  270. video_url_dict["video_height"] = video_height
  271. elif (
  272. "dynamic_video" in video_info["videoResource"]["dash_120fps"]
  273. and "dynamic_video_list"
  274. in video_info["videoResource"]["dash_120fps"]["dynamic_video"]
  275. and "dynamic_audio_list"
  276. in video_info["videoResource"]["dash_120fps"]["dynamic_video"]
  277. and len(
  278. video_info["videoResource"]["dash_120fps"]["dynamic_video"][
  279. "dynamic_video_list"
  280. ]
  281. )
  282. != 0
  283. and len(
  284. video_info["videoResource"]["dash_120fps"]["dynamic_video"][
  285. "dynamic_audio_list"
  286. ]
  287. )
  288. != 0
  289. ):
  290. video_url = video_info["videoResource"]["dash_120fps"]["dynamic_video"][
  291. "dynamic_video_list"
  292. ][-1]["backup_url_1"]
  293. audio_url = video_info["videoResource"]["dash_120fps"]["dynamic_video"][
  294. "dynamic_audio_list"
  295. ][-1]["backup_url_1"]
  296. if len(video_url) % 3 == 1:
  297. video_url += "=="
  298. elif len(video_url) % 3 == 2:
  299. video_url += "="
  300. elif len(audio_url) % 3 == 1:
  301. audio_url += "=="
  302. elif len(audio_url) % 3 == 2:
  303. audio_url += "="
  304. video_url = base64.b64decode(video_url).decode("utf8")
  305. audio_url = base64.b64decode(audio_url).decode("utf8")
  306. video_width = video_info["videoResource"]["dash_120fps"]["dynamic_video"][
  307. "dynamic_video_list"
  308. ][-1]["vwidth"]
  309. video_height = video_info["videoResource"]["dash_120fps"]["dynamic_video"][
  310. "dynamic_video_list"
  311. ][-1]["vheight"]
  312. video_url_dict["video_url"] = video_url
  313. video_url_dict["audio_url"] = audio_url
  314. video_url_dict["video_width"] = video_width
  315. video_url_dict["video_height"] = video_height
  316. else:
  317. video_url_dict["video_url"] = ""
  318. video_url_dict["audio_url"] = ""
  319. video_url_dict["video_width"] = 0
  320. video_url_dict["video_height"] = 0
  321. elif "dash" in video_info["videoResource"]:
  322. if (
  323. "video_list" in video_info["videoResource"]["dash"]
  324. and "video_4" in video_info["videoResource"]["dash"]["video_list"]
  325. ):
  326. video_url = video_info["videoResource"]["dash"]["video_list"]["video_4"][
  327. "backup_url_1"
  328. ]
  329. audio_url = video_info["videoResource"]["dash"]["video_list"]["video_4"][
  330. "backup_url_1"
  331. ]
  332. if len(video_url) % 3 == 1:
  333. video_url += "=="
  334. elif len(video_url) % 3 == 2:
  335. video_url += "="
  336. elif len(audio_url) % 3 == 1:
  337. audio_url += "=="
  338. elif len(audio_url) % 3 == 2:
  339. audio_url += "="
  340. video_url = base64.b64decode(video_url).decode("utf8")
  341. audio_url = base64.b64decode(audio_url).decode("utf8")
  342. video_width = video_info["videoResource"]["dash"]["video_list"]["video_4"][
  343. "vwidth"
  344. ]
  345. video_height = video_info["videoResource"]["dash"]["video_list"]["video_4"][
  346. "vheight"
  347. ]
  348. video_url_dict["video_url"] = video_url
  349. video_url_dict["audio_url"] = audio_url
  350. video_url_dict["video_width"] = video_width
  351. video_url_dict["video_height"] = video_height
  352. elif (
  353. "video_list" in video_info["videoResource"]["dash"]
  354. and "video_3" in video_info["videoResource"]["dash"]["video_list"]
  355. ):
  356. video_url = video_info["videoResource"]["dash"]["video_list"]["video_3"][
  357. "backup_url_1"
  358. ]
  359. audio_url = video_info["videoResource"]["dash"]["video_list"]["video_3"][
  360. "backup_url_1"
  361. ]
  362. if len(video_url) % 3 == 1:
  363. video_url += "=="
  364. elif len(video_url) % 3 == 2:
  365. video_url += "="
  366. elif len(audio_url) % 3 == 1:
  367. audio_url += "=="
  368. elif len(audio_url) % 3 == 2:
  369. audio_url += "="
  370. video_url = base64.b64decode(video_url).decode("utf8")
  371. audio_url = base64.b64decode(audio_url).decode("utf8")
  372. video_width = video_info["videoResource"]["dash"]["video_list"]["video_3"][
  373. "vwidth"
  374. ]
  375. video_height = video_info["videoResource"]["dash"]["video_list"]["video_3"][
  376. "vheight"
  377. ]
  378. video_url_dict["video_url"] = video_url
  379. video_url_dict["audio_url"] = audio_url
  380. video_url_dict["video_width"] = video_width
  381. video_url_dict["video_height"] = video_height
  382. elif (
  383. "video_list" in video_info["videoResource"]["dash"]
  384. and "video_2" in video_info["videoResource"]["dash"]["video_list"]
  385. ):
  386. video_url = video_info["videoResource"]["dash"]["video_list"]["video_2"][
  387. "backup_url_1"
  388. ]
  389. audio_url = video_info["videoResource"]["dash"]["video_list"]["video_2"][
  390. "backup_url_1"
  391. ]
  392. if len(video_url) % 3 == 1:
  393. video_url += "=="
  394. elif len(video_url) % 3 == 2:
  395. video_url += "="
  396. elif len(audio_url) % 3 == 1:
  397. audio_url += "=="
  398. elif len(audio_url) % 3 == 2:
  399. audio_url += "="
  400. video_url = base64.b64decode(video_url).decode("utf8")
  401. audio_url = base64.b64decode(audio_url).decode("utf8")
  402. video_width = video_info["videoResource"]["dash"]["video_list"]["video_2"][
  403. "vwidth"
  404. ]
  405. video_height = video_info["videoResource"]["dash"]["video_list"]["video_2"][
  406. "vheight"
  407. ]
  408. video_url_dict["video_url"] = video_url
  409. video_url_dict["audio_url"] = audio_url
  410. video_url_dict["video_width"] = video_width
  411. video_url_dict["video_height"] = video_height
  412. elif (
  413. "video_list" in video_info["videoResource"]["dash"]
  414. and "video_1" in video_info["videoResource"]["dash"]["video_list"]
  415. ):
  416. video_url = video_info["videoResource"]["dash"]["video_list"]["video_1"][
  417. "backup_url_1"
  418. ]
  419. audio_url = video_info["videoResource"]["dash"]["video_list"]["video_1"][
  420. "backup_url_1"
  421. ]
  422. if len(video_url) % 3 == 1:
  423. video_url += "=="
  424. elif len(video_url) % 3 == 2:
  425. video_url += "="
  426. elif len(audio_url) % 3 == 1:
  427. audio_url += "=="
  428. elif len(audio_url) % 3 == 2:
  429. audio_url += "="
  430. video_url = base64.b64decode(video_url).decode("utf8")
  431. audio_url = base64.b64decode(audio_url).decode("utf8")
  432. video_width = video_info["videoResource"]["dash"]["video_list"]["video_1"][
  433. "vwidth"
  434. ]
  435. video_height = video_info["videoResource"]["dash"]["video_list"]["video_1"][
  436. "vheight"
  437. ]
  438. video_url_dict["video_url"] = video_url
  439. video_url_dict["audio_url"] = audio_url
  440. video_url_dict["video_width"] = video_width
  441. video_url_dict["video_height"] = video_height
  442. elif (
  443. "dynamic_video" in video_info["videoResource"]["dash"]
  444. and "dynamic_video_list"
  445. in video_info["videoResource"]["dash"]["dynamic_video"]
  446. and "dynamic_audio_list"
  447. in video_info["videoResource"]["dash"]["dynamic_video"]
  448. and len(
  449. video_info["videoResource"]["dash"]["dynamic_video"][
  450. "dynamic_video_list"
  451. ]
  452. )
  453. != 0
  454. and len(
  455. video_info["videoResource"]["dash"]["dynamic_video"][
  456. "dynamic_audio_list"
  457. ]
  458. )
  459. != 0
  460. ):
  461. video_url = video_info["videoResource"]["dash"]["dynamic_video"][
  462. "dynamic_video_list"
  463. ][-1]["backup_url_1"]
  464. audio_url = video_info["videoResource"]["dash"]["dynamic_video"][
  465. "dynamic_audio_list"
  466. ][-1]["backup_url_1"]
  467. if len(video_url) % 3 == 1:
  468. video_url += "=="
  469. elif len(video_url) % 3 == 2:
  470. video_url += "="
  471. elif len(audio_url) % 3 == 1:
  472. audio_url += "=="
  473. elif len(audio_url) % 3 == 2:
  474. audio_url += "="
  475. video_url = base64.b64decode(video_url).decode("utf8")
  476. audio_url = base64.b64decode(audio_url).decode("utf8")
  477. video_width = video_info["videoResource"]["dash"]["dynamic_video"][
  478. "dynamic_video_list"
  479. ][-1]["vwidth"]
  480. video_height = video_info["videoResource"]["dash"]["dynamic_video"][
  481. "dynamic_video_list"
  482. ][-1]["vheight"]
  483. video_url_dict["video_url"] = video_url
  484. video_url_dict["audio_url"] = audio_url
  485. video_url_dict["video_width"] = video_width
  486. video_url_dict["video_height"] = video_height
  487. else:
  488. video_url_dict["video_url"] = ""
  489. video_url_dict["audio_url"] = ""
  490. video_url_dict["video_width"] = 0
  491. video_url_dict["video_height"] = 0
  492. elif "normal" in video_info["videoResource"]:
  493. if (
  494. "video_list" in video_info["videoResource"]["normal"]
  495. and "video_4" in video_info["videoResource"]["normal"]["video_list"]
  496. ):
  497. video_url = video_info["videoResource"]["normal"]["video_list"]["video_4"][
  498. "backup_url_1"
  499. ]
  500. audio_url = video_info["videoResource"]["normal"]["video_list"]["video_4"][
  501. "backup_url_1"
  502. ]
  503. if len(video_url) % 3 == 1:
  504. video_url += "=="
  505. elif len(video_url) % 3 == 2:
  506. video_url += "="
  507. elif len(audio_url) % 3 == 1:
  508. audio_url += "=="
  509. elif len(audio_url) % 3 == 2:
  510. audio_url += "="
  511. video_url = base64.b64decode(video_url).decode("utf8")
  512. audio_url = base64.b64decode(audio_url).decode("utf8")
  513. video_width = video_info["videoResource"]["normal"]["video_list"][
  514. "video_4"
  515. ]["vwidth"]
  516. video_height = video_info["videoResource"]["normal"]["video_list"][
  517. "video_4"
  518. ]["vheight"]
  519. video_url_dict["video_url"] = video_url
  520. video_url_dict["audio_url"] = audio_url
  521. video_url_dict["video_width"] = video_width
  522. video_url_dict["video_height"] = video_height
  523. elif (
  524. "video_list" in video_info["videoResource"]["normal"]
  525. and "video_3" in video_info["videoResource"]["normal"]["video_list"]
  526. ):
  527. video_url = video_info["videoResource"]["normal"]["video_list"]["video_3"][
  528. "backup_url_1"
  529. ]
  530. audio_url = video_info["videoResource"]["normal"]["video_list"]["video_3"][
  531. "backup_url_1"
  532. ]
  533. if len(video_url) % 3 == 1:
  534. video_url += "=="
  535. elif len(video_url) % 3 == 2:
  536. video_url += "="
  537. elif len(audio_url) % 3 == 1:
  538. audio_url += "=="
  539. elif len(audio_url) % 3 == 2:
  540. audio_url += "="
  541. video_url = base64.b64decode(video_url).decode("utf8")
  542. audio_url = base64.b64decode(audio_url).decode("utf8")
  543. video_width = video_info["videoResource"]["normal"]["video_list"][
  544. "video_3"
  545. ]["vwidth"]
  546. video_height = video_info["videoResource"]["normal"]["video_list"][
  547. "video_3"
  548. ]["vheight"]
  549. video_url_dict["video_url"] = video_url
  550. video_url_dict["audio_url"] = audio_url
  551. video_url_dict["video_width"] = video_width
  552. video_url_dict["video_height"] = video_height
  553. elif (
  554. "video_list" in video_info["videoResource"]["normal"]
  555. and "video_2" in video_info["videoResource"]["normal"]["video_list"]
  556. ):
  557. video_url = video_info["videoResource"]["normal"]["video_list"]["video_2"][
  558. "backup_url_1"
  559. ]
  560. audio_url = video_info["videoResource"]["normal"]["video_list"]["video_2"][
  561. "backup_url_1"
  562. ]
  563. if len(video_url) % 3 == 1:
  564. video_url += "=="
  565. elif len(video_url) % 3 == 2:
  566. video_url += "="
  567. elif len(audio_url) % 3 == 1:
  568. audio_url += "=="
  569. elif len(audio_url) % 3 == 2:
  570. audio_url += "="
  571. video_url = base64.b64decode(video_url).decode("utf8")
  572. audio_url = base64.b64decode(audio_url).decode("utf8")
  573. video_width = video_info["videoResource"]["normal"]["video_list"][
  574. "video_2"
  575. ]["vwidth"]
  576. video_height = video_info["videoResource"]["normal"]["video_list"][
  577. "video_2"
  578. ]["vheight"]
  579. video_url_dict["video_url"] = video_url
  580. video_url_dict["audio_url"] = audio_url
  581. video_url_dict["video_width"] = video_width
  582. video_url_dict["video_height"] = video_height
  583. elif (
  584. "video_list" in video_info["videoResource"]["normal"]
  585. and "video_1" in video_info["videoResource"]["normal"]["video_list"]
  586. ):
  587. video_url = video_info["videoResource"]["normal"]["video_list"]["video_1"][
  588. "backup_url_1"
  589. ]
  590. audio_url = video_info["videoResource"]["normal"]["video_list"]["video_1"][
  591. "backup_url_1"
  592. ]
  593. if len(video_url) % 3 == 1:
  594. video_url += "=="
  595. elif len(video_url) % 3 == 2:
  596. video_url += "="
  597. elif len(audio_url) % 3 == 1:
  598. audio_url += "=="
  599. elif len(audio_url) % 3 == 2:
  600. audio_url += "="
  601. video_url = base64.b64decode(video_url).decode("utf8")
  602. audio_url = base64.b64decode(audio_url).decode("utf8")
  603. video_width = video_info["videoResource"]["normal"]["video_list"][
  604. "video_1"
  605. ]["vwidth"]
  606. video_height = video_info["videoResource"]["normal"]["video_list"][
  607. "video_1"
  608. ]["vheight"]
  609. video_url_dict["video_url"] = video_url
  610. video_url_dict["audio_url"] = audio_url
  611. video_url_dict["video_width"] = video_width
  612. video_url_dict["video_height"] = video_height
  613. elif (
  614. "dynamic_video" in video_info["videoResource"]["normal"]
  615. and "dynamic_video_list"
  616. in video_info["videoResource"]["normal"]["dynamic_video"]
  617. and "dynamic_audio_list"
  618. in video_info["videoResource"]["normal"]["dynamic_video"]
  619. and len(
  620. video_info["videoResource"]["normal"]["dynamic_video"][
  621. "dynamic_video_list"
  622. ]
  623. )
  624. != 0
  625. and len(
  626. video_info["videoResource"]["normal"]["dynamic_video"][
  627. "dynamic_audio_list"
  628. ]
  629. )
  630. != 0
  631. ):
  632. video_url = video_info["videoResource"]["normal"]["dynamic_video"][
  633. "dynamic_video_list"
  634. ][-1]["backup_url_1"]
  635. audio_url = video_info["videoResource"]["normal"]["dynamic_video"][
  636. "dynamic_audio_list"
  637. ][-1]["backup_url_1"]
  638. if len(video_url) % 3 == 1:
  639. video_url += "=="
  640. elif len(video_url) % 3 == 2:
  641. video_url += "="
  642. elif len(audio_url) % 3 == 1:
  643. audio_url += "=="
  644. elif len(audio_url) % 3 == 2:
  645. audio_url += "="
  646. video_url = base64.b64decode(video_url).decode("utf8")
  647. audio_url = base64.b64decode(audio_url).decode("utf8")
  648. video_width = video_info["videoResource"]["normal"]["dynamic_video"][
  649. "dynamic_video_list"
  650. ][-1]["vwidth"]
  651. video_height = video_info["videoResource"]["normal"]["dynamic_video"][
  652. "dynamic_video_list"
  653. ][-1]["vheight"]
  654. video_url_dict["video_url"] = video_url
  655. video_url_dict["audio_url"] = audio_url
  656. video_url_dict["video_width"] = video_width
  657. video_url_dict["video_height"] = video_height
  658. else:
  659. video_url_dict["video_url"] = ""
  660. video_url_dict["audio_url"] = ""
  661. video_url_dict["video_width"] = 0
  662. video_url_dict["video_height"] = 0
  663. else:
  664. video_url_dict["video_url"] = ""
  665. video_url_dict["audio_url"] = ""
  666. video_url_dict["video_width"] = 0
  667. video_url_dict["video_height"] = 0
  668. return video_url_dict
  669. def get_comment_cnt(item_id):
  670. """
  671. 获取视频的评论数量
  672. """
  673. url = "https://www.ixigua.com/tlb/comment/article/v5/tab_comments/?"
  674. params = {
  675. "tab_index": "0",
  676. "count": "10",
  677. "offset": "10",
  678. "group_id": str(item_id),
  679. "item_id": str(item_id),
  680. "aid": "1768",
  681. "msToken": "50-JJObWB07HfHs-BMJWT1eIDX3G-6lPSF_i-QwxBIXE9VVa-iN0jbEXR5pG2DKjXBmP299n6ZTuXzY-GAy968CCvouSAYIS4GzvGQT3pNlKNejr5G4-1g==",
  682. "X-Bogus": "DFSzswVOyGtANVeWtCLMqR/F6q9U",
  683. "_signature": random_signature(),
  684. }
  685. headers = {
  686. "authority": "www.ixigua.com",
  687. "accept": "application/json, text/plain, */*",
  688. "accept-language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
  689. "cache-control": "no-cache",
  690. "cookie": "MONITOR_WEB_ID=67cb5099-a022-4ec3-bb8e-c4de6ba51dd0; passport_csrf_token=72b2574f3c99f8ba670e42df430218fd; passport_csrf_token_default=72b2574f3c99f8ba670e42df430218fd; sid_guard=c7472b508ea631823ba765a60cf8757f%7C1680867422%7C3024002%7CFri%2C+12-May-2023+11%3A37%3A04+GMT; uid_tt=c13f47d51767f616befe32fb3e9f485a; uid_tt_ss=c13f47d51767f616befe32fb3e9f485a; sid_tt=c7472b508ea631823ba765a60cf8757f; sessionid=c7472b508ea631823ba765a60cf8757f; sessionid_ss=c7472b508ea631823ba765a60cf8757f; sid_ucp_v1=1.0.0-KGUzNWYxNmRkZGJiZjgxY2MzZWNkMTEzMTkwYjY1Yjg5OTY5NzVlNmMKFQiu3d-eqQIQ3oDAoQYYGCAMOAhACxoCaGwiIGM3NDcyYjUwOGVhNjMxODIzYmE3NjVhNjBjZjg3NTdm; ssid_ucp_v1=1.0.0-KGUzNWYxNmRkZGJiZjgxY2MzZWNkMTEzMTkwYjY1Yjg5OTY5NzVlNmMKFQiu3d-eqQIQ3oDAoQYYGCAMOAhACxoCaGwiIGM3NDcyYjUwOGVhNjMxODIzYmE3NjVhNjBjZjg3NTdm; odin_tt=b893608d4dde2e1e8df8cd5d97a0e2fbeafc4ca762ac72ebef6e6c97e2ed19859bb01d46b4190ddd6dd17d7f9678e1de; SEARCH_CARD_MODE=7168304743566296612_0; support_webp=true; support_avif=false; csrf_session_id=a5355d954d3c63ed1ba35faada452b4d; tt_scid=7Pux7s634-z8DYvCM20y7KigwH5u7Rh6D9C-RROpnT.aGMEcz6Vsxp.oai47wJqa4f86; ttwid=1%7CHHtv2QqpSGuSu8r-zXF1QoWsvjmNi1SJrqOrZzg-UCY%7C1683858689%7Ca5223fe1500578e01e138a0d71d6444692018296c4c24f5885af174a65873c95; ixigua-a-s=3; msToken=50-JJObWB07HfHs-BMJWT1eIDX3G-6lPSF_i-QwxBIXE9VVa-iN0jbEXR5pG2DKjXBmP299n6ZTuXzY-GAy968CCvouSAYIS4GzvGQT3pNlKNejr5G4-1g==; __ac_nonce=0645dcbf0005064517440; __ac_signature=_02B4Z6wo00f01FEGmAwAAIDBKchzCGqn-MBRJpyAAHAjieFC5GEg6gGiwz.I4PRrJl7f0GcixFrExKmgt6QI1i1S-dQyofPEj2ugWTCnmKUdJQv-wYuDofeKNe8VtMtZq2aKewyUGeKU-5Ud21; ixigua-a-s=3",
  691. "pragma": "no-cache",
  692. "referer": f"https://www.ixigua.com/{item_id}?logTag=3c5aa86a8600b9ab8540",
  693. "sec-ch-ua": '"Microsoft Edge";v="113", "Chromium";v="113", "Not-A.Brand";v="24"',
  694. "sec-ch-ua-mobile": "?0",
  695. "sec-ch-ua-platform": '"macOS"',
  696. "sec-fetch-dest": "empty",
  697. "sec-fetch-mode": "cors",
  698. "sec-fetch-site": "same-origin",
  699. "tt-anti-token": "cBITBHvmYjEygzv-f9c78c1297722cf1f559c74b084e4525ce4900bdcf9e8588f20cc7c2e3234422",
  700. "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.35",
  701. "x-secsdk-csrf-token": "000100000001f8e733cf37f0cd255a51aea9a81ff7bc0c09490cfe41ad827c3c5c18ec809279175e4d9f5553d8a5",
  702. }
  703. response = requests.get(
  704. url=url, headers=headers, params=params, proxies=tunnel_proxies(), timeout=5
  705. )
  706. response.close()
  707. if (
  708. response.status_code != 200
  709. or "total_number" not in response.json()
  710. or response.json() == {}
  711. ):
  712. return 0
  713. return response.json().get("total_number", 0)
  714. class XiGuaAuthor:
  715. """
  716. 西瓜账号爬虫
  717. """
  718. def __init__(self, platform, mode, rule_dict, env, user_list):
  719. self.platform = platform
  720. self.mode = mode
  721. self.rule_dict = rule_dict
  722. self.env = env
  723. self.user_list = user_list
  724. self.mq = MQ(topic_name="topic_crawler_etl_" + self.env)
  725. self.download_count = 0
  726. self.limiter = AuthorLimit(platform=self.platform, mode=self.mode)
  727. def rule_maker(self, account):
  728. """
  729. 通过不同的账号生成不同的规则
  730. :param account: 输入的账号信息
  731. {'play_cnt': {'min': 100000, 'max': 0}, 'period': {'min': 5, 'max': 5}}
  732. """
  733. temp = account['link'].split("?")[0].split("_")
  734. if len(temp) == 1:
  735. return self.rule_dict
  736. else:
  737. flag = temp[-2]
  738. match flag:
  739. case "V1":
  740. rule_dict = {
  741. "play_cnt": {"min": 100000, "max": 0},
  742. 'period': {"min": 90, "max": 90},
  743. 'special': 0.02
  744. }
  745. return rule_dict
  746. case "V2":
  747. rule_dict = {
  748. "play_cnt": {"min": 10000, "max": 0},
  749. 'period': {"min": 90, "max": 90},
  750. 'special': 0.01
  751. }
  752. return rule_dict
  753. case "V3":
  754. rule_dict = {
  755. "play_cnt": {"min": 5000, "max": 0},
  756. 'period': {"min": 90, "max": 90},
  757. 'special': 0.01
  758. }
  759. return rule_dict
  760. def get_author_list(self):
  761. """
  762. 每轮只抓取定量的数据,到达数量后自己退出
  763. 获取账号列表以及账号信息
  764. """
  765. # max_count = int(self.rule_dict.get("videos_cnt", {}).get("min", 300))
  766. for user_dict in self.user_list:
  767. # if self.download_count <= max_count:
  768. try:
  769. flag = user_dict["link"][0]
  770. match flag:
  771. case "V":
  772. self.get_video_list(user_dict)
  773. case "X":
  774. self.get_tiny_video_list(user_dict)
  775. case "h":
  776. self.get_video_list(user_dict)
  777. case "D":
  778. self.get_video_list(user_dict)
  779. case "B":
  780. self.get_video_list(user_dict)
  781. self.get_tiny_video_list(user_dict)
  782. except Exception as e:
  783. AliyunLogger.logging(
  784. code="3001",
  785. account=user_dict["uid"],
  786. platform=self.platform,
  787. mode=self.mode,
  788. env=self.env,
  789. message="扫描账号时出现bug, 报错是 {}".format(e)
  790. )
  791. # time.sleep(random.randint(1, 15))
  792. # else:
  793. # AliyunLogger.logging(
  794. # code="2000",
  795. # platform=self.platform,
  796. # mode=self.mode,
  797. # env=self.env,
  798. # message="本轮已经抓取足够数量的视频,已经自动退出",
  799. # )
  800. # return
  801. def get_video_list(self, user_dict):
  802. """
  803. 获取某个账号的视频列表
  804. 账号分为 3 类
  805. """
  806. offset = 0
  807. signature = random_signature()
  808. link = user_dict['link'].split("?")[0].split("_")[-1]
  809. url = "https://www.ixigua.com/api/videov2/author/new_video_list?"
  810. while True:
  811. to_user_id = str(link.replace("https://www.ixigua.com/home/", ""))
  812. params = {
  813. "to_user_id": to_user_id,
  814. "offset": str(offset),
  815. "limit": "30",
  816. "maxBehotTime": "0",
  817. "order": "new",
  818. "isHome": "0",
  819. "_signature": signature,
  820. }
  821. headers = {
  822. "referer": f'https://www.ixigua.com/home/{link.replace("https://www.ixigua.com/home/", "")}/video/?preActiveKey=hotsoon&list_entrance=userdetail',
  823. "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.41",
  824. }
  825. response = requests.get(
  826. url=url,
  827. headers=headers,
  828. params=params,
  829. proxies=tunnel_proxies(),
  830. timeout=5,
  831. )
  832. offset += 30
  833. if "data" not in response.text or response.status_code != 200:
  834. AliyunLogger.logging(
  835. code="3000",
  836. platform=self.platform,
  837. mode=self.mode,
  838. env=self.env,
  839. message=f"get_videoList:{response.text}\n",
  840. )
  841. return
  842. elif not response.json()["data"]["videoList"]:
  843. AliyunLogger.logging(
  844. account=link,
  845. code="3000",
  846. platform=self.platform,
  847. mode=self.mode,
  848. env=self.env,
  849. data=response.json(),
  850. message=f"没有更多数据啦~\n",
  851. )
  852. return
  853. else:
  854. feeds = response.json()["data"]["videoList"]
  855. for video_obj in feeds:
  856. try:
  857. AliyunLogger.logging(
  858. code="1001",
  859. account=user_dict['uid'],
  860. platform=self.platform,
  861. mode=self.mode,
  862. env=self.env,
  863. data=video_obj,
  864. message="扫描到一条视频",
  865. )
  866. date_flag = self.process_video_obj(video_obj, user_dict, "l")
  867. if not date_flag:
  868. return
  869. except Exception as e:
  870. AliyunLogger.logging(
  871. code="3000",
  872. platform=self.platform,
  873. mode=self.mode,
  874. env=self.env,
  875. data=video_obj,
  876. message="抓取单条视频异常, 报错原因是: {}".format(e),
  877. )
  878. def get_tiny_video_list(self, user_dict):
  879. """
  880. 获取小视频
  881. """
  882. url = "https://www.ixigua.com/api/videov2/hotsoon/video"
  883. max_behot_time = "0"
  884. link = user_dict['link'].split("?")[0].split("_")[-1]
  885. to_user_id = str(link.replace("https://www.ixigua.com/home/", ""))
  886. while True:
  887. params = {
  888. "to_user_id": to_user_id,
  889. "max_behot_time": max_behot_time,
  890. "_signature": random_signature()
  891. }
  892. headers = {
  893. "referer": "https://www.ixigua.com/{}?&".format(to_user_id),
  894. "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.41",
  895. }
  896. response = requests.get(
  897. url=url,
  898. headers=headers,
  899. params=params,
  900. proxies=tunnel_proxies(),
  901. timeout=5,
  902. )
  903. if "data" not in response.text or response.status_code != 200:
  904. AliyunLogger.logging(
  905. code="2000",
  906. platform=self.platform,
  907. mode=self.mode,
  908. env=self.env,
  909. message=f"get_videoList:{response.text}\n",
  910. )
  911. return
  912. elif not response.json()["data"]["data"]:
  913. AliyunLogger.logging(
  914. account=link,
  915. code="2000",
  916. platform=self.platform,
  917. mode=self.mode,
  918. env=self.env,
  919. data=response.json(),
  920. message=f"没有更多数据啦~\n",
  921. )
  922. return
  923. else:
  924. video_list = response.json()['data']['data']
  925. max_behot_time = video_list[-1]["max_behot_time"]
  926. for video_obj in video_list:
  927. try:
  928. AliyunLogger.logging(
  929. code="1001",
  930. account=user_dict['uid'],
  931. platform=self.platform,
  932. mode=self.mode,
  933. env=self.env,
  934. data=video_obj,
  935. message="扫描到一条小视频",
  936. )
  937. date_flag = self.process_video_obj(video_obj, user_dict, "s")
  938. if not date_flag:
  939. return
  940. except Exception as e:
  941. AliyunLogger.logging(
  942. code="3000",
  943. platform=self.platform,
  944. mode=self.mode,
  945. env=self.env,
  946. data=video_obj,
  947. message="抓取单条视频异常, 报错原因是: {}".format(e),
  948. )
  949. def process_video_obj(self, video_obj, user_dict, f):
  950. """
  951. process video_obj and extract video_url
  952. """
  953. new_rule = self.rule_maker(user_dict)
  954. trace_id = self.platform + str(uuid.uuid1())
  955. if f == "s":
  956. item_id = video_obj.get("id_str", "")
  957. else:
  958. item_id = video_obj.get("item_id", "")
  959. if not item_id:
  960. AliyunLogger.logging(
  961. code="2005",
  962. account=user_dict['uid'],
  963. platform=self.platform,
  964. mode=self.mode,
  965. env=self.env,
  966. message="无效视频",
  967. data=video_obj,
  968. trace_id=trace_id,
  969. )
  970. return
  971. # 获取视频信息
  972. video_dict = self.get_video_info(item_id=item_id)
  973. video_dict["platform"] = self.platform
  974. video_dict["strategy"] = self.mode
  975. video_dict["out_video_id"] = video_dict["video_id"]
  976. video_dict["width"] = video_dict["video_width"]
  977. video_dict["height"] = video_dict["video_height"]
  978. video_dict["crawler_rule"] = json.dumps(new_rule)
  979. video_dict["user_id"] = user_dict["uid"]
  980. video_dict["publish_time"] = video_dict["publish_time_str"]
  981. video_dict["strategy_type"] = self.mode
  982. video_dict["update_time_stamp"] = int(time.time())
  983. if int(time.time()) - video_dict['publish_time_stamp'] > 3600 * 24 * int(
  984. new_rule.get("period", {}).get("max", 1000)):
  985. if not video_obj['is_top']:
  986. """
  987. 非置顶数据发布时间超过才退出
  988. """
  989. AliyunLogger.logging(
  990. code="2004",
  991. account=user_dict['uid'],
  992. platform=self.platform,
  993. mode=self.mode,
  994. env=self.env,
  995. data=video_dict,
  996. message="发布时间超过{}天".format(
  997. int(new_rule.get("period", {}).get("max", 1000))
  998. ),
  999. )
  1000. return False
  1001. pipeline = PiaoQuanPipeline(
  1002. platform=self.platform,
  1003. mode=self.mode,
  1004. rule_dict=new_rule,
  1005. env=self.env,
  1006. item=video_dict,
  1007. trace_id=trace_id,
  1008. )
  1009. limit_flag = self.limiter.author_limitation(user_id=video_dict['user_id'])
  1010. if limit_flag:
  1011. title_flag = pipeline.title_flag()
  1012. repeat_flag = pipeline.repeat_video()
  1013. if title_flag and repeat_flag:
  1014. if new_rule.get("special"):
  1015. if int(video_dict['play_cnt']) >= int(new_rule.get("play_cnt", {}).get("min", 100000)):
  1016. if float(video_dict['like_cnt']) / float(video_dict['play_cnt']) >= new_rule['special']:
  1017. self.mq.send_msg(video_dict)
  1018. self.download_count += 1
  1019. AliyunLogger.logging(
  1020. code="1002",
  1021. account=user_dict['uid'],
  1022. platform=self.platform,
  1023. mode=self.mode,
  1024. env=self.env,
  1025. data=video_dict,
  1026. trace_id=trace_id,
  1027. message="成功发送 MQ 至 ETL",
  1028. )
  1029. return True
  1030. else:
  1031. AliyunLogger.logging(
  1032. code="2008",
  1033. account=user_dict['uid'],
  1034. platform=self.platform,
  1035. mode=self.mode,
  1036. env=self.env,
  1037. message="不满足特殊规则, 点赞量/播放量",
  1038. data=video_dict
  1039. )
  1040. else:
  1041. if int(video_dict['play_cnt']) >= int(new_rule.get("play_cnt", {}).get("min", 100000)):
  1042. self.mq.send_msg(video_dict)
  1043. self.download_count += 1
  1044. AliyunLogger.logging(
  1045. code="1002",
  1046. account=user_dict['uid'],
  1047. platform=self.platform,
  1048. mode=self.mode,
  1049. env=self.env,
  1050. data=video_dict,
  1051. trace_id=trace_id,
  1052. message="成功发送 MQ 至 ETL",
  1053. )
  1054. return True
  1055. else:
  1056. AliyunLogger.logging(
  1057. code="2008",
  1058. account=user_dict['uid'],
  1059. platform=self.platform,
  1060. mode=self.mode,
  1061. env=self.env,
  1062. message="不满足特殊规则, 播放量",
  1063. data=video_dict
  1064. )
  1065. return True
  1066. def get_video_info(self, item_id):
  1067. """
  1068. 获取视频信息
  1069. """
  1070. url = "https://www.ixigua.com/{}".format(item_id)
  1071. headers = {
  1072. "accept-encoding": "gzip, deflate",
  1073. "accept-language": "zh-CN,zh-Hans;q=0.9",
  1074. "cookie": "ttwid={}".format(byte_dance_cookie(item_id)),
  1075. "user-agent": FakeUserAgent().random,
  1076. "referer": "https://www.ixigua.com/{}/".format(item_id),
  1077. }
  1078. response = requests.get(
  1079. url=url,
  1080. headers=headers,
  1081. proxies=tunnel_proxies(),
  1082. timeout=5,
  1083. )
  1084. time.sleep(random.randint(1, 5))
  1085. video_info = extract_info_by_re(response.text)
  1086. video_dict = {
  1087. "video_title": video_info.get("title", ""),
  1088. "video_id": video_info.get("video_id"),
  1089. "gid": str(item_id),
  1090. "play_cnt": int(video_info.get("play_count", 0)),
  1091. "like_cnt": int(video_info.get("like_count", 0)),
  1092. "comment_cnt": 0,
  1093. "share_cnt": 0,
  1094. "favorite_cnt": 0,
  1095. "duration": int(video_info.get("duration", 0)),
  1096. "video_width": 0,
  1097. "video_height": 0,
  1098. "publish_time_stamp": int(video_info.get("publish_time", 0)),
  1099. "publish_time_str": time.strftime(
  1100. "%Y-%m-%d %H:%M:%S",
  1101. time.localtime(int(video_info.get("publish_time", 0))),
  1102. ),
  1103. "avatar_url": str(
  1104. video_info.get("user_info", {}).get("avatar_url", "")
  1105. ),
  1106. "cover_url": video_info.get("cover_url", "").replace("\\u002F", "/"),
  1107. "video_url": video_info.get("url"),
  1108. "session": f"xigua-author-{int(time.time())}",
  1109. }
  1110. return video_dict
  1111. if __name__ == "__main__":
  1112. user_list = [
  1113. {
  1114. "uid": 6267140,
  1115. "source": "xigua",
  1116. "link": "https://www.ixigua.com/home/2779177225827568",
  1117. "nick_name": "秋晴爱音乐",
  1118. "avatar_url": "",
  1119. "mode": "author",
  1120. },
  1121. {
  1122. "uid": 6267140,
  1123. "source": "xigua",
  1124. "link": "https://www.ixigua.com/home/2885546124776780",
  1125. "nick_name": "朗诵放歌的老山羊",
  1126. "avatar_url": "",
  1127. "mode": "author",
  1128. },
  1129. {
  1130. "uid": 6267140,
  1131. "source": "xigua",
  1132. "link": "https://www.ixigua.com/home/5880938217",
  1133. "nick_name": "天原声疗",
  1134. "avatar_url": "",
  1135. "mode": "author",
  1136. },
  1137. ]
  1138. rule = {'period': {'min': 30, 'max': 30}, 'duration': {'min': 20, 'max': 0}, 'play_cnt': {'min': 100000, 'max': 0}}
  1139. XGA = XiGuaAuthor(
  1140. platform="xigua",
  1141. mode="author",
  1142. rule_dict=rule,
  1143. env="prod",
  1144. user_list=user_list
  1145. )
  1146. XGA.get_author_list()