xigua_author.py 49 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198
  1. import json
  2. import os
  3. import re
  4. import random
  5. import sys
  6. import string
  7. import time
  8. import uuid
  9. import base64
  10. import requests
  11. from lxml import etree
  12. from Crypto.Cipher import AES
  13. from Crypto.Util.Padding import unpad
  14. from fake_useragent import FakeUserAgent
  15. from common.mq import MQ
  16. sys.path.append(os.getcwd())
  17. from common import AliyunLogger, PiaoQuanPipeline, tunnel_proxies
  18. from common.limit import AuthorLimit
  19. def aes_decrypt(data: str, key: str) -> str:
  20. """
  21. XiGua AES decrypt
  22. :param data:
  23. :param key:
  24. :return:
  25. """
  26. password = key.encode()
  27. iv = password[:16]
  28. try:
  29. ct = base64.b64decode(data.encode())
  30. cipher = AES.new(password, AES.MODE_CBC, iv)
  31. pt = unpad(cipher.decrypt(ct), AES.block_size)
  32. return base64.b64decode(pt).decode()
  33. except Exception as e:
  34. print("Incorrect decryption {}".format(e))
  35. return None
  36. def extract_video_url(text):
  37. """
  38. 获取视频 video_url
  39. :param text:
  40. :return:
  41. """
  42. HTML = etree.HTML(text)
  43. str_2 = HTML.xpath('//script[@id="SSR_HYDRATED_DATA"]/text()')[0]
  44. json_2 = str_2[str_2.find('{'):str_2.rfind('}') + 1]
  45. Irregulars = ['null', 'undefined', '=false', '=true', 'false', 'true']
  46. # python中不规则的定义
  47. for I in Irregulars:
  48. if I in ['=false', '=true']:
  49. json_2 = json_2.replace(I, '=' + I[1:].capitalize())
  50. else:
  51. json_2 = json_2.replace(I, '12')
  52. dict_2 = json.loads(json_2)["anyVideo"]["gidInformation"]["packerData"]["video"]["videoResource"]
  53. if dict_2['dash'] == 12:
  54. obj = dict_2['normal']
  55. ptk = obj['ptk']
  56. main_url = obj['video_list']['video_3']['main_url']
  57. real_video_url = aes_decrypt(data=main_url, key=ptk)
  58. else:
  59. obj = dict_2['dash']
  60. ptk = obj["ptk"]
  61. video_url = obj['dynamic_video']['main_url']
  62. real_video_url = aes_decrypt(data=video_url, key=ptk)
  63. return real_video_url
  64. def extract_info_by_re(text):
  65. """
  66. 通过正则表达式获取文本中的信息
  67. :param text:
  68. :return:
  69. """
  70. # 标题
  71. title_match = re.search(r'<title[^>]*>(.*?)</title>', text)
  72. if title_match:
  73. title_content = title_match.group(1)
  74. title_content = title_content.split(" - ")[0]
  75. title_content = bytes(title_content, "latin1").decode()
  76. else:
  77. title_content = ""
  78. # video_id
  79. video_id = re.search(r'"vid":"(.*?)"', text).group(1)
  80. # like_count
  81. like_count = re.search(r'"video_like_count":(.*?),', text).group(1)
  82. # cover_url
  83. cover_url = re.search(r'"avatar_url":"(.*?)"', text).group(1)
  84. # video_play
  85. video_watch_count = re.search(r'"video_watch_count":(.*?),', text).group(1)
  86. # "video_publish_time"
  87. publish_time = re.search(r'"video_publish_time":"(.*?)"', text).group(1)
  88. # video_duration
  89. duration = re.search(r'("video_duration":)(.*?)"', text).group(2).replace(",", "")
  90. return {
  91. "title": title_content,
  92. "url": extract_video_url(text),
  93. "video_id": video_id,
  94. "like_count": like_count,
  95. "cover_url": cover_url,
  96. "play_count": video_watch_count,
  97. "publish_time": publish_time,
  98. "duration": duration
  99. }
  100. def random_signature():
  101. """
  102. 随机生成签名
  103. """
  104. src_digits = string.digits # string_数字
  105. src_uppercase = string.ascii_uppercase # string_大写字母
  106. src_lowercase = string.ascii_lowercase # string_小写字母
  107. digits_num = random.randint(1, 6)
  108. uppercase_num = random.randint(1, 26 - digits_num - 1)
  109. lowercase_num = 26 - (digits_num + uppercase_num)
  110. password = (
  111. random.sample(src_digits, digits_num)
  112. + random.sample(src_uppercase, uppercase_num)
  113. + random.sample(src_lowercase, lowercase_num)
  114. )
  115. random.shuffle(password)
  116. new_password = "AAAAAAAAAA" + "".join(password)[10:-4] + "AAAB"
  117. new_password_start = new_password[0:18]
  118. new_password_end = new_password[-7:]
  119. if new_password[18] == "8":
  120. new_password = new_password_start + "w" + new_password_end
  121. elif new_password[18] == "9":
  122. new_password = new_password_start + "x" + new_password_end
  123. elif new_password[18] == "-":
  124. new_password = new_password_start + "y" + new_password_end
  125. elif new_password[18] == ".":
  126. new_password = new_password_start + "z" + new_password_end
  127. else:
  128. new_password = new_password_start + "y" + new_password_end
  129. return new_password
  130. def byte_dance_cookie(item_id):
  131. """
  132. 获取西瓜视频的 cookie
  133. :param item_id:
  134. """
  135. sess = requests.Session()
  136. sess.headers.update({
  137. 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_1_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',
  138. 'referer': 'https://www.ixigua.com/home/{}/'.format(item_id),
  139. })
  140. # 获取 cookies
  141. sess.get('https://i.snssdk.com/slardar/sdk.js?bid=xigua_video_web_pc')
  142. data = '{"region":"cn","aid":1768,"needFid":false,"service":"www.ixigua.com","migrate_info":{"ticket":"","source":"node"},"cbUrlProtocol":"https","union":true}'
  143. r = sess.post('https://ttwid.bytedance.com/ttwid/union/register/', data=data)
  144. # print(r.text)
  145. return r.cookies.values()[0]
  146. def get_video_url(video_info):
  147. """
  148. 获取视频的链接
  149. """
  150. video_url_dict = {}
  151. # video_url
  152. if "videoResource" not in video_info:
  153. video_url_dict["video_url"] = ""
  154. video_url_dict["audio_url"] = ""
  155. video_url_dict["video_width"] = 0
  156. video_url_dict["video_height"] = 0
  157. elif "dash_120fps" in video_info["videoResource"]:
  158. if (
  159. "video_list" in video_info["videoResource"]["dash_120fps"]
  160. and "video_4" in video_info["videoResource"]["dash_120fps"]["video_list"]
  161. ):
  162. video_url = video_info["videoResource"]["dash_120fps"]["video_list"][
  163. "video_4"
  164. ]["backup_url_1"]
  165. audio_url = video_info["videoResource"]["dash_120fps"]["video_list"][
  166. "video_4"
  167. ]["backup_url_1"]
  168. if len(video_url) % 3 == 1:
  169. video_url += "=="
  170. elif len(video_url) % 3 == 2:
  171. video_url += "="
  172. elif len(audio_url) % 3 == 1:
  173. audio_url += "=="
  174. elif len(audio_url) % 3 == 2:
  175. audio_url += "="
  176. video_url = base64.b64decode(video_url).decode("utf8")
  177. audio_url = base64.b64decode(audio_url).decode("utf8")
  178. video_width = video_info["videoResource"]["dash_120fps"]["video_list"][
  179. "video_4"
  180. ]["vwidth"]
  181. video_height = video_info["videoResource"]["dash_120fps"]["video_list"][
  182. "video_4"
  183. ]["vheight"]
  184. video_url_dict["video_url"] = video_url
  185. video_url_dict["audio_url"] = audio_url
  186. video_url_dict["video_width"] = video_width
  187. video_url_dict["video_height"] = video_height
  188. elif (
  189. "video_list" in video_info["videoResource"]["dash_120fps"]
  190. and "video_3" in video_info["videoResource"]["dash_120fps"]["video_list"]
  191. ):
  192. video_url = video_info["videoResource"]["dash_120fps"]["video_list"][
  193. "video_3"
  194. ]["backup_url_1"]
  195. audio_url = video_info["videoResource"]["dash_120fps"]["video_list"][
  196. "video_3"
  197. ]["backup_url_1"]
  198. if len(video_url) % 3 == 1:
  199. video_url += "=="
  200. elif len(video_url) % 3 == 2:
  201. video_url += "="
  202. elif len(audio_url) % 3 == 1:
  203. audio_url += "=="
  204. elif len(audio_url) % 3 == 2:
  205. audio_url += "="
  206. video_url = base64.b64decode(video_url).decode("utf8")
  207. audio_url = base64.b64decode(audio_url).decode("utf8")
  208. video_width = video_info["videoResource"]["dash_120fps"]["video_list"][
  209. "video_3"
  210. ]["vwidth"]
  211. video_height = video_info["videoResource"]["dash_120fps"]["video_list"][
  212. "video_3"
  213. ]["vheight"]
  214. video_url_dict["video_url"] = video_url
  215. video_url_dict["audio_url"] = audio_url
  216. video_url_dict["video_width"] = video_width
  217. video_url_dict["video_height"] = video_height
  218. elif (
  219. "video_list" in video_info["videoResource"]["dash_120fps"]
  220. and "video_2" in video_info["videoResource"]["dash_120fps"]["video_list"]
  221. ):
  222. video_url = video_info["videoResource"]["dash_120fps"]["video_list"][
  223. "video_2"
  224. ]["backup_url_1"]
  225. audio_url = video_info["videoResource"]["dash_120fps"]["video_list"][
  226. "video_2"
  227. ]["backup_url_1"]
  228. if len(video_url) % 3 == 1:
  229. video_url += "=="
  230. elif len(video_url) % 3 == 2:
  231. video_url += "="
  232. elif len(audio_url) % 3 == 1:
  233. audio_url += "=="
  234. elif len(audio_url) % 3 == 2:
  235. audio_url += "="
  236. video_url = base64.b64decode(video_url).decode("utf8")
  237. audio_url = base64.b64decode(audio_url).decode("utf8")
  238. video_width = video_info["videoResource"]["dash_120fps"]["video_list"][
  239. "video_2"
  240. ]["vwidth"]
  241. video_height = video_info["videoResource"]["dash_120fps"]["video_list"][
  242. "video_2"
  243. ]["vheight"]
  244. video_url_dict["video_url"] = video_url
  245. video_url_dict["audio_url"] = audio_url
  246. video_url_dict["video_width"] = video_width
  247. video_url_dict["video_height"] = video_height
  248. elif (
  249. "video_list" in video_info["videoResource"]["dash_120fps"]
  250. and "video_1" in video_info["videoResource"]["dash_120fps"]["video_list"]
  251. ):
  252. video_url = video_info["videoResource"]["dash_120fps"]["video_list"][
  253. "video_1"
  254. ]["backup_url_1"]
  255. audio_url = video_info["videoResource"]["dash_120fps"]["video_list"][
  256. "video_1"
  257. ]["backup_url_1"]
  258. if len(video_url) % 3 == 1:
  259. video_url += "=="
  260. elif len(video_url) % 3 == 2:
  261. video_url += "="
  262. elif len(audio_url) % 3 == 1:
  263. audio_url += "=="
  264. elif len(audio_url) % 3 == 2:
  265. audio_url += "="
  266. video_url = base64.b64decode(video_url).decode("utf8")
  267. audio_url = base64.b64decode(audio_url).decode("utf8")
  268. video_width = video_info["videoResource"]["dash_120fps"]["video_list"][
  269. "video_1"
  270. ]["vwidth"]
  271. video_height = video_info["videoResource"]["dash_120fps"]["video_list"][
  272. "video_1"
  273. ]["vheight"]
  274. video_url_dict["video_url"] = video_url
  275. video_url_dict["audio_url"] = audio_url
  276. video_url_dict["video_width"] = video_width
  277. video_url_dict["video_height"] = video_height
  278. elif (
  279. "dynamic_video" in video_info["videoResource"]["dash_120fps"]
  280. and "dynamic_video_list"
  281. in video_info["videoResource"]["dash_120fps"]["dynamic_video"]
  282. and "dynamic_audio_list"
  283. in video_info["videoResource"]["dash_120fps"]["dynamic_video"]
  284. and len(
  285. video_info["videoResource"]["dash_120fps"]["dynamic_video"][
  286. "dynamic_video_list"
  287. ]
  288. )
  289. != 0
  290. and len(
  291. video_info["videoResource"]["dash_120fps"]["dynamic_video"][
  292. "dynamic_audio_list"
  293. ]
  294. )
  295. != 0
  296. ):
  297. video_url = video_info["videoResource"]["dash_120fps"]["dynamic_video"][
  298. "dynamic_video_list"
  299. ][-1]["backup_url_1"]
  300. audio_url = video_info["videoResource"]["dash_120fps"]["dynamic_video"][
  301. "dynamic_audio_list"
  302. ][-1]["backup_url_1"]
  303. if len(video_url) % 3 == 1:
  304. video_url += "=="
  305. elif len(video_url) % 3 == 2:
  306. video_url += "="
  307. elif len(audio_url) % 3 == 1:
  308. audio_url += "=="
  309. elif len(audio_url) % 3 == 2:
  310. audio_url += "="
  311. video_url = base64.b64decode(video_url).decode("utf8")
  312. audio_url = base64.b64decode(audio_url).decode("utf8")
  313. video_width = video_info["videoResource"]["dash_120fps"]["dynamic_video"][
  314. "dynamic_video_list"
  315. ][-1]["vwidth"]
  316. video_height = video_info["videoResource"]["dash_120fps"]["dynamic_video"][
  317. "dynamic_video_list"
  318. ][-1]["vheight"]
  319. video_url_dict["video_url"] = video_url
  320. video_url_dict["audio_url"] = audio_url
  321. video_url_dict["video_width"] = video_width
  322. video_url_dict["video_height"] = video_height
  323. else:
  324. video_url_dict["video_url"] = ""
  325. video_url_dict["audio_url"] = ""
  326. video_url_dict["video_width"] = 0
  327. video_url_dict["video_height"] = 0
  328. elif "dash" in video_info["videoResource"]:
  329. if (
  330. "video_list" in video_info["videoResource"]["dash"]
  331. and "video_4" in video_info["videoResource"]["dash"]["video_list"]
  332. ):
  333. video_url = video_info["videoResource"]["dash"]["video_list"]["video_4"][
  334. "backup_url_1"
  335. ]
  336. audio_url = video_info["videoResource"]["dash"]["video_list"]["video_4"][
  337. "backup_url_1"
  338. ]
  339. if len(video_url) % 3 == 1:
  340. video_url += "=="
  341. elif len(video_url) % 3 == 2:
  342. video_url += "="
  343. elif len(audio_url) % 3 == 1:
  344. audio_url += "=="
  345. elif len(audio_url) % 3 == 2:
  346. audio_url += "="
  347. video_url = base64.b64decode(video_url).decode("utf8")
  348. audio_url = base64.b64decode(audio_url).decode("utf8")
  349. video_width = video_info["videoResource"]["dash"]["video_list"]["video_4"][
  350. "vwidth"
  351. ]
  352. video_height = video_info["videoResource"]["dash"]["video_list"]["video_4"][
  353. "vheight"
  354. ]
  355. video_url_dict["video_url"] = video_url
  356. video_url_dict["audio_url"] = audio_url
  357. video_url_dict["video_width"] = video_width
  358. video_url_dict["video_height"] = video_height
  359. elif (
  360. "video_list" in video_info["videoResource"]["dash"]
  361. and "video_3" in video_info["videoResource"]["dash"]["video_list"]
  362. ):
  363. video_url = video_info["videoResource"]["dash"]["video_list"]["video_3"][
  364. "backup_url_1"
  365. ]
  366. audio_url = video_info["videoResource"]["dash"]["video_list"]["video_3"][
  367. "backup_url_1"
  368. ]
  369. if len(video_url) % 3 == 1:
  370. video_url += "=="
  371. elif len(video_url) % 3 == 2:
  372. video_url += "="
  373. elif len(audio_url) % 3 == 1:
  374. audio_url += "=="
  375. elif len(audio_url) % 3 == 2:
  376. audio_url += "="
  377. video_url = base64.b64decode(video_url).decode("utf8")
  378. audio_url = base64.b64decode(audio_url).decode("utf8")
  379. video_width = video_info["videoResource"]["dash"]["video_list"]["video_3"][
  380. "vwidth"
  381. ]
  382. video_height = video_info["videoResource"]["dash"]["video_list"]["video_3"][
  383. "vheight"
  384. ]
  385. video_url_dict["video_url"] = video_url
  386. video_url_dict["audio_url"] = audio_url
  387. video_url_dict["video_width"] = video_width
  388. video_url_dict["video_height"] = video_height
  389. elif (
  390. "video_list" in video_info["videoResource"]["dash"]
  391. and "video_2" in video_info["videoResource"]["dash"]["video_list"]
  392. ):
  393. video_url = video_info["videoResource"]["dash"]["video_list"]["video_2"][
  394. "backup_url_1"
  395. ]
  396. audio_url = video_info["videoResource"]["dash"]["video_list"]["video_2"][
  397. "backup_url_1"
  398. ]
  399. if len(video_url) % 3 == 1:
  400. video_url += "=="
  401. elif len(video_url) % 3 == 2:
  402. video_url += "="
  403. elif len(audio_url) % 3 == 1:
  404. audio_url += "=="
  405. elif len(audio_url) % 3 == 2:
  406. audio_url += "="
  407. video_url = base64.b64decode(video_url).decode("utf8")
  408. audio_url = base64.b64decode(audio_url).decode("utf8")
  409. video_width = video_info["videoResource"]["dash"]["video_list"]["video_2"][
  410. "vwidth"
  411. ]
  412. video_height = video_info["videoResource"]["dash"]["video_list"]["video_2"][
  413. "vheight"
  414. ]
  415. video_url_dict["video_url"] = video_url
  416. video_url_dict["audio_url"] = audio_url
  417. video_url_dict["video_width"] = video_width
  418. video_url_dict["video_height"] = video_height
  419. elif (
  420. "video_list" in video_info["videoResource"]["dash"]
  421. and "video_1" in video_info["videoResource"]["dash"]["video_list"]
  422. ):
  423. video_url = video_info["videoResource"]["dash"]["video_list"]["video_1"][
  424. "backup_url_1"
  425. ]
  426. audio_url = video_info["videoResource"]["dash"]["video_list"]["video_1"][
  427. "backup_url_1"
  428. ]
  429. if len(video_url) % 3 == 1:
  430. video_url += "=="
  431. elif len(video_url) % 3 == 2:
  432. video_url += "="
  433. elif len(audio_url) % 3 == 1:
  434. audio_url += "=="
  435. elif len(audio_url) % 3 == 2:
  436. audio_url += "="
  437. video_url = base64.b64decode(video_url).decode("utf8")
  438. audio_url = base64.b64decode(audio_url).decode("utf8")
  439. video_width = video_info["videoResource"]["dash"]["video_list"]["video_1"][
  440. "vwidth"
  441. ]
  442. video_height = video_info["videoResource"]["dash"]["video_list"]["video_1"][
  443. "vheight"
  444. ]
  445. video_url_dict["video_url"] = video_url
  446. video_url_dict["audio_url"] = audio_url
  447. video_url_dict["video_width"] = video_width
  448. video_url_dict["video_height"] = video_height
  449. elif (
  450. "dynamic_video" in video_info["videoResource"]["dash"]
  451. and "dynamic_video_list"
  452. in video_info["videoResource"]["dash"]["dynamic_video"]
  453. and "dynamic_audio_list"
  454. in video_info["videoResource"]["dash"]["dynamic_video"]
  455. and len(
  456. video_info["videoResource"]["dash"]["dynamic_video"][
  457. "dynamic_video_list"
  458. ]
  459. )
  460. != 0
  461. and len(
  462. video_info["videoResource"]["dash"]["dynamic_video"][
  463. "dynamic_audio_list"
  464. ]
  465. )
  466. != 0
  467. ):
  468. video_url = video_info["videoResource"]["dash"]["dynamic_video"][
  469. "dynamic_video_list"
  470. ][-1]["backup_url_1"]
  471. audio_url = video_info["videoResource"]["dash"]["dynamic_video"][
  472. "dynamic_audio_list"
  473. ][-1]["backup_url_1"]
  474. if len(video_url) % 3 == 1:
  475. video_url += "=="
  476. elif len(video_url) % 3 == 2:
  477. video_url += "="
  478. elif len(audio_url) % 3 == 1:
  479. audio_url += "=="
  480. elif len(audio_url) % 3 == 2:
  481. audio_url += "="
  482. video_url = base64.b64decode(video_url).decode("utf8")
  483. audio_url = base64.b64decode(audio_url).decode("utf8")
  484. video_width = video_info["videoResource"]["dash"]["dynamic_video"][
  485. "dynamic_video_list"
  486. ][-1]["vwidth"]
  487. video_height = video_info["videoResource"]["dash"]["dynamic_video"][
  488. "dynamic_video_list"
  489. ][-1]["vheight"]
  490. video_url_dict["video_url"] = video_url
  491. video_url_dict["audio_url"] = audio_url
  492. video_url_dict["video_width"] = video_width
  493. video_url_dict["video_height"] = video_height
  494. else:
  495. video_url_dict["video_url"] = ""
  496. video_url_dict["audio_url"] = ""
  497. video_url_dict["video_width"] = 0
  498. video_url_dict["video_height"] = 0
  499. elif "normal" in video_info["videoResource"]:
  500. if (
  501. "video_list" in video_info["videoResource"]["normal"]
  502. and "video_4" in video_info["videoResource"]["normal"]["video_list"]
  503. ):
  504. video_url = video_info["videoResource"]["normal"]["video_list"]["video_4"][
  505. "backup_url_1"
  506. ]
  507. audio_url = video_info["videoResource"]["normal"]["video_list"]["video_4"][
  508. "backup_url_1"
  509. ]
  510. if len(video_url) % 3 == 1:
  511. video_url += "=="
  512. elif len(video_url) % 3 == 2:
  513. video_url += "="
  514. elif len(audio_url) % 3 == 1:
  515. audio_url += "=="
  516. elif len(audio_url) % 3 == 2:
  517. audio_url += "="
  518. video_url = base64.b64decode(video_url).decode("utf8")
  519. audio_url = base64.b64decode(audio_url).decode("utf8")
  520. video_width = video_info["videoResource"]["normal"]["video_list"][
  521. "video_4"
  522. ]["vwidth"]
  523. video_height = video_info["videoResource"]["normal"]["video_list"][
  524. "video_4"
  525. ]["vheight"]
  526. video_url_dict["video_url"] = video_url
  527. video_url_dict["audio_url"] = audio_url
  528. video_url_dict["video_width"] = video_width
  529. video_url_dict["video_height"] = video_height
  530. elif (
  531. "video_list" in video_info["videoResource"]["normal"]
  532. and "video_3" in video_info["videoResource"]["normal"]["video_list"]
  533. ):
  534. video_url = video_info["videoResource"]["normal"]["video_list"]["video_3"][
  535. "backup_url_1"
  536. ]
  537. audio_url = video_info["videoResource"]["normal"]["video_list"]["video_3"][
  538. "backup_url_1"
  539. ]
  540. if len(video_url) % 3 == 1:
  541. video_url += "=="
  542. elif len(video_url) % 3 == 2:
  543. video_url += "="
  544. elif len(audio_url) % 3 == 1:
  545. audio_url += "=="
  546. elif len(audio_url) % 3 == 2:
  547. audio_url += "="
  548. video_url = base64.b64decode(video_url).decode("utf8")
  549. audio_url = base64.b64decode(audio_url).decode("utf8")
  550. video_width = video_info["videoResource"]["normal"]["video_list"][
  551. "video_3"
  552. ]["vwidth"]
  553. video_height = video_info["videoResource"]["normal"]["video_list"][
  554. "video_3"
  555. ]["vheight"]
  556. video_url_dict["video_url"] = video_url
  557. video_url_dict["audio_url"] = audio_url
  558. video_url_dict["video_width"] = video_width
  559. video_url_dict["video_height"] = video_height
  560. elif (
  561. "video_list" in video_info["videoResource"]["normal"]
  562. and "video_2" in video_info["videoResource"]["normal"]["video_list"]
  563. ):
  564. video_url = video_info["videoResource"]["normal"]["video_list"]["video_2"][
  565. "backup_url_1"
  566. ]
  567. audio_url = video_info["videoResource"]["normal"]["video_list"]["video_2"][
  568. "backup_url_1"
  569. ]
  570. if len(video_url) % 3 == 1:
  571. video_url += "=="
  572. elif len(video_url) % 3 == 2:
  573. video_url += "="
  574. elif len(audio_url) % 3 == 1:
  575. audio_url += "=="
  576. elif len(audio_url) % 3 == 2:
  577. audio_url += "="
  578. video_url = base64.b64decode(video_url).decode("utf8")
  579. audio_url = base64.b64decode(audio_url).decode("utf8")
  580. video_width = video_info["videoResource"]["normal"]["video_list"][
  581. "video_2"
  582. ]["vwidth"]
  583. video_height = video_info["videoResource"]["normal"]["video_list"][
  584. "video_2"
  585. ]["vheight"]
  586. video_url_dict["video_url"] = video_url
  587. video_url_dict["audio_url"] = audio_url
  588. video_url_dict["video_width"] = video_width
  589. video_url_dict["video_height"] = video_height
  590. elif (
  591. "video_list" in video_info["videoResource"]["normal"]
  592. and "video_1" in video_info["videoResource"]["normal"]["video_list"]
  593. ):
  594. video_url = video_info["videoResource"]["normal"]["video_list"]["video_1"][
  595. "backup_url_1"
  596. ]
  597. audio_url = video_info["videoResource"]["normal"]["video_list"]["video_1"][
  598. "backup_url_1"
  599. ]
  600. if len(video_url) % 3 == 1:
  601. video_url += "=="
  602. elif len(video_url) % 3 == 2:
  603. video_url += "="
  604. elif len(audio_url) % 3 == 1:
  605. audio_url += "=="
  606. elif len(audio_url) % 3 == 2:
  607. audio_url += "="
  608. video_url = base64.b64decode(video_url).decode("utf8")
  609. audio_url = base64.b64decode(audio_url).decode("utf8")
  610. video_width = video_info["videoResource"]["normal"]["video_list"][
  611. "video_1"
  612. ]["vwidth"]
  613. video_height = video_info["videoResource"]["normal"]["video_list"][
  614. "video_1"
  615. ]["vheight"]
  616. video_url_dict["video_url"] = video_url
  617. video_url_dict["audio_url"] = audio_url
  618. video_url_dict["video_width"] = video_width
  619. video_url_dict["video_height"] = video_height
  620. elif (
  621. "dynamic_video" in video_info["videoResource"]["normal"]
  622. and "dynamic_video_list"
  623. in video_info["videoResource"]["normal"]["dynamic_video"]
  624. and "dynamic_audio_list"
  625. in video_info["videoResource"]["normal"]["dynamic_video"]
  626. and len(
  627. video_info["videoResource"]["normal"]["dynamic_video"][
  628. "dynamic_video_list"
  629. ]
  630. )
  631. != 0
  632. and len(
  633. video_info["videoResource"]["normal"]["dynamic_video"][
  634. "dynamic_audio_list"
  635. ]
  636. )
  637. != 0
  638. ):
  639. video_url = video_info["videoResource"]["normal"]["dynamic_video"][
  640. "dynamic_video_list"
  641. ][-1]["backup_url_1"]
  642. audio_url = video_info["videoResource"]["normal"]["dynamic_video"][
  643. "dynamic_audio_list"
  644. ][-1]["backup_url_1"]
  645. if len(video_url) % 3 == 1:
  646. video_url += "=="
  647. elif len(video_url) % 3 == 2:
  648. video_url += "="
  649. elif len(audio_url) % 3 == 1:
  650. audio_url += "=="
  651. elif len(audio_url) % 3 == 2:
  652. audio_url += "="
  653. video_url = base64.b64decode(video_url).decode("utf8")
  654. audio_url = base64.b64decode(audio_url).decode("utf8")
  655. video_width = video_info["videoResource"]["normal"]["dynamic_video"][
  656. "dynamic_video_list"
  657. ][-1]["vwidth"]
  658. video_height = video_info["videoResource"]["normal"]["dynamic_video"][
  659. "dynamic_video_list"
  660. ][-1]["vheight"]
  661. video_url_dict["video_url"] = video_url
  662. video_url_dict["audio_url"] = audio_url
  663. video_url_dict["video_width"] = video_width
  664. video_url_dict["video_height"] = video_height
  665. else:
  666. video_url_dict["video_url"] = ""
  667. video_url_dict["audio_url"] = ""
  668. video_url_dict["video_width"] = 0
  669. video_url_dict["video_height"] = 0
  670. else:
  671. video_url_dict["video_url"] = ""
  672. video_url_dict["audio_url"] = ""
  673. video_url_dict["video_width"] = 0
  674. video_url_dict["video_height"] = 0
  675. return video_url_dict
  676. def get_comment_cnt(item_id):
  677. """
  678. 获取视频的评论数量
  679. """
  680. url = "https://www.ixigua.com/tlb/comment/article/v5/tab_comments/?"
  681. params = {
  682. "tab_index": "0",
  683. "count": "10",
  684. "offset": "10",
  685. "group_id": str(item_id),
  686. "item_id": str(item_id),
  687. "aid": "1768",
  688. "msToken": "50-JJObWB07HfHs-BMJWT1eIDX3G-6lPSF_i-QwxBIXE9VVa-iN0jbEXR5pG2DKjXBmP299n6ZTuXzY-GAy968CCvouSAYIS4GzvGQT3pNlKNejr5G4-1g==",
  689. "X-Bogus": "DFSzswVOyGtANVeWtCLMqR/F6q9U",
  690. "_signature": random_signature(),
  691. }
  692. headers = {
  693. "authority": "www.ixigua.com",
  694. "accept": "application/json, text/plain, */*",
  695. "accept-language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
  696. "cache-control": "no-cache",
  697. "cookie": "MONITOR_WEB_ID=67cb5099-a022-4ec3-bb8e-c4de6ba51dd0; passport_csrf_token=72b2574f3c99f8ba670e42df430218fd; passport_csrf_token_default=72b2574f3c99f8ba670e42df430218fd; sid_guard=c7472b508ea631823ba765a60cf8757f%7C1680867422%7C3024002%7CFri%2C+12-May-2023+11%3A37%3A04+GMT; uid_tt=c13f47d51767f616befe32fb3e9f485a; uid_tt_ss=c13f47d51767f616befe32fb3e9f485a; sid_tt=c7472b508ea631823ba765a60cf8757f; sessionid=c7472b508ea631823ba765a60cf8757f; sessionid_ss=c7472b508ea631823ba765a60cf8757f; sid_ucp_v1=1.0.0-KGUzNWYxNmRkZGJiZjgxY2MzZWNkMTEzMTkwYjY1Yjg5OTY5NzVlNmMKFQiu3d-eqQIQ3oDAoQYYGCAMOAhACxoCaGwiIGM3NDcyYjUwOGVhNjMxODIzYmE3NjVhNjBjZjg3NTdm; ssid_ucp_v1=1.0.0-KGUzNWYxNmRkZGJiZjgxY2MzZWNkMTEzMTkwYjY1Yjg5OTY5NzVlNmMKFQiu3d-eqQIQ3oDAoQYYGCAMOAhACxoCaGwiIGM3NDcyYjUwOGVhNjMxODIzYmE3NjVhNjBjZjg3NTdm; odin_tt=b893608d4dde2e1e8df8cd5d97a0e2fbeafc4ca762ac72ebef6e6c97e2ed19859bb01d46b4190ddd6dd17d7f9678e1de; SEARCH_CARD_MODE=7168304743566296612_0; support_webp=true; support_avif=false; csrf_session_id=a5355d954d3c63ed1ba35faada452b4d; tt_scid=7Pux7s634-z8DYvCM20y7KigwH5u7Rh6D9C-RROpnT.aGMEcz6Vsxp.oai47wJqa4f86; ttwid=1%7CHHtv2QqpSGuSu8r-zXF1QoWsvjmNi1SJrqOrZzg-UCY%7C1683858689%7Ca5223fe1500578e01e138a0d71d6444692018296c4c24f5885af174a65873c95; ixigua-a-s=3; msToken=50-JJObWB07HfHs-BMJWT1eIDX3G-6lPSF_i-QwxBIXE9VVa-iN0jbEXR5pG2DKjXBmP299n6ZTuXzY-GAy968CCvouSAYIS4GzvGQT3pNlKNejr5G4-1g==; __ac_nonce=0645dcbf0005064517440; __ac_signature=_02B4Z6wo00f01FEGmAwAAIDBKchzCGqn-MBRJpyAAHAjieFC5GEg6gGiwz.I4PRrJl7f0GcixFrExKmgt6QI1i1S-dQyofPEj2ugWTCnmKUdJQv-wYuDofeKNe8VtMtZq2aKewyUGeKU-5Ud21; ixigua-a-s=3",
  698. "pragma": "no-cache",
  699. "referer": f"https://www.ixigua.com/{item_id}?logTag=3c5aa86a8600b9ab8540",
  700. "sec-ch-ua": '"Microsoft Edge";v="113", "Chromium";v="113", "Not-A.Brand";v="24"',
  701. "sec-ch-ua-mobile": "?0",
  702. "sec-ch-ua-platform": '"macOS"',
  703. "sec-fetch-dest": "empty",
  704. "sec-fetch-mode": "cors",
  705. "sec-fetch-site": "same-origin",
  706. "tt-anti-token": "cBITBHvmYjEygzv-f9c78c1297722cf1f559c74b084e4525ce4900bdcf9e8588f20cc7c2e3234422",
  707. "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.35",
  708. "x-secsdk-csrf-token": "000100000001f8e733cf37f0cd255a51aea9a81ff7bc0c09490cfe41ad827c3c5c18ec809279175e4d9f5553d8a5",
  709. }
  710. response = requests.get(
  711. url=url, headers=headers, params=params, proxies=tunnel_proxies(), timeout=5
  712. )
  713. response.close()
  714. if (
  715. response.status_code != 200
  716. or "total_number" not in response.json()
  717. or response.json() == {}
  718. ):
  719. return 0
  720. return response.json().get("total_number", 0)
  721. class XiGuaAuthor:
  722. """
  723. 西瓜账号爬虫
  724. """
  725. def __init__(self, platform, mode, rule_dict, env, user_list):
  726. self.platform = platform
  727. self.mode = mode
  728. self.rule_dict = rule_dict
  729. self.env = env
  730. self.user_list = user_list
  731. self.mq = MQ(topic_name="topic_crawler_etl_" + self.env)
  732. self.download_count = 0
  733. self.limiter = AuthorLimit(platform=self.platform, mode=self.mode)
  734. def rule_maker(self, account):
  735. """
  736. 通过不同的账号生成不同的规则
  737. :param account: 输入的账号信息
  738. {'play_cnt': {'min': 100000, 'max': 0}, 'period': {'min': 5, 'max': 5}}
  739. """
  740. temp = account['link'].split("?")[0].split("_")
  741. if len(temp) == 1:
  742. return self.rule_dict
  743. else:
  744. flag = temp[-2]
  745. match flag:
  746. case "V1":
  747. rule_dict = {
  748. "play_cnt": {"min": 100000, "max": 0},
  749. 'period': {"min": 90, "max": 90},
  750. 'special': 0.02
  751. }
  752. return rule_dict
  753. case "V2":
  754. rule_dict = {
  755. "play_cnt": {"min": 10000, "max": 0},
  756. 'period': {"min": 90, "max": 90},
  757. 'special': 0.01
  758. }
  759. return rule_dict
  760. case "V3":
  761. rule_dict = {
  762. "play_cnt": {"min": 5000, "max": 0},
  763. 'period': {"min": 90, "max": 90},
  764. 'special': 0.01
  765. }
  766. return rule_dict
  767. def get_author_list(self):
  768. """
  769. 每轮只抓取定量的数据,到达数量后自己退出
  770. 获取账号列表以及账号信息
  771. """
  772. # max_count = int(self.rule_dict.get("videos_cnt", {}).get("min", 300))
  773. for user_dict in self.user_list:
  774. # if self.download_count <= max_count:
  775. try:
  776. flag = user_dict["link"][0]
  777. match flag:
  778. case "V":
  779. self.get_video_list(user_dict)
  780. case "X":
  781. self.get_tiny_video_list(user_dict)
  782. case "h":
  783. self.get_video_list(user_dict)
  784. case "D":
  785. self.get_video_list(user_dict)
  786. case "B":
  787. self.get_video_list(user_dict)
  788. self.get_tiny_video_list(user_dict)
  789. except Exception as e:
  790. AliyunLogger.logging(
  791. code="3001",
  792. account=user_dict["uid"],
  793. platform=self.platform,
  794. mode=self.mode,
  795. env=self.env,
  796. message="扫描账号时出现bug, 报错是 {}".format(e)
  797. )
  798. # time.sleep(random.randint(1, 15))
  799. # else:
  800. # AliyunLogger.logging(
  801. # code="2000",
  802. # platform=self.platform,
  803. # mode=self.mode,
  804. # env=self.env,
  805. # message="本轮已经抓取足够数量的视频,已经自动退出",
  806. # )
  807. # return
  808. def get_video_list(self, user_dict):
  809. """
  810. 获取某个账号的视频列表
  811. 账号分为 3 类
  812. """
  813. offset = 0
  814. signature = random_signature()
  815. link = user_dict['link'].split("?")[0].split("_")[-1]
  816. url = "https://www.ixigua.com/api/videov2/author/new_video_list?"
  817. while True:
  818. to_user_id = str(link.replace("https://www.ixigua.com/home/", ""))
  819. params = {
  820. "to_user_id": to_user_id,
  821. "offset": str(offset),
  822. "limit": "30",
  823. "maxBehotTime": "0",
  824. "order": "new",
  825. "isHome": "0",
  826. "_signature": signature,
  827. }
  828. headers = {
  829. "referer": f'https://www.ixigua.com/home/{link.replace("https://www.ixigua.com/home/", "")}/video/?preActiveKey=hotsoon&list_entrance=userdetail',
  830. "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.41",
  831. }
  832. response = requests.get(
  833. url=url,
  834. headers=headers,
  835. params=params,
  836. proxies=tunnel_proxies(),
  837. timeout=5,
  838. )
  839. offset += 30
  840. if "data" not in response.text or response.status_code != 200:
  841. AliyunLogger.logging(
  842. code="3000",
  843. platform=self.platform,
  844. mode=self.mode,
  845. env=self.env,
  846. message=f"get_videoList:{response.text}\n",
  847. )
  848. return
  849. elif not response.json()["data"]["videoList"]:
  850. AliyunLogger.logging(
  851. account=link,
  852. code="3000",
  853. platform=self.platform,
  854. mode=self.mode,
  855. env=self.env,
  856. data=response.json(),
  857. message=f"没有更多数据啦~\n",
  858. )
  859. return
  860. else:
  861. feeds = response.json()["data"]["videoList"]
  862. for video_obj in feeds:
  863. try:
  864. AliyunLogger.logging(
  865. code="1001",
  866. account=user_dict['uid'],
  867. platform=self.platform,
  868. mode=self.mode,
  869. env=self.env,
  870. data=video_obj,
  871. message="扫描到一条视频",
  872. )
  873. date_flag = self.process_video_obj(video_obj, user_dict, "l")
  874. if not date_flag:
  875. return
  876. except Exception as e:
  877. AliyunLogger.logging(
  878. code="3000",
  879. platform=self.platform,
  880. mode=self.mode,
  881. env=self.env,
  882. data=video_obj,
  883. message="抓取单条视频异常, 报错原因是: {}".format(e),
  884. )
  885. def get_tiny_video_list(self, user_dict):
  886. """
  887. 获取小视频
  888. """
  889. url = "https://www.ixigua.com/api/videov2/hotsoon/video"
  890. max_behot_time = "0"
  891. link = user_dict['link'].split("?")[0].split("_")[-1]
  892. to_user_id = str(link.replace("https://www.ixigua.com/home/", ""))
  893. while True:
  894. params = {
  895. "to_user_id": to_user_id,
  896. "max_behot_time": max_behot_time,
  897. "_signature": random_signature()
  898. }
  899. headers = {
  900. "referer": "https://www.ixigua.com/{}?&".format(to_user_id),
  901. "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.41",
  902. }
  903. response = requests.get(
  904. url=url,
  905. headers=headers,
  906. params=params,
  907. proxies=tunnel_proxies(),
  908. timeout=5,
  909. )
  910. if "data" not in response.text or response.status_code != 200:
  911. AliyunLogger.logging(
  912. code="2000",
  913. platform=self.platform,
  914. mode=self.mode,
  915. env=self.env,
  916. message=f"get_videoList:{response.text}\n",
  917. )
  918. return
  919. elif not response.json()["data"]["data"]:
  920. AliyunLogger.logging(
  921. account=link,
  922. code="2000",
  923. platform=self.platform,
  924. mode=self.mode,
  925. env=self.env,
  926. data=response.json(),
  927. message=f"没有更多数据啦~\n",
  928. )
  929. return
  930. else:
  931. video_list = response.json()['data']['data']
  932. max_behot_time = video_list[-1]["max_behot_time"]
  933. for video_obj in video_list:
  934. try:
  935. AliyunLogger.logging(
  936. code="1001",
  937. account=user_dict['uid'],
  938. platform=self.platform,
  939. mode=self.mode,
  940. env=self.env,
  941. data=video_obj,
  942. message="扫描到一条小视频",
  943. )
  944. date_flag = self.process_video_obj(video_obj, user_dict, "s")
  945. if not date_flag:
  946. return
  947. except Exception as e:
  948. AliyunLogger.logging(
  949. code="3000",
  950. platform=self.platform,
  951. mode=self.mode,
  952. env=self.env,
  953. data=video_obj,
  954. message="抓取单条视频异常, 报错原因是: {}".format(e),
  955. )
  956. def process_video_obj(self, video_obj, user_dict, f):
  957. """
  958. process video_obj and extract video_url
  959. """
  960. new_rule = self.rule_maker(user_dict)
  961. trace_id = self.platform + str(uuid.uuid1())
  962. if f == "s":
  963. item_id = video_obj.get("id_str", "")
  964. else:
  965. item_id = video_obj.get("item_id", "")
  966. if not item_id:
  967. AliyunLogger.logging(
  968. code="2005",
  969. account=user_dict['uid'],
  970. platform=self.platform,
  971. mode=self.mode,
  972. env=self.env,
  973. message="无效视频",
  974. data=video_obj,
  975. trace_id=trace_id,
  976. )
  977. return
  978. # 获取视频信息
  979. video_dict = self.get_video_info(item_id=item_id)
  980. video_dict["platform"] = self.platform
  981. video_dict["strategy"] = self.mode
  982. video_dict["out_video_id"] = video_dict["video_id"]
  983. video_dict["width"] = video_dict["video_width"]
  984. video_dict["height"] = video_dict["video_height"]
  985. video_dict["crawler_rule"] = json.dumps(new_rule)
  986. video_dict["user_id"] = user_dict["uid"]
  987. video_dict["publish_time"] = video_dict["publish_time_str"]
  988. video_dict["strategy_type"] = self.mode
  989. video_dict["update_time_stamp"] = int(time.time())
  990. if int(time.time()) - video_dict['publish_time_stamp'] > 3600 * 24 * int(
  991. new_rule.get("period", {}).get("max", 1000)):
  992. if not video_obj['is_top']:
  993. """
  994. 非置顶数据发布时间超过才退出
  995. """
  996. AliyunLogger.logging(
  997. code="2004",
  998. account=user_dict['uid'],
  999. platform=self.platform,
  1000. mode=self.mode,
  1001. env=self.env,
  1002. data=video_dict,
  1003. message="发布时间超过{}天".format(
  1004. int(new_rule.get("period", {}).get("max", 1000))
  1005. ),
  1006. )
  1007. return False
  1008. pipeline = PiaoQuanPipeline(
  1009. platform=self.platform,
  1010. mode=self.mode,
  1011. rule_dict=new_rule,
  1012. env=self.env,
  1013. item=video_dict,
  1014. trace_id=trace_id,
  1015. )
  1016. limit_flag = self.limiter.author_limitation(user_id=video_dict['user_id'])
  1017. if limit_flag:
  1018. title_flag = pipeline.title_flag()
  1019. repeat_flag = pipeline.repeat_video()
  1020. if title_flag and repeat_flag:
  1021. if new_rule.get("special"):
  1022. if int(video_dict['play_cnt']) >= int(new_rule.get("play_cnt", {}).get("min", 100000)):
  1023. if float(video_dict['like_cnt']) / float(video_dict['play_cnt']) >= new_rule['special']:
  1024. self.mq.send_msg(video_dict)
  1025. self.download_count += 1
  1026. AliyunLogger.logging(
  1027. code="1002",
  1028. account=user_dict['uid'],
  1029. platform=self.platform,
  1030. mode=self.mode,
  1031. env=self.env,
  1032. data=video_dict,
  1033. trace_id=trace_id,
  1034. message="成功发送 MQ 至 ETL",
  1035. )
  1036. return True
  1037. else:
  1038. AliyunLogger.logging(
  1039. code="2008",
  1040. account=user_dict['uid'],
  1041. platform=self.platform,
  1042. mode=self.mode,
  1043. env=self.env,
  1044. message="不满足特殊规则, 点赞量/播放量",
  1045. data=video_dict
  1046. )
  1047. else:
  1048. if int(video_dict['play_cnt']) >= int(new_rule.get("play_cnt", {}).get("min", 100000)):
  1049. self.mq.send_msg(video_dict)
  1050. self.download_count += 1
  1051. AliyunLogger.logging(
  1052. code="1002",
  1053. account=user_dict['uid'],
  1054. platform=self.platform,
  1055. mode=self.mode,
  1056. env=self.env,
  1057. data=video_dict,
  1058. trace_id=trace_id,
  1059. message="成功发送 MQ 至 ETL",
  1060. )
  1061. return True
  1062. else:
  1063. AliyunLogger.logging(
  1064. code="2008",
  1065. account=user_dict['uid'],
  1066. platform=self.platform,
  1067. mode=self.mode,
  1068. env=self.env,
  1069. message="不满足特殊规则, 播放量",
  1070. data=video_dict
  1071. )
  1072. return True
  1073. def get_video_info(self, item_id):
  1074. """
  1075. 获取视频信息
  1076. """
  1077. url = "https://www.ixigua.com/{}".format(item_id)
  1078. headers = {
  1079. "accept-encoding": "gzip, deflate",
  1080. "accept-language": "zh-CN,zh-Hans;q=0.9",
  1081. "cookie": "ttwid={}".format(byte_dance_cookie(item_id)),
  1082. "user-agent": FakeUserAgent().random,
  1083. "referer": "https://www.ixigua.com/{}/".format(item_id),
  1084. }
  1085. response = requests.get(
  1086. url=url,
  1087. headers=headers,
  1088. proxies=tunnel_proxies(),
  1089. timeout=5,
  1090. )
  1091. time.sleep(random.randint(1, 5))
  1092. video_info = extract_info_by_re(response.text)
  1093. video_dict = {
  1094. "video_title": video_info.get("title", ""),
  1095. "video_id": video_info.get("video_id"),
  1096. "gid": str(item_id),
  1097. "play_cnt": int(video_info.get("play_count", 0)),
  1098. "like_cnt": int(video_info.get("like_count", 0)),
  1099. "comment_cnt": 0,
  1100. "share_cnt": 0,
  1101. "favorite_cnt": 0,
  1102. "duration": int(video_info.get("duration", 0)),
  1103. "video_width": 0,
  1104. "video_height": 0,
  1105. "publish_time_stamp": int(video_info.get("publish_time", 0)),
  1106. "publish_time_str": time.strftime(
  1107. "%Y-%m-%d %H:%M:%S",
  1108. time.localtime(int(video_info.get("publish_time", 0))),
  1109. ),
  1110. "avatar_url": str(
  1111. video_info.get("user_info", {}).get("avatar_url", "")
  1112. ),
  1113. "cover_url": video_info.get("cover_url", "").replace("\\u002F", "/"),
  1114. "video_url": video_info.get("url"),
  1115. "session": f"xigua-author-{int(time.time())}",
  1116. }
  1117. return video_dict
  1118. if __name__ == "__main__":
  1119. user_list = [
  1120. {
  1121. "uid": 6267140,
  1122. "source": "xigua",
  1123. "link": "https://www.ixigua.com/home/2779177225827568",
  1124. "nick_name": "秋晴爱音乐",
  1125. "avatar_url": "",
  1126. "mode": "author",
  1127. },
  1128. {
  1129. "uid": 6267140,
  1130. "source": "xigua",
  1131. "link": "https://www.ixigua.com/home/2885546124776780",
  1132. "nick_name": "朗诵放歌的老山羊",
  1133. "avatar_url": "",
  1134. "mode": "author",
  1135. },
  1136. {
  1137. "uid": 6267140,
  1138. "source": "xigua",
  1139. "link": "https://www.ixigua.com/home/5880938217",
  1140. "nick_name": "天原声疗",
  1141. "avatar_url": "",
  1142. "mode": "author",
  1143. },
  1144. ]
  1145. rule = {'period': {'min': 30, 'max': 30}, 'duration': {'min': 20, 'max': 0}, 'play_cnt': {'min': 100000, 'max': 0}}
  1146. XGA = XiGuaAuthor(
  1147. platform="xigua",
  1148. mode="author",
  1149. rule_dict=rule,
  1150. env="prod",
  1151. user_list=user_list
  1152. )
  1153. XGA.get_author_list()