xigua_dev.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498
  1. import json
  2. import os
  3. import re
  4. import random
  5. import sys
  6. import string
  7. import time
  8. import uuid
  9. import base64
  10. import requests
  11. from fake_useragent import FakeUserAgent
  12. from common.mq import MQ
  13. sys.path.append(os.getcwd())
  14. from common import PiaoQuanPipeline, tunnel_proxies
  15. from common.limit import AuthorLimit
  16. def extract_info_by_re(text):
  17. """
  18. 通过正则表达式获取文本中的信息
  19. :param text:
  20. :return:
  21. """
  22. # 标题
  23. title_match = re.search(r'<title[^>]*>(.*?)</title>', text)
  24. if title_match:
  25. title_content = title_match.group(1)
  26. title_content = title_content.split(" - ")[0]
  27. title_content = bytes(title_content, "latin1").decode()
  28. else:
  29. title_content = ""
  30. # video_url
  31. main_url = re.search(r'("main_url":")(.*?)"', text)[0]
  32. main_url = main_url.split(":")[1]
  33. decoded_data = base64.b64decode(main_url)
  34. try:
  35. # 尝试使用utf-8解码
  36. video_url = decoded_data.decode()
  37. except UnicodeDecodeError:
  38. # 如果utf-8解码失败,尝试使用其他编码方式
  39. video_url = decoded_data.decode('latin-1')
  40. # video_id
  41. video_id = re.search(r'"vid":"(.*?)"', text).group(1)
  42. # like_count
  43. like_count = re.search(r'"video_like_count":(.*?),', text).group(1)
  44. # cover_url
  45. cover_url = re.search(r'"avatar_url":"(.*?)"', text).group(1)
  46. # video_play
  47. video_watch_count = re.search(r'"video_watch_count":(.*?),', text).group(1)
  48. # "video_publish_time"
  49. publish_time = re.search(r'"video_publish_time":"(.*?)"', text).group(1)
  50. # video_duration
  51. duration = re.search(r'("video_duration":)(.*?)"', text).group(2).replace(",", "")
  52. return {
  53. "title": title_content,
  54. "url": video_url,
  55. "video_id": video_id,
  56. "like_count": like_count,
  57. "cover_url": cover_url,
  58. "play_count": video_watch_count,
  59. "publish_time": publish_time,
  60. "duration": duration
  61. }
  62. def random_signature():
  63. """
  64. 随机生成签名
  65. """
  66. src_digits = string.digits # string_数字
  67. src_uppercase = string.ascii_uppercase # string_大写字母
  68. src_lowercase = string.ascii_lowercase # string_小写字母
  69. digits_num = random.randint(1, 6)
  70. uppercase_num = random.randint(1, 26 - digits_num - 1)
  71. lowercase_num = 26 - (digits_num + uppercase_num)
  72. password = (
  73. random.sample(src_digits, digits_num)
  74. + random.sample(src_uppercase, uppercase_num)
  75. + random.sample(src_lowercase, lowercase_num)
  76. )
  77. random.shuffle(password)
  78. new_password = "AAAAAAAAAA" + "".join(password)[10:-4] + "AAAB"
  79. new_password_start = new_password[0:18]
  80. new_password_end = new_password[-7:]
  81. if new_password[18] == "8":
  82. new_password = new_password_start + "w" + new_password_end
  83. elif new_password[18] == "9":
  84. new_password = new_password_start + "x" + new_password_end
  85. elif new_password[18] == "-":
  86. new_password = new_password_start + "y" + new_password_end
  87. elif new_password[18] == ".":
  88. new_password = new_password_start + "z" + new_password_end
  89. else:
  90. new_password = new_password_start + "y" + new_password_end
  91. return new_password
  92. def byte_dance_cookie(item_id):
  93. """
  94. 获取西瓜视频的 cookie
  95. :param item_id:
  96. """
  97. sess = requests.Session()
  98. sess.headers.update({
  99. 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_1_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',
  100. 'referer': 'https://www.ixigua.com/home/{}/'.format(item_id),
  101. })
  102. # 获取 cookies
  103. sess.get('https://i.snssdk.com/slardar/sdk.js?bid=xigua_video_web_pc')
  104. data = '{"region":"cn","aid":1768,"needFid":false,"service":"www.ixigua.com","migrate_info":{"ticket":"","source":"node"},"cbUrlProtocol":"https","union":true}'
  105. r = sess.post('https://ttwid.bytedance.com/ttwid/union/register/', data=data)
  106. # print(r.text)
  107. return r.cookies.values()[0]
  108. class XiGuaAuthor(object):
  109. """
  110. 西瓜账号爬虫
  111. """
  112. def __init__(self, platform, mode, rule_dict, env, user_list):
  113. self.platform = platform
  114. self.mode = mode
  115. self.rule_dict = rule_dict
  116. self.env = env
  117. self.user_list = user_list
  118. self.mq = MQ(topic_name="topic_crawler_etl_" + self.env)
  119. self.download_count = 0
  120. self.limiter = AuthorLimit(platform=self.platform, mode=self.mode)
  121. def rule_maker(self, account):
  122. """
  123. 通过不同的账号生成不同的规则
  124. :param account: 输入的账号信息
  125. {'play_cnt': {'min': 100000, 'max': 0}, 'period': {'min': 5, 'max': 5}}
  126. """
  127. temp = account['link'].split("_")
  128. if len(temp) == 1:
  129. return self.rule_dict
  130. else:
  131. flag = temp[-2]
  132. match flag:
  133. case "V1":
  134. rule_dict = {
  135. "play_cnt": {"min": 100000, "max": 0},
  136. 'period': {"min": 90, "max": 90},
  137. 'special': 0.02
  138. }
  139. return rule_dict
  140. case "V2":
  141. rule_dict = {
  142. "play_cnt": {"min": 10000, "max": 0},
  143. 'period': {"min": 90, "max": 90},
  144. 'special': 0.01
  145. }
  146. return rule_dict
  147. case "V3":
  148. rule_dict = {
  149. "play_cnt": {"min": 5000, "max": 0},
  150. 'period': {"min": 90, "max": 90},
  151. 'special': 0.01
  152. }
  153. return rule_dict
  154. def get_author_list(self):
  155. """
  156. 每轮只抓取定量的数据,到达数量后自己退出
  157. 获取账号列表以及账号信息
  158. """
  159. # max_count = int(self.rule_dict.get("videos_cnt", {}).get("min", 300))
  160. for user_dict in self.user_list:
  161. # if self.download_count <= max_count:
  162. flag = user_dict["link"][0]
  163. print(user_dict)
  164. print(flag)
  165. match flag:
  166. case "V":
  167. self.get_video_list(user_dict)
  168. case "X":
  169. self.get_tiny_video_list(user_dict)
  170. case "h":
  171. self.get_video_list(user_dict)
  172. case "D":
  173. self.get_video_list(user_dict)
  174. case "B":
  175. self.get_video_list(user_dict)
  176. self.get_tiny_video_list(user_dict)
  177. # time.sleep(random.randint(1, 15))
  178. # else:
  179. # AliyunLogger.logging(
  180. # code="2000",
  181. # platform=self.platform,
  182. # mode=self.mode,
  183. # env=self.env,
  184. # message="本轮已经抓取足够数量的视频,已经自动退出",
  185. # )
  186. # return
  187. def get_video_list(self, user_dict):
  188. """
  189. 获取某个账号的视频列表
  190. 账号分为 3 类
  191. """
  192. offset = 0
  193. signature = random_signature()
  194. link = user_dict['link'].split("_")[-1]
  195. url = "https://www.ixigua.com/api/videov2/author/new_video_list?"
  196. while True:
  197. to_user_id = str(link.replace("https://www.ixigua.com/home/", ""))
  198. params = {
  199. "to_user_id": to_user_id,
  200. "offset": str(offset),
  201. "limit": "30",
  202. "maxBehotTime": "0",
  203. "order": "new",
  204. "isHome": "0",
  205. "_signature": signature,
  206. }
  207. headers = {
  208. "referer": f'https://www.ixigua.com/home/{link.replace("https://www.ixigua.com/home/", "")}/video/?preActiveKey=hotsoon&list_entrance=userdetail',
  209. "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.41",
  210. }
  211. response = requests.get(
  212. url=url,
  213. headers=headers,
  214. params=params,
  215. proxies=tunnel_proxies(),
  216. timeout=5,
  217. )
  218. offset += 30
  219. if "data" not in response.text or response.status_code != 200:
  220. message = f"get_videoList:{response.text}\n"
  221. print(message)
  222. return
  223. elif not response.json()["data"]["videoList"]:
  224. message = f"没有更多数据啦~\n"
  225. print(params)
  226. return
  227. else:
  228. feeds = response.json()["data"]["videoList"]
  229. for video_obj in feeds:
  230. message = "扫描到一条视频"
  231. print(message)
  232. date_flag = self.process_video_obj(video_obj, user_dict, "l")
  233. if not date_flag:
  234. return
  235. def get_tiny_video_list(self, user_dict):
  236. """
  237. 获取小视频
  238. """
  239. url = "https://www.ixigua.com/api/videov2/hotsoon/video"
  240. max_behot_time = "0"
  241. link = user_dict['link'].split("_")[-1]
  242. to_user_id = str(link.replace("https://www.ixigua.com/home/", ""))
  243. while True:
  244. params = {
  245. "to_user_id": to_user_id,
  246. "max_behot_time": max_behot_time,
  247. "_signature": random_signature()
  248. }
  249. headers = {
  250. "referer": "https://www.ixigua.com/{}?&".format(to_user_id),
  251. "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.41",
  252. }
  253. response = requests.get(
  254. url=url,
  255. headers=headers,
  256. params=params,
  257. proxies=tunnel_proxies(),
  258. timeout=5,
  259. )
  260. if "data" not in response.text or response.status_code != 200:
  261. AliyunLogger.logging(
  262. code="2000",
  263. platform=self.platform,
  264. mode=self.mode,
  265. env=self.env,
  266. message=f"get_videoList:{response.text}\n",
  267. )
  268. return
  269. elif not response.json()["data"]["data"]:
  270. AliyunLogger.logging(
  271. code="2000",
  272. platform=self.platform,
  273. mode=self.mode,
  274. env=self.env,
  275. message=f"没有更多数据啦~\n",
  276. )
  277. return
  278. else:
  279. video_list = response.json()['data']['data']
  280. max_behot_time = video_list[-1]["max_behot_time"]
  281. for video_obj in video_list:
  282. try:
  283. AliyunLogger.logging(
  284. code="1001",
  285. account=user_dict['uid'],
  286. platform=self.platform,
  287. mode=self.mode,
  288. env=self.env,
  289. data=video_obj,
  290. message="扫描到一条小视频",
  291. )
  292. date_flag = self.process_video_obj(video_obj, user_dict, "s")
  293. if not date_flag:
  294. return
  295. except Exception as e:
  296. AliyunLogger.logging(
  297. code="3000",
  298. platform=self.platform,
  299. mode=self.mode,
  300. env=self.env,
  301. data=video_obj,
  302. message="抓取单条视频异常, 报错原因是: {}".format(e),
  303. )
  304. def process_video_obj(self, video_obj, user_dict, f):
  305. """
  306. process video_obj and extract video_url
  307. """
  308. new_rule = self.rule_maker(user_dict)
  309. trace_id = self.platform + str(uuid.uuid1())
  310. if f == "s":
  311. item_id = video_obj.get("id_str", "")
  312. else:
  313. item_id = video_obj.get("item_id", "")
  314. if not item_id:
  315. message="无效视频"
  316. print(message)
  317. return
  318. # 获取视频信息
  319. video_dict = self.get_video_info(item_id=item_id, trace_id=trace_id)
  320. # video_dict["out_user_id"] = video_dict["user_id"]
  321. video_dict["platform"] = self.platform
  322. video_dict["strategy"] = self.mode
  323. video_dict["out_video_id"] = video_dict["video_id"]
  324. video_dict["width"] = video_dict["video_width"]
  325. video_dict["height"] = video_dict["video_height"]
  326. video_dict["crawler_rule"] = json.dumps(new_rule)
  327. video_dict["user_id"] = user_dict["uid"]
  328. video_dict["publish_time"] = video_dict["publish_time_str"]
  329. video_dict["strategy_type"] = self.mode
  330. video_dict["update_time_stamp"] = int(time.time())
  331. if int(time.time()) - video_dict['publish_time_stamp'] > 3600 * 24 * int(
  332. new_rule.get("period", {}).get("max", 1000)):
  333. if not video_obj['is_top']:
  334. """
  335. 非置顶数据发布时间超过才退出
  336. """
  337. message = "发布时间超过{}天".format(
  338. int(new_rule.get("period", {}).get("max", 1000))
  339. )
  340. print(message)
  341. return False
  342. pipeline = PiaoQuanPipeline(
  343. platform=self.platform,
  344. mode=self.mode,
  345. rule_dict=new_rule,
  346. env=self.env,
  347. item=video_dict,
  348. trace_id=trace_id,
  349. )
  350. limit_flag = self.limiter.author_limitation(user_id=video_dict['user_id'])
  351. print(json.dumps(video_dict, ensure_ascii=False, indent=4))
  352. # if limit_flag:
  353. # title_flag = pipeline.title_flag()
  354. # repeat_flag = pipeline.repeat_video()
  355. # if title_flag and repeat_flag:
  356. # if new_rule.get("special"):
  357. # if int(video_dict['play_cnt']) >= int(new_rule.get("play_cnt", {}).get("min", 100000)):
  358. # if float(video_dict['like_cnt']) / float(video_dict['play_cnt']) >= new_rule['special']:
  359. # print(json.dumps(video_dict, ensure_ascii=False, indent=4))
  360. # # self.mq.send_msg(video_dict)
  361. # self.download_count += 1
  362. #
  363. # return True
  364. # else:
  365. # message="不满足特殊规则, 点赞量/播放量"
  366. # print(json.dumps(video_dict, ensure_ascii=False, indent=4))
  367. # print(message)
  368. # return False
  369. #
  370. # else:
  371. # if int(video_dict['play_cnt']) >= int(new_rule.get("play_cnt", {}).get("min", 100000)):
  372. # self.mq.send_msg(video_dict)
  373. # self.download_count += 1
  374. # message="成功发送 MQ 至 ETL",
  375. # )
  376. # return True
  377. # else:
  378. # AliyunLogger.logging(
  379. # code="2008",
  380. # account=user_dict['uid'],
  381. # platform=self.platform,
  382. # mode=self.mode,
  383. # env=self.env,
  384. # message="不满足特殊规则, 播放量",
  385. # data=video_dict
  386. # )
  387. # return True
  388. def get_video_info(self, item_id, trace_id):
  389. """
  390. 获取视频信息
  391. """
  392. url = "https://www.ixigua.com/{}".format(item_id)
  393. headers = {
  394. "accept-encoding": "gzip, deflate",
  395. "accept-language": "zh-CN,zh-Hans;q=0.9",
  396. "user-agent": FakeUserAgent().random,
  397. "cookie": "ttwid={}".format(byte_dance_cookie(item_id)),
  398. "referer": "https://www.ixigua.com/{}/".format(item_id),
  399. }
  400. response = requests.get(
  401. url=url,
  402. headers=headers,
  403. proxies=tunnel_proxies(),
  404. timeout=5,
  405. )
  406. video_info = extract_info_by_re(response.text)
  407. video_dict = {
  408. "video_title": video_info.get("title", ""),
  409. "video_id": video_info.get("video_id"),
  410. "gid": str(item_id),
  411. "play_cnt": int(video_info.get("play_count", 0)),
  412. "like_cnt": int(video_info.get("like_count", 0)),
  413. "comment_cnt": 0,
  414. "share_cnt": 0,
  415. "favorite_cnt": 0,
  416. "duration": int(video_info.get("duration", 0)),
  417. "video_width": 0,
  418. "video_height": 0,
  419. "publish_time_stamp": int(video_info.get("publish_time", 0)),
  420. "publish_time_str": time.strftime(
  421. "%Y-%m-%d %H:%M:%S",
  422. time.localtime(int(video_info.get("publish_time", 0))),
  423. ),
  424. "avatar_url": str(
  425. video_info.get("user_info", {}).get("avatar_url", "")
  426. ),
  427. "cover_url": video_info.get("cover_url", ""),
  428. "video_url": video_info.get("url"),
  429. "session": f"xigua-search-{int(time.time())}",
  430. }
  431. return video_dict
  432. if __name__ == "__main__":
  433. user_list = [
  434. {
  435. "uid": 6267140,
  436. "source": "xigua",
  437. "link": "https://www.ixigua.com/home/2779177225827568",
  438. "nick_name": "秋晴爱音乐",
  439. "avatar_url": "",
  440. "mode": "author",
  441. },
  442. {
  443. "uid": 6267140,
  444. "source": "xigua",
  445. "link": "https://www.ixigua.com/home/2885546124776780",
  446. "nick_name": "朗诵放歌的老山羊",
  447. "avatar_url": "",
  448. "mode": "author",
  449. },
  450. {
  451. "uid": 6267140,
  452. "source": "xigua",
  453. "link": "https://www.ixigua.com/home/5880938217",
  454. "nick_name": "天原声疗",
  455. "avatar_url": "",
  456. "mode": "author",
  457. },
  458. ]
  459. rule = {'period': {'min': 30, 'max': 30}, 'duration': {'min': 20, 'max': 0}, 'play_cnt': {'min': 100000, 'max': 0}}
  460. XGA = XiGuaAuthor(
  461. platform="xigua",
  462. mode="author",
  463. rule_dict=rule,
  464. env="prod",
  465. user_list=user_list
  466. )
  467. XGA.get_author_list()