xigua_search.py 8.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238
  1. """
  2. @author: luojunhui
  3. 西瓜视频搜索爬虫
  4. """
  5. import re
  6. import json
  7. import time
  8. import random
  9. import base64
  10. import urllib.parse
  11. import requests
  12. from lxml import etree
  13. from Crypto.Cipher import AES
  14. from Crypto.Util.Padding import unpad
  15. from fake_useragent import FakeUserAgent
  16. def byte_dance_cookie(item_id):
  17. """
  18. 获取西瓜视频的 cookie
  19. :param item_id:
  20. """
  21. sess = requests.Session()
  22. sess.headers.update({
  23. 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_1_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',
  24. 'referer': 'https://www.ixigua.com/home/{}/'.format(item_id),
  25. })
  26. # 获取 cookies
  27. sess.get('https://i.snssdk.com/slardar/sdk.js?bid=xigua_video_web_pc')
  28. data = '{"region":"cn","aid":1768,"needFid":false,"service":"www.ixigua.com","migrate_info":{"ticket":"","source":"node"},"cbUrlProtocol":"https","union":true}'
  29. r = sess.post('https://ttwid.bytedance.com/ttwid/union/register/', data=data)
  30. # print(r.text)
  31. return r.cookies.values()[0]
  32. def aes_decrypt(data: str, key: str) -> str:
  33. """
  34. XiGua AES decrypt
  35. :param data:
  36. :param key:
  37. :return:
  38. """
  39. password = key.encode()
  40. iv = password[:16]
  41. try:
  42. ct = base64.b64decode(data.encode())
  43. cipher = AES.new(password, AES.MODE_CBC, iv)
  44. pt = unpad(cipher.decrypt(ct), AES.block_size)
  45. return base64.b64decode(pt).decode()
  46. except Exception as e:
  47. print("Incorrect decryption {}".format(e))
  48. return None
  49. def extract_video_url(text):
  50. """
  51. 获取视频 video_url
  52. :param text:
  53. :return:
  54. """
  55. HTML = etree.HTML(text)
  56. str_2 = HTML.xpath('//script[@id="SSR_HYDRATED_DATA"]/text()')[0]
  57. json_2 = str_2[str_2.find('{'):str_2.rfind('}') + 1]
  58. Irregulars = ['null', 'undefined', '=false', '=true', 'false', 'true']
  59. # python中不规则的定义
  60. for I in Irregulars:
  61. if I in ['=false', '=true']:
  62. json_2 = json_2.replace(I, '=' + I[1:].capitalize())
  63. else:
  64. json_2 = json_2.replace(I, '12')
  65. dict_2 = json.loads(json_2)["anyVideo"]["gidInformation"]["packerData"]["video"]["videoResource"]
  66. if dict_2['dash'] == 12:
  67. obj = dict_2['normal']
  68. ptk = obj['ptk']
  69. main_url = obj['video_list']['video_3']['main_url']
  70. real_video_url = aes_decrypt(data=main_url, key=ptk)
  71. else:
  72. obj = dict_2['dash']
  73. ptk = obj["ptk"]
  74. video_url = obj['dynamic_video']['main_url']
  75. real_video_url = aes_decrypt(data=video_url, key=ptk)
  76. return real_video_url
  77. def extract_info_by_re(text):
  78. """
  79. 通过正则表达式获取文本中的信息
  80. :param text:
  81. :return:
  82. """
  83. # 标题
  84. title_match = re.search(r'<title[^>]*>(.*?)</title>', text)
  85. if title_match:
  86. title_content = title_match.group(1)
  87. title_content = title_content.split(" - ")[0]
  88. title_content = bytes(title_content, "latin1").decode()
  89. else:
  90. title_content = ""
  91. # video_id
  92. video_id = re.search(r'"vid":"(.*?)"', text).group(1)
  93. # like_count
  94. like_count = re.search(r'"video_like_count":(.*?),', text).group(1)
  95. # cover_url
  96. cover_url = re.search(r'"avatar_url":"(.*?)"', text).group(1)
  97. # video_play
  98. video_watch_count = re.search(r'"video_watch_count":(.*?),', text).group(1)
  99. # "video_publish_time"
  100. publish_time = re.search(r'"video_publish_time":"(.*?)"', text).group(1)
  101. # video_duration
  102. duration = re.search(r'("video_duration":)(.*?)"', text).group(2).replace(",", "")
  103. return {
  104. "title": title_content,
  105. "url": extract_video_url(text),
  106. "video_id": video_id,
  107. "like_count": like_count,
  108. "cover_url": cover_url,
  109. "play_count": video_watch_count,
  110. "publish_time": publish_time,
  111. "duration": duration
  112. }
  113. def byte_dance_cookie(item_id):
  114. """
  115. 获取西瓜视频的 cookie
  116. :param item_id:
  117. """
  118. sess = requests.Session()
  119. sess.headers.update({
  120. 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_1_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',
  121. 'referer': 'https://www.ixigua.com/home/{}/'.format(item_id),
  122. })
  123. # 获取 cookies
  124. sess.get('https://i.snssdk.com/slardar/sdk.js?bid=xigua_video_web_pc')
  125. data = '{"region":"cn","aid":1768,"needFid":false,"service":"www.ixigua.com","migrate_info":{"ticket":"","source":"node"},"cbUrlProtocol":"https","union":true}'
  126. r = sess.post('https://ttwid.bytedance.com/ttwid/union/register/', data=data)
  127. # print(r.text)
  128. return r.cookies.values()[0]
  129. def get_video_info(item_id):
  130. """
  131. 获取视频信息
  132. """
  133. url = "https://www.ixigua.com/{}".format(item_id)
  134. headers = {
  135. "accept-encoding": "gzip, deflate",
  136. "accept-language": "zh-CN,zh-Hans;q=0.9",
  137. "cookie": "ttwid={}".format(byte_dance_cookie(item_id)),
  138. "user-agent": FakeUserAgent().random,
  139. "referer": "https://www.ixigua.com/{}/".format(item_id),
  140. }
  141. response = requests.get(
  142. url=url,
  143. headers=headers,
  144. # proxies=tunnel_proxies(),
  145. timeout=5,
  146. )
  147. time.sleep(random.randint(1, 5))
  148. video_info = extract_info_by_re(response.text)
  149. video_dict = {
  150. "video_title": video_info.get("title", ""),
  151. "video_id": video_info.get("video_id"),
  152. "gid": str(item_id),
  153. "play_cnt": int(video_info.get("play_count", 0)),
  154. "like_cnt": int(video_info.get("like_count", 0)),
  155. "comment_cnt": 0,
  156. "share_cnt": 0,
  157. "favorite_cnt": 0,
  158. "duration": int(video_info.get("duration", 0)),
  159. "video_width": 0,
  160. "video_height": 0,
  161. "publish_time_stamp": int(video_info.get("publish_time", 0)),
  162. "publish_time_str": time.strftime(
  163. "%Y-%m-%d %H:%M:%S",
  164. time.localtime(int(video_info.get("publish_time", 0))),
  165. ),
  166. "avatar_url": str(
  167. video_info.get("user_info", {}).get("avatar_url", "")
  168. ),
  169. "cover_url": video_info.get("cover_url", "").replace("\\u002F", "/"),
  170. "video_url": video_info.get("url"),
  171. "session": f"xigua-author-{int(time.time())}",
  172. }
  173. return video_dict
  174. def xigua_search(keyword):
  175. """
  176. 搜索
  177. """
  178. keyword = urllib.parse.quote(keyword)
  179. base_url = "https://www.ixigua.com/search/{}/ab_name=search&fss=input".format(
  180. keyword
  181. )
  182. headers = {
  183. "authority": "www.ixigua.com",
  184. "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
  185. "accept-language": "zh,en;q=0.9,zh-CN;q=0.8",
  186. "cache-control": "max-age=0",
  187. "cookie": "ixigua-a-s=1; support_webp=true; support_avif=true; csrf_session_id=a5355d954d3c63ed1ba35faada452b4d; tt_scid=Ur23fgYD2pMJOvi1BpILyfaobg8wA7IhGwmQx260ULRa8Dvjaxc5ZA63BUIP-6Vi473f; ttwid=1%7CNtTtSp4Iej-v0nWtepdZH3d3Ts6uGNMFzTN20ps1cdo%7C1708236945%7Cc1f301c64aa3bf69cdaa41f28856e2bb7b7eed16583f8c92d50cffa2d9944fc6; msToken=rr418opQf04vm8n9s8FAGdr1AoCUsvAOGKSDPbBEfwVS1sznxxZCvcZTI93qVz5uAXlX9yRwcKlNQZ4wMro2DmlHw5yWHAVeKr_SzgO1KtVVnjUMTUNEux_cq1-EIkI=",
  188. "sec-ch-ua": '"Not A(Brand";v="99", "Google Chrome";v="121", "Chromium";v="121"',
  189. "sec-ch-ua-mobile": "?0",
  190. "sec-ch-ua-platform": '"macOS"',
  191. "sec-fetch-dest": "document",
  192. "sec-fetch-mode": "navigate",
  193. "sec-fetch-site": "none",
  194. "sec-fetch-user": "?1",
  195. "upgrade-insecure-requests": "1",
  196. "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
  197. }
  198. basic_response = requests.get(url=base_url, headers=headers)
  199. html = etree.HTML(basic_response.text)
  200. result = html.xpath(
  201. '//a[@class="HorizontalFeedCard__coverWrapper disableZoomAnimation"]/@href'
  202. )
  203. res_list = []
  204. for page_id in result[:5]:
  205. doc_id = page_id[1:].split("?")[0]
  206. try:
  207. res = get_video_info(doc_id)
  208. temp = ["xigua", res['video_title'], res['video_url'], "https://www.ixigua.com/{}".format(doc_id)]
  209. res_list.append(temp)
  210. except:
  211. pass
  212. return res_list