xigua_search.py 8.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222
  1. """
  2. @author: luojunhui
  3. 西瓜视频搜索爬虫
  4. """
  5. import re
  6. import json
  7. import base64
  8. import requests
  9. import urllib.parse
  10. from lxml import etree
  11. from Crypto.Cipher import AES
  12. from Crypto.Util.Padding import unpad
  13. from fake_useragent import FakeUserAgent
  14. from applications.functions.common import sensitive_flag
  15. class XiGuaFunctions(object):
  16. """
  17. XiGuaSearch Class
  18. """
  19. @classmethod
  20. def tunnel_proxies(cls):
  21. """
  22. 快代理方法
  23. :return:
  24. """
  25. tunnel = "q796.kdltps.com:15818"
  26. username = "t17772369458618"
  27. password = "5zqcjkmy"
  28. proxies = {
  29. "http": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel},
  30. "https": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel}
  31. }
  32. return proxies
  33. @classmethod
  34. def byte_dance_cookie(cls, item_id):
  35. """
  36. 获取西瓜视频的 cookie
  37. :param item_id:
  38. """
  39. sess = requests.Session()
  40. sess.headers.update({
  41. 'user-agent': FakeUserAgent().chrome,
  42. 'referer': 'https://www.ixigua.com/home/{}/'.format(item_id),
  43. })
  44. # 获取 cookies
  45. sess.get('https://i.snssdk.com/slardar/sdk.js?bid=xigua_video_web_pc')
  46. data = '{"region":"cn","aid":1768,"needFid":false,"service":"www.ixigua.com","migrate_info":{"ticket":"","source":"node"},"cbUrlProtocol":"https","union":true}'
  47. r = sess.post('https://ttwid.bytedance.com/ttwid/union/register/', data=data)
  48. if r.json()['redirect_url']:
  49. requests.get(
  50. url=r.json()['redirect_url']
  51. )
  52. return r.cookies.values()[0]
  53. @classmethod
  54. def aes_decrypt(cls, data, key):
  55. """
  56. XiGua AES decrypt
  57. :param data:
  58. :param key:
  59. :return:
  60. """
  61. password = key.encode()
  62. iv = password[:16]
  63. try:
  64. ct = base64.b64decode(data.encode())
  65. cipher = AES.new(password, AES.MODE_CBC, iv)
  66. pt = unpad(cipher.decrypt(ct), AES.block_size)
  67. return base64.b64decode(pt).decode()
  68. except Exception as e:
  69. print("Incorrect decryption {}".format(e))
  70. return None
  71. @classmethod
  72. def extract_video_url(cls, text):
  73. """
  74. 获取视频 video_url
  75. :param text:
  76. :return:
  77. """
  78. HTML = etree.HTML(text)
  79. str_2 = HTML.xpath('//script[@id="SSR_HYDRATED_DATA"]/text()')[0]
  80. json_2 = str_2[str_2.find('{'):str_2.rfind('}') + 1]
  81. Irregulars = ['null', 'undefined', '=false', '=true', 'false', 'true']
  82. # python中不规则的定义
  83. for I in Irregulars:
  84. if I in ['=false', '=true']:
  85. json_2 = json_2.replace(I, '=' + I[1:].capitalize())
  86. else:
  87. json_2 = json_2.replace(I, '12')
  88. dict_2 = json.loads(json_2)["anyVideo"]["gidInformation"]["packerData"]["video"]
  89. duration = dict_2["video_duration"]
  90. play_cnt = dict_2['video_watch_count']
  91. publish_time = int(dict_2['video_publish_time'])
  92. like_cnt = dict_2['video_like_count']
  93. video_title = dict_2['title']
  94. video_id = dict_2['vid']
  95. video_res = dict_2['videoResource']
  96. cover_url = dict_2['poster_url'].replace("\\u002F", "/")
  97. if video_res['dash'] == 12:
  98. obj = video_res['normal']
  99. ptk = obj['ptk']
  100. video_list = obj['video_list']
  101. keys = list(video_list.keys())
  102. main_url = video_list[keys[-1]]['main_url']
  103. real_video_url = cls.aes_decrypt(data=main_url, key=ptk)
  104. else:
  105. obj = video_res['dash']
  106. ptk = obj["ptk"]
  107. video_url = obj['dynamic_video']['main_url']
  108. real_video_url = cls.aes_decrypt(data=video_url, key=ptk)
  109. return {
  110. "video_url": real_video_url,
  111. "cover_url": cover_url,
  112. "video_id": video_id,
  113. "video_title": video_title,
  114. "like_cnt": like_cnt,
  115. "play_cnt": play_cnt,
  116. "publish_time": publish_time,
  117. "duration": duration
  118. }
  119. @classmethod
  120. def extract_info_by_re(cls, text):
  121. """
  122. 通过正则表达式获取文本中的信息
  123. :param text:
  124. :return:
  125. """
  126. result = cls.extract_video_url(text)
  127. # 标题
  128. title_match = re.search(r'<title[^>]*>(.*?)</title>', text)
  129. if title_match:
  130. title_content = title_match.group(1)
  131. title_content = title_content.split(" - ")[0]
  132. try:
  133. title_content = bytes(title_content, "latin1").decode()
  134. except:
  135. title_content = title_content
  136. else:
  137. title_content = ""
  138. result['video_title'] = title_content
  139. return result
  140. @classmethod
  141. def get_video_info(cls, item_id):
  142. """
  143. 获取视频信息
  144. """
  145. url = "https://www.ixigua.com/{}".format(item_id)
  146. headers = {
  147. "accept-encoding": "gzip, deflate",
  148. "accept-language": "zh-CN,zh-Hans;q=0.9",
  149. "cookie": "ttwid={}".format(cls.byte_dance_cookie(item_id)),
  150. "user-agent": FakeUserAgent().random,
  151. "referer": "https://www.ixigua.com/{}/".format(item_id),
  152. }
  153. response = requests.get(
  154. url=url,
  155. headers=headers
  156. )
  157. video_info = cls.extract_info_by_re(response.text)
  158. return video_info
  159. def xigua_search(keyword, sensitive_words):
  160. """
  161. 搜索
  162. """
  163. keyword = urllib.parse.quote(keyword)
  164. base_url = "https://www.ixigua.com/search/{}/ab_name=search&fss=input".format(
  165. keyword
  166. )
  167. headers = {
  168. "authority": "www.ixigua.com",
  169. "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
  170. "accept-language": "zh,en;q=0.9,zh-CN;q=0.8",
  171. "cache-control": "max-age=0",
  172. "cookie": "ixigua-a-s=1; support_webp=true; support_avif=true; csrf_session_id=a5355d954d3c63ed1ba35faada452b4d; tt_scid=Ur23fgYD2pMJOvi1BpILyfaobg8wA7IhGwmQx260ULRa8Dvjaxc5ZA63BUIP-6Vi473f; ttwid=1%7CNtTtSp4Iej-v0nWtepdZH3d3Ts6uGNMFzTN20ps1cdo%7C1708236945%7Cc1f301c64aa3bf69cdaa41f28856e2bb7b7eed16583f8c92d50cffa2d9944fc6; msToken=rr418opQf04vm8n9s8FAGdr1AoCUsvAOGKSDPbBEfwVS1sznxxZCvcZTI93qVz5uAXlX9yRwcKlNQZ4wMro2DmlHw5yWHAVeKr_SzgO1KtVVnjUMTUNEux_cq1-EIkI=",
  173. "upgrade-insecure-requests": "1",
  174. "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
  175. }
  176. basic_response = requests.get(url=base_url, headers=headers)
  177. html = etree.HTML(basic_response.text)
  178. result_list = html.xpath(
  179. '//div[@class="HorizontalFeedCard searchPageV2__card"]/div[1]/a'
  180. )
  181. if result_list:
  182. for item in result_list:
  183. try:
  184. url = item.xpath("@href")[0]
  185. duration_str = str(item.xpath("./span/text()")[0])
  186. duration_obj = duration_str.split(":")
  187. if len(duration_obj) == 3:
  188. duration = 100000
  189. elif len(duration_obj) == 2:
  190. duration = int(duration_str.split(":")[0]) * 60 + int(duration_str.split(":")[1])
  191. else:
  192. duration = 10000
  193. title = item.xpath("@title")[0]
  194. real_title = bytes(str(title), "latin1").decode()
  195. if sensitive_flag(sensitive_words, real_title) and duration <= 300:
  196. try:
  197. res = XiGuaFunctions().get_video_info(url[1:])
  198. if res:
  199. return [res]
  200. else:
  201. continue
  202. except Exception as e:
  203. print(e)
  204. except Exception as e:
  205. print(e)
  206. return []
  207. else:
  208. return []