xigua_search.py 8.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226
  1. """
  2. @author: luojunhui
  3. 西瓜视频搜索爬虫
  4. """
  5. import re
  6. import json
  7. import base64
  8. import requests
  9. import urllib.parse
  10. from lxml import etree
  11. from Crypto.Cipher import AES
  12. from Crypto.Util.Padding import unpad
  13. from fake_useragent import FakeUserAgent
  14. from applications.functions.common import sensitive_flag
  15. class XiGuaFunctions(object):
  16. """
  17. XiGuaSearch Class
  18. """
  19. @classmethod
  20. def tunnel_proxies(cls):
  21. """
  22. 快代理方法
  23. :return:
  24. """
  25. # 隧道域名:端口号
  26. tunnel = "l901.kdltps.com:15818"
  27. # 用户名密码方式
  28. username = "t11983523373311"
  29. password = "mtuhdr2z"
  30. proxies = {
  31. "http": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel},
  32. "https": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel}
  33. }
  34. return proxies
  35. @classmethod
  36. def byte_dance_cookie(cls, item_id):
  37. """
  38. 获取西瓜视频的 cookie
  39. :param item_id:
  40. """
  41. sess = requests.Session()
  42. sess.headers.update({
  43. 'user-agent': FakeUserAgent().chrome,
  44. 'referer': 'https://www.ixigua.com/home/{}/'.format(item_id),
  45. })
  46. # 获取 cookies
  47. sess.get('https://i.snssdk.com/slardar/sdk.js?bid=xigua_video_web_pc')
  48. data = '{"region":"cn","aid":1768,"needFid":false,"service":"www.ixigua.com","migrate_info":{"ticket":"","source":"node"},"cbUrlProtocol":"https","union":true}'
  49. r = sess.post('https://ttwid.bytedance.com/ttwid/union/register/', data=data)
  50. if r.json()['redirect_url']:
  51. requests.get(
  52. url=r.json()['redirect_url']
  53. )
  54. return r.cookies.values()[0]
  55. @classmethod
  56. def aes_decrypt(cls, data, key):
  57. """
  58. XiGua AES decrypt
  59. :param data:
  60. :param key:
  61. :return:
  62. """
  63. password = key.encode()
  64. iv = password[:16]
  65. try:
  66. ct = base64.b64decode(data.encode())
  67. cipher = AES.new(password, AES.MODE_CBC, iv)
  68. pt = unpad(cipher.decrypt(ct), AES.block_size)
  69. return base64.b64decode(pt).decode()
  70. except Exception as e:
  71. print("Incorrect decryption {}".format(e))
  72. return None
  73. @classmethod
  74. def extract_video_url(cls, text):
  75. """
  76. 获取视频 video_url
  77. :param text:
  78. :return:
  79. """
  80. HTML = etree.HTML(text)
  81. str_2 = HTML.xpath('//script[@id="SSR_HYDRATED_DATA"]/text()')[0]
  82. json_2 = str_2[str_2.find('{'):str_2.rfind('}') + 1]
  83. Irregulars = ['null', 'undefined', '=false', '=true', 'false', 'true']
  84. # python中不规则的定义
  85. for I in Irregulars:
  86. if I in ['=false', '=true']:
  87. json_2 = json_2.replace(I, '=' + I[1:].capitalize())
  88. else:
  89. json_2 = json_2.replace(I, '12')
  90. dict_2 = json.loads(json_2)["anyVideo"]["gidInformation"]["packerData"]["video"]
  91. duration = dict_2["video_duration"]
  92. play_cnt = dict_2['video_watch_count']
  93. publish_time = int(dict_2['video_publish_time'])
  94. like_cnt = dict_2['video_like_count']
  95. video_title = dict_2['title']
  96. video_id = dict_2['vid']
  97. video_res = dict_2['videoResource']
  98. cover_url = dict_2['poster_url'].replace("\\u002F", "/")
  99. if video_res['dash'] == 12:
  100. obj = video_res['normal']
  101. ptk = obj['ptk']
  102. video_list = obj['video_list']
  103. keys = list(video_list.keys())
  104. main_url = video_list[keys[-1]]['main_url']
  105. real_video_url = cls.aes_decrypt(data=main_url, key=ptk)
  106. else:
  107. obj = video_res['dash']
  108. ptk = obj["ptk"]
  109. video_url = obj['dynamic_video']['main_url']
  110. real_video_url = cls.aes_decrypt(data=video_url, key=ptk)
  111. return {
  112. "video_url": real_video_url,
  113. "cover_url": cover_url,
  114. "video_id": video_id,
  115. "video_title": video_title,
  116. "like_cnt": like_cnt,
  117. "play_cnt": play_cnt,
  118. "publish_time": publish_time,
  119. "duration": duration
  120. }
  121. @classmethod
  122. def extract_info_by_re(cls, text):
  123. """
  124. 通过正则表达式获取文本中的信息
  125. :param text:
  126. :return:
  127. """
  128. result = cls.extract_video_url(text)
  129. # 标题
  130. title_match = re.search(r'<title[^>]*>(.*?)</title>', text)
  131. if title_match:
  132. title_content = title_match.group(1)
  133. title_content = title_content.split(" - ")[0]
  134. try:
  135. title_content = bytes(title_content, "latin1").decode()
  136. except:
  137. title_content = title_content
  138. else:
  139. title_content = ""
  140. result['video_title'] = title_content
  141. return result
  142. @classmethod
  143. def get_video_info(cls, item_id):
  144. """
  145. 获取视频信息
  146. """
  147. url = "https://www.ixigua.com/{}".format(item_id)
  148. headers = {
  149. # "accept-encoding": "gzip, deflate",
  150. "accept-language": "zh-CN,zh-Hans;q=0.9",
  151. # "cookie": "ttwid={}".format(cls.byte_dance_cookie(item_id)),
  152. "cookie": "UIFID=73355a799e41c2edb6d004baa6cda0116425031dff9117e11075ec8bf266082874fe897f43e66be83a0501afe4a08cfc7e1066ab88423af122641493c7af9f0a745eb85c50fddb096de5cc77cd5ff05503312d84d36ab2681c6e6d930bbe68edaebf8fae03b04eb669359965e01c266b;"
  153. "__ac_nonce=0666fd1a00053bf535b9f;"
  154. "__ac_signature=_02B4Z6wo00f01u8PTiQAAIDBvfBuP-YjUQbvL0qAAN25bWfWXQrzRNCBKvFYKS5wAOYPXg5XV1Ck9JEroeWeWKijH2v3i4lxXM37JogiJJfEtYD.8sbXul2-4v.VRRta4xa07ignRnGj5Voh83;"
  155. # "msToken=Pc0sCOhbTxWnGbeqIHMcELMObmtTQGPwloqzOwtfsew-ao5WYnHuhKwE4TL_-88EGh64ec36ggsuqMuV-iBmcF1Gg92ZDGlD89lL6r0MMCg-8srTh1GfNgDnVfFq7g==; "
  156. # "tt_scid=wLMuzIiixDpWtXV38R283kz.YIi2x1BE31RggCRLCsFJu204SFWS8Py13xxEPpzZ3b8e;"
  157. "ttwid=1%7C9b5sTIuwZxZKt0wFsvE-2t5OoFxH_Q5VIpVNWEREbAo%7C1718605316%7C9dfc9322350e713e6109ed46a7047ed31c0ab5a724e84de0bb766c195043207c",
  158. "user-agent": FakeUserAgent().chrome,
  159. "referer": "https://www.ixigua.com/{}/".format(item_id),
  160. }
  161. response = requests.get(
  162. url=url,
  163. headers=headers
  164. )
  165. # print(response.text)
  166. video_info = cls.extract_info_by_re(response.text)
  167. return video_info
  168. # class XiGuaVideoDeal(object):
  169. def xigua_search_v2(keyword, sensitive_words):
  170. """
  171. Search By KeyWord
  172. :param sensitive_words:
  173. :param keyword:
  174. :return:
  175. """
  176. url = "https://www.ixigua.com/api/searchv2/complex/{}/10".format(keyword)
  177. params = {}
  178. headers = {
  179. 'accept': 'application/json, text/plain, */*',
  180. 'accept-language': 'en,zh;q=0.9,zh-CN;q=0.8',
  181. 'cookie': '_tea_utm_cache_2285=undefined;',
  182. 'priority': 'u=1, i',
  183. 'referer': 'https://www.ixigua.com/search/{}'.format(urllib.parse.quote(keyword)),
  184. 'user-agent': FakeUserAgent().chrome
  185. }
  186. response = requests.request("GET", url, headers=headers, params=params)
  187. try:
  188. recall_list = response.json()['data']['data']
  189. if recall_list:
  190. for obj in recall_list:
  191. if obj['type'] == "video":
  192. title = obj['data']['title']
  193. url = obj['data']['group_id']
  194. duration = obj['data']['video_time']
  195. watch_count = obj['data']['video_watch_count']
  196. if sensitive_flag(sensitive_words, title) and duration <= 300:
  197. # try:
  198. res = XiGuaFunctions().get_video_info(url)
  199. if res:
  200. return [res]
  201. else:
  202. continue
  203. # except Exception as e:
  204. # print(e)
  205. return []
  206. else:
  207. return []
  208. except Exception as e:
  209. return []