xigua_search.py 8.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223
  1. """
  2. @author: luojunhui
  3. 西瓜视频搜索爬虫
  4. """
  5. import re
  6. import json
  7. import base64
  8. import requests
  9. import urllib.parse
  10. from lxml import etree
  11. from Crypto.Cipher import AES
  12. from Crypto.Util.Padding import unpad
  13. from fake_useragent import FakeUserAgent
  14. from applications.functions.common import sensitive_flag
  15. class XiGuaFunctions(object):
  16. """
  17. XiGuaSearch Class
  18. """
  19. @classmethod
  20. def tunnel_proxies(cls):
  21. """
  22. 快代理方法
  23. :return:
  24. """
  25. tunnel = "q796.kdltps.com:15818"
  26. username = "t17772369458618"
  27. password = "5zqcjkmy"
  28. proxies = {
  29. "http": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel},
  30. "https": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel}
  31. }
  32. return proxies
  33. @classmethod
  34. def byte_dance_cookie(cls, item_id):
  35. """
  36. 获取西瓜视频的 cookie
  37. :param item_id:
  38. """
  39. sess = requests.Session()
  40. sess.headers.update({
  41. 'user-agent': FakeUserAgent().chrome,
  42. 'referer': 'https://www.ixigua.com/home/{}/'.format(item_id),
  43. })
  44. # 获取 cookies
  45. sess.get('https://i.snssdk.com/slardar/sdk.js?bid=xigua_video_web_pc')
  46. data = '{"region":"cn","aid":1768,"needFid":false,"service":"www.ixigua.com","migrate_info":{"ticket":"","source":"node"},"cbUrlProtocol":"https","union":true}'
  47. r = sess.post('https://ttwid.bytedance.com/ttwid/union/register/', data=data)
  48. if r.json()['redirect_url']:
  49. requests.get(
  50. url=r.json()['redirect_url']
  51. )
  52. return r.cookies.values()[0]
  53. @classmethod
  54. def aes_decrypt(cls, data, key):
  55. """
  56. XiGua AES decrypt
  57. :param data:
  58. :param key:
  59. :return:
  60. """
  61. password = key.encode()
  62. iv = password[:16]
  63. try:
  64. ct = base64.b64decode(data.encode())
  65. cipher = AES.new(password, AES.MODE_CBC, iv)
  66. pt = unpad(cipher.decrypt(ct), AES.block_size)
  67. return base64.b64decode(pt).decode()
  68. except Exception as e:
  69. print("Incorrect decryption {}".format(e))
  70. return None
  71. @classmethod
  72. def extract_video_url(cls, text):
  73. """
  74. 获取视频 video_url
  75. :param text:
  76. :return:
  77. """
  78. HTML = etree.HTML(text)
  79. str_2 = HTML.xpath('//script[@id="SSR_HYDRATED_DATA"]/text()')[0]
  80. json_2 = str_2[str_2.find('{'):str_2.rfind('}') + 1]
  81. Irregulars = ['null', 'undefined', '=false', '=true', 'false', 'true']
  82. # python中不规则的定义
  83. for I in Irregulars:
  84. if I in ['=false', '=true']:
  85. json_2 = json_2.replace(I, '=' + I[1:].capitalize())
  86. else:
  87. json_2 = json_2.replace(I, '12')
  88. dict_2 = json.loads(json_2)["anyVideo"]["gidInformation"]["packerData"]["video"]
  89. duration = dict_2["video_duration"]
  90. play_cnt = dict_2['video_watch_count']
  91. publish_time = int(dict_2['video_publish_time'])
  92. like_cnt = dict_2['video_like_count']
  93. video_title = dict_2['title']
  94. video_id = dict_2['vid']
  95. video_res = dict_2['videoResource']
  96. cover_url = dict_2['poster_url'].replace("\\u002F", "/")
  97. if video_res['dash'] == 12:
  98. obj = video_res['normal']
  99. ptk = obj['ptk']
  100. video_list = obj['video_list']
  101. keys = list(video_list.keys())
  102. main_url = video_list[keys[-1]]['main_url']
  103. real_video_url = cls.aes_decrypt(data=main_url, key=ptk)
  104. else:
  105. obj = video_res['dash']
  106. ptk = obj["ptk"]
  107. video_url = obj['dynamic_video']['main_url']
  108. real_video_url = cls.aes_decrypt(data=video_url, key=ptk)
  109. return {
  110. "video_url": real_video_url,
  111. "cover_url": cover_url,
  112. "video_id": video_id,
  113. "video_title": video_title,
  114. "like_cnt": like_cnt,
  115. "play_cnt": play_cnt,
  116. "publish_time": publish_time,
  117. "duration": duration
  118. }
  119. @classmethod
  120. def extract_info_by_re(cls, text):
  121. """
  122. 通过正则表达式获取文本中的信息
  123. :param text:
  124. :return:
  125. """
  126. result = cls.extract_video_url(text)
  127. # 标题
  128. title_match = re.search(r'<title[^>]*>(.*?)</title>', text)
  129. if title_match:
  130. title_content = title_match.group(1)
  131. title_content = title_content.split(" - ")[0]
  132. try:
  133. title_content = bytes(title_content, "latin1").decode()
  134. except:
  135. title_content = title_content
  136. else:
  137. title_content = ""
  138. result['video_title'] = title_content
  139. return result
  140. @classmethod
  141. def get_video_info(cls, item_id):
  142. """
  143. 获取视频信息
  144. """
  145. url = "https://www.ixigua.com/{}".format(item_id)
  146. headers = {
  147. # "accept-encoding": "gzip, deflate",
  148. "accept-language": "zh-CN,zh-Hans;q=0.9",
  149. # "cookie": "ttwid={}".format(cls.byte_dance_cookie(item_id)),
  150. "cookie": "UIFID=73355a799e41c2edb6d004baa6cda0116425031dff9117e11075ec8bf266082874fe897f43e66be83a0501afe4a08cfc7e1066ab88423af122641493c7af9f0a745eb85c50fddb096de5cc77cd5ff05503312d84d36ab2681c6e6d930bbe68edaebf8fae03b04eb669359965e01c266b;"
  151. "__ac_nonce=0666fd1a00053bf535b9f;"
  152. "__ac_signature=_02B4Z6wo00f01u8PTiQAAIDBvfBuP-YjUQbvL0qAAN25bWfWXQrzRNCBKvFYKS5wAOYPXg5XV1Ck9JEroeWeWKijH2v3i4lxXM37JogiJJfEtYD.8sbXul2-4v.VRRta4xa07ignRnGj5Voh83;"
  153. # "msToken=Pc0sCOhbTxWnGbeqIHMcELMObmtTQGPwloqzOwtfsew-ao5WYnHuhKwE4TL_-88EGh64ec36ggsuqMuV-iBmcF1Gg92ZDGlD89lL6r0MMCg-8srTh1GfNgDnVfFq7g==; "
  154. # "tt_scid=wLMuzIiixDpWtXV38R283kz.YIi2x1BE31RggCRLCsFJu204SFWS8Py13xxEPpzZ3b8e;"
  155. "ttwid=1%7C9b5sTIuwZxZKt0wFsvE-2t5OoFxH_Q5VIpVNWEREbAo%7C1718605316%7C9dfc9322350e713e6109ed46a7047ed31c0ab5a724e84de0bb766c195043207c",
  156. "user-agent": FakeUserAgent().chrome,
  157. "referer": "https://www.ixigua.com/{}/".format(item_id),
  158. }
  159. response = requests.get(
  160. url=url,
  161. headers=headers
  162. )
  163. print(response.text)
  164. video_info = cls.extract_info_by_re(response.text)
  165. return video_info
  166. # class XiGuaVideoDeal(object):
  167. def xigua_search_v2(keyword, sensitive_words):
  168. """
  169. Search By KeyWord
  170. :param sensitive_words:
  171. :param keyword:
  172. :return:
  173. """
  174. url = "https://www.ixigua.com/api/searchv2/complex/{}/10".format(keyword)
  175. params = {}
  176. headers = {
  177. 'accept': 'application/json, text/plain, */*',
  178. 'accept-language': 'en,zh;q=0.9,zh-CN;q=0.8',
  179. 'cookie': '_tea_utm_cache_2285=undefined;',
  180. 'priority': 'u=1, i',
  181. 'referer': 'https://www.ixigua.com/search/{}'.format(urllib.parse.quote(keyword)),
  182. 'user-agent': FakeUserAgent().chrome
  183. }
  184. response = requests.request("GET", url, headers=headers, params=params)
  185. try:
  186. recall_list = response.json()['data']['data']
  187. if recall_list:
  188. for obj in recall_list:
  189. if obj['type'] == "video":
  190. title = obj['data']['title']
  191. url = obj['data']['group_id']
  192. duration = obj['data']['video_time']
  193. watch_count = obj['data']['video_watch_count']
  194. if sensitive_flag(sensitive_words, title) and duration <= 300:
  195. # try:
  196. res = XiGuaFunctions().get_video_info(url)
  197. if res:
  198. return [res]
  199. else:
  200. continue
  201. # except Exception as e:
  202. # print(e)
  203. return []
  204. else:
  205. return []
  206. except Exception as e:
  207. return []