xigua_search_dev.py 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352
  1. # -*- coding: utf-8 -*-
  2. # @Author: wangkun
  3. # @Time: 2023/6/25
  4. import base64
  5. import json
  6. import os
  7. import random
  8. import re
  9. import string
  10. import sys
  11. import time
  12. import requests
  13. import urllib3
  14. from requests.adapters import HTTPAdapter
  15. from selenium.webdriver import DesiredCapabilities, ActionChains
  16. from selenium.webdriver.chrome.service import Service
  17. from selenium import webdriver
  18. from selenium.webdriver.common.by import By
  19. from common.mq import MQ
  20. sys.path.append(os.getcwd())
  21. from common.scheduling_db import MysqlHelper
  22. from common.common import Common
  23. from common.public import get_config_from_mysql, download_rule
  24. from common.userAgent import get_random_user_agent
  25. class XiguasearchDev:
  26. # 已下载视频数
  27. download_cnt = 0
  28. platform = "xigua"
  29. @classmethod
  30. def random_signature(cls):
  31. src_digits = string.digits # string_数字
  32. src_uppercase = string.ascii_uppercase # string_大写字母
  33. src_lowercase = string.ascii_lowercase # string_小写字母
  34. digits_num = random.randint(1, 6)
  35. uppercase_num = random.randint(1, 26 - digits_num - 1)
  36. lowercase_num = 26 - (digits_num + uppercase_num)
  37. password = random.sample(src_digits, digits_num) + random.sample(src_uppercase, uppercase_num) + random.sample(
  38. src_lowercase, lowercase_num)
  39. random.shuffle(password)
  40. new_password = 'AAAAAAAAAA' + ''.join(password)[10:-4] + 'AAAB'
  41. new_password_start = new_password[0:18]
  42. new_password_end = new_password[-7:]
  43. if new_password[18] == '8':
  44. new_password = new_password_start + 'w' + new_password_end
  45. elif new_password[18] == '9':
  46. new_password = new_password_start + 'x' + new_password_end
  47. elif new_password[18] == '-':
  48. new_password = new_password_start + 'y' + new_password_end
  49. elif new_password[18] == '.':
  50. new_password = new_password_start + 'z' + new_password_end
  51. else:
  52. new_password = new_password_start + 'y' + new_password_end
  53. return new_password
  54. @classmethod
  55. def get_video_url(cls, video_info):
  56. video_url_dict = {}
  57. video_resource = video_info.get('videoResource', {})
  58. dash_120fps = video_resource.get('dash_120fps', {})
  59. normal = video_resource.get('normal', {})
  60. # 从dash_120fps和normal字典中获取video_list字典
  61. video_list = dash_120fps.get('video_list', {}) or normal.get('video_list', {})
  62. # 获取video_list字典中的video_4、video_3、video_2或video_1的值。如果找到非空视频URL,则将其赋值给变量video_url。否则,将赋值为空字符串。
  63. video = video_list.get('video_4') or video_list.get('video_3') or video_list.get('video_2') or video_list.get(
  64. 'video_1')
  65. video_url = video.get('backup_url_1', '') if video else ''
  66. audio_url = video.get('backup_url_1', '') if video else ''
  67. video_width = video.get('vwidth', 0) if video else 0
  68. video_height = video.get('vheight', 0) if video else 0
  69. video_url = re.sub(r'[^a-zA-Z0-9+/=]', '', video_url) # 从视频URL中删除特殊字符
  70. audio_url = re.sub(r'[^a-zA-Z0-9+/=]', '', audio_url) # 从音频URL中删除特殊字符
  71. video_url = base64.b64decode(video_url).decode('utf8') # 解码视频URL
  72. audio_url = base64.b64decode(audio_url).decode('utf8') # 解码音频URL
  73. video_url_dict["video_url"] = video_url
  74. video_url_dict["audio_url"] = audio_url
  75. video_url_dict["video_width"] = video_width
  76. video_url_dict["video_height"] = video_height
  77. return video_url_dict
  78. @classmethod
  79. def get_comment_cnt(cls, item_id):
  80. url = "https://www.ixigua.com/tlb/comment/article/v5/tab_comments/?"
  81. params = {
  82. "tab_index": "0",
  83. "count": "10",
  84. "offset": "10",
  85. "group_id": str(item_id),
  86. "item_id": str(item_id),
  87. "aid": "1768",
  88. "msToken": "50-JJObWB07HfHs-BMJWT1eIDX3G-6lPSF_i-QwxBIXE9VVa-iN0jbEXR5pG2DKjXBmP299n6ZTuXzY-GAy968CCvouSAYIS4GzvGQT3pNlKNejr5G4-1g==",
  89. "X-Bogus": "DFSzswVOyGtANVeWtCLMqR/F6q9U",
  90. "_signature": cls.random_signature(),
  91. }
  92. headers = {
  93. 'authority': 'www.ixigua.com',
  94. 'accept': 'application/json, text/plain, */*',
  95. 'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
  96. 'cache-control': 'no-cache',
  97. 'cookie': 'MONITOR_WEB_ID=67cb5099-a022-4ec3-bb8e-c4de6ba51dd0; passport_csrf_token=72b2574f3c99f8ba670e42df430218fd; passport_csrf_token_default=72b2574f3c99f8ba670e42df430218fd; sid_guard=c7472b508ea631823ba765a60cf8757f%7C1680867422%7C3024002%7CFri%2C+12-May-2023+11%3A37%3A04+GMT; uid_tt=c13f47d51767f616befe32fb3e9f485a; uid_tt_ss=c13f47d51767f616befe32fb3e9f485a; sid_tt=c7472b508ea631823ba765a60cf8757f; sessionid=c7472b508ea631823ba765a60cf8757f; sessionid_ss=c7472b508ea631823ba765a60cf8757f; sid_ucp_v1=1.0.0-KGUzNWYxNmRkZGJiZjgxY2MzZWNkMTEzMTkwYjY1Yjg5OTY5NzVlNmMKFQiu3d-eqQIQ3oDAoQYYGCAMOAhACxoCaGwiIGM3NDcyYjUwOGVhNjMxODIzYmE3NjVhNjBjZjg3NTdm; ssid_ucp_v1=1.0.0-KGUzNWYxNmRkZGJiZjgxY2MzZWNkMTEzMTkwYjY1Yjg5OTY5NzVlNmMKFQiu3d-eqQIQ3oDAoQYYGCAMOAhACxoCaGwiIGM3NDcyYjUwOGVhNjMxODIzYmE3NjVhNjBjZjg3NTdm; odin_tt=b893608d4dde2e1e8df8cd5d97a0e2fbeafc4ca762ac72ebef6e6c97e2ed19859bb01d46b4190ddd6dd17d7f9678e1de; SEARCH_CARD_MODE=7168304743566296612_0; support_webp=true; support_avif=false; csrf_session_id=a5355d954d3c63ed1ba35faada452b4d; tt_scid=7Pux7s634-z8DYvCM20y7KigwH5u7Rh6D9C-RROpnT.aGMEcz6Vsxp.oai47wJqa4f86; ttwid=1%7CHHtv2QqpSGuSu8r-zXF1QoWsvjmNi1SJrqOrZzg-UCY%7C1683858689%7Ca5223fe1500578e01e138a0d71d6444692018296c4c24f5885af174a65873c95; ixigua-a-s=3; msToken=50-JJObWB07HfHs-BMJWT1eIDX3G-6lPSF_i-QwxBIXE9VVa-iN0jbEXR5pG2DKjXBmP299n6ZTuXzY-GAy968CCvouSAYIS4GzvGQT3pNlKNejr5G4-1g==; __ac_nonce=0645dcbf0005064517440; __ac_signature=_02B4Z6wo00f01FEGmAwAAIDBKchzCGqn-MBRJpyAAHAjieFC5GEg6gGiwz.I4PRrJl7f0GcixFrExKmgt6QI1i1S-dQyofPEj2ugWTCnmKUdJQv-wYuDofeKNe8VtMtZq2aKewyUGeKU-5Ud21; ixigua-a-s=3',
  98. 'pragma': 'no-cache',
  99. 'referer': f'https://www.ixigua.com/{item_id}?logTag=3c5aa86a8600b9ab8540',
  100. 'sec-ch-ua': '"Microsoft Edge";v="113", "Chromium";v="113", "Not-A.Brand";v="24"',
  101. 'sec-ch-ua-mobile': '?0',
  102. 'sec-ch-ua-platform': '"macOS"',
  103. 'sec-fetch-dest': 'empty',
  104. 'sec-fetch-mode': 'cors',
  105. 'sec-fetch-site': 'same-origin',
  106. 'tt-anti-token': 'cBITBHvmYjEygzv-f9c78c1297722cf1f559c74b084e4525ce4900bdcf9e8588f20cc7c2e3234422',
  107. 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.35',
  108. 'x-secsdk-csrf-token': '000100000001f8e733cf37f0cd255a51aea9a81ff7bc0c09490cfe41ad827c3c5c18ec809279175e4d9f5553d8a5'
  109. }
  110. urllib3.disable_warnings()
  111. s = requests.session()
  112. # max_retries=3 重试3次
  113. s.mount('http://', HTTPAdapter(max_retries=3))
  114. s.mount('https://', HTTPAdapter(max_retries=3))
  115. response = s.get(url=url, headers=headers, params=params, verify=False, proxies=Common.tunnel_proxies(), timeout=5)
  116. response.close()
  117. if response.status_code != 200 or 'total_number' not in response.json() or response.json() == {}:
  118. return 0
  119. return response.json().get("total_number", 0)
  120. # 获取视频详情
  121. @classmethod
  122. def get_video_info(cls, log_type, crawler, item_id):
  123. url = 'https://www.ixigua.com/api/mixVideo/information?'
  124. headers = {
  125. "accept-encoding": "gzip, deflate",
  126. "accept-language": "zh-CN,zh-Hans;q=0.9",
  127. "user-agent": get_random_user_agent('pc'),
  128. "referer": "https://www.ixigua.com/7102614741050196520?logTag=0531c88ac04f38ab2c62",
  129. }
  130. params = {
  131. 'mixId': str(item_id),
  132. 'msToken': 'IlG0wd0Pylyw9ghcYiB2YseUmTwrsrqqhXrbIcsSaTcLTJyVlbYJzk20zw3UO-CfrfC'
  133. 'NVVIOBNjIl7vfBoxnVUwO9ZyzAI3umSKsT5-pef_RRfQCJwmA',
  134. 'X-Bogus': 'DFSzswVupYTANCJOSBk0P53WxM-r',
  135. '_signature': '_02B4Z6wo0000119LvEwAAIDCuktNZ0y5wkdfS7jAALThuOR8D9yWNZ.EmWHKV0WSn6Px'
  136. 'fPsH9-BldyxVje0f49ryXgmn7Tzk-swEHNb15TiGqa6YF.cX0jW8Eds1TtJOIZyfc9s5emH7gdWN94',
  137. }
  138. cookies = {
  139. 'ixigua-a-s': '1',
  140. 'msToken': 'IlG0wd0Pylyw9ghcYiB2YseUmTwrsrqqhXrbIcsSaTcLTJyVlbYJzk20zw3UO-CfrfCNVVIOB'
  141. 'NjIl7vfBoxnVUwO9ZyzAI3umSKsT5-pef_RRfQCJwmA',
  142. 'ttwid': '1%7C_yXQeHWwLZgCsgHClOwTCdYSOt_MjdOkgnPIkpi-Sr8%7C1661241238%7Cf57d0c5ef3f1d7'
  143. '6e049fccdca1ac54887c34d1f8731c8e51a49780ff0ceab9f8',
  144. 'tt_scid': 'QZ4l8KXDG0YAEaMCSbADdcybdKbUfG4BC6S4OBv9lpRS5VyqYLX2bIR8CTeZeGHR9ee3',
  145. 'MONITOR_WEB_ID': '0a49204a-7af5-4e96-95f0-f4bafb7450ad',
  146. '__ac_nonce': '06304878000964fdad287',
  147. '__ac_signature': '_02B4Z6wo00f017Rcr3AAAIDCUVxeW1tOKEu0fKvAAI4cvoYzV-wBhq7B6D8k0no7lb'
  148. 'FlvYoinmtK6UXjRIYPXnahUlFTvmWVtb77jsMkKAXzAEsLE56m36RlvL7ky.M3Xn52r9t1IEb7IR3ke8',
  149. 'ttcid': 'e56fabf6e85d4adf9e4d91902496a0e882',
  150. '_tea_utm_cache_1300': 'undefined',
  151. 'support_avif': 'false',
  152. 'support_webp': 'false',
  153. 'xiguavideopcwebid': '7134967546256016900',
  154. 'xiguavideopcwebid.sig': 'xxRww5R1VEMJN_dQepHorEu_eAc',
  155. }
  156. urllib3.disable_warnings()
  157. s = requests.session()
  158. # max_retries=3 重试3次
  159. s.mount('http://', HTTPAdapter(max_retries=3))
  160. s.mount('https://', HTTPAdapter(max_retries=3))
  161. response = s.get(url=url, headers=headers, params=params, cookies=cookies, verify=False, proxies=Common.tunnel_proxies(), timeout=5)
  162. response.close()
  163. if response.status_code != 200 or 'data' not in response.json() or response.json()['data'] == {}:
  164. Common.logger(log_type, crawler).warning(f"get_video_info:{response.status_code}, {response.text}\n")
  165. return None
  166. else:
  167. video_info = response.json()['data'].get("gidInformation", {}).get("packerData", {}).get("video", {})
  168. if video_info == {}:
  169. return None
  170. video_dict = {
  171. "video_title": video_info.get("title", ""),
  172. "video_id": video_info.get("videoResource", {}).get("vid", ""),
  173. "gid": str(item_id),
  174. "play_cnt": int(video_info.get("video_watch_count", 0)),
  175. "like_cnt": int(video_info.get("video_like_count", 0)),
  176. "comment_cnt": int(cls.get_comment_cnt(item_id)),
  177. "share_cnt": 0,
  178. "favorite_cnt": 0,
  179. "duration": int(video_info.get("video_duration", 0)),
  180. "video_width": int(cls.get_video_url(video_info)["video_width"]),
  181. "video_height": int(cls.get_video_url(video_info)["video_height"]),
  182. "publish_time_stamp": int(video_info.get("video_publish_time", 0)),
  183. "publish_time_str": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(video_info.get("video_publish_time", 0)))),
  184. "user_name": video_info.get("user_info", {}).get("name", ""),
  185. "user_id": str(video_info.get("user_info", {}).get("user_id", "")),
  186. "avatar_url": str(video_info.get("user_info", {}).get("avatar_url", "")),
  187. "cover_url": video_info.get("poster_url", ""),
  188. "audio_url": cls.get_video_url(video_info)["audio_url"],
  189. "video_url": cls.get_video_url(video_info)["video_url"],
  190. "session": f"xigua-search-{int(time.time())}"
  191. }
  192. return video_dict
  193. @classmethod
  194. def repeat_video(cls, log_type, crawler, video_id, env):
  195. sql = f""" select * from crawler_video where platform in ("{crawler}","{cls.platform}") and out_video_id="{video_id}"; """
  196. repeat_video = MysqlHelper.get_values(log_type, crawler, sql, env, action="")
  197. return len(repeat_video)
  198. @classmethod
  199. def get_search_videos(cls, log_type, crawler, user_list, rule_dict, env):
  200. Common.logger(log_type, crawler).info(f"搜索词总数:{len(user_list)}\n")
  201. Common.logging(log_type, crawler, env, f"搜索词总数:{len(user_list)}\n")
  202. for user_dict in user_list:
  203. try:
  204. cls.download_cnt = 0
  205. Common.logger(log_type, crawler).info(f"开始抓取 {user_dict['link']} 视频\n")
  206. Common.logging(log_type, crawler, env, f"开始抓取 {user_dict['link']} 视频\n")
  207. cls.get_videoList(log_type=log_type,
  208. crawler=crawler,
  209. user_dict=user_dict,
  210. rule_dict=rule_dict,
  211. env=env)
  212. except Exception as e:
  213. Common.logger(log_type, crawler).error(f"抓取{user_dict['link']}视频时异常:{e}\n")
  214. Common.logging(log_type, crawler, env, f"抓取{user_dict['link']}视频时异常:{e}\n")
  215. @classmethod
  216. def get_videoList(cls, log_type, crawler, user_dict, rule_dict, env):
  217. mq = MQ(topic_name="topic_crawler_etl_" + env)
  218. # 打印请求配置
  219. ca = DesiredCapabilities.CHROME
  220. ca["goog:loggingPrefs"] = {"performance": "ALL"}
  221. # 不打开浏览器运行
  222. chrome_options = webdriver.ChromeOptions()
  223. chrome_options.add_argument(f'user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36')
  224. chrome_options.add_argument("--window-size=1920,1080")
  225. chrome_options.add_argument("--headless")
  226. chrome_options.add_argument("--no-sandbox")
  227. if env == "dev":
  228. chromedriver = "/Users/wangkun/Downloads/chromedriver/chromedriver_v114/chromedriver"
  229. else:
  230. chromedriver = "/usr/bin/chromedriver"
  231. # driver初始化
  232. driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options, service=Service(chromedriver))
  233. driver.implicitly_wait(10)
  234. Common.logger(log_type, crawler).info(f"打开搜索页:{user_dict['link']}")
  235. Common.logging(log_type, crawler, env, f"打开搜索页:{user_dict['link']}")
  236. driver.get(f"https://www.ixigua.com/search/{user_dict['link']}/")
  237. time.sleep(2)
  238. # Common.logger(log_type, crawler).info("关闭登录弹框")
  239. # Common.logging(log_type, crawler, env, "关闭登录弹框")
  240. if driver.find_elements(By.XPATH, '//*[@class="xg-notification-close"]') != 0:
  241. driver.find_element(By.XPATH, '//*[@class="xg-notification-close"]').click()
  242. while True:
  243. # 查找视频列表
  244. video_elements = driver.find_elements(By.XPATH, '//*[@class="HorizontalFeedCard__coverWrapper disableZoomAnimation"]')
  245. if len(video_elements) == 0:
  246. Common.logger(log_type, crawler).warning('未搜索到视频\n')
  247. Common.logging(log_type, crawler, env, '未搜索到视频\n')
  248. driver.quit()
  249. return
  250. elif len(video_elements) == 1000:
  251. Common.logger(log_type, crawler).info("已扫描 1000 条视频\n")
  252. break
  253. elif driver.find_element(By.XPATH, '//*[@class="Feed-footer"]').text == "没有更多内容了":
  254. Common.logger(log_type, crawler).info(f"已扫描 {len(video_elements)} 条视频\n")
  255. break
  256. else:
  257. # 拖动列表最后一条视频至屏幕中间
  258. Common.logger(log_type, crawler).info("拖动列表最后一条视频至屏幕中间")
  259. action = ActionChains(driver)
  260. action.move_to_element(video_elements[-1]).perform()
  261. time.sleep(1)
  262. for i, video_element in enumerate(video_elements):
  263. try:
  264. if cls.download_cnt >= int(rule_dict.get("videos_cnt", {}).get("min", 30)):
  265. Common.logger(log_type, crawler).info(f"搜索词: {user_dict['link']},已下载视频数: {cls.download_cnt}\n")
  266. Common.logging(log_type, crawler, env, f"搜索词: {user_dict['link']},已下载视频数: {cls.download_cnt}\n")
  267. driver.quit()
  268. return
  269. Common.logger(log_type, crawler).info(f'正在抓取第{i+1}条视频')
  270. Common.logging(log_type, crawler, env, f'正在抓取第{i+1}条视频')
  271. item_id = video_element.get_attribute('href').split("com/")[-1].split("?&")[0]
  272. # title = video_element.get_attribute('title')
  273. # Common.logger(log_type, crawler).info(f"标题:{title}")
  274. # Common.logging(log_type, crawler, env, f"标题:{title}")
  275. video_dict = cls.get_video_info(log_type, crawler, item_id)
  276. if video_dict is None:
  277. Common.logger(log_type, crawler).info("无效视频\n")
  278. Common.logging(log_type, crawler, env, "无效视频\n")
  279. continue
  280. for k, v in video_dict.items():
  281. Common.logger(log_type, crawler).info(f"{k}:{v}")
  282. Common.logging(log_type, crawler, env, f"{video_dict}")
  283. if download_rule(log_type=log_type, crawler=crawler, video_dict=video_dict, rule_dict=rule_dict) is False:
  284. Common.logger(log_type, crawler).info("不满足抓取规则\n")
  285. Common.logging(log_type, crawler, env, "不满足抓取规则\n")
  286. elif any(str(word) if str(word) in video_dict["video_title"] else False
  287. for word in get_config_from_mysql(log_type=log_type,
  288. source=crawler,
  289. env=env,
  290. text="filter",
  291. action="")) is True:
  292. Common.logger(log_type, crawler).info('已中过滤词\n')
  293. Common.logging(log_type, crawler, env, '已中过滤词\n')
  294. elif cls.repeat_video(log_type, crawler, video_dict["video_id"], env) != 0:
  295. Common.logger(log_type, crawler).info('视频已下载\n')
  296. Common.logging(log_type, crawler, env, '视频已下载\n')
  297. else:
  298. # title_score = get_title_score(log_type, "kuaishou", "16QspO", "0usaDk", video_dict["video_title"])
  299. # if title_score <= 0.3:
  300. # Common.logger(log_type, crawler).info(f"权重分:{title_score}<=0.3\n")
  301. # Common.logging(log_type, crawler, env, f"权重分:{title_score}<=0.3\n")
  302. # continue
  303. # Common.logger(log_type, crawler).info(f"权重分:{title_score}>0.3\n")
  304. # Common.logging(log_type, crawler, env, f"权重分:{title_score}>0.3\n")
  305. # cls.download_publish(log_type=log_type,
  306. # crawler=crawler,
  307. # user_dict=user_dict,
  308. # video_dict=video_dict,
  309. # rule_dict=rule_dict,
  310. # title_score=title_score,
  311. # env=env)
  312. video_dict["out_user_id"] = video_dict["user_id"]
  313. video_dict["platform"] = crawler
  314. video_dict["strategy"] = log_type
  315. video_dict["out_video_id"] = video_dict["video_id"]
  316. video_dict["width"] = video_dict["video_width"]
  317. video_dict["height"] = video_dict["video_height"]
  318. video_dict["crawler_rule"] = json.dumps(rule_dict)
  319. video_dict["user_id"] = user_dict["uid"]
  320. video_dict["publish_time"] = video_dict["publish_time_str"]
  321. video_dict["strategy_type"] = log_type
  322. mq.send_msg(video_dict)
  323. cls.download_cnt += 1
  324. Common.logger(log_type, crawler).info("满足下载规则\n")
  325. except Exception as e:
  326. Common.logger(log_type, crawler).warning(f"抓取单条视频异常:{e}\n")
  327. Common.logging(log_type, crawler, env, f"抓取单条视频异常:{e}\n")
  328. if __name__ == '__main__':
  329. pass