dy_ks_get_url.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221
  1. import html
  2. import json
  3. import os
  4. import random
  5. import re
  6. import time
  7. import uuid
  8. import requests
  9. import traceback
  10. from datetime import datetime
  11. from urllib.parse import urlparse, parse_qs
  12. from loguru import logger
  13. from utils.aliyun_log import AliyunLogger
  14. from utils.feishu_utils import Feishu
  15. class Dy_KS:
  16. @classmethod
  17. def get_text_dy_video(cls,url):
  18. max_retries = 3
  19. retry_count = 0
  20. while retry_count < max_retries:
  21. try:
  22. if "&vid=" in url:
  23. parsed_url = urlparse(url)
  24. params = parse_qs(parsed_url.query)
  25. video_id = params.get('vid', [None])[0]
  26. elif "?modal_id=" in url:
  27. parsed_url = urlparse(url)
  28. params = parse_qs(parsed_url.query)
  29. video_id = params.get('modal_id', [None])[0]
  30. else:
  31. headers = {
  32. 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;'
  33. 'q=0.8,application/signed-exchange;v=b3;q=0.7',
  34. 'Accept-Language': 'zh-CN,zh;q=0.9',
  35. 'Cache-Control': 'no-cache',
  36. 'Pragma': 'no-cache',
  37. 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) '
  38. 'Chrome/127.0.0.0 Safari/537.36',
  39. }
  40. response = requests.request(url=url, method='GET', headers=headers, allow_redirects=False, timeout=30)
  41. logger.info(f"请求{url}响应:{response}")
  42. location = response.headers.get('Location', None)
  43. match = re.search(r'/video/(\d+)/?', location.split('?')[0] if location else url)
  44. if match:
  45. video_id = match.group(1)
  46. elif "&vid=" in location:
  47. video_id = re.search(r'vid=(\d+)', location).group(1)
  48. url = "http://8.217.192.46:8889/crawler/dou_yin/detail"
  49. if not video_id or not video_id.strip():
  50. return None, None, None
  51. payload = json.dumps({
  52. "content_id": str(video_id)
  53. })
  54. headers = {
  55. 'Content-Type': 'application/json'
  56. }
  57. response = requests.request("POST", url, headers=headers, data=payload, timeout= 60)
  58. logger.info(f"请求{url}响应:{response.json()}")
  59. response = response.json()
  60. code = response["code"]
  61. if code == 0:
  62. data = response["data"]["data"]
  63. video_url = data["video_url_list"][0]["video_url"]
  64. original_title = data["title"]
  65. return video_url, original_title, video_id
  66. if code == 22002:
  67. if '抖音内容已被删除或无法访问' in response['msg']:
  68. return "作品不存在", None, None
  69. except Exception as e:
  70. retry_count += 1
  71. logger.error(f"[+] 抖音{url}获取视频链接失败,失败信息{e} \n {traceback.format_exc()}")
  72. time.sleep(1)
  73. return None, None, None
  74. @classmethod
  75. def get_text_ks_video(cls,url):
  76. try:
  77. headers = {
  78. 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;'
  79. 'q=0.8,application/signed-exchange;v=b3;q=0.7',
  80. 'Accept-Language': 'zh-CN,zh;q=0.9',
  81. 'Cache-Control': 'no-cache',
  82. 'Pragma': 'no-cache',
  83. 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) '
  84. 'Chrome/127.0.0.0 Safari/537.36',
  85. }
  86. response = requests.request(url=url, method='GET', headers=headers, allow_redirects=False, timeout= 30)
  87. logger.info(f"请求{url}响应:{response}")
  88. location = response.headers.get('Location', None)
  89. if location == "https://kuaishou.com/":
  90. return "作品不存在", None, None
  91. # video_id = re.search(r'/(f|photo|short-video|long-video)/(.*)/?',
  92. # location.split('?')[0] if location else url).group(2)
  93. match = re.search(r'/(f|photo|short-video|long-video)/(.*)/?',
  94. location.split('?')[0] if location else url)
  95. if match:
  96. video_id = match.group(2)
  97. else:
  98. parts = url.rstrip('/').split('/')
  99. if parts:
  100. video_id = parts[-1]
  101. logger.info(f"[+]提取到的视频ID=={video_id}")
  102. url = "http://8.217.192.46:8889/crawler/kuai_shou/detail"
  103. if not video_id or not video_id.strip():
  104. return None, None, None
  105. payload = json.dumps({
  106. "content_id": str(video_id)
  107. })
  108. headers = {
  109. 'Content-Type': 'application/json'
  110. }
  111. time.sleep(random.uniform(10, 50))
  112. response = requests.request("POST", url, headers=headers, data=payload, timeout= 30)
  113. logger.info(f"请求{url}响应:{response.json()}")
  114. response = response.json()
  115. code = response["code"]
  116. if code == 0:
  117. data = response["data"]["data"]
  118. content_type = data['content_type']
  119. if content_type == 'note':
  120. return "note","note",None
  121. video_url = data["video_url_list"][0]["video_url"]
  122. original_title = data["title"]
  123. return video_url, original_title, video_id
  124. elif code == 27006:
  125. if "作品不存在" in response['msg'] or "内容不存在" in response['msg'] or "私密作品" in response['msg'] or "该作品仅允许关注者查看" in response['msg']:
  126. return "作品不存在", None, None
  127. time.sleep(3)
  128. except Exception as e:
  129. logger.error(f"[+] 快手{url}获取视频链接失败,失败信息{e} \n {traceback.format_exc()}")
  130. return None, None,None
  131. @classmethod
  132. def get_text_hksp_video(cls, url):
  133. try:
  134. parsed_url = urlparse(url)
  135. query_params = parse_qs(parsed_url.query)
  136. video_id = query_params.get('vid')[0]
  137. req_url = "http://8.217.192.46:8889/crawler/hao_kan_shi_pin/detail"
  138. if not video_id or not video_id.strip():
  139. return None, None, None
  140. payload = json.dumps({
  141. "content_id": str(video_id)
  142. })
  143. headers = {
  144. 'Content-Type': 'application/json'
  145. }
  146. max_retries = 3
  147. retries = 0
  148. while retries < max_retries:
  149. try:
  150. response = requests.request("POST", req_url, headers=headers, data=payload, timeout=30)
  151. response = response.json()
  152. code = response["code"]
  153. if code == 0:
  154. data = response["data"]["data"]
  155. content_type = data['content_type']
  156. if content_type == 'note':
  157. return "note", "note", "note"
  158. video_url = data["video_url_list"][0]["video_url"]
  159. original_title = data["title"]
  160. return video_url, original_title, video_id
  161. else:
  162. retries += 1
  163. logger.warning(f"[+] 好看视频 {url} 请求返回 code 为 {code},正在进行第 {retries} 次重试...")
  164. except Exception as e:
  165. retries += 1
  166. logger.warning(f"[+] 好看视频 {url} 请求接口异常,正在进行第 {retries} 次重试...")
  167. except Exception as e:
  168. logger.error(f"[+] 好看视频{url}获取视频信息失败,失败信息{e} \n {traceback.format_exc()}")
  169. return None, None, None
  170. @classmethod
  171. def get_video_url(cls, data, principal):
  172. try:
  173. url = data['video_url']
  174. logger.info(f"[+] url=={url}")
  175. if "&vid=" in url or "?modal_id=" in url or "?vid=" in url:
  176. host = urlparse(url).netloc
  177. logger.info(f"[+] host=={host}")
  178. else:
  179. # msg = html.unescape(url).split('?')[0]
  180. # pattern = re.search(r'https?://[^\s<>"\'\u4e00-\u9fff]+', msg)
  181. msg = html.unescape(url)
  182. pattern = re.search(r'https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+(/[-\w._~:/#[\]@!$&()*+,;=]*)', msg)
  183. logger.info(f"[+] pattern == {pattern}")
  184. # if pattern is None:
  185. # logger.error(f"[+] {url} 提取 url失败")
  186. # return "重新处理",None,None,None
  187. url = pattern.group()
  188. host = urlparse(url).netloc
  189. logger.info(f"[+] url == {url} host=={host}")
  190. if host in ['v.douyin.com', 'www.douyin.com', 'www.iesdouyin.com']:
  191. tag_transport_channel = "抖音"
  192. logger.info(f"[+] {url}开始获取抖音视频链接")
  193. url, original_title, video_id = cls.get_text_dy_video(url=url)
  194. elif host in ['v.kuaishou.com', 'www.kuaishou.com', 'v.m.chenzhongtech.com', 'creater.eozatvmq.com','live.kuaishou.com']:
  195. tag_transport_channel = "快手"
  196. logger.info(f"[+] {url}开始获取快手视频链接")
  197. url, original_title, video_id = cls.get_text_ks_video(url=url)
  198. elif host in ['haokan.baidu.com']:
  199. tag_transport_channel = "好看视频"
  200. logger.info(f"[+] {url}开始获取好看视频链接")
  201. url, original_title, video_id = cls.get_text_hksp_video(url=url)
  202. else:
  203. logger.error(f"[+] {url}该链接不是抖/快/好看视频 不做处理")
  204. AliyunLogger.logging(data["name"], principal, "", data["video_url"],
  205. "不是抖/快/好看视频 不做处理", "1001", str(data))
  206. return "链接不是抖/快/好看",None,None,None
  207. if url == "作品不存在":
  208. return "作品不存在",None,None,None
  209. return url, original_title, video_id, tag_transport_channel
  210. except Exception as e:
  211. logger.info(f"[+] 获取视频链接异常{e}")
  212. return "重新处理",None,None,None