dy_ks_get_url.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216
  1. import html
  2. import json
  3. import os
  4. import random
  5. import re
  6. import time
  7. import uuid
  8. import requests
  9. from datetime import datetime
  10. from urllib.parse import urlparse, parse_qs
  11. from loguru import logger
  12. from utils.aliyun_log import AliyunLogger
  13. from utils.feishu_utils import Feishu
  14. class Dy_KS:
  15. @classmethod
  16. def get_text_dy_video(cls,url):
  17. max_retries = 3
  18. retry_count = 0
  19. while retry_count < max_retries:
  20. try:
  21. if "&vid=" in url:
  22. parsed_url = urlparse(url)
  23. params = parse_qs(parsed_url.query)
  24. video_id = params.get('vid', [None])[0]
  25. elif "?modal_id=" in url:
  26. parsed_url = urlparse(url)
  27. params = parse_qs(parsed_url.query)
  28. video_id = params.get('modal_id', [None])[0]
  29. else:
  30. headers = {
  31. 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;'
  32. 'q=0.8,application/signed-exchange;v=b3;q=0.7',
  33. 'Accept-Language': 'zh-CN,zh;q=0.9',
  34. 'Cache-Control': 'no-cache',
  35. 'Pragma': 'no-cache',
  36. 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) '
  37. 'Chrome/127.0.0.0 Safari/537.36',
  38. }
  39. response = requests.request(url=url, method='GET', headers=headers, allow_redirects=False, timeout=30)
  40. location = response.headers.get('Location', None)
  41. match = re.search(r'/video/(\d+)/?', location.split('?')[0] if location else url)
  42. if match:
  43. video_id = match.group(1)
  44. elif "&vid=" in location:
  45. video_id = re.search(r'vid=(\d+)', location).group(1)
  46. url = "http://8.217.192.46:8889/crawler/dou_yin/detail"
  47. if not video_id or not video_id.strip():
  48. return None, None, None
  49. payload = json.dumps({
  50. "content_id": str(video_id)
  51. })
  52. headers = {
  53. 'Content-Type': 'application/json'
  54. }
  55. response = requests.request("POST", url, headers=headers, data=payload, timeout= 60)
  56. response = response.json()
  57. code = response["code"]
  58. if code == 0:
  59. data = response["data"]["data"]
  60. video_url = data["video_url_list"][0]["video_url"]
  61. original_title = data["title"]
  62. return video_url, original_title, video_id
  63. if code == 22002:
  64. if '抖音内容已被删除或无法访问' in response['msg']:
  65. return "作品不存在", None, None
  66. except Exception as e:
  67. retry_count += 1
  68. logger.error(f"[+] 抖音{url}获取视频链接失败,失败信息{e}")
  69. time.sleep(1)
  70. return None, None, None
  71. @classmethod
  72. def get_text_ks_video(cls,url):
  73. try:
  74. headers = {
  75. 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;'
  76. 'q=0.8,application/signed-exchange;v=b3;q=0.7',
  77. 'Accept-Language': 'zh-CN,zh;q=0.9',
  78. 'Cache-Control': 'no-cache',
  79. 'Pragma': 'no-cache',
  80. 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) '
  81. 'Chrome/127.0.0.0 Safari/537.36',
  82. }
  83. response = requests.request(url=url, method='GET', headers=headers, allow_redirects=False, timeout= 30)
  84. location = response.headers.get('Location', None)
  85. if location == "https://kuaishou.com/":
  86. return "作品不存在", None, None
  87. # video_id = re.search(r'/(f|photo|short-video|long-video)/(.*)/?',
  88. # location.split('?')[0] if location else url).group(2)
  89. match = re.search(r'/(f|photo|short-video|long-video)/(.*)/?',
  90. location.split('?')[0] if location else url)
  91. if match:
  92. video_id = match.group(2)
  93. else:
  94. parts = url.rstrip('/').split('/')
  95. if parts:
  96. video_id = parts[-1]
  97. logger.info(f"[+]提取到的视频ID=={video_id}")
  98. url = "http://8.217.192.46:8889/crawler/kuai_shou/detail"
  99. if not video_id or not video_id.strip():
  100. return None, None, None
  101. payload = json.dumps({
  102. "content_id": str(video_id)
  103. })
  104. headers = {
  105. 'Content-Type': 'application/json'
  106. }
  107. time.sleep(random.uniform(10, 50))
  108. response = requests.request("POST", url, headers=headers, data=payload, timeout= 30)
  109. response = response.json()
  110. code = response["code"]
  111. if code == 0:
  112. data = response["data"]["data"]
  113. content_type = data['content_type']
  114. if content_type == 'note':
  115. return "note","note"
  116. video_url = data["video_url_list"][0]["video_url"]
  117. original_title = data["title"]
  118. return video_url, original_title, video_id
  119. elif code == 27006:
  120. if "作品不存在" in response['msg'] or "内容不存在" in response['msg'] or "私密作品" in response['msg'] or "该作品仅允许关注者查看" in response['msg']:
  121. return "作品不存在", None, None
  122. time.sleep(3)
  123. except Exception as e:
  124. logger.error(f"[+] 快手{url}获取视频链接失败,失败信息{e}")
  125. return None, None,None
  126. @classmethod
  127. def get_text_hksp_video(cls, url):
  128. try:
  129. parsed_url = urlparse(url)
  130. query_params = parse_qs(parsed_url.query)
  131. video_id = query_params.get('vid')[0]
  132. req_url = "http://8.217.192.46:8889/crawler/hao_kan_shi_pin/detail"
  133. if not video_id or not video_id.strip():
  134. return None, None, None
  135. payload = json.dumps({
  136. "content_id": str(video_id)
  137. })
  138. headers = {
  139. 'Content-Type': 'application/json'
  140. }
  141. max_retries = 3
  142. retries = 0
  143. while retries < max_retries:
  144. try:
  145. response = requests.request("POST", req_url, headers=headers, data=payload, timeout=30)
  146. response = response.json()
  147. code = response["code"]
  148. if code == 0:
  149. data = response["data"]["data"]
  150. content_type = data['content_type']
  151. if content_type == 'note':
  152. return "note", "note", "note"
  153. video_url = data["video_url_list"][0]["video_url"]
  154. original_title = data["title"]
  155. return video_url, original_title, video_id
  156. else:
  157. retries += 1
  158. logger.warning(f"[+] 好看视频 {url} 请求返回 code 为 {code},正在进行第 {retries} 次重试...")
  159. except Exception as e:
  160. retries += 1
  161. logger.warning(f"[+] 好看视频 {url} 请求接口异常,正在进行第 {retries} 次重试...")
  162. except Exception as e:
  163. logger.error(f"[+] 好看视频{url}获取视频信息失败,失败信息{e}")
  164. return None, None, None
  165. @classmethod
  166. def get_video_url(cls, data, principal):
  167. try:
  168. url = data['video_url']
  169. logger.info(f"[+] url=={url}")
  170. if "&vid=" in url or "?modal_id=" in url or "?vid=" in url:
  171. host = urlparse(url).netloc
  172. logger.info(f"[+] host=={host}")
  173. else:
  174. # msg = html.unescape(url).split('?')[0]
  175. # pattern = re.search(r'https?://[^\s<>"\'\u4e00-\u9fff]+', msg)
  176. msg = html.unescape(url)
  177. pattern = re.search(r'https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+(/[-\w._~:/#[\]@!$&()*+,;=]*)', msg)
  178. logger.info(f"[+] pattern == {pattern}")
  179. # if pattern is None:
  180. # logger.error(f"[+] {url} 提取 url失败")
  181. # return "重新处理",None,None,None
  182. url = pattern.group()
  183. host = urlparse(url).netloc
  184. logger.info(f"[+] url == {url} host=={host}")
  185. if host in ['v.douyin.com', 'www.douyin.com', 'www.iesdouyin.com']:
  186. tag_transport_channel = "抖音"
  187. logger.info(f"[+] {url}开始获取抖音视频链接")
  188. url, original_title, video_id = cls.get_text_dy_video(url=url)
  189. elif host in ['v.kuaishou.com', 'www.kuaishou.com', 'v.m.chenzhongtech.com', 'creater.eozatvmq.com','live.kuaishou.com']:
  190. tag_transport_channel = "快手"
  191. logger.info(f"[+] {url}开始获取快手视频链接")
  192. url, original_title, video_id = cls.get_text_ks_video(url=url)
  193. elif host in ['haokan.baidu.com']:
  194. tag_transport_channel = "好看视频"
  195. logger.info(f"[+] {url}开始获取好看视频链接")
  196. url, original_title, video_id = cls.get_text_hksp_video(url=url)
  197. else:
  198. logger.error(f"[+] {url}该链接不是抖/快/好看视频 不做处理")
  199. AliyunLogger.logging(data["name"], principal, "", data["video_url"],
  200. "不是抖/快/好看视频 不做处理", "1001", str(data))
  201. return "链接不是抖/快/好看",None,None,None
  202. if url == "作品不存在":
  203. return "作品不存在",None,None,None
  204. return url, original_title, video_id, tag_transport_channel
  205. except Exception as e:
  206. logger.info(f"[+] 获取视频链接异常{e}")
  207. return "重新处理",None,None,None