dy_ks_get_url.py 9.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205
  1. import html
  2. import json
  3. import os
  4. import random
  5. import re
  6. import time
  7. import uuid
  8. import requests
  9. from datetime import datetime
  10. from urllib.parse import urlparse, parse_qs
  11. from loguru import logger
  12. from utils.aliyun_log import AliyunLogger
  13. from utils.feishu_utils import Feishu
  14. class Dy_KS:
  15. @classmethod
  16. def get_text_dy_video(cls,url):
  17. max_retries = 3
  18. retry_count = 0
  19. while retry_count < max_retries:
  20. try:
  21. if "&vid=" in url:
  22. parsed_url = urlparse(url)
  23. params = parse_qs(parsed_url.query)
  24. video_id = params.get('vid', [None])[0]
  25. elif "?modal_id=" in url:
  26. parsed_url = urlparse(url)
  27. params = parse_qs(parsed_url.query)
  28. video_id = params.get('modal_id', [None])[0]
  29. else:
  30. headers = {
  31. 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;'
  32. 'q=0.8,application/signed-exchange;v=b3;q=0.7',
  33. 'Accept-Language': 'zh-CN,zh;q=0.9',
  34. 'Cache-Control': 'no-cache',
  35. 'Pragma': 'no-cache',
  36. 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) '
  37. 'Chrome/127.0.0.0 Safari/537.36',
  38. }
  39. response = requests.request(url=url, method='GET', headers=headers, allow_redirects=False, timeout=30)
  40. location = response.headers.get('Location', None)
  41. match = re.search(r'/video/(\d+)/?', location.split('?')[0] if location else url)
  42. if match:
  43. video_id = match.group(1)
  44. elif "&vid=" in location:
  45. video_id = re.search(r'vid=(\d+)', location).group(1)
  46. url = "http://8.217.192.46:8889/crawler/dou_yin/detail"
  47. if not video_id or not video_id.strip():
  48. return None, None, None
  49. payload = json.dumps({
  50. "content_id": str(video_id)
  51. })
  52. headers = {
  53. 'Content-Type': 'application/json'
  54. }
  55. response = requests.request("POST", url, headers=headers, data=payload, timeout= 60)
  56. response = response.json()
  57. code = response["code"]
  58. if code == 0:
  59. data = response["data"]["data"]
  60. video_url = data["video_url_list"][0]["video_url"]
  61. original_title = data["title"]
  62. return video_url, original_title, video_id
  63. if code == 22002:
  64. if '抖音内容已被删除或无法访问' in response['msg']:
  65. return "作品不存在", None, None
  66. except Exception as e:
  67. retry_count += 1
  68. logger.error(f"[+] 抖音{url}获取视频链接失败,失败信息{e}")
  69. time.sleep(1)
  70. return None, None, None
  71. @classmethod
  72. def get_text_ks_video(cls,url):
  73. try:
  74. headers = {
  75. 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;'
  76. 'q=0.8,application/signed-exchange;v=b3;q=0.7',
  77. 'Accept-Language': 'zh-CN,zh;q=0.9',
  78. 'Cache-Control': 'no-cache',
  79. 'Pragma': 'no-cache',
  80. 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) '
  81. 'Chrome/127.0.0.0 Safari/537.36',
  82. }
  83. response = requests.request(url=url, method='GET', headers=headers, allow_redirects=False, timeout= 30)
  84. location = response.headers.get('Location', None)
  85. video_id = re.search(r'/(f|photo|short-video|long-video)/(.*)/?',
  86. location.split('?')[0] if location else url).group(2)
  87. url = "http://8.217.192.46:8889/crawler/kuai_shou/detail"
  88. if not video_id or not video_id.strip():
  89. return None, None, None
  90. payload = json.dumps({
  91. "content_id": str(video_id)
  92. })
  93. headers = {
  94. 'Content-Type': 'application/json'
  95. }
  96. time.sleep(random.uniform(10, 50))
  97. response = requests.request("POST", url, headers=headers, data=payload, timeout= 30)
  98. response = response.json()
  99. code = response["code"]
  100. if code == 0:
  101. data = response["data"]["data"]
  102. content_type = data['content_type']
  103. if content_type == 'note':
  104. return "note","note"
  105. video_url = data["video_url_list"][0]["video_url"]
  106. original_title = data["title"]
  107. return video_url, original_title, video_id
  108. elif code == 27006:
  109. if "作品不存在" in response['msg'] or "内容不存在" in response['msg'] or "私密作品" in response['msg'] or "该作品仅允许关注者查看" in response['msg']:
  110. return "作品不存在", None, None
  111. time.sleep(3)
  112. except Exception as e:
  113. logger.error(f"[+] 快手{url}获取视频链接失败,失败信息{e}")
  114. return None, None,None
  115. @classmethod
  116. def get_text_hksp_video(cls, url):
  117. try:
  118. parsed_url = urlparse(url)
  119. query_params = parse_qs(parsed_url.query)
  120. video_id = query_params.get('vid')[0]
  121. req_url = "http://8.217.192.46:8889/crawler/hao_kan_shi_pin/detail"
  122. if not video_id or not video_id.strip():
  123. return None, None, None
  124. payload = json.dumps({
  125. "content_id": str(video_id)
  126. })
  127. headers = {
  128. 'Content-Type': 'application/json'
  129. }
  130. max_retries = 3
  131. retries = 0
  132. while retries < max_retries:
  133. try:
  134. response = requests.request("POST", req_url, headers=headers, data=payload, timeout=30)
  135. response = response.json()
  136. code = response["code"]
  137. if code == 0:
  138. data = response["data"]["data"]
  139. content_type = data['content_type']
  140. if content_type == 'note':
  141. return "note", "note", "note"
  142. video_url = data["video_url_list"][0]["video_url"]
  143. original_title = data["title"]
  144. return video_url, original_title, video_id
  145. else:
  146. retries += 1
  147. logger.warning(f"[+] 好看视频 {url} 请求返回 code 为 {code},正在进行第 {retries} 次重试...")
  148. except Exception as e:
  149. retries += 1
  150. logger.warning(f"[+] 好看视频 {url} 请求接口异常,正在进行第 {retries} 次重试...")
  151. except Exception as e:
  152. logger.error(f"[+] 好看视频{url}获取视频信息失败,失败信息{e}")
  153. return None, None, None
  154. @classmethod
  155. def get_video_url(cls, data, principal):
  156. try:
  157. url = data['video_url']
  158. logger.info(f"[+] url=={url}")
  159. if "&vid=" in url or "?modal_id=" in url or "?vid=" in url:
  160. host = urlparse(url).netloc
  161. logger.info(f"[+] host=={host}")
  162. else:
  163. # msg = html.unescape(url).split('?')[0]
  164. # pattern = re.search(r'https?://[^\s<>"\'\u4e00-\u9fff]+', msg)
  165. msg = html.unescape(url)
  166. pattern = re.search(r'https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+(/[-\w._~:/#[\]@!$&()*+,;=]*)', msg)
  167. logger.info(f"[+] pattern == {pattern}")
  168. if pattern is None:
  169. logger.error(f"[+] {url} 提取 url失败")
  170. return "重新处理",None,None,None
  171. url = pattern.group()
  172. host = urlparse(url).netloc
  173. logger.info(f"[+] url == {url} host=={host}")
  174. if host in ['v.douyin.com', 'www.douyin.com', 'www.iesdouyin.com']:
  175. tag_transport_channel = "抖音"
  176. logger.info(f"[+] {url}开始获取抖音视频链接")
  177. url, original_title, video_id = cls.get_text_dy_video(url=url)
  178. elif host in ['v.kuaishou.com', 'www.kuaishou.com', 'v.m.chenzhongtech.com', 'creater.eozatvmq.com']:
  179. tag_transport_channel = "快手"
  180. logger.info(f"[+] {url}开始获取快手视频链接")
  181. url, original_title, video_id = cls.get_text_ks_video(url=url)
  182. elif host in ['haokan.baidu.com']:
  183. tag_transport_channel = "好看视频"
  184. logger.info(f"[+] {url}开始获取好看视频链接")
  185. url, original_title, video_id = cls.get_text_hksp_video(url=url)
  186. else:
  187. logger.error(f"[+] {url}该链接不是抖/快/好看视频 不做处理")
  188. AliyunLogger.logging(data["name"], principal, "", data["video_url"],
  189. "不是抖/快/好看视频 不做处理", "1001", str(data))
  190. return "链接不是抖/快/好看",None,None,None
  191. if url == "作品不存在":
  192. return "作品不存在",None,None,None
  193. return url, original_title, video_id, tag_transport_channel
  194. except Exception as e:
  195. logger.info(f"[+] 获取视频链接异常{e}")
  196. return "重新处理",None,None,None