dy_ks_get_url.py 7.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149
  1. import html
  2. import json
  3. import os
  4. import random
  5. import re
  6. import time
  7. import uuid
  8. import requests
  9. from datetime import datetime
  10. from urllib.parse import urlparse, parse_qs
  11. from loguru import logger
  12. from utils.aliyun_log import AliyunLogger
  13. from utils.feishu_utils import Feishu
  14. class Dy_KS:
  15. @classmethod
  16. def get_text_dy_video(cls,url):
  17. max_retries = 3
  18. retry_count = 0
  19. while retry_count < max_retries:
  20. try:
  21. if "&vid=" in url:
  22. parsed_url = urlparse(url)
  23. params = parse_qs(parsed_url.query)
  24. video_id = params.get('vid', [None])[0]
  25. elif "?modal_id=" in url:
  26. parsed_url = urlparse(url)
  27. params = parse_qs(parsed_url.query)
  28. video_id = params.get('modal_id', [None])[0]
  29. else:
  30. headers = {
  31. 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;'
  32. 'q=0.8,application/signed-exchange;v=b3;q=0.7',
  33. 'Accept-Language': 'zh-CN,zh;q=0.9',
  34. 'Cache-Control': 'no-cache',
  35. 'Pragma': 'no-cache',
  36. 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) '
  37. 'Chrome/127.0.0.0 Safari/537.36',
  38. }
  39. response = requests.request(url=url, method='GET', headers=headers, allow_redirects=False, timeout=30)
  40. location = response.headers.get('Location', None)
  41. video_id = re.search(r'/video/(\d+)/?', location.split('?')[0] if location else url).group(1)
  42. url = "http://8.217.192.46:8889/crawler/dou_yin/detail"
  43. if not video_id or not video_id.strip():
  44. return None, None, None
  45. payload = json.dumps({
  46. "content_id": str(video_id)
  47. })
  48. headers = {
  49. 'Content-Type': 'application/json'
  50. }
  51. response = requests.request("POST", url, headers=headers, data=payload, timeout= 60)
  52. response = response.json()
  53. code = response["code"]
  54. if code == 0:
  55. data = response["data"]["data"]
  56. video_url = data["video_url_list"][0]["video_url"]
  57. original_title = data["title"]
  58. return video_url, original_title, video_id
  59. if code == 22002:
  60. if '抖音内容已被删除或无法访问' in response['msg']:
  61. return "作品不存在", None, None
  62. except Exception as e:
  63. retry_count += 1
  64. logger.error(f"[+] 抖音{url}获取视频链接失败,失败信息{e}")
  65. time.sleep(1)
  66. return None, None, None
  67. @classmethod
  68. def get_text_ks_video(cls,url):
  69. try:
  70. headers = {
  71. 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;'
  72. 'q=0.8,application/signed-exchange;v=b3;q=0.7',
  73. 'Accept-Language': 'zh-CN,zh;q=0.9',
  74. 'Cache-Control': 'no-cache',
  75. 'Pragma': 'no-cache',
  76. 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) '
  77. 'Chrome/127.0.0.0 Safari/537.36',
  78. }
  79. response = requests.request(url=url, method='GET', headers=headers, allow_redirects=False, timeout= 30)
  80. location = response.headers.get('Location', None)
  81. video_id = re.search(r'/(f|photo|short-video|long-video)/(.*)/?',
  82. location.split('?')[0] if location else url).group(2)
  83. url = "http://8.217.192.46:8889/crawler/kuai_shou/detail"
  84. if not video_id or not video_id.strip():
  85. return None, None, None
  86. payload = json.dumps({
  87. "content_id": str(video_id)
  88. })
  89. headers = {
  90. 'Content-Type': 'application/json'
  91. }
  92. time.sleep(random.uniform(10, 50))
  93. response = requests.request("POST", url, headers=headers, data=payload, timeout= 30)
  94. response = response.json()
  95. code = response["code"]
  96. if code == 0:
  97. data = response["data"]["data"]
  98. content_type = data['content_type']
  99. if content_type == 'note':
  100. return "note","note"
  101. video_url = data["video_url_list"][0]["video_url"]
  102. original_title = data["title"]
  103. return video_url, original_title, video_id
  104. elif code == 27006:
  105. if "作品不存在" in response['msg'] or "内容不存在" in response['msg'] or "私密作品" in response['msg']:
  106. return "作品不存在", None, None
  107. time.sleep(3)
  108. except Exception as e:
  109. logger.error(f"[+] 快手{url}获取视频链接失败,失败信息{e}")
  110. return None, None,None
  111. @classmethod
  112. def get_video_url(cls, data, principal):
  113. try:
  114. url = data['video_url']
  115. if "&vid=" in url or "?modal_id=" in url:
  116. host = urlparse(url).netloc
  117. else:
  118. msg = html.unescape(url).split('?')[0]
  119. pattern = re.search(r'https?://[^\s<>"\'\u4e00-\u9fff]+', msg)
  120. if not pattern:
  121. return "重新处理",None,None,None
  122. url = pattern.group()
  123. host = urlparse(url).netloc
  124. if host in ['v.douyin.com', 'www.douyin.com', 'www.iesdouyin.com']:
  125. tag_transport_channel = "抖音"
  126. logger.info(f"[+] {url}开始获取抖音视频链接")
  127. url, original_title, video_id = cls.get_text_dy_video(url=url)
  128. elif host in ['v.kuaishou.com', 'www.kuaishou.com', 'v.m.chenzhongtech.com', 'creater.eozatvmq.com']:
  129. tag_transport_channel = "快手"
  130. logger.info(f"[+] {url}开始获取快手视频链接")
  131. url, original_title, video_id = cls.get_text_ks_video(url=url)
  132. else:
  133. logger.error(f"[+] {url}该链接不是抖/快 不做处理")
  134. AliyunLogger.logging(data["name"], principal, "", data["video_url"],
  135. "不是抖/快不做处理", "1001", str(data))
  136. return "链接不是抖/快",None,None,None
  137. if url == "作品不存在":
  138. return "作品不存在",None,None,None
  139. return url, original_title, video_id, tag_transport_channel
  140. except Exception as e:
  141. logger.info(f"[+] 获取视频链接异常{e}")
  142. return "重新处理",None,None,None