dy_ks_get_url.py 7.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156
  1. import html
  2. import json
  3. import os
  4. import random
  5. import re
  6. import time
  7. import uuid
  8. import requests
  9. from datetime import datetime
  10. from urllib.parse import urlparse, parse_qs
  11. from loguru import logger
  12. from utils.aliyun_log import AliyunLogger
  13. from utils.feishu_utils import Feishu
  14. class Dy_KS:
  15. @classmethod
  16. def get_text_dy_video(cls,url):
  17. max_retries = 3
  18. retry_count = 0
  19. while retry_count < max_retries:
  20. try:
  21. if "&vid=" in url:
  22. parsed_url = urlparse(url)
  23. params = parse_qs(parsed_url.query)
  24. video_id = params.get('vid', [None])[0]
  25. elif "?modal_id=" in url:
  26. parsed_url = urlparse(url)
  27. params = parse_qs(parsed_url.query)
  28. video_id = params.get('modal_id', [None])[0]
  29. else:
  30. headers = {
  31. 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;'
  32. 'q=0.8,application/signed-exchange;v=b3;q=0.7',
  33. 'Accept-Language': 'zh-CN,zh;q=0.9',
  34. 'Cache-Control': 'no-cache',
  35. 'Pragma': 'no-cache',
  36. 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) '
  37. 'Chrome/127.0.0.0 Safari/537.36',
  38. }
  39. response = requests.request(url=url, method='GET', headers=headers, allow_redirects=False, timeout=30)
  40. location = response.headers.get('Location', None)
  41. video_id = re.search(r'/video/(\d+)/?', location.split('?')[0] if location else url).group(1)
  42. url = "http://8.217.192.46:8889/crawler/dou_yin/detail"
  43. if not video_id or not video_id.strip():
  44. return None, None, None
  45. payload = json.dumps({
  46. "content_id": str(video_id)
  47. })
  48. headers = {
  49. 'Content-Type': 'application/json'
  50. }
  51. response = requests.request("POST", url, headers=headers, data=payload, timeout= 60)
  52. response = response.json()
  53. code = response["code"]
  54. if code == 0:
  55. data = response["data"]["data"]
  56. video_url = data["video_url_list"][0]["video_url"]
  57. original_title = data["title"]
  58. return video_url, original_title, video_id
  59. if code == 22002:
  60. if '抖音内容已被删除或无法访问' in response['msg']:
  61. return "作品不存在", None, None
  62. except Exception as e:
  63. retry_count += 1
  64. logger.error(f"[+] 抖音{url}获取视频链接失败,失败信息{e}")
  65. time.sleep(1)
  66. return None, None, None
  67. @classmethod
  68. def get_text_ks_video(cls,url):
  69. try:
  70. headers = {
  71. 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;'
  72. 'q=0.8,application/signed-exchange;v=b3;q=0.7',
  73. 'Accept-Language': 'zh-CN,zh;q=0.9',
  74. 'Cache-Control': 'no-cache',
  75. 'Pragma': 'no-cache',
  76. 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) '
  77. 'Chrome/127.0.0.0 Safari/537.36',
  78. }
  79. response = requests.request(url=url, method='GET', headers=headers, allow_redirects=False, timeout= 30)
  80. location = response.headers.get('Location', None)
  81. video_id = re.search(r'/(f|photo|short-video|long-video)/(.*)/?',
  82. location.split('?')[0] if location else url).group(2)
  83. url = "http://8.217.192.46:8889/crawler/kuai_shou/detail"
  84. if not video_id or not video_id.strip():
  85. return None, None, None
  86. payload = json.dumps({
  87. "content_id": str(video_id)
  88. })
  89. headers = {
  90. 'Content-Type': 'application/json'
  91. }
  92. time.sleep(random.uniform(10, 50))
  93. response = requests.request("POST", url, headers=headers, data=payload, timeout= 30)
  94. response = response.json()
  95. code = response["code"]
  96. if code == 0:
  97. data = response["data"]["data"]
  98. content_type = data['content_type']
  99. if content_type == 'note':
  100. return "note","note"
  101. video_url = data["video_url_list"][0]["video_url"]
  102. original_title = data["title"]
  103. return video_url, original_title, video_id
  104. elif code == 27006:
  105. if "作品不存在" in response['msg'] or "内容不存在" in response['msg'] or "私密作品" in response['msg'] or "该作品仅允许关注者查看" in response['msg']:
  106. return "作品不存在", None, None
  107. time.sleep(3)
  108. except Exception as e:
  109. logger.error(f"[+] 快手{url}获取视频链接失败,失败信息{e}")
  110. return None, None,None
  111. @classmethod
  112. def get_video_url(cls, data, principal):
  113. try:
  114. url = data['video_url']
  115. logger.info(f"[+] url=={url}")
  116. if "&vid=" in url or "?modal_id=" in url:
  117. host = urlparse(url).netloc
  118. logger.info(f"[+] host=={host}")
  119. else:
  120. # msg = html.unescape(url).split('?')[0]
  121. # pattern = re.search(r'https?://[^\s<>"\'\u4e00-\u9fff]+', msg)
  122. msg = html.unescape(url)
  123. pattern = re.search(r'https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+(/[-\w._~:/#[\]@!$&()*+,;=]*)', msg)
  124. logger.info(f"[+] pattern == {pattern}")
  125. if pattern is None:
  126. logger.error(f"[+] {url} 提取 url失败")
  127. return "重新处理",None,None,None
  128. url = pattern.group()
  129. host = urlparse(url).netloc
  130. logger.info(f"[+] url == {url} host=={host}")
  131. if host in ['v.douyin.com', 'www.douyin.com', 'www.iesdouyin.com']:
  132. tag_transport_channel = "抖音"
  133. logger.info(f"[+] {url}开始获取抖音视频链接")
  134. url, original_title, video_id = cls.get_text_dy_video(url=url)
  135. elif host in ['v.kuaishou.com', 'www.kuaishou.com', 'v.m.chenzhongtech.com', 'creater.eozatvmq.com']:
  136. tag_transport_channel = "快手"
  137. logger.info(f"[+] {url}开始获取快手视频链接")
  138. url, original_title, video_id = cls.get_text_ks_video(url=url)
  139. else:
  140. logger.error(f"[+] {url}该链接不是抖/快 不做处理")
  141. AliyunLogger.logging(data["name"], principal, "", data["video_url"],
  142. "不是抖/快不做处理", "1001", str(data))
  143. return "链接不是抖/快",None,None,None
  144. if url == "作品不存在":
  145. return "作品不存在",None,None,None
  146. return url, original_title, video_id, tag_transport_channel
  147. except Exception as e:
  148. logger.info(f"[+] 获取视频链接异常{e}")
  149. return "重新处理",None,None,None