get_detail.py 3.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596
  1. import re
  2. import json
  3. import requests
  4. from datetime import datetime
  5. from lxml import html
  6. from tenacity import retry
  7. from applications import log
  8. from applications.utils import proxy, request_retry
  9. retry_desc = request_retry(retry_times=3, min_retry_delay=2, max_retry_delay=30)
  10. def extract_video_url(html_text: str) -> str | None:
  11. """
  12. extract video url from html text
  13. """
  14. patterns = [
  15. r'<source\s+src=["\'](.*?\.mp4)["\']',
  16. r'videoUrl\s*=\s*["\'](.*?\.mp4)["\']',
  17. r"(https?://\S+?\.mp4(?:\?\S+)?)",
  18. ]
  19. video_urls = []
  20. for pattern in patterns:
  21. match = re.findall(pattern, html_text, re.IGNORECASE)
  22. video_urls.extend(match)
  23. if video_urls:
  24. return video_urls[0]
  25. else:
  26. return None
  27. def extract_video_info(html_text: str) -> dict | None:
  28. """
  29. extract video publish time from html text
  30. """
  31. tree = html.fromstring(html_text)
  32. publish_time_str = tree.xpath("//meta[@property='og:release_date']/@content")[0]
  33. account_name = tree.xpath("//meta[@name='mediaid']/@content")[0]
  34. sub_url = tree.xpath("//meta[@property='og:url']/@content")[0]
  35. article_id = sub_url.split("/")[-1].split("_")[0]
  36. account_id = sub_url.split("/")[-1].split("_")[1]
  37. title = tree.xpath("//meta[@name='description']/@content")[0]
  38. response = {
  39. "publish_timestamp": int(
  40. datetime.strptime(publish_time_str, "%Y-%m-%d %H:%M").timestamp() * 1000
  41. ),
  42. "account_name": account_name,
  43. "article_id": article_id,
  44. "account_id": account_id,
  45. "video_url": extract_video_url(html_text),
  46. "title": title,
  47. }
  48. return response
  49. @retry(**retry_desc)
  50. def get_video_detail(article_url: str) -> dict | None:
  51. """
  52. get detail video url
  53. """
  54. payload = {}
  55. headers = {
  56. "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
  57. "Accept-Language": "zh",
  58. "Connection": "keep-alive",
  59. "Upgrade-Insecure-Requests": "1",
  60. "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36",
  61. }
  62. try:
  63. response = requests.get(article_url, headers=headers, data=payload, proxies=proxy())
  64. response.raise_for_status()
  65. video_info = extract_video_info(response.text)
  66. return video_info
  67. except requests.exceptions.RequestException as e:
  68. log(
  69. task="sohu_detail_video",
  70. function="get_detail_video_url",
  71. message=f"API请求失败: {e}",
  72. )
  73. except json.JSONDecodeError as e:
  74. log(
  75. task="sohu_detail_video",
  76. function="get_detail_video_url",
  77. message=f"响应解析失败: {e}",
  78. )
  79. return None
  80. # url = 'https://www.sohu.com/a/877211651_121141867?scm=10001.325_13-109000.0.0.5_32&spm=smpc.channel_248.block3_308_NDdFbm_1_fd.25.1743578768825Lv6rTp1_324'
  81. # res = get_detail_video_url(url)
  82. #
  83. # print(res)