123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596 |
- import re
- import json
- import requests
- from datetime import datetime
- from lxml import html
- from tenacity import retry
- from applications import log
- from applications.utils import proxy, request_retry
- retry_desc = request_retry(retry_times=3, min_retry_delay=2, max_retry_delay=30)
- def extract_video_url(html_text: str) -> str | None:
- """
- extract video url from html text
- """
- patterns = [
- r'<source\s+src=["\'](.*?\.mp4)["\']',
- r'videoUrl\s*=\s*["\'](.*?\.mp4)["\']',
- r"(https?://\S+?\.mp4(?:\?\S+)?)",
- ]
- video_urls = []
- for pattern in patterns:
- match = re.findall(pattern, html_text, re.IGNORECASE)
- video_urls.extend(match)
- if video_urls:
- return video_urls[0]
- else:
- return None
- def extract_video_info(html_text: str) -> dict | None:
- """
- extract video publish time from html text
- """
- tree = html.fromstring(html_text)
- publish_time_str = tree.xpath("//meta[@property='og:release_date']/@content")[0]
- account_name = tree.xpath("//meta[@name='mediaid']/@content")[0]
- sub_url = tree.xpath("//meta[@property='og:url']/@content")[0]
- article_id = sub_url.split("/")[-1].split("_")[0]
- account_id = sub_url.split("/")[-1].split("_")[1]
- title = tree.xpath("//meta[@name='description']/@content")[0]
- response = {
- "publish_timestamp": int(
- datetime.strptime(publish_time_str, "%Y-%m-%d %H:%M").timestamp() * 1000
- ),
- "account_name": account_name,
- "article_id": article_id,
- "account_id": account_id,
- "video_url": extract_video_url(html_text),
- "title": title,
- }
- return response
- @retry(**retry_desc)
- def get_video_detail(article_url: str) -> dict | None:
- """
- get detail video url
- """
- payload = {}
- headers = {
- "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
- "Accept-Language": "zh",
- "Connection": "keep-alive",
- "Upgrade-Insecure-Requests": "1",
- "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36",
- }
- try:
- response = requests.get(article_url, headers=headers, data=payload, proxies=proxy())
- response.raise_for_status()
- video_info = extract_video_info(response.text)
- return video_info
- except requests.exceptions.RequestException as e:
- log(
- task="sohu_detail_video",
- function="get_detail_video_url",
- message=f"API请求失败: {e}",
- )
- except json.JSONDecodeError as e:
- log(
- task="sohu_detail_video",
- function="get_detail_video_url",
- message=f"响应解析失败: {e}",
- )
- return None
- # url = 'https://www.sohu.com/a/877211651_121141867?scm=10001.325_13-109000.0.0.5_32&spm=smpc.channel_248.block3_308_NDdFbm_1_fd.25.1743578768825Lv6rTp1_324'
- # res = get_detail_video_url(url)
- #
- # print(res)
|