import re import json import requests from datetime import datetime from lxml import html from tenacity import retry from applications import log from applications.utils import proxy, request_retry retry_desc = request_retry(retry_times=3, min_retry_delay=2, max_retry_delay=30) def extract_video_url(html_text: str) -> str | None: """ extract video url from html text """ patterns = [ r' dict | None: """ extract video publish time from html text """ tree = html.fromstring(html_text) publish_time_str = tree.xpath("//meta[@property='og:release_date']/@content")[0] account_name = tree.xpath("//meta[@name='mediaid']/@content")[0] sub_url = tree.xpath("//meta[@property='og:url']/@content")[0] article_id = sub_url.split("/")[-1].split("_")[0] account_id = sub_url.split("/")[-1].split("_")[1] title = tree.xpath("//meta[@name='description']/@content")[0] response = { "publish_timestamp": int( datetime.strptime(publish_time_str, "%Y-%m-%d %H:%M").timestamp() * 1000 ), "account_name": account_name, "article_id": article_id, "account_id": account_id, "video_url": extract_video_url(html_text), "title": title, } return response @retry(**retry_desc) def get_video_detail(article_url: str) -> dict | None: """ get detail video url """ payload = {} headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", "Accept-Language": "zh", "Connection": "keep-alive", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36", } try: response = requests.get(article_url, headers=headers, data=payload, proxies=proxy()) response.raise_for_status() video_info = extract_video_info(response.text) return video_info except requests.exceptions.RequestException as e: log( task="sohu_detail_video", function="get_detail_video_url", message=f"API请求失败: {e}", ) except json.JSONDecodeError as e: log( task="sohu_detail_video", function="get_detail_video_url", message=f"响应解析失败: {e}", ) return None # url = 'https://www.sohu.com/a/877211651_121141867?scm=10001.325_13-109000.0.0.5_32&spm=smpc.channel_248.block3_308_NDdFbm_1_fd.25.1743578768825Lv6rTp1_324' # res = get_detail_video_url(url) # # print(res)