luojunhui
/
LongArticlesJob


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596
							import re
import json
import requests
from datetime import datetime
from lxml import html
from tenacity import retry

from applications import log
from applications.utils import proxy, request_retry

retry_desc = request_retry(retry_times=3, min_retry_delay=2, max_retry_delay=30)


def extract_video_url(html_text: str) -> str | None:
    """
    extract video url from html text
    """
    patterns = [
        r'<source\s+src=["\'](.*?\.mp4)["\']',
        r'videoUrl\s*=\s*["\'](.*?\.mp4)["\']',
        r"(https?://\S+?\.mp4(?:\?\S+)?)",
    ]
    video_urls = []
    for pattern in patterns:
        match = re.findall(pattern, html_text, re.IGNORECASE)
        video_urls.extend(match)

    if video_urls:
        return video_urls[0]
    else:
        return None


def extract_video_info(html_text: str) -> dict | None:
    """
    extract video publish time from html text
    """
    tree = html.fromstring(html_text)
    publish_time_str = tree.xpath("//meta[@property='og:release_date']/@content")[0]
    account_name = tree.xpath("//meta[@name='mediaid']/@content")[0]
    sub_url = tree.xpath("//meta[@property='og:url']/@content")[0]
    article_id = sub_url.split("/")[-1].split("_")[0]
    account_id = sub_url.split("/")[-1].split("_")[1]
    title = tree.xpath("//meta[@name='description']/@content")[0]
    response = {
        "publish_timestamp": int(
            datetime.strptime(publish_time_str, "%Y-%m-%d %H:%M").timestamp() * 1000
        ),
        "account_name": account_name,
        "article_id": article_id,
        "account_id": account_id,
        "video_url": extract_video_url(html_text),
        "title": title,
    }
    return response


@retry(**retry_desc)
def get_video_detail(article_url: str) -> dict | None:
    """
    get detail video url
    """
    payload = {}
    headers = {
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
        "Accept-Language": "zh",
        "Connection": "keep-alive",
        "Upgrade-Insecure-Requests": "1",
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36",
    }
    try:
        response = requests.get(article_url, headers=headers, data=payload, proxies=proxy())
        response.raise_for_status()
        video_info = extract_video_info(response.text)
        return video_info

    except requests.exceptions.RequestException as e:
        log(
            task="sohu_detail_video",
            function="get_detail_video_url",
            message=f"API请求失败: {e}",
        )

    except json.JSONDecodeError as e:
        log(
            task="sohu_detail_video",
            function="get_detail_video_url",
            message=f"响应解析失败: {e}",
        )
    return None


# url = 'https://www.sohu.com/a/877211651_121141867?scm=10001.325_13-109000.0.0.5_32&spm=smpc.channel_248.block3_308_NDdFbm_1_fd.25.1743578768825Lv6rTp1_324'
# res = get_detail_video_url(url)
#
# print(res)