| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129 | from __future__ import annotationsimport reimport jsonimport requestsfrom fake_useragent import FakeUserAgentfrom tenacity import retryfrom applications import logfrom applications.utils import request_retryretry_desc = request_retry(retry_times=3, min_retry_delay=2, max_retry_delay=30)# url from aigcbase_url = "http://crawler-cn.aiddit.com/crawler/wei_xin"headers = {"Content-Type": "application/json"}@retry(**retry_desc)def get_article_detail(    article_link: str, is_count: bool=False, is_cache: bool=True) -> dict | None:    """    get official article detail    """    target_url = f"{base_url}/detail"    payload = json.dumps(        {            "content_link": article_link,            "is_count": is_count,            "is_ad": False,            "is_cache": is_cache        }    )    try:        response = requests.post(            url=target_url, headers=headers, data=payload, timeout=120        )        response.raise_for_status()        return response.json()    except requests.exceptions.RequestException as e:        log(            task="get_official_article_detail",            function="get_official_article_detail",            message=f"API请求失败: {e}",            data={"link": article_link}        )    except json.JSONDecodeError as e:        log(            task="get_official_article_detail",            function="get_official_article_detail",            message=f"响应解析失败: {e}",            data={"link": article_link}        )    return None@retry(**retry_desc)def get_article_list_from_account(        account_id: str, index=None) -> dict | None:    target_url = f"{base_url}/blogger"    payload = json.dumps(        {            "account_id": account_id,            "cursor": index        }    )    try:        response = requests.post(            url=target_url, headers=headers, data=payload, timeout=120        )        response.raise_for_status()        return response.json()    except requests.exceptions.RequestException as e:        log(            task="get_official_account_article_list",            function="get_official_account_article_list",            message=f"API请求失败: {e}",            data={"gh_id": account_id}        )    except json.JSONDecodeError as e:        log(            task="get_official_account_article_list",            function="get_official_account_article_list",            message=f"响应解析失败: {e}",            data={"gh_id": account_id}        )    return None@retry(**retry_desc)def get_source_account_from_article(article_link) -> dict | None:    """    get account info from official article    :param article_link:    :return:    """    try:        response = requests.get(url=article_link, headers={'User-Agent': FakeUserAgent().random}, timeout=120)        response.raise_for_status()        html_text = response.text        regex_nickname = r"hit_nickname:\s*'([^']+)'"        regex_username = r"hit_username:\s*'([^']+)'"        nickname = re.search(regex_nickname, html_text)        username = re.search(regex_username, html_text)        # 输出提取的结果        if nickname and username:            return {                'name': nickname.group(1),                'gh_id': username.group(1)            }        else:            return {}    except requests.exceptions.RequestException as e:        log(            task="get_source_account_from_article",            function="get_source_account_from_article",            message=f"API请求失败: {e}",            data={"link": article_link}        )    except json.JSONDecodeError as e:        log(            task="get_source_account_from_article",            function="get_source_account_from_article",            message=f"响应解析失败: {e}",            data={"link": article_link}        )    return None
 |