howard
/
Agent


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692
							"""
热点宝画像数据工具（示例）

调用内部爬虫服务获取账号/内容的粉丝画像。
"""
import json
import logging
import os
import time
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple, TypedDict

import requests

from agent.tools import tool, ToolResult
from utils.tool_logging import format_tool_result_for_log, log_tool_call

logger = logging.getLogger(__name__)

_LABEL_ACCOUNT = "工具调用：get_account_fans_portrait -> 抖音账号粉丝画像（热点宝）"
_LABEL_CONTENT = "工具调用：get_content_fans_portrait -> 内容点赞用户画像（热点宝）"
_LABEL_BATCH = "工具调用：batch_fetch_portraits -> 批量获取内容/账号画像（热点宝）"

BATCH_MAX_ITEMS = 30


ACCOUNT_FANS_PORTRAIT_API = "http://crawapi.piaoquantv.com/crawler/dou_yin/re_dian_bao/account_fans_portrait"
CONTENT_FANS_PORTRAIT_API = "http://crawapi.piaoquantv.com/crawler/dou_yin/re_dian_bao/video_like_portrait"
DEFAULT_TIMEOUT = 60.0


class _PortraitOk(TypedDict):
    output: str
    has_portrait: bool
    portrait_data: Dict[str, Any]
    raw_data: Any


def _log_return(label: str, params: Dict[str, Any], r: ToolResult) -> ToolResult:
    log_tool_call(label, params, format_tool_result_for_log(r))
    return r


def _top_k(items: Dict[str, Any], k: int) -> List[Tuple[str, Any]]:
    def percent_value(entry: Tuple[str, Any]) -> float:
        metrics = entry[1] if isinstance(entry[1], dict) else {}
        return metrics.get("percentage")

    return sorted(items.items(), key=percent_value, reverse=True)[:k]


def _format_portrait_summary(
    header_line: str,
    link_line: str,
    portrait: Dict[str, Any],
) -> str:
    summary_lines = [header_line, link_line, ""]
    for k, v in portrait.items():
        if not isinstance(v, dict):
            continue
        if k in ("省份", "城市"):
            summary_lines.append(f"【{k} TOP5】分布")
            items = _top_k(v, 5)
        else:
            summary_lines.append(f"【{k}】分布")
            items = v.items()

        for name, metrics in items:
            ratio = metrics.get("percentage")
            tgi = metrics.get("preference")
            summary_lines.append(f"  {name}: {ratio} (偏好度: {tgi})")
        summary_lines.append("")
    return "\n".join(summary_lines)


def _validate_account_id(account_id: str) -> Optional[str]:
    if not account_id or not isinstance(account_id, str):
        return "account_id 参数无效：必须是非空字符串"
    if not account_id.startswith("MS4wLjABAAAA"):
        return (
            f"account_id 格式错误：必须以 MS4wLjABAAAA 开头，"
            f"当前值: {account_id[:min(20, len(account_id))]}..."
        )
    return None


def _validate_content_id(content_id: str) -> Optional[str]:
    if not content_id or not isinstance(content_id, str):
        return "content_id 参数无效：必须是非空字符串"
    if not content_id.isdigit():
        return f"content_id 格式错误：aweme_id 应该是纯数字，当前值: {content_id[:20]}..."
    if len(content_id) < 15 or len(content_id) > 25:
        return f"content_id 长度异常：期望 15-25 位数字，实际 {len(content_id)} 位"
    return None


def _dimension_flags(
    need_province: bool,
    need_city: bool,
    need_city_level: bool,
    need_gender: bool,
    need_age: bool,
    need_phone_brand: bool,
    need_phone_price: bool,
) -> Dict[str, bool]:
    return {
        "need_province": need_province,
        "need_city": need_city,
        "need_city_level": need_city_level,
        "need_gender": need_gender,
        "need_age": need_age,
        "need_phone_brand": need_phone_brand,
        "need_phone_price": need_phone_price,
    }


def _sync_fetch_account_portrait(
    account_id: str,
    flags: Dict[str, bool],
    request_timeout: float,
) -> Tuple[Optional[str], Optional[_PortraitOk]]:
    err = _validate_account_id(account_id)
    if err:
        return err, None
    payload = {"account_id": account_id, **flags}
    try:
        response = requests.post(
            ACCOUNT_FANS_PORTRAIT_API,
            json=payload,
            headers={"Content-Type": "application/json"},
            timeout=request_timeout,
        )
        response.raise_for_status()
        data = response.json()
    except requests.exceptions.HTTPError as e:
        return f"HTTP {e.response.status_code}: {e.response.text}", None
    except requests.exceptions.Timeout:
        return f"请求超时（{request_timeout}秒）", None
    except requests.exceptions.RequestException as e:
        return f"网络错误: {str(e)}", None
    except Exception as e:
        logger.error(
            "account portrait request failed",
            extra={"account_id": account_id, "error": str(e)},
            exc_info=True,
        )
        return f"未知错误: {str(e)}", None

    data_block = data.get("data", {}) if isinstance(data.get("data"), dict) else {}
    portrait = data_block.get("data", {}) if isinstance(data_block.get("data"), dict) else {}
    header = f"账号 {account_id} 的粉丝画像"
    link = (
        f"画像链接：https://douhot.douyin.com/creator/detail?"
        f"active_tab=creator_fans_portrait&creator_id={account_id}"
    )
    output = _format_portrait_summary(header, link, portrait)
    has_valid = bool(portrait and any(isinstance(v, dict) and v for v in portrait.values()))
    return None, _PortraitOk(
        output=output,
        has_portrait=has_valid,
        portrait_data=portrait,
        raw_data=data,
    )


def _sync_fetch_content_portrait(
    content_id: str,
    flags: Dict[str, bool],
    request_timeout: float,
) -> Tuple[Optional[str], Optional[_PortraitOk]]:
    err = _validate_content_id(content_id)
    if err:
        return err, None
    payload = {"content_id": content_id, **flags}
    try:
        response = requests.post(
            CONTENT_FANS_PORTRAIT_API,
            json=payload,
            headers={"Content-Type": "application/json"},
            timeout=request_timeout,
        )
        response.raise_for_status()
        data = response.json()
    except requests.exceptions.HTTPError as e:
        return f"HTTP {e.response.status_code}: {e.response.text}", None
    except requests.exceptions.Timeout:
        return f"请求超时（{request_timeout}秒）", None
    except requests.exceptions.RequestException as e:
        return f"网络错误: {str(e)}", None
    except Exception as e:
        logger.error(
            "content portrait request failed",
            extra={"content_id": content_id, "error": str(e)},
            exc_info=True,
        )
        return f"未知错误: {str(e)}", None

    data_block = data.get("data", {}) if isinstance(data.get("data"), dict) else {}
    portrait = data_block.get("data", {}) if isinstance(data_block.get("data"), dict) else {}
    header = f"内容 {content_id} 的点赞用户画像"
    link = (
        f"画像链接：https://douhot.douyin.com/video/detail?"
        f"active_tab=video_fans&video_id={content_id}"
    )
    output = _format_portrait_summary(header, link, portrait)
    has_valid = bool(portrait and any(isinstance(v, dict) and v for v in portrait.values()))
    return None, _PortraitOk(
        output=output,
        has_portrait=has_valid,
        portrait_data=portrait,
        raw_data=data,
    )


@tool(description="获取抖音账号粉丝画像（热点宝），支持选择画像维度")
async def get_account_fans_portrait(
    account_id: str,
    need_province: bool = False,
    need_city: bool = False,
    need_city_level: bool = False,
    need_gender: bool = False,
    need_age: bool = True,
    need_phone_brand: bool = False,
    need_phone_price: bool = False,
    timeout: Optional[float] = None,
) -> ToolResult:
    """
    获取抖音账号粉丝画像（热点宝数据）

    获取指定账号的粉丝画像数据，包括年龄、性别、地域等多个维度。

    Args:
        account_id: 抖音账号ID（使用 author.sec_uid）
        need_province: 是否获取省份分布，默认 False
        need_city: 是否获取城市分布，默认 False
        need_city_level: 是否获取城市等级分布（一线/新一线/二线等），默认 False
        need_gender: 是否获取性别分布，默认 False
        need_age: 是否获取年龄分布，默认 True
        need_phone_brand: 是否获取手机品牌分布，默认 False
        need_phone_price: 是否获取手机价格分布，默认 False
        timeout: 超时时间（秒），默认 60

    Returns:
        ToolResult: 包含以下内容：
            - output: 文本格式的画像摘要
            - metadata.has_portrait: 布尔值，表示是否有有效画像数据
                - True: 有有效画像数据
                - False: 无画像数据
            - metadata.portrait_data: 结构化的画像数据（字典格式）
                - 键: 维度名称（如 "年龄"、"性别"）
                - 值: 该维度的分布数据（字典）
                    - percentage: 占比（如 "48.35%"）
                    - preference: 偏好度/TGI（如 "210.05"）
            - metadata.raw_data: 原始 API 返回数据

    Note:
        - account_id 参数使用 author.sec_uid（约80字符）
        - 默认只返回年龄分布，需要其他维度时设置对应参数为 True
        - 省份数据只显示 TOP5
        - 偏好度（TGI）说明：
            - > 100: 该人群偏好高于平均水平
            - = 100: 平均水平
            - < 100: 低于平均水平
        - 使用 metadata.has_portrait 判断画像是否有效，不要解析 output 文本
        - 从 metadata.portrait_data 获取结构化画像数据
    """
    start_time = time.time()
    call_params = {
        "account_id": account_id,
        "need_province": need_province,
        "need_city": need_city,
        "need_city_level": need_city_level,
        "need_gender": need_gender,
        "need_age": need_age,
        "need_phone_brand": need_phone_brand,
        "need_phone_price": need_phone_price,
        "timeout": timeout,
    }
    flags = _dimension_flags(
        need_province,
        need_city,
        need_city_level,
        need_gender,
        need_age,
        need_phone_brand,
        need_phone_price,
    )
    request_timeout = timeout if timeout is not None else DEFAULT_TIMEOUT
    err, ok = _sync_fetch_account_portrait(account_id, flags, request_timeout)
    duration_ms = int((time.time() - start_time) * 1000)

    if err:
        logger.error("get_account_fans_portrait failed", extra={"account_id": account_id, "error": err})
        return _log_return(
            _LABEL_ACCOUNT,
            call_params,
            ToolResult(
                title="账号粉丝画像获取失败",
                output="",
                error=err,
            ),
        )

    assert ok is not None
    logger.info(
        "get_account_fans_portrait completed",
        extra={
            "account_id": account_id,
            "has_portrait": ok["has_portrait"],
            "portrait_dimensions": list(ok["portrait_data"].keys()) if ok["portrait_data"] else [],
            "duration_ms": duration_ms,
        },
    )
    return _log_return(
        _LABEL_ACCOUNT,
        call_params,
        ToolResult(
            title=f"账号粉丝画像: {account_id}",
            output=ok["output"],
            long_term_memory=f"Fetched fans portrait for account '{account_id}'",
            metadata={
                "raw_data": ok["raw_data"],
                "has_portrait": ok["has_portrait"],
                "portrait_data": ok["portrait_data"],
            },
        ),
    )


@tool(description="获取抖音内容点赞用户画像（热点宝），支持选择画像维度")
async def get_content_fans_portrait(
    content_id: str,
    need_province: bool = False,
    need_city: bool = False,
    need_city_level: bool = False,
    need_gender: bool = False,
    need_age: bool = True,
    need_phone_brand: bool = False,
    need_phone_price: bool = False,
    timeout: Optional[float] = None,
) -> ToolResult:
    """
    获取抖音内容点赞用户画像（热点宝数据）

    获取指定视频内容的点赞用户画像数据，包括年龄、性别、地域等多个维度。

    Args:
        content_id: 抖音内容ID（使用 aweme_id）
        need_province: 是否获取省份分布，默认 False
        need_city: 是否获取城市分布，默认 False
        need_city_level: 是否获取城市等级分布（一线/新一线/二线等），默认 False
        need_gender: 是否获取性别分布，默认 False
        need_age: 是否获取年龄分布，默认 True
        need_phone_brand: 是否获取手机品牌分布，默认 False
        need_phone_price: 是否获取手机价格分布，默认 False
        timeout: 超时时间（秒），默认 60

    Returns:
        ToolResult: 包含以下内容：
            - output: 文本格式的画像摘要
            - metadata.has_portrait: 布尔值，表示是否有有效画像数据
                - True: 有有效画像数据
                - False: 无画像数据（需要使用账号画像兜底）
            - metadata.portrait_data: 结构化的画像数据（字典格式）
                - 键: 维度名称（如 "年龄"、"性别"）
                - 值: 该维度的分布数据（字典）
                    - percentage: 占比（如 "48.35%"）
                    - preference: 偏好度/TGI（如 "210.05"）
            - metadata.raw_data: 原始 API 返回数据

    Note:
        - content_id 参数使用 aweme_id
        - 默认只返回年龄分布，需要其他维度时设置对应参数为 True
        - 省份数据只显示 TOP5
        - 偏好度（TGI）说明：
            - > 100: 该人群偏好高于平均水平
            - = 100: 平均水平
            - < 100: 低于平均水平
        - 使用 metadata.has_portrait 判断画像是否有效，不要解析 output 文本
        - 如果 has_portrait 为 False，应使用 get_account_fans_portrait 作为兜底
        - 从 metadata.portrait_data 获取结构化画像数据
    """
    start_time = time.time()
    call_params = {
        "content_id": content_id,
        "need_province": need_province,
        "need_city": need_city,
        "need_city_level": need_city_level,
        "need_gender": need_gender,
        "need_age": need_age,
        "need_phone_brand": need_phone_brand,
        "need_phone_price": need_phone_price,
        "timeout": timeout,
    }
    flags = _dimension_flags(
        need_province,
        need_city,
        need_city_level,
        need_gender,
        need_age,
        need_phone_brand,
        need_phone_price,
    )
    request_timeout = timeout if timeout is not None else DEFAULT_TIMEOUT
    err, ok = _sync_fetch_content_portrait(content_id, flags, request_timeout)
    duration_ms = int((time.time() - start_time) * 1000)

    if err:
        logger.error("get_content_fans_portrait failed", extra={"content_id": content_id, "error": err})
        return _log_return(
            _LABEL_CONTENT,
            call_params,
            ToolResult(
                title="内容点赞用户画像获取失败",
                output="",
                error=err,
            ),
        )

    assert ok is not None
    logger.info(
        "get_content_fans_portrait completed",
        extra={
            "content_id": content_id,
            "has_portrait": ok["has_portrait"],
            "portrait_dimensions": list(ok["portrait_data"].keys()) if ok["portrait_data"] else [],
            "duration_ms": duration_ms,
        },
    )
    return _log_return(
        _LABEL_CONTENT,
        call_params,
        ToolResult(
            title=f"内容点赞用户画像: {content_id}",
            output=ok["output"],
            long_term_memory=f"Fetched fans portrait for content '{content_id}'",
            metadata={
                "raw_data": ok["raw_data"],
                "has_portrait": ok["has_portrait"],
                "portrait_data": ok["portrait_data"],
            },
        ),
    )


@tool(
    description=(
        "批量获取多条候选视频的画像：工具内依次请求内容点赞画像；"
        "若无画像且允许兜底则再请求作者粉丝画像。一次调用返回所有条目，减少对话轮次。"
        "完整结构化结果在同一条 tool 消息的 metadata JSON 中"
    ),
    hidden_params=["context"],
)
async def batch_fetch_portraits(
    candidates_json: str,
    need_province: bool = False,
    need_city: bool = False,
    need_city_level: bool = False,
    need_gender: bool = False,
    need_age: bool = True,
    need_phone_brand: bool = False,
    need_phone_price: bool = False,
    timeout: Optional[float] = None,
    context: Optional[Dict[str, Any]] = None,
) -> ToolResult:
    """
    批量拉取内容画像并在规则允许时用账号画像兜底（单工具、多 HTTP 顺序请求）。

    Args:
        candidates_json: JSON 数组字符串。每项为对象，字段：
            - aweme_id (必填): 视频 id
            - author_sec_uid (可选): 作者 sec_uid，兜底时需要
            - try_account_fallback (可选，默认 true): 为 false 时不请求账号画像
              （对应来自 douyin_user_videos 的条目，与单条工具规则一致）
        need_* / timeout: 与各单条画像工具一致

    Returns:
        ToolResult.output: 人类可读的分条摘要
        metadata.results: 与 candidates 顺序一致的列表，每项含 content / account 子对象；
            通过 ToolResult.include_metadata_in_llm 会进入本轮 tool 消息正文（JSON），无需从 log 猜测。
        metadata.snapshot_path: 落盘文件绝对路径（若写入成功）

    Note:
        context 由 Runner 注入，含 trace_id，用于写入 batch_portraits.json。
    """
    call_params: Dict[str, Any] = {
        "candidates_json": candidates_json[:2000] + ("..." if len(candidates_json) > 2000 else ""),
        "need_age": need_age,
        "timeout": timeout,
    }
    raw = (candidates_json or "").strip()
    if not raw:
        return _log_return(
            _LABEL_BATCH,
            call_params,
            ToolResult(title="批量画像失败", output="", error="candidates_json 为空"),
        )
    try:
        parsed = json.loads(raw)
    except json.JSONDecodeError as e:
        return _log_return(
            _LABEL_BATCH,
            call_params,
            ToolResult(
                title="批量画像失败",
                output="",
                error=f"candidates_json 不是合法 JSON: {e}",
            ),
        )
    if not isinstance(parsed, list):
        return _log_return(
            _LABEL_BATCH,
            call_params,
            ToolResult(title="批量画像失败", output="", error="candidates_json 必须是 JSON 数组"),
        )
    if len(parsed) > BATCH_MAX_ITEMS:
        return _log_return(
            _LABEL_BATCH,
            call_params,
            ToolResult(
                title="批量画像失败",
                output="",
                error=f"条目数超过上限 {BATCH_MAX_ITEMS}，请分批调用",
            ),
        )

    flags = _dimension_flags(
        need_province,
        need_city,
        need_city_level,
        need_gender,
        need_age,
        need_phone_brand,
        need_phone_price,
    )
    request_timeout = timeout if timeout is not None else DEFAULT_TIMEOUT

    results: List[Dict[str, Any]] = []
    output_chunks: List[str] = []

    for idx, entry in enumerate(parsed):
        if not isinstance(entry, dict):
            results.append(
                {
                    "aweme_id": None,
                    "error": "条目不是对象",
                    "content": None,
                    "account": None,
                }
            )
            output_chunks.append(f"[{idx}] 跳过：条目不是 JSON 对象")
            continue

        aweme_id = entry.get("aweme_id") or entry.get("content_id")
        author_sec = entry.get("author_sec_uid") or entry.get("account_id")
        try_fallback = entry.get("try_account_fallback", True)
        if isinstance(try_fallback, str):
            try_fallback = try_fallback.strip().lower() in ("1", "true", "yes")

        if not aweme_id or not isinstance(aweme_id, str):
            results.append(
                {
                    "aweme_id": aweme_id,
                    "error": "缺少 aweme_id",
                    "content": None,
                    "account": None,
                }
            )
            output_chunks.append(f"[{idx}] 跳过：缺少 aweme_id")
            continue

        item_result: Dict[str, Any] = {
            "aweme_id": aweme_id,
            "author_sec_uid": author_sec if isinstance(author_sec, str) else None,
            "try_account_fallback": bool(try_fallback),
            "content": None,
            "account": None,
            "error": None,
        }

        cerr, cok = _sync_fetch_content_portrait(aweme_id, flags, request_timeout)
        if cerr:
            item_result["content"] = {
                "ok": False,
                "error": cerr,
                "has_portrait": False,
                "portrait_data": {},
            }
        else:
            assert cok is not None
            item_result["content"] = {
                "ok": True,
                "error": None,
                "has_portrait": cok["has_portrait"],
                "portrait_data": cok["portrait_data"],
                "output": cok["output"],
            }

        c_block = item_result["content"]
        content_has = bool(c_block and c_block.get("has_portrait"))
        need_account = bool(try_fallback) and not content_has
        if need_account:
            if not author_sec or not isinstance(author_sec, str):
                item_result["account"] = {
                    "attempted": False,
                    "skipped_reason": "缺少 author_sec_uid，无法账号兜底",
                    "has_portrait": False,
                    "portrait_data": {},
                }
            else:
                aerr, aok = _sync_fetch_account_portrait(author_sec, flags, request_timeout)
                if aerr:
                    item_result["account"] = {
                        "attempted": True,
                        "error": aerr,
                        "has_portrait": False,
                        "portrait_data": {},
                    }
                else:
                    assert aok is not None
                    item_result["account"] = {
                        "attempted": True,
                        "error": None,
                        "has_portrait": aok["has_portrait"],
                        "portrait_data": aok["portrait_data"],
                        "output": aok["output"],
                    }
        else:
            skip_reason = (
                "try_account_fallback 为 false（如 douyin_user_videos 来源）"
                if not try_fallback
                else "内容侧已有有效画像，无需账号兜底"
            )
            item_result["account"] = {
                "attempted": False,
                "skipped_reason": skip_reason,
                "has_portrait": False,
                "portrait_data": {},
            }

        results.append(item_result)
        # 压缩每条在 output 中的篇幅
        c_part = item_result["content"] or {}
        a_part = item_result["account"] or {}
        line = (
            f"[{idx}] aweme_id={aweme_id} "
            f"content_has_portrait={c_part.get('has_portrait')} "
            f"account_attempted={a_part.get('attempted')} "
            f"account_has_portrait={a_part.get('has_portrait')}"
        )
        output_chunks.append(line)

    full_text = "\n".join(output_chunks)
    trace_id = None
    if isinstance(context, dict):
        tid = context.get("trace_id")
        if isinstance(tid, str) and tid.strip():
            trace_id = tid.strip()

    out_display = (os.getenv("OUTPUT_DIR") or ".cache/output").strip()
    meta_hint = (
        "\n\n本条 tool 消息在标题与摘要后附有 ## metadata (JSON)，其中 results 含每条 "
        "content/account 的 has_portrait 与 portrait_data；若上下文被压缩，可用 read_file 读取："
        f" {out_display}/{trace_id}/output.json"
    )
    output_body = full_text + meta_hint

    logger.info(
        "batch_fetch_portraits completed",
        extra={
            "count": len(results),
            "candidates": len(parsed),
            "trace_id": trace_id,
        },
    )

    meta: Dict[str, Any] = {
        "results": results,
        "count": len(results),
    }

    return _log_return(
        _LABEL_BATCH,
        call_params,
        ToolResult(
            title=f"批量画像完成 ({len(results)} 条)",
            output=output_body,
            long_term_memory=f"Batch portrait fetch for {len(results)} items",
            metadata=meta,
            include_metadata_in_llm=True,
        ),
    )