Server
/
external_demand


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484
							"""微信指数达标后的需求元素/短语导出逻辑。"""

from __future__ import annotations

import argparse
import json
from typing import Any

from app.hot_content.config import load_flow_config
from app.hot_content.repository import HotContentRepository

WXINDEX_EXPORT_THRESHOLD = 1_000_000.0  # 与 WXINDEX_SCORE_THRESHOLD 默认值一致


POINT_CATEGORIES = ("灵感点", "目的点", "关键点")
ITEM_TYPE_ELEMENT = "元素"
ITEM_TYPE_PHRASE = "短语"


def get_latest_wxindex_score(trend_json: dict[str, Any]) -> float | None:
    wxindex = trend_json.get("wxindex")
    if not isinstance(wxindex, dict):
        return None
    try:
        return float(wxindex.get("latest_total_score"))
    except (TypeError, ValueError):
        return None


def get_wxindex_keyword(trend_json: dict[str, Any] | None) -> str:
    if not isinstance(trend_json, dict):
        return ""
    wxindex = trend_json.get("wxindex")
    if isinstance(wxindex, dict):
        keyword = str(wxindex.get("keyword") or "").strip()
        if keyword:
            return keyword
    return str(trend_json.get("llm_selected_word") or "").strip()


def get_wxindex_trend(trend_json: dict[str, Any]) -> str:
    wxindex = trend_json.get("wxindex")
    if not isinstance(wxindex, dict):
        return ""
    return str(wxindex.get("trend") or "").strip()


def _to_contribution_score(value: Any) -> float | None:
    try:
        if value is None:
            return None
        return float(value)
    except (TypeError, ValueError):
        return None


def extract_matched_demand_name_list(word_row: dict[str, Any]) -> list[str]:
    match_rows = word_row.get("匹配需求列表") or []
    if not isinstance(match_rows, list):
        return []

    names: list[str] = []
    seen: set[str] = set()
    for match in match_rows:
        if not isinstance(match, dict):
            continue
        demand_name = str(match.get("demand_name") or "").strip()
        if not demand_name or demand_name in seen:
            continue
        seen.add(demand_name)
        names.append(demand_name)
    return names


def extract_matched_demand_names(word_row: dict[str, Any]) -> str:
    return " ".join(extract_matched_demand_name_list(word_row))


def build_word_lookup(words_rows: list[Any]) -> dict[str, dict[str, Any]]:
    lookup: dict[str, dict[str, Any]] = {}
    for word_row in words_rows:
        if not isinstance(word_row, dict):
            continue
        word_text = str(word_row.get("词") or "").strip()
        if word_text:
            lookup[word_text] = word_row
    return lookup


def build_word_to_categories(points: list[Any]) -> dict[str, set[str]]:
    word_categories: dict[str, set[str]] = {}
    if not isinstance(points, list):
        return word_categories

    for point_item in points:
        if not isinstance(point_item, dict):
            continue
        category = str(point_item.get("来源") or "").strip()
        if category not in POINT_CATEGORIES:
            continue
        match_words = point_item.get("匹配词列表") or []
        if not isinstance(match_words, list):
            continue
        for hit in match_words:
            if not isinstance(hit, dict):
                continue
            word_text = str(hit.get("词") or "").strip()
            if not word_text:
                continue
            word_categories.setdefault(word_text, set()).add(category)
    return word_categories


def ordered_point_categories(categories: set[str]) -> list[str]:
    return [category for category in POINT_CATEGORIES if category in categories]


def extract_point_matched_demand_names(
    point_item: dict[str, Any],
    word_lookup: dict[str, dict[str, Any]],
) -> str:
    match_words = point_item.get("匹配词列表") or []
    if not isinstance(match_words, list):
        return ""

    names: list[str] = []
    seen: set[str] = set()
    for hit in match_words:
        if not isinstance(hit, dict):
            continue
        word_text = str(hit.get("词") or "").strip()
        word_row = word_lookup.get(word_text)
        if not word_row:
            continue
        for demand_name in extract_matched_demand_name_list(word_row):
            if demand_name in seen:
                continue
            seen.add(demand_name)
            names.append(demand_name)
    return " ".join(names)


def _build_word_export_row(
    word_text: str,
    word_row: dict[str, Any],
    category: str,
) -> dict[str, Any]:
    return {
        "item_type": ITEM_TYPE_ELEMENT,
        "item_text": word_text,
        "point_category": category,
        "matched_demand": extract_matched_demand_names(word_row),
        "contribution_score": _to_contribution_score(word_row.get("贡献度")),
    }


def _resolve_word_row(
    word_text: str,
    *,
    word_lookup: dict[str, dict[str, Any]],
    match_result: dict[str, Any],
) -> dict[str, Any]:
    word_row = word_lookup.get(word_text)
    if isinstance(word_row, dict):
        return word_row

    for row in match_result.get("匹配到需求的词列表") or []:
        if isinstance(row, dict) and str(row.get("词") or "").strip() == word_text:
            return row
    for row in match_result.get("高贡献词列表") or []:
        if isinstance(row, dict) and str(row.get("词") or "").strip() == word_text:
            return row
    return {"词": word_text}


def append_wxindex_keyword_rows(
    export_rows: list[dict[str, Any]],
    *,
    trend_json: dict[str, Any] | None,
    match_result: dict[str, Any],
    word_lookup: dict[str, dict[str, Any]],
    word_to_categories: dict[str, set[str]],
) -> None:
    keyword = get_wxindex_keyword(trend_json)
    if not keyword:
        return

    has_keyword_row = any(
        row.get("item_type") == ITEM_TYPE_ELEMENT and str(row.get("item_text") or "").strip() == keyword
        for row in export_rows
    )
    if has_keyword_row:
        return

    word_row = _resolve_word_row(keyword, word_lookup=word_lookup, match_result=match_result)
    if not extract_matched_demand_names(word_row):
        return
    categories = ordered_point_categories(word_to_categories.get(keyword, set()))
    if categories:
        for category in categories:
            export_rows.append(_build_word_export_row(keyword, word_row, category))
        return

    export_rows.append(_build_word_export_row(keyword, word_row, ""))


def build_demand_export_rows(
    match_result: dict[str, Any],
    *,
    contribution_points: dict[str, Any] | None = None,
    trend_json: dict[str, Any] | None = None,
) -> list[dict[str, Any]]:
    export_rows: list[dict[str, Any]] = []
    words_rows = match_result.get("高贡献词列表") or []
    if not isinstance(words_rows, list):
        words_rows = []

    contribution_source = contribution_points if isinstance(contribution_points, dict) else match_result
    points = match_result.get("点列表") or []
    if not isinstance(points, list) or not points:
        points = contribution_source.get("点列表") or []
    if not isinstance(points, list):
        points = []

    word_lookup = build_word_lookup(words_rows)
    word_to_categories = build_word_to_categories(points)

    for word_text, word_row in word_lookup.items():
        categories = ordered_point_categories(word_to_categories.get(word_text, set()))
        if not categories:
            continue
        if not extract_matched_demand_names(word_row):
            continue
        for category in categories:
            export_rows.append(_build_word_export_row(word_text, word_row, category))

    for point_item in points:
        if not isinstance(point_item, dict):
            continue
        point_text = str(point_item.get("点") or "").strip()
        category = str(point_item.get("来源") or "").strip()
        if not point_text or category not in POINT_CATEGORIES:
            continue
        matched_demand = extract_point_matched_demand_names(point_item, word_lookup)
        if not matched_demand:
            continue
        export_rows.append(
            {
                "item_type": ITEM_TYPE_PHRASE,
                "item_text": point_text,
                "point_category": category,
                "matched_demand": matched_demand,
                "contribution_score": None,
            }
        )

    append_wxindex_keyword_rows(
        export_rows,
        trend_json=trend_json,
        match_result=match_result,
        word_lookup=word_lookup,
        word_to_categories=word_to_categories,
    )

    deduped_rows: list[dict[str, Any]] = []
    seen: set[tuple[str, str, str]] = set()
    for row in export_rows:
        key = (
            row["item_type"],
            row["item_text"],
            str(row.get("point_category") or ""),
        )
        if key in seen:
            continue
        seen.add(key)
        deduped_rows.append(row)
    return deduped_rows


def attach_wxindex_metadata(
    export_rows: list[dict[str, Any]],
    trend_json: dict[str, Any] | None,
) -> list[dict[str, Any]]:
    latest_score = (
        get_latest_wxindex_score(trend_json)
        if isinstance(trend_json, dict)
        else None
    )
    trend = get_wxindex_trend(trend_json) if isinstance(trend_json, dict) else ""
    wxindex_keyword = get_wxindex_keyword(trend_json)
    rows: list[dict[str, Any]] = []
    for row in export_rows:
        matched_demand = str(row.get("matched_demand") or "").strip()
        has_record_wxindex = latest_score is not None

        if has_record_wxindex and matched_demand:
            wxindex_score = float(latest_score)
            wxindex_trend_value = trend
        else:
            wxindex_score = 0.0
            wxindex_trend_value = ""

        rows.append(
            {
                **row,
                "wxindex_keyword": wxindex_keyword,
                "wxindex_latest_score": wxindex_score,
                "wxindex_trend": wxindex_trend_value,
            }
        )
    return rows


def _json_loads(value: Any) -> Any:
    if value is None:
        return None
    if isinstance(value, (dict, list)):
        return value
    if isinstance(value, (bytes, bytearray)):
        value = value.decode("utf-8")
    if isinstance(value, str):
        return json.loads(value)
    return value


def fetch_export_candidate_records(cursor: Any, limit: int) -> list[dict[str, Any]]:
    limit_sql = "" if limit <= 0 else "LIMIT %s"
    params: tuple[Any, ...] = () if limit <= 0 else (limit,)
    cursor.execute(
        f"""
        SELECT
            id,
            source,
            title,
            article_title,
            contribution_points_json,
            contribution_demand_match_json,
            wxindex_trend_json
        FROM hot_content_records
        WHERE contribution_demand_match_json IS NOT NULL
          AND TRIM(CAST(contribution_demand_match_json AS CHAR)) <> ''
        ORDER BY id ASC
        {limit_sql}
        """,
        params,
    )
    return list(cursor.fetchall())


def export_existing_records(
    repository: HotContentRepository,
    records: list[dict[str, Any]],
    *,
    dry_run: bool,
    verbose: bool,
) -> dict[str, int]:
    summary = {
        "scanned": 0,
        "exported_records": 0,
        "exported_rows": 0,
        "no_export_rows": 0,
        "invalid_json": 0,
        "skipped": 0,
    }

    for row in records:
        summary["scanned"] += 1
        record_id = int(row["id"])
        try:
            match_json = _json_loads(row.get("contribution_demand_match_json"))
            contribution_points = _json_loads(row.get("contribution_points_json"))
            trend_json = _json_loads(row.get("wxindex_trend_json"))
        except json.JSONDecodeError:
            summary["invalid_json"] += 1
            if verbose:
                print(f"id={record_id}: JSON 解析失败，已跳过")
            continue

        if not isinstance(match_json, dict):
            summary["skipped"] += 1
            continue

        latest_score = (
            get_latest_wxindex_score(trend_json)
            if isinstance(trend_json, dict)
            else None
        )
        export_rows = attach_wxindex_metadata(
            build_demand_export_rows(
                match_json,
                contribution_points=(
                    contribution_points if isinstance(contribution_points, dict) else None
                ),
                trend_json=trend_json if isinstance(trend_json, dict) else None,
            ),
            trend_json if isinstance(trend_json, dict) else None,
        )
        if not export_rows:
            summary["no_export_rows"] += 1
            if not dry_run:
                repository.replace_demand_export_rows(
                    record_id=record_id,
                    source=str(row.get("source") or ""),
                    hot_title=str(row.get("title") or ""),
                    article_title=str(row.get("article_title") or ""),
                    rows=[],
                )
            continue

        if verbose or dry_run:
            matched_rows = sum(
                1 for item in export_rows if str(item.get("matched_demand") or "").strip()
            )
            print(
                f"id={record_id} rows={len(export_rows)} matched_rows={matched_rows} "
                f"title={str(row.get('title') or '')[:40]}"
            )

        if not dry_run:
            repository.replace_demand_export_rows(
                record_id=record_id,
                source=str(row.get("source") or ""),
                hot_title=str(row.get("title") or ""),
                article_title=str(row.get("article_title") or ""),
                rows=export_rows,
            )

        summary["exported_records"] += 1
        summary["exported_rows"] += len(export_rows)

    return summary


def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
    parser = argparse.ArgumentParser(
        description=(
            "扫描已有 contribution_demand_match_json 记录，"
            "导出全部元素/短语到 hot_content_demand_exports；"
            "元素/短语按灵感点/目的点/关键点展开为多行，无点类型数据过滤；"
            "并补充获取微信指数的词、微信指数及趋势。"
        ),
    )
    parser.add_argument(
        "--limit",
        type=int,
        default=0,
        help="最多处理多少条记录，默认 0 表示不限制。",
    )
    parser.add_argument(
        "--dry-run",
        action="store_true",
        help="只统计/打印，不写入数据库。",
    )
    parser.add_argument(
        "--verbose",
        action="store_true",
        help="打印每条成功导出的记录。",
    )
    return parser.parse_args(argv)


def main(argv: list[str] | None = None) -> dict[str, int]:
    args = parse_args(argv)
    config = load_flow_config()
    repository = HotContentRepository(config.mysql)
    try:
        with repository.conn.cursor() as cursor:
            records = fetch_export_candidate_records(cursor, args.limit)
        summary = export_existing_records(
            repository,
            records,
            dry_run=args.dry_run,
            verbose=args.verbose,
        )
    finally:
        repository.close()

    action = "预览完成" if args.dry_run else "导出完成"
    print(f"{action}：{json.dumps(summary, ensure_ascii=False)}")
    return summary


if __name__ == "__main__":
    main()