1 неделя назад · 45355d56cf
--- a/app/core/config.py
+++ b/app/core/config.py
@@ -158,6 +158,8 @@ class Settings:
 
															     wxindex_llm_max_tokens: int = 4000
														
 
															     wxindex_api_url: str = "http://crawapi.piaoquantv.com/crawler/wei_xin/wxindex"
														
 
															     wxindex_lookback_days: int = 7
														
 
															+    wxindex_words_cron_hour: int = 7
														
 
															+    wxindex_words_cron_minute: int = 30
														
 
															     demand_event_sense_threshold: float = 6.0
														
 
															     demand_senior_fit_threshold: float = 6.0
														
 
															     demand_quality_llm_model: str = "anthropic/claude-haiku-4-5"
														
@@ -340,6 +342,14 @@ class Settings:
 
															                 "WXINDEX_LOOKBACK_DAYS",
														
 
															                 defaults.wxindex_lookback_days,
														
 
															             ),
														
 
															+            wxindex_words_cron_hour=_env_int(
														
 
															+                "WXINDEX_WORDS_CRON_HOUR",
														
 
															+                defaults.wxindex_words_cron_hour,
														
 
															+            ),
														
 
															+            wxindex_words_cron_minute=_env_int(
														
 
															+                "WXINDEX_WORDS_CRON_MINUTE",
														
 
															+                defaults.wxindex_words_cron_minute,
														
 
															+            ),
														
 
															             demand_event_sense_threshold=_env_float(
														
 
															                 "DEMAND_EVENT_SENSE_THRESHOLD",
														
 
															                 defaults.demand_event_sense_threshold,
														
--- a/app/hot_content/config.py
+++ b/app/hot_content/config.py
@@ -254,6 +254,14 @@ def load_flow_config(interval_override: int | None = None) -> FlowConfig:
 
															             "WXINDEX_LOOKBACK_DAYS",
														
 
															             settings.wxindex_lookback_days,
														
 
															         ),
														
 
															+        wxindex_words_cron_hour=_get_env_int(
														
 
															+            "WXINDEX_WORDS_CRON_HOUR",
														
 
															+            settings.wxindex_words_cron_hour,
														
 
															+        ),
														
 
															+        wxindex_words_cron_minute=_get_env_int(
														
 
															+            "WXINDEX_WORDS_CRON_MINUTE",
														
 
															+            settings.wxindex_words_cron_minute,
														
 
															+        ),
														
 
															         demand_event_sense_threshold=_get_env_float(
														
 
															             "DEMAND_EVENT_SENSE_THRESHOLD",
														
 
															             settings.demand_event_sense_threshold,
														
--- a/app/hot_content/postprocess_service.py
+++ b/app/hot_content/postprocess_service.py
@@ -25,6 +25,11 @@ from app.hot_content.demand_export import (
 
															 from app.hot_content.demand_pool_writer import sync_hot_demands_to_hive
														
 
															 from app.hot_content.demand_quality import run_demand_quality_pipeline
														
 
															 from app.hot_content.wxindex_trend import calc_wxindex_trend
														
 
															+from app.hot_content.wxindex_words import (
														
 
															+    ensure_word_full_scores,
														
 
															+    slice_scores_lookback,
														
 
															+    sync_words_from_trend_json,
														
 
															+)
														
 
															 class WxindexSelectionSkipped(Exception):
														
@@ -230,6 +235,10 @@ class ContributionPostprocessService:
 
															                     record_id=record_id,
														
 
															                     trend_json=trend_result,
														
 
															                 )
														
 
															+                self.sync_wxindex_words(
														
 
															+                    record_id=record_id,
														
 
															+                    trend_result=trend_result,
														
 
															+                )
														
 
															                 event_sense_json, senior_fit_json = self.run_demand_quality_judgment(
														
 
															                     record=record,
														
 
															                     match_result=match_result,
														
@@ -296,6 +305,22 @@ class ContributionPostprocessService:
 
															             result["hive_sync_error"] = str(exc)
														
 
															         return result
														
 
															+    def sync_wxindex_words(
														
 
															+        self,
														
 
															+        *,
														
 
															+        record_id: int,
														
 
															+        trend_result: dict[str, Any],
														
 
															+        verbose: bool = False,
														
 
															+    ) -> dict[str, int]:
														
 
															+        return sync_words_from_trend_json(
														
 
															+            self.repository,
														
 
															+            self.api_client,
														
 
															+            self.config.wxindex_api_url,
														
 
															+            trend_json=trend_result,
														
 
															+            record_id=record_id,
														
 
															+            verbose=verbose,
														
 
															+        )
														
 
															+
														
 
															     def _save_empty_demand_quality(self, *, record_id: int) -> None:
														
 
															         self.repository.save_demand_quality(
														
 
															             record_id=record_id,
														
@@ -617,18 +642,20 @@ class ContributionPostprocessService:
 
															             matched_demands=matched_demands,
														
 
															         )
														
 
															         selected_words = pick["selected_words"]
														
 
															-        start_ymd, end_ymd = _get_recent_range(self.config.wxindex_lookback_days)
														
 
															         threshold = float(self.config.wxindex_score_threshold)
														
 
															         wxindex_searches: list[dict[str, Any]] = []
														
 
															         for keyword in selected_words:
														
 
															-            wx_payload = {
														
 
															-                "keyword": keyword,
														
 
															-                "start_ymd": start_ymd,
														
 
															-                "end_ymd": end_ymd,
														
 
															-            }
														
 
															-            wx_resp = self.api_client.post_json(self.config.wxindex_api_url, wx_payload)
														
 
															-            series = _parse_total_scores(wx_resp)
														
 
															+            full_scores, _action = ensure_word_full_scores(
														
 
															+                self.repository,
														
 
															+                self.api_client,
														
 
															+                self.config.wxindex_api_url,
														
 
															+                keyword=keyword,
														
 
															+            )
														
 
															+            series, start_ymd, end_ymd = slice_scores_lookback(
														
 
															+                full_scores,
														
 
															+                self.config.wxindex_lookback_days,
														
 
															+            )
														
 
															             latest_score = series[-1]["total_score"] if series else None
														
 
															             wxindex_searches.append(
														
 
															                 {
														
@@ -672,8 +699,8 @@ class ContributionPostprocessService:
 
															             "wxindex": {
														
 
															                 "keyword": selected_word,
														
 
															                 "keywords": selected_words,
														
 
															-                "start_ymd": start_ymd,
														
 
															-                "end_ymd": end_ymd,
														
 
															+                "start_ymd": best["start_ymd"],
														
 
															+                "end_ymd": best["end_ymd"],
														
 
															                 "total_score_7d": series,
														
 
															                 "latest_total_score": latest_score,
														
 
															                 "threshold": threshold,
														
--- a/app/hot_content/repository.py
+++ b/app/hot_content/repository.py
@@ -1082,6 +1082,215 @@ class HotContentRepository:
 
															                 """
														
 
															             )
														
 
															+    def list_wxindex_word_scores(self, name: str) -> list[dict[str, Any]]:
														
 
															+        word = str(name or "").strip()
														
 
															+        if not word:
														
 
															+            return []
														
 
															+        self._ensure_wxindex_words_table()
														
 
															+        sql = """
														
 
															+            SELECT dt, total_score
														
 
															+            FROM hot_content_wxindex_words
														
 
															+            WHERE name = %s
														
 
															+            ORDER BY dt ASC
														
 
															+        """
														
 
															+        with self.conn.cursor() as cursor:
														
 
															+            cursor.execute(sql, (word,))
														
 
															+            rows = cursor.fetchall()
														
 
															+        scores: list[dict[str, Any]] = []
														
 
															+        for row in rows:
														
 
															+            dt = str(row.get("dt") or "").strip()
														
 
															+            if not dt:
														
 
															+                continue
														
 
															+            try:
														
 
															+                total_score = float(row["total_score"])
														
 
															+            except (TypeError, ValueError, KeyError):
														
 
															+                continue
														
 
															+            scores.append({"ymd": dt, "total_score": total_score})
														
 
															+        return scores
														
 
															+
														
 
															+    def list_stale_wxindex_words(self, *, end_ymd: str) -> list[dict[str, Any]]:
														
 
															+        """返回已存在但最新日期早于 end_ymd 的词。"""
														
 
															+        target_end = str(end_ymd or "").strip()
														
 
															+        if not target_end:
														
 
															+            return []
														
 
															+        self._ensure_wxindex_words_table()
														
 
															+        sql = """
														
 
															+            SELECT name, MAX(dt) AS latest_dt
														
 
															+            FROM hot_content_wxindex_words
														
 
															+            GROUP BY name
														
 
															+            HAVING MAX(dt) < %s
														
 
															+            ORDER BY name ASC
														
 
															+        """
														
 
															+        with self.conn.cursor() as cursor:
														
 
															+            cursor.execute(sql, (target_end,))
														
 
															+            rows = cursor.fetchall()
														
 
															+        stale_words: list[dict[str, Any]] = []
														
 
															+        for row in rows:
														
 
															+            name = str(row.get("name") or "").strip()
														
 
															+            latest_dt = str(row.get("latest_dt") or "").strip()
														
 
															+            if name and latest_dt:
														
 
															+                stale_words.append({"name": name, "latest_dt": latest_dt})
														
 
															+        return stale_words
														
 
															+
														
 
															+    def list_low_avg_wxindex_words(
														
 
															+        self,
														
 
															+        *,
														
 
															+        min_avg_score: float,
														
 
															+    ) -> list[dict[str, Any]]:
														
 
															+        """按 name 聚合，返回平均分低于阈值的词。"""
														
 
															+        self._ensure_wxindex_words_table()
														
 
															+        sql = """
														
 
															+            SELECT
														
 
															+                name,
														
 
															+                AVG(total_score) AS avg_score,
														
 
															+                COUNT(*) AS row_count
														
 
															+            FROM hot_content_wxindex_words
														
 
															+            GROUP BY name
														
 
															+            HAVING AVG(total_score) < %s
														
 
															+            ORDER BY name ASC
														
 
															+        """
														
 
															+        with self.conn.cursor() as cursor:
														
 
															+            cursor.execute(sql, (min_avg_score,))
														
 
															+            rows = cursor.fetchall()
														
 
															+
														
 
															+        low_words: list[dict[str, Any]] = []
														
 
															+        for row in rows:
														
 
															+            name = str(row.get("name") or "").strip()
														
 
															+            if not name:
														
 
															+                continue
														
 
															+            try:
														
 
															+                avg_score = float(row["avg_score"])
														
 
															+                row_count = int(row["row_count"])
														
 
															+            except (TypeError, ValueError, KeyError):
														
 
															+                continue
														
 
															+            low_words.append(
														
 
															+                {
														
 
															+                    "name": name,
														
 
															+                    "avg_score": avg_score,
														
 
															+                    "row_count": row_count,
														
 
															+                }
														
 
															+            )
														
 
															+        return low_words
														
 
															+
														
 
															+    def delete_wxindex_words_by_names(self, names: list[str]) -> int:
														
 
															+        cleaned = [str(name or "").strip() for name in names if str(name or "").strip()]
														
 
															+        if not cleaned:
														
 
															+            return 0
														
 
															+        self._ensure_wxindex_words_table()
														
 
															+        placeholders = ", ".join(["%s"] * len(cleaned))
														
 
															+        sql = f"""
														
 
															+            DELETE FROM hot_content_wxindex_words
														
 
															+            WHERE name IN ({placeholders})
														
 
															+        """
														
 
															+        with self.conn.cursor() as cursor:
														
 
															+            cursor.execute(sql, tuple(cleaned))
														
 
															+            return int(cursor.rowcount or 0)
														
 
															+
														
 
															+    def get_wxindex_word_latest_dt(self, name: str) -> str | None:
														
 
															+        word = str(name or "").strip()
														
 
															+        if not word:
														
 
															+            return None
														
 
															+        self._ensure_wxindex_words_table()
														
 
															+        sql = """
														
 
															+            SELECT MAX(dt) AS latest_dt
														
 
															+            FROM hot_content_wxindex_words
														
 
															+            WHERE name = %s
														
 
															+        """
														
 
															+        with self.conn.cursor() as cursor:
														
 
															+            cursor.execute(sql, (word,))
														
 
															+            row = cursor.fetchone() or {}
														
 
															+        latest_dt = str(row.get("latest_dt") or "").strip()
														
 
															+        return latest_dt or None
														
 
															+
														
 
															+    def save_wxindex_daily_scores(
														
 
															+        self,
														
 
															+        *,
														
 
															+        name: str,
														
 
															+        scores: list[dict[str, Any]],
														
 
															+    ) -> tuple[int, int]:
														
 
															+        """按词+日期写入每日指数，重复行跳过。返回 (inserted, skipped)。"""
														
 
															+        word = str(name or "").strip()
														
 
															+        if not word or not scores:
														
 
															+            return 0, 0
														
 
															+        self._ensure_wxindex_words_table()
														
 
															+        sql = """
														
 
															+            INSERT IGNORE INTO hot_content_wxindex_words (
														
 
															+                name,
														
 
															+                dt,
														
 
															+                total_score
														
 
															+            )
														
 
															+            VALUES (%s, %s, %s)
														
 
															+        """
														
 
															+        rows: list[tuple[str, str, float]] = []
														
 
															+        seen: set[tuple[str, str]] = set()
														
 
															+        for item in scores:
														
 
															+            if not isinstance(item, dict):
														
 
															+                continue
														
 
															+            dt = str(item.get("ymd") or item.get("dt") or "").strip()
														
 
															+            if not dt:
														
 
															+                continue
														
 
															+            try:
														
 
															+                total_score = float(item["total_score"])
														
 
															+            except (TypeError, ValueError, KeyError):
														
 
															+                continue
														
 
															+            key = (word, dt)
														
 
															+            if key in seen:
														
 
															+                continue
														
 
															+            seen.add(key)
														
 
															+            rows.append((word, dt, total_score))
														
 
															+        if not rows:
														
 
															+            return 0, 0
														
 
															+
														
 
															+        with self.conn.cursor() as cursor:
														
 
															+            cursor.executemany(sql, rows)
														
 
															+            inserted = int(cursor.rowcount or 0)
														
 
															+        skipped = len(rows) - inserted
														
 
															+        return inserted, skipped
														
 
															+
														
 
															+    def list_records_with_wxindex_trend(
														
 
															+        self,
														
 
															+        *,
														
 
															+        since_dt: datetime,
														
 
															+    ) -> list[dict[str, Any]]:
														
 
															+        sql = """
														
 
															+            SELECT id, wxindex_trend_json
														
 
															+            FROM hot_content_records
														
 
															+            WHERE created_at >= %s
														
 
															+              AND wxindex_trend_json IS NOT NULL
														
 
															+              AND TRIM(CAST(wxindex_trend_json AS CHAR)) <> ''
														
 
															+            ORDER BY id ASC
														
 
															+        """
														
 
															+        with self.conn.cursor() as cursor:
														
 
															+            cursor.execute(sql, (since_dt,))
														
 
															+            rows = cursor.fetchall()
														
 
															+
														
 
															+        records: list[dict[str, Any]] = []
														
 
															+        for row in rows:
														
 
															+            records.append(
														
 
															+                {
														
 
															+                    "id": int(row["id"]),
														
 
															+                    "wxindex_trend_json": _json_loads(row.get("wxindex_trend_json")),
														
 
															+                }
														
 
															+            )
														
 
															+        return records
														
 
															+
														
 
															+    def _ensure_wxindex_words_table(self) -> None:
														
 
															+        sql = """
														
 
															+            CREATE TABLE IF NOT EXISTS hot_content_wxindex_words (
														
 
															+                id BIGINT UNSIGNED NOT NULL AUTO_INCREMENT,
														
 
															+                name VARCHAR(256) NOT NULL COMMENT '词',
														
 
															+                dt VARCHAR(8) NOT NULL COMMENT '日期 yyyymmdd',
														
 
															+                total_score DOUBLE NOT NULL COMMENT '微信指数',
														
 
															+                created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
														
 
															+                PRIMARY KEY (id),
														
 
															+                UNIQUE KEY uk_name_dt (name, dt),
														
 
															+                KEY idx_name (name),
														
 
															+                KEY idx_dt (dt)
														
 
															+            ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci
														
 
															+        """
														
 
															+        with self.conn.cursor() as cursor:
														
 
															+            cursor.execute(sql)
														
 
															+
														
 
															     def _ensure_odps_sync_log_table(self) -> None:
														
 
															         sql = """
														
 
															             CREATE TABLE IF NOT EXISTS hot_content_odps_sync_log (
														
--- a/app/hot_content/types.py
+++ b/app/hot_content/types.py
@@ -53,6 +53,8 @@ class FlowConfig:
 
															     wxindex_llm_max_tokens: int
														
 
															     wxindex_api_url: str
														
 
															     wxindex_lookback_days: int
														
 
															+    wxindex_words_cron_hour: int
														
 
															+    wxindex_words_cron_minute: int
														
 
															     demand_event_sense_threshold: float
														
 
															     demand_senior_fit_threshold: float
														
 
															     demand_quality_llm_model: str
														
--- a/app/hot_content/wxindex_words.py
+++ b/app/hot_content/wxindex_words.py
@@ -0,0 +1,434 @@
 
															+"""微信指数检索词汇总：从 wxindex_trend_json 提取全部检索词并持久化每日指数。"""
														
 
															+
														
 
															+from __future__ import annotations
														
 
															+
														
 
															+from datetime import date, datetime, timedelta
														
 
															+from typing import Any
														
 
															+
														
 
															+from app.hot_content.client import JsonApiClient
														
 
															+from app.hot_content.demand_export import get_wxindex_keywords
														
 
															+from app.hot_content.repository import HotContentRepository
														
 
															+from app.hot_content.timezone import SHANGHAI_TZ
														
 
															+
														
 
															+WXINDEX_WORDS_START_YMD = "20260601"
														
 
															+WXINDEX_WORDS_RECORD_SINCE = date(2026, 6, 11)
														
 
															+WXINDEX_WORDS_MIN_AVG_SCORE = 100_000.0
														
 
															+
														
 
															+
														
 
															+def get_wxindex_end_ymd(*, today: date | None = None) -> str:
														
 
															+    current = today or datetime.now(SHANGHAI_TZ).date()
														
 
															+    return (current - timedelta(days=1)).strftime("%Y%m%d")
														
 
															+
														
 
															+
														
 
															+def get_lookback_range(lookback_days: int, *, today: date | None = None) -> tuple[str, str]:
														
 
															+    """原流程使用的近 N 日区间（截至昨日）。"""
														
 
															+    current = today or datetime.now(SHANGHAI_TZ).date()
														
 
															+    end_date = current - timedelta(days=1)
														
 
															+    start_date = end_date - timedelta(days=max(lookback_days, 1))
														
 
															+    return start_date.strftime("%Y%m%d"), end_date.strftime("%Y%m%d")
														
 
															+
														
 
															+
														
 
															+def extract_searched_words(trend_json: dict[str, Any] | None) -> list[str]:
														
 
															+    """提取 wxindex_trend_json 中实际检索过微信指数的全部词（非仅最高分词）。"""
														
 
															+    if not isinstance(trend_json, dict):
														
 
															+        return []
														
 
															+
														
 
															+    words: list[str] = []
														
 
															+    seen: set[str] = set()
														
 
															+    for item in trend_json.get("wxindex_searches") or []:
														
 
															+        if not isinstance(item, dict):
														
 
															+            continue
														
 
															+        keyword = str(item.get("keyword") or "").strip()
														
 
															+        if keyword and keyword not in seen:
														
 
															+            seen.add(keyword)
														
 
															+            words.append(keyword)
														
 
															+
														
 
															+    if words:
														
 
															+        return words
														
 
															+    return get_wxindex_keywords(trend_json)
														
 
															+
														
 
															+
														
 
															+def parse_wxindex_total_scores(wx_resp: dict[str, Any]) -> list[dict[str, Any]]:
														
 
															+    rows = ((wx_resp.get("data") or {}).get("data") or [])
														
 
															+    if not isinstance(rows, list):
														
 
															+        return []
														
 
															+
														
 
															+    series: list[dict[str, Any]] = []
														
 
															+    for row in rows:
														
 
															+        if not isinstance(row, dict):
														
 
															+            continue
														
 
															+        ymd = str(row.get("ymd") or "").strip()
														
 
															+        total_score = (row.get("channel_score") or {}).get("total_score")
														
 
															+        try:
														
 
															+            score_num = float(total_score) if total_score is not None else None
														
 
															+        except (TypeError, ValueError):
														
 
															+            score_num = None
														
 
															+        if ymd and score_num is not None:
														
 
															+            series.append({"ymd": ymd, "total_score": score_num})
														
 
															+    series.sort(key=lambda item: item["ymd"])
														
 
															+    return series
														
 
															+
														
 
															+
														
 
															+def fetch_wxindex_scores(
														
 
															+    api_client: JsonApiClient,
														
 
															+    api_url: str,
														
 
															+    *,
														
 
															+    keyword: str,
														
 
															+    start_ymd: str = WXINDEX_WORDS_START_YMD,
														
 
															+    end_ymd: str | None = None,
														
 
															+) -> list[dict[str, Any]]:
														
 
															+    payload = {
														
 
															+        "keyword": keyword,
														
 
															+        "start_ymd": start_ymd,
														
 
															+        "end_ymd": end_ymd or get_wxindex_end_ymd(),
														
 
															+    }
														
 
															+    wx_resp = api_client.post_json(api_url, payload)
														
 
															+    return parse_wxindex_total_scores(wx_resp)
														
 
															+
														
 
															+
														
 
															+def scores_need_refresh(
														
 
															+    scores: list[dict[str, Any]],
														
 
															+    *,
														
 
															+    end_ymd: str | None = None,
														
 
															+) -> bool:
														
 
															+    if not scores:
														
 
															+        return True
														
 
															+    target_end = end_ymd or get_wxindex_end_ymd()
														
 
															+    latest_ymd = max(
														
 
															+        str(item.get("ymd") or "")
														
 
															+        for item in scores
														
 
															+        if isinstance(item, dict) and str(item.get("ymd") or "").strip()
														
 
															+    )
														
 
															+    return latest_ymd < target_end
														
 
															+
														
 
															+
														
 
															+def slice_scores_lookback(
														
 
															+    full_scores: list[dict[str, Any]],
														
 
															+    lookback_days: int,
														
 
															+    *,
														
 
															+    today: date | None = None,
														
 
															+) -> tuple[list[dict[str, Any]], str, str]:
														
 
															+    """从全量序列截取原流程所需的近 N 日数据。"""
														
 
															+    start_ymd, end_ymd = get_lookback_range(lookback_days, today=today)
														
 
															+    series = [
														
 
															+        item
														
 
															+        for item in full_scores
														
 
															+        if isinstance(item, dict)
														
 
															+        and start_ymd <= str(item.get("ymd") or "") <= end_ymd
														
 
															+    ]
														
 
															+    series.sort(key=lambda item: str(item.get("ymd") or ""))
														
 
															+    if series:
														
 
															+        return series, start_ymd, end_ymd
														
 
															+    return [], start_ymd, end_ymd
														
 
															+
														
 
															+
														
 
															+def next_ymd(ymd: str) -> str:
														
 
															+    current = datetime.strptime(ymd, "%Y%m%d").date()
														
 
															+    return (current + timedelta(days=1)).strftime("%Y%m%d")
														
 
															+
														
 
															+
														
 
															+def refresh_stale_wxindex_words(
														
 
															+    repository: HotContentRepository,
														
 
															+    api_client: JsonApiClient,
														
 
															+    api_url: str,
														
 
															+    *,
														
 
															+    end_ymd: str | None = None,
														
 
															+    dry_run: bool = False,
														
 
															+    verbose: bool = False,
														
 
															+) -> dict[str, int]:
														
 
															+    """补全已存在但缺少最新日期数据的词。"""
														
 
															+    target_end = end_ymd or get_wxindex_end_ymd()
														
 
															+    summary = {
														
 
															+        "target_end_ymd": target_end,
														
 
															+        "stale_words": 0,
														
 
															+        "refreshed": 0,
														
 
															+        "inserted_rows": 0,
														
 
															+        "skipped_rows": 0,
														
 
															+        "fetch_failed": 0,
														
 
															+        "no_new_range": 0,
														
 
															+    }
														
 
															+
														
 
															+    stale_words = repository.list_stale_wxindex_words(end_ymd=target_end)
														
 
															+    summary["stale_words"] = len(stale_words)
														
 
															+    if not stale_words:
														
 
															+        return summary
														
 
															+
														
 
															+    for item in stale_words:
														
 
															+        name = str(item.get("name") or "").strip()
														
 
															+        latest_dt = str(item.get("latest_dt") or "").strip()
														
 
															+        if not name or not latest_dt:
														
 
															+            continue
														
 
															+
														
 
															+        start_ymd = next_ymd(latest_dt)
														
 
															+        if start_ymd > target_end:
														
 
															+            summary["no_new_range"] += 1
														
 
															+            if verbose:
														
 
															+                print(f"skip up-to-date word={name} latest_dt={latest_dt}")
														
 
															+            continue
														
 
															+
														
 
															+        if dry_run:
														
 
															+            summary["refreshed"] += 1
														
 
															+            if verbose:
														
 
															+                print(f"[dry-run] would refresh word={name} {start_ymd}->{target_end}")
														
 
															+            continue
														
 
															+
														
 
															+        try:
														
 
															+            api_scores = fetch_wxindex_scores(
														
 
															+                api_client,
														
 
															+                api_url,
														
 
															+                keyword=name,
														
 
															+                start_ymd=start_ymd,
														
 
															+                end_ymd=target_end,
														
 
															+            )
														
 
															+            inserted, skipped = repository.save_wxindex_daily_scores(
														
 
															+                name=name,
														
 
															+                scores=api_scores,
														
 
															+            )
														
 
															+        except Exception as exc:
														
 
															+            summary["fetch_failed"] += 1
														
 
															+            if verbose:
														
 
															+                print(f"refresh failed word={name}: {exc}")
														
 
															+            continue
														
 
															+
														
 
															+        summary["refreshed"] += 1
														
 
															+        summary["inserted_rows"] += inserted
														
 
															+        summary["skipped_rows"] += skipped
														
 
															+        if verbose:
														
 
															+            print(
														
 
															+                f"refreshed word={name} range={start_ymd}->{target_end} "
														
 
															+                f"inserted={inserted} skipped={skipped}"
														
 
															+            )
														
 
															+
														
 
															+    return summary
														
 
															+
														
 
															+
														
 
															+def cleanup_low_avg_wxindex_words(
														
 
															+    repository: HotContentRepository,
														
 
															+    *,
														
 
															+    min_avg_score: float = WXINDEX_WORDS_MIN_AVG_SCORE,
														
 
															+    dry_run: bool = False,
														
 
															+    verbose: bool = False,
														
 
															+) -> dict[str, int | float]:
														
 
															+    """删除各 dt 平均分低于阈值的词（按 name 整词删除）。"""
														
 
															+    summary: dict[str, int | float] = {
														
 
															+        "min_avg_score": min_avg_score,
														
 
															+        "low_avg_words": 0,
														
 
															+        "deleted_rows": 0,
														
 
															+    }
														
 
															+    low_words = repository.list_low_avg_wxindex_words(min_avg_score=min_avg_score)
														
 
															+    summary["low_avg_words"] = len(low_words)
														
 
															+    if not low_words:
														
 
															+        return summary
														
 
															+
														
 
															+    if dry_run:
														
 
															+        if verbose:
														
 
															+            for item in low_words:
														
 
															+                print(
														
 
															+                    f"[dry-run] would delete word={item['name']} "
														
 
															+                    f"avg_score={item['avg_score']:.0f} rows={item['row_count']}"
														
 
															+                )
														
 
															+        summary["deleted_rows"] = sum(int(item["row_count"]) for item in low_words)
														
 
															+        return summary
														
 
															+
														
 
															+    names = [str(item["name"]) for item in low_words if str(item.get("name") or "").strip()]
														
 
															+    deleted_rows = repository.delete_wxindex_words_by_names(names)
														
 
															+    summary["deleted_rows"] = deleted_rows
														
 
															+    if verbose:
														
 
															+        for item in low_words:
														
 
															+            print(
														
 
															+                f"deleted word={item['name']} "
														
 
															+                f"avg_score={item['avg_score']:.0f} rows={item['row_count']}"
														
 
															+            )
														
 
															+    return summary
														
 
															+
														
 
															+
														
 
															+def run_wxindex_words_daily_job(
														
 
															+    repository: HotContentRepository,
														
 
															+    api_client: JsonApiClient,
														
 
															+    api_url: str,
														
 
															+    *,
														
 
															+    end_ymd: str | None = None,
														
 
															+    min_avg_score: float = WXINDEX_WORDS_MIN_AVG_SCORE,
														
 
															+    dry_run: bool = False,
														
 
															+    verbose: bool = False,
														
 
															+) -> dict[str, Any]:
														
 
															+    """定时任务：先补全缺失日期，再清理低均值词。"""
														
 
															+    refresh_summary = refresh_stale_wxindex_words(
														
 
															+        repository,
														
 
															+        api_client,
														
 
															+        api_url,
														
 
															+        end_ymd=end_ymd,
														
 
															+        dry_run=dry_run,
														
 
															+        verbose=verbose,
														
 
															+    )
														
 
															+    cleanup_summary = cleanup_low_avg_wxindex_words(
														
 
															+        repository,
														
 
															+        min_avg_score=min_avg_score,
														
 
															+        dry_run=dry_run,
														
 
															+        verbose=verbose,
														
 
															+    )
														
 
															+    return {
														
 
															+        "refresh": refresh_summary,
														
 
															+        "cleanup": cleanup_summary,
														
 
															+    }
														
 
															+
														
 
															+
														
 
															+def ensure_word_full_scores(
														
 
															+    repository: HotContentRepository,
														
 
															+    api_client: JsonApiClient,
														
 
															+    api_url: str,
														
 
															+    *,
														
 
															+    keyword: str,
														
 
															+    end_ymd: str | None = None,
														
 
															+    force_refresh: bool = False,
														
 
															+    dry_run: bool = False,
														
 
															+) -> tuple[list[dict[str, Any]], str]:
														
 
															+    """
														
 
															+    获取词的全量微信指数（20260601 起至昨日），按词+日期逐行入库。
														
 
															+
														
 
															+    返回 (scores, action)，action 为 inserted / updated / cached / dry_run。
														
 
															+    """
														
 
															+    word = str(keyword or "").strip()
														
 
															+    if not word:
														
 
															+        return [], "empty"
														
 
															+
														
 
															+    target_end = end_ymd or get_wxindex_end_ymd()
														
 
															+    stored_scores = repository.list_wxindex_word_scores(word)
														
 
															+    if stored_scores and not force_refresh and not scores_need_refresh(
														
 
															+        stored_scores,
														
 
															+        end_ymd=target_end,
														
 
															+    ):
														
 
															+        return stored_scores, "cached"
														
 
															+
														
 
															+    if dry_run:
														
 
															+        return [], "dry_run"
														
 
															+
														
 
															+    had_data = bool(stored_scores)
														
 
															+    api_scores = fetch_wxindex_scores(
														
 
															+        api_client,
														
 
															+        api_url,
														
 
															+        keyword=word,
														
 
															+        end_ymd=target_end,
														
 
															+    )
														
 
															+    inserted, _skipped = repository.save_wxindex_daily_scores(
														
 
															+        name=word,
														
 
															+        scores=api_scores,
														
 
															+    )
														
 
															+    final_scores = repository.list_wxindex_word_scores(word)
														
 
															+    if inserted > 0:
														
 
															+        action = "updated" if had_data else "inserted"
														
 
															+    else:
														
 
															+        action = "cached"
														
 
															+    return final_scores or api_scores, action
														
 
															+
														
 
															+
														
 
															+def sync_words_from_trend_json(
														
 
															+    repository: HotContentRepository,
														
 
															+    api_client: JsonApiClient,
														
 
															+    api_url: str,
														
 
															+    *,
														
 
															+    trend_json: dict[str, Any],
														
 
															+    record_id: int,
														
 
															+    dry_run: bool = False,
														
 
															+    verbose: bool = False,
														
 
															+) -> dict[str, int]:
														
 
															+    """将单条记录的 wxindex_trend_json 中检索词写入/刷新汇总表（全量数据）。"""
														
 
															+    summary = {
														
 
															+        "words_found": 0,
														
 
															+        "inserted": 0,
														
 
															+        "updated": 0,
														
 
															+        "cached": 0,
														
 
															+        "fetch_failed": 0,
														
 
															+    }
														
 
															+    words = extract_searched_words(trend_json)
														
 
															+    summary["words_found"] = len(words)
														
 
															+    if not words:
														
 
															+        return summary
														
 
															+
														
 
															+    for name in words:
														
 
															+        try:
														
 
															+            _, action = ensure_word_full_scores(
														
 
															+                repository,
														
 
															+                api_client,
														
 
															+                api_url,
														
 
															+                keyword=name,
														
 
															+                dry_run=dry_run,
														
 
															+            )
														
 
															+        except Exception as exc:
														
 
															+            summary["fetch_failed"] += 1
														
 
															+            if verbose:
														
 
															+                print(f"  fetch failed word={name}: {exc}")
														
 
															+            continue
														
 
															+
														
 
															+        if action == "inserted":
														
 
															+            summary["inserted"] += 1
														
 
															+        elif action == "updated":
														
 
															+            summary["updated"] += 1
														
 
															+        elif action == "cached":
														
 
															+            summary["cached"] += 1
														
 
															+        elif action == "dry_run":
														
 
															+            summary["inserted"] += 1
														
 
															+
														
 
															+        if verbose:
														
 
															+            print(f"  word={name} action={action}")
														
 
															+
														
 
															+    return summary
														
 
															+
														
 
															+
														
 
															+def backfill_wxindex_words(
														
 
															+    repository: HotContentRepository,
														
 
															+    api_client: JsonApiClient,
														
 
															+    api_url: str,
														
 
															+    *,
														
 
															+    since_date: date = WXINDEX_WORDS_RECORD_SINCE,
														
 
															+    dry_run: bool = False,
														
 
															+    verbose: bool = False,
														
 
															+) -> dict[str, int]:
														
 
															+    """扫描 hot_content_records，汇总 6/11 起全部微信指数检索词（历史回填调 API）。"""
														
 
															+    summary = {
														
 
															+        "records_scanned": 0,
														
 
															+        "records_with_words": 0,
														
 
															+        "words_found": 0,
														
 
															+        "inserted": 0,
														
 
															+        "updated": 0,
														
 
															+        "cached": 0,
														
 
															+        "fetch_failed": 0,
														
 
															+        "invalid_json": 0,
														
 
															+    }
														
 
															+
														
 
															+    since_dt = datetime.combine(since_date, datetime.min.time()).replace(tzinfo=SHANGHAI_TZ)
														
 
															+    records = repository.list_records_with_wxindex_trend(since_dt=since_dt)
														
 
															+
														
 
															+    for row in records:
														
 
															+        summary["records_scanned"] += 1
														
 
															+        record_id = int(row["id"])
														
 
															+        try:
														
 
															+            trend_json = row.get("wxindex_trend_json")
														
 
															+            if not isinstance(trend_json, dict):
														
 
															+                summary["invalid_json"] += 1
														
 
															+                continue
														
 
															+        except (TypeError, ValueError):
														
 
															+            summary["invalid_json"] += 1
														
 
															+            continue
														
 
															+
														
 
															+        words = extract_searched_words(trend_json)
														
 
															+        if not words:
														
 
															+            continue
														
 
															+
														
 
															+        summary["records_with_words"] += 1
														
 
															+        if verbose:
														
 
															+            print(f"id={record_id} words={words}")
														
 
															+
														
 
															+        result = sync_words_from_trend_json(
														
 
															+            repository,
														
 
															+            api_client,
														
 
															+            api_url,
														
 
															+            trend_json=trend_json,
														
 
															+            record_id=record_id,
														
 
															+            dry_run=dry_run,
														
 
															+            verbose=verbose,
														
 
															+        )
														
 
															+        for key in ("words_found", "inserted", "updated", "cached", "fetch_failed"):
														
 
															+            summary[key] += result[key]
														
 
															+
														
 
															+    return summary
														
--- a/app/scheduler.py
+++ b/app/scheduler.py
@@ -13,12 +13,15 @@ PROJECT_ROOT = Path(__file__).resolve().parents[1]
 
															 if str(PROJECT_ROOT) not in sys.path:
														
 
															     sys.path.insert(0, str(PROJECT_ROOT))
														
 
															+from app.hot_content.client import JsonApiClient
														
 
															 from app.hot_content.decode_result_service import run_once as run_decode_result_once
														
 
															 from app.hot_content.config import load_flow_config
														
 
															 from app.hot_content.postprocess_service import run_once as run_postprocess_once
														
 
															+from app.hot_content.repository import HotContentRepository
														
 
															 from app.hot_content.service import run_once
														
 
															 from app.hot_content.timezone import SHANGHAI_TZ
														
 
															 from app.hot_content.types import FlowConfig
														
 
															+from app.hot_content.wxindex_words import run_wxindex_words_daily_job
														
 
															 def _import_blocking_scheduler() -> Any:
														
@@ -61,6 +64,31 @@ def run_postprocess_job(config: FlowConfig) -> None:
 
															         print(f"postprocess flow failed: {exc}", file=sys.stderr)
														
 
															+def run_wxindex_words_refresh_job(config: FlowConfig) -> None:
														
 
															+    repository = HotContentRepository(config.mysql)
														
 
															+    api_client = JsonApiClient(
														
 
															+        timeout_seconds=config.request_timeout_seconds,
														
 
															+        verify_ssl=config.https_verify_ssl,
														
 
															+    )
														
 
															+    try:
														
 
															+        summary = run_wxindex_words_daily_job(
														
 
															+            repository,
														
 
															+            api_client,
														
 
															+            config.wxindex_api_url,
														
 
															+        )
														
 
															+        print(
														
 
															+            json.dumps(
														
 
															+                {"job": "wxindex_words_refresh", "summary": summary},
														
 
															+                ensure_ascii=False,
														
 
															+                indent=2,
														
 
															+            )
														
 
															+        )
														
 
															+    except Exception as exc:
														
 
															+        print(f"wxindex words refresh failed: {exc}", file=sys.stderr)
														
 
															+    finally:
														
 
															+        repository.close()
														
 
															+
														
 
															+
														
 
															 def register_hot_content_job(scheduler: Any, config: FlowConfig) -> None:
														
 
															     scheduler.add_job(
														
 
															         run_hot_content_job,
														
@@ -93,17 +121,35 @@ def register_decode_result_job(scheduler: Any, config: FlowConfig) -> None:
 
															     )
														
 
															+def register_wxindex_words_refresh_job(scheduler: Any, config: FlowConfig) -> None:
														
 
															+    scheduler.add_job(
														
 
															+        run_wxindex_words_refresh_job,
														
 
															+        trigger="cron",
														
 
															+        hour=config.wxindex_words_cron_hour,
														
 
															+        minute=config.wxindex_words_cron_minute,
														
 
															+        timezone=SHANGHAI_TZ,
														
 
															+        args=[config],
														
 
															+        id="wxindex_words_refresh",
														
 
															+        name="微信指数词汇总表补全缺失日期并清理低均值词",
														
 
															+        replace_existing=True,
														
 
															+        coalesce=True,
														
 
															+        max_instances=1,
														
 
															+    )
														
 
															+
														
 
															+
														
 
															 def start_scheduler() -> None:
														
 
															     BlockingScheduler = _import_blocking_scheduler()
														
 
															     scheduler = BlockingScheduler(timezone=SHANGHAI_TZ)
														
 
															     config = load_flow_config()
														
 
															     register_hot_content_job(scheduler, config)
														
 
															     register_decode_result_job(scheduler, config)
														
 
															+    register_wxindex_words_refresh_job(scheduler, config)
														
 
															     print(
														
 
															         "scheduler started, timezone=Asia/Shanghai, "
														
 
															-        "jobs=['hot_content_flow', 'decode_result_flow'], "
														
 
															+        "jobs=['hot_content_flow', 'decode_result_flow', 'wxindex_words_refresh'], "
														
 
															         f"hot_cron={config.hot_flow_cron_hours}:{config.hot_flow_cron_minute:02d}, "
														
 
															-        f"decode_result_interval={config.decode_result_interval_seconds}s"
														
 
															+        f"decode_result_interval={config.decode_result_interval_seconds}s, "
														
 
															+        f"wxindex_words_cron={config.wxindex_words_cron_hour}:{config.wxindex_words_cron_minute:02d}"
														
 
															     )
														
 
															     scheduler.start()
														
@@ -113,7 +159,7 @@ def parse_args() -> argparse.Namespace:
 
															     parser.add_argument("--once", action="store_true", help="执行一次，不启动调度器")
														
 
															     parser.add_argument(
														
 
															         "--job",
														
 
															-        choices=("all", "hot-content", "decode-result", "postprocess"),
														
 
															+        choices=("all", "hot-content", "decode-result", "postprocess", "wxindex-refresh"),
														
 
															         default="all",
														
 
															         help="--once 时选择执行哪个任务",
														
 
															     )
														
@@ -159,6 +205,8 @@ def main() -> None:
 
															                     indent=2,
														
 
															                 )
														
 
															             )
														
 
															+        if args.job in {"wxindex-refresh"}:
														
 
															+            run_wxindex_words_refresh_job(config)
														
 
															         return
														
 
															     start_scheduler()