|
@@ -0,0 +1,434 @@
|
|
|
|
|
+"""微信指数检索词汇总:从 wxindex_trend_json 提取全部检索词并持久化每日指数。"""
|
|
|
|
|
+
|
|
|
|
|
+from __future__ import annotations
|
|
|
|
|
+
|
|
|
|
|
+from datetime import date, datetime, timedelta
|
|
|
|
|
+from typing import Any
|
|
|
|
|
+
|
|
|
|
|
+from app.hot_content.client import JsonApiClient
|
|
|
|
|
+from app.hot_content.demand_export import get_wxindex_keywords
|
|
|
|
|
+from app.hot_content.repository import HotContentRepository
|
|
|
|
|
+from app.hot_content.timezone import SHANGHAI_TZ
|
|
|
|
|
+
|
|
|
|
|
+WXINDEX_WORDS_START_YMD = "20260601"
|
|
|
|
|
+WXINDEX_WORDS_RECORD_SINCE = date(2026, 6, 11)
|
|
|
|
|
+WXINDEX_WORDS_MIN_AVG_SCORE = 100_000.0
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def get_wxindex_end_ymd(*, today: date | None = None) -> str:
|
|
|
|
|
+ current = today or datetime.now(SHANGHAI_TZ).date()
|
|
|
|
|
+ return (current - timedelta(days=1)).strftime("%Y%m%d")
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def get_lookback_range(lookback_days: int, *, today: date | None = None) -> tuple[str, str]:
|
|
|
|
|
+ """原流程使用的近 N 日区间(截至昨日)。"""
|
|
|
|
|
+ current = today or datetime.now(SHANGHAI_TZ).date()
|
|
|
|
|
+ end_date = current - timedelta(days=1)
|
|
|
|
|
+ start_date = end_date - timedelta(days=max(lookback_days, 1))
|
|
|
|
|
+ return start_date.strftime("%Y%m%d"), end_date.strftime("%Y%m%d")
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def extract_searched_words(trend_json: dict[str, Any] | None) -> list[str]:
|
|
|
|
|
+ """提取 wxindex_trend_json 中实际检索过微信指数的全部词(非仅最高分词)。"""
|
|
|
|
|
+ if not isinstance(trend_json, dict):
|
|
|
|
|
+ return []
|
|
|
|
|
+
|
|
|
|
|
+ words: list[str] = []
|
|
|
|
|
+ seen: set[str] = set()
|
|
|
|
|
+ for item in trend_json.get("wxindex_searches") or []:
|
|
|
|
|
+ if not isinstance(item, dict):
|
|
|
|
|
+ continue
|
|
|
|
|
+ keyword = str(item.get("keyword") or "").strip()
|
|
|
|
|
+ if keyword and keyword not in seen:
|
|
|
|
|
+ seen.add(keyword)
|
|
|
|
|
+ words.append(keyword)
|
|
|
|
|
+
|
|
|
|
|
+ if words:
|
|
|
|
|
+ return words
|
|
|
|
|
+ return get_wxindex_keywords(trend_json)
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def parse_wxindex_total_scores(wx_resp: dict[str, Any]) -> list[dict[str, Any]]:
|
|
|
|
|
+ rows = ((wx_resp.get("data") or {}).get("data") or [])
|
|
|
|
|
+ if not isinstance(rows, list):
|
|
|
|
|
+ return []
|
|
|
|
|
+
|
|
|
|
|
+ series: list[dict[str, Any]] = []
|
|
|
|
|
+ for row in rows:
|
|
|
|
|
+ if not isinstance(row, dict):
|
|
|
|
|
+ continue
|
|
|
|
|
+ ymd = str(row.get("ymd") or "").strip()
|
|
|
|
|
+ total_score = (row.get("channel_score") or {}).get("total_score")
|
|
|
|
|
+ try:
|
|
|
|
|
+ score_num = float(total_score) if total_score is not None else None
|
|
|
|
|
+ except (TypeError, ValueError):
|
|
|
|
|
+ score_num = None
|
|
|
|
|
+ if ymd and score_num is not None:
|
|
|
|
|
+ series.append({"ymd": ymd, "total_score": score_num})
|
|
|
|
|
+ series.sort(key=lambda item: item["ymd"])
|
|
|
|
|
+ return series
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def fetch_wxindex_scores(
|
|
|
|
|
+ api_client: JsonApiClient,
|
|
|
|
|
+ api_url: str,
|
|
|
|
|
+ *,
|
|
|
|
|
+ keyword: str,
|
|
|
|
|
+ start_ymd: str = WXINDEX_WORDS_START_YMD,
|
|
|
|
|
+ end_ymd: str | None = None,
|
|
|
|
|
+) -> list[dict[str, Any]]:
|
|
|
|
|
+ payload = {
|
|
|
|
|
+ "keyword": keyword,
|
|
|
|
|
+ "start_ymd": start_ymd,
|
|
|
|
|
+ "end_ymd": end_ymd or get_wxindex_end_ymd(),
|
|
|
|
|
+ }
|
|
|
|
|
+ wx_resp = api_client.post_json(api_url, payload)
|
|
|
|
|
+ return parse_wxindex_total_scores(wx_resp)
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def scores_need_refresh(
|
|
|
|
|
+ scores: list[dict[str, Any]],
|
|
|
|
|
+ *,
|
|
|
|
|
+ end_ymd: str | None = None,
|
|
|
|
|
+) -> bool:
|
|
|
|
|
+ if not scores:
|
|
|
|
|
+ return True
|
|
|
|
|
+ target_end = end_ymd or get_wxindex_end_ymd()
|
|
|
|
|
+ latest_ymd = max(
|
|
|
|
|
+ str(item.get("ymd") or "")
|
|
|
|
|
+ for item in scores
|
|
|
|
|
+ if isinstance(item, dict) and str(item.get("ymd") or "").strip()
|
|
|
|
|
+ )
|
|
|
|
|
+ return latest_ymd < target_end
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def slice_scores_lookback(
|
|
|
|
|
+ full_scores: list[dict[str, Any]],
|
|
|
|
|
+ lookback_days: int,
|
|
|
|
|
+ *,
|
|
|
|
|
+ today: date | None = None,
|
|
|
|
|
+) -> tuple[list[dict[str, Any]], str, str]:
|
|
|
|
|
+ """从全量序列截取原流程所需的近 N 日数据。"""
|
|
|
|
|
+ start_ymd, end_ymd = get_lookback_range(lookback_days, today=today)
|
|
|
|
|
+ series = [
|
|
|
|
|
+ item
|
|
|
|
|
+ for item in full_scores
|
|
|
|
|
+ if isinstance(item, dict)
|
|
|
|
|
+ and start_ymd <= str(item.get("ymd") or "") <= end_ymd
|
|
|
|
|
+ ]
|
|
|
|
|
+ series.sort(key=lambda item: str(item.get("ymd") or ""))
|
|
|
|
|
+ if series:
|
|
|
|
|
+ return series, start_ymd, end_ymd
|
|
|
|
|
+ return [], start_ymd, end_ymd
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def next_ymd(ymd: str) -> str:
|
|
|
|
|
+ current = datetime.strptime(ymd, "%Y%m%d").date()
|
|
|
|
|
+ return (current + timedelta(days=1)).strftime("%Y%m%d")
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def refresh_stale_wxindex_words(
|
|
|
|
|
+ repository: HotContentRepository,
|
|
|
|
|
+ api_client: JsonApiClient,
|
|
|
|
|
+ api_url: str,
|
|
|
|
|
+ *,
|
|
|
|
|
+ end_ymd: str | None = None,
|
|
|
|
|
+ dry_run: bool = False,
|
|
|
|
|
+ verbose: bool = False,
|
|
|
|
|
+) -> dict[str, int]:
|
|
|
|
|
+ """补全已存在但缺少最新日期数据的词。"""
|
|
|
|
|
+ target_end = end_ymd or get_wxindex_end_ymd()
|
|
|
|
|
+ summary = {
|
|
|
|
|
+ "target_end_ymd": target_end,
|
|
|
|
|
+ "stale_words": 0,
|
|
|
|
|
+ "refreshed": 0,
|
|
|
|
|
+ "inserted_rows": 0,
|
|
|
|
|
+ "skipped_rows": 0,
|
|
|
|
|
+ "fetch_failed": 0,
|
|
|
|
|
+ "no_new_range": 0,
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ stale_words = repository.list_stale_wxindex_words(end_ymd=target_end)
|
|
|
|
|
+ summary["stale_words"] = len(stale_words)
|
|
|
|
|
+ if not stale_words:
|
|
|
|
|
+ return summary
|
|
|
|
|
+
|
|
|
|
|
+ for item in stale_words:
|
|
|
|
|
+ name = str(item.get("name") or "").strip()
|
|
|
|
|
+ latest_dt = str(item.get("latest_dt") or "").strip()
|
|
|
|
|
+ if not name or not latest_dt:
|
|
|
|
|
+ continue
|
|
|
|
|
+
|
|
|
|
|
+ start_ymd = next_ymd(latest_dt)
|
|
|
|
|
+ if start_ymd > target_end:
|
|
|
|
|
+ summary["no_new_range"] += 1
|
|
|
|
|
+ if verbose:
|
|
|
|
|
+ print(f"skip up-to-date word={name} latest_dt={latest_dt}")
|
|
|
|
|
+ continue
|
|
|
|
|
+
|
|
|
|
|
+ if dry_run:
|
|
|
|
|
+ summary["refreshed"] += 1
|
|
|
|
|
+ if verbose:
|
|
|
|
|
+ print(f"[dry-run] would refresh word={name} {start_ymd}->{target_end}")
|
|
|
|
|
+ continue
|
|
|
|
|
+
|
|
|
|
|
+ try:
|
|
|
|
|
+ api_scores = fetch_wxindex_scores(
|
|
|
|
|
+ api_client,
|
|
|
|
|
+ api_url,
|
|
|
|
|
+ keyword=name,
|
|
|
|
|
+ start_ymd=start_ymd,
|
|
|
|
|
+ end_ymd=target_end,
|
|
|
|
|
+ )
|
|
|
|
|
+ inserted, skipped = repository.save_wxindex_daily_scores(
|
|
|
|
|
+ name=name,
|
|
|
|
|
+ scores=api_scores,
|
|
|
|
|
+ )
|
|
|
|
|
+ except Exception as exc:
|
|
|
|
|
+ summary["fetch_failed"] += 1
|
|
|
|
|
+ if verbose:
|
|
|
|
|
+ print(f"refresh failed word={name}: {exc}")
|
|
|
|
|
+ continue
|
|
|
|
|
+
|
|
|
|
|
+ summary["refreshed"] += 1
|
|
|
|
|
+ summary["inserted_rows"] += inserted
|
|
|
|
|
+ summary["skipped_rows"] += skipped
|
|
|
|
|
+ if verbose:
|
|
|
|
|
+ print(
|
|
|
|
|
+ f"refreshed word={name} range={start_ymd}->{target_end} "
|
|
|
|
|
+ f"inserted={inserted} skipped={skipped}"
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+ return summary
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def cleanup_low_avg_wxindex_words(
|
|
|
|
|
+ repository: HotContentRepository,
|
|
|
|
|
+ *,
|
|
|
|
|
+ min_avg_score: float = WXINDEX_WORDS_MIN_AVG_SCORE,
|
|
|
|
|
+ dry_run: bool = False,
|
|
|
|
|
+ verbose: bool = False,
|
|
|
|
|
+) -> dict[str, int | float]:
|
|
|
|
|
+ """删除各 dt 平均分低于阈值的词(按 name 整词删除)。"""
|
|
|
|
|
+ summary: dict[str, int | float] = {
|
|
|
|
|
+ "min_avg_score": min_avg_score,
|
|
|
|
|
+ "low_avg_words": 0,
|
|
|
|
|
+ "deleted_rows": 0,
|
|
|
|
|
+ }
|
|
|
|
|
+ low_words = repository.list_low_avg_wxindex_words(min_avg_score=min_avg_score)
|
|
|
|
|
+ summary["low_avg_words"] = len(low_words)
|
|
|
|
|
+ if not low_words:
|
|
|
|
|
+ return summary
|
|
|
|
|
+
|
|
|
|
|
+ if dry_run:
|
|
|
|
|
+ if verbose:
|
|
|
|
|
+ for item in low_words:
|
|
|
|
|
+ print(
|
|
|
|
|
+ f"[dry-run] would delete word={item['name']} "
|
|
|
|
|
+ f"avg_score={item['avg_score']:.0f} rows={item['row_count']}"
|
|
|
|
|
+ )
|
|
|
|
|
+ summary["deleted_rows"] = sum(int(item["row_count"]) for item in low_words)
|
|
|
|
|
+ return summary
|
|
|
|
|
+
|
|
|
|
|
+ names = [str(item["name"]) for item in low_words if str(item.get("name") or "").strip()]
|
|
|
|
|
+ deleted_rows = repository.delete_wxindex_words_by_names(names)
|
|
|
|
|
+ summary["deleted_rows"] = deleted_rows
|
|
|
|
|
+ if verbose:
|
|
|
|
|
+ for item in low_words:
|
|
|
|
|
+ print(
|
|
|
|
|
+ f"deleted word={item['name']} "
|
|
|
|
|
+ f"avg_score={item['avg_score']:.0f} rows={item['row_count']}"
|
|
|
|
|
+ )
|
|
|
|
|
+ return summary
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def run_wxindex_words_daily_job(
|
|
|
|
|
+ repository: HotContentRepository,
|
|
|
|
|
+ api_client: JsonApiClient,
|
|
|
|
|
+ api_url: str,
|
|
|
|
|
+ *,
|
|
|
|
|
+ end_ymd: str | None = None,
|
|
|
|
|
+ min_avg_score: float = WXINDEX_WORDS_MIN_AVG_SCORE,
|
|
|
|
|
+ dry_run: bool = False,
|
|
|
|
|
+ verbose: bool = False,
|
|
|
|
|
+) -> dict[str, Any]:
|
|
|
|
|
+ """定时任务:先补全缺失日期,再清理低均值词。"""
|
|
|
|
|
+ refresh_summary = refresh_stale_wxindex_words(
|
|
|
|
|
+ repository,
|
|
|
|
|
+ api_client,
|
|
|
|
|
+ api_url,
|
|
|
|
|
+ end_ymd=end_ymd,
|
|
|
|
|
+ dry_run=dry_run,
|
|
|
|
|
+ verbose=verbose,
|
|
|
|
|
+ )
|
|
|
|
|
+ cleanup_summary = cleanup_low_avg_wxindex_words(
|
|
|
|
|
+ repository,
|
|
|
|
|
+ min_avg_score=min_avg_score,
|
|
|
|
|
+ dry_run=dry_run,
|
|
|
|
|
+ verbose=verbose,
|
|
|
|
|
+ )
|
|
|
|
|
+ return {
|
|
|
|
|
+ "refresh": refresh_summary,
|
|
|
|
|
+ "cleanup": cleanup_summary,
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def ensure_word_full_scores(
|
|
|
|
|
+ repository: HotContentRepository,
|
|
|
|
|
+ api_client: JsonApiClient,
|
|
|
|
|
+ api_url: str,
|
|
|
|
|
+ *,
|
|
|
|
|
+ keyword: str,
|
|
|
|
|
+ end_ymd: str | None = None,
|
|
|
|
|
+ force_refresh: bool = False,
|
|
|
|
|
+ dry_run: bool = False,
|
|
|
|
|
+) -> tuple[list[dict[str, Any]], str]:
|
|
|
|
|
+ """
|
|
|
|
|
+ 获取词的全量微信指数(20260601 起至昨日),按词+日期逐行入库。
|
|
|
|
|
+
|
|
|
|
|
+ 返回 (scores, action),action 为 inserted / updated / cached / dry_run。
|
|
|
|
|
+ """
|
|
|
|
|
+ word = str(keyword or "").strip()
|
|
|
|
|
+ if not word:
|
|
|
|
|
+ return [], "empty"
|
|
|
|
|
+
|
|
|
|
|
+ target_end = end_ymd or get_wxindex_end_ymd()
|
|
|
|
|
+ stored_scores = repository.list_wxindex_word_scores(word)
|
|
|
|
|
+ if stored_scores and not force_refresh and not scores_need_refresh(
|
|
|
|
|
+ stored_scores,
|
|
|
|
|
+ end_ymd=target_end,
|
|
|
|
|
+ ):
|
|
|
|
|
+ return stored_scores, "cached"
|
|
|
|
|
+
|
|
|
|
|
+ if dry_run:
|
|
|
|
|
+ return [], "dry_run"
|
|
|
|
|
+
|
|
|
|
|
+ had_data = bool(stored_scores)
|
|
|
|
|
+ api_scores = fetch_wxindex_scores(
|
|
|
|
|
+ api_client,
|
|
|
|
|
+ api_url,
|
|
|
|
|
+ keyword=word,
|
|
|
|
|
+ end_ymd=target_end,
|
|
|
|
|
+ )
|
|
|
|
|
+ inserted, _skipped = repository.save_wxindex_daily_scores(
|
|
|
|
|
+ name=word,
|
|
|
|
|
+ scores=api_scores,
|
|
|
|
|
+ )
|
|
|
|
|
+ final_scores = repository.list_wxindex_word_scores(word)
|
|
|
|
|
+ if inserted > 0:
|
|
|
|
|
+ action = "updated" if had_data else "inserted"
|
|
|
|
|
+ else:
|
|
|
|
|
+ action = "cached"
|
|
|
|
|
+ return final_scores or api_scores, action
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def sync_words_from_trend_json(
|
|
|
|
|
+ repository: HotContentRepository,
|
|
|
|
|
+ api_client: JsonApiClient,
|
|
|
|
|
+ api_url: str,
|
|
|
|
|
+ *,
|
|
|
|
|
+ trend_json: dict[str, Any],
|
|
|
|
|
+ record_id: int,
|
|
|
|
|
+ dry_run: bool = False,
|
|
|
|
|
+ verbose: bool = False,
|
|
|
|
|
+) -> dict[str, int]:
|
|
|
|
|
+ """将单条记录的 wxindex_trend_json 中检索词写入/刷新汇总表(全量数据)。"""
|
|
|
|
|
+ summary = {
|
|
|
|
|
+ "words_found": 0,
|
|
|
|
|
+ "inserted": 0,
|
|
|
|
|
+ "updated": 0,
|
|
|
|
|
+ "cached": 0,
|
|
|
|
|
+ "fetch_failed": 0,
|
|
|
|
|
+ }
|
|
|
|
|
+ words = extract_searched_words(trend_json)
|
|
|
|
|
+ summary["words_found"] = len(words)
|
|
|
|
|
+ if not words:
|
|
|
|
|
+ return summary
|
|
|
|
|
+
|
|
|
|
|
+ for name in words:
|
|
|
|
|
+ try:
|
|
|
|
|
+ _, action = ensure_word_full_scores(
|
|
|
|
|
+ repository,
|
|
|
|
|
+ api_client,
|
|
|
|
|
+ api_url,
|
|
|
|
|
+ keyword=name,
|
|
|
|
|
+ dry_run=dry_run,
|
|
|
|
|
+ )
|
|
|
|
|
+ except Exception as exc:
|
|
|
|
|
+ summary["fetch_failed"] += 1
|
|
|
|
|
+ if verbose:
|
|
|
|
|
+ print(f" fetch failed word={name}: {exc}")
|
|
|
|
|
+ continue
|
|
|
|
|
+
|
|
|
|
|
+ if action == "inserted":
|
|
|
|
|
+ summary["inserted"] += 1
|
|
|
|
|
+ elif action == "updated":
|
|
|
|
|
+ summary["updated"] += 1
|
|
|
|
|
+ elif action == "cached":
|
|
|
|
|
+ summary["cached"] += 1
|
|
|
|
|
+ elif action == "dry_run":
|
|
|
|
|
+ summary["inserted"] += 1
|
|
|
|
|
+
|
|
|
|
|
+ if verbose:
|
|
|
|
|
+ print(f" word={name} action={action}")
|
|
|
|
|
+
|
|
|
|
|
+ return summary
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def backfill_wxindex_words(
|
|
|
|
|
+ repository: HotContentRepository,
|
|
|
|
|
+ api_client: JsonApiClient,
|
|
|
|
|
+ api_url: str,
|
|
|
|
|
+ *,
|
|
|
|
|
+ since_date: date = WXINDEX_WORDS_RECORD_SINCE,
|
|
|
|
|
+ dry_run: bool = False,
|
|
|
|
|
+ verbose: bool = False,
|
|
|
|
|
+) -> dict[str, int]:
|
|
|
|
|
+ """扫描 hot_content_records,汇总 6/11 起全部微信指数检索词(历史回填调 API)。"""
|
|
|
|
|
+ summary = {
|
|
|
|
|
+ "records_scanned": 0,
|
|
|
|
|
+ "records_with_words": 0,
|
|
|
|
|
+ "words_found": 0,
|
|
|
|
|
+ "inserted": 0,
|
|
|
|
|
+ "updated": 0,
|
|
|
|
|
+ "cached": 0,
|
|
|
|
|
+ "fetch_failed": 0,
|
|
|
|
|
+ "invalid_json": 0,
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ since_dt = datetime.combine(since_date, datetime.min.time()).replace(tzinfo=SHANGHAI_TZ)
|
|
|
|
|
+ records = repository.list_records_with_wxindex_trend(since_dt=since_dt)
|
|
|
|
|
+
|
|
|
|
|
+ for row in records:
|
|
|
|
|
+ summary["records_scanned"] += 1
|
|
|
|
|
+ record_id = int(row["id"])
|
|
|
|
|
+ try:
|
|
|
|
|
+ trend_json = row.get("wxindex_trend_json")
|
|
|
|
|
+ if not isinstance(trend_json, dict):
|
|
|
|
|
+ summary["invalid_json"] += 1
|
|
|
|
|
+ continue
|
|
|
|
|
+ except (TypeError, ValueError):
|
|
|
|
|
+ summary["invalid_json"] += 1
|
|
|
|
|
+ continue
|
|
|
|
|
+
|
|
|
|
|
+ words = extract_searched_words(trend_json)
|
|
|
|
|
+ if not words:
|
|
|
|
|
+ continue
|
|
|
|
|
+
|
|
|
|
|
+ summary["records_with_words"] += 1
|
|
|
|
|
+ if verbose:
|
|
|
|
|
+ print(f"id={record_id} words={words}")
|
|
|
|
|
+
|
|
|
|
|
+ result = sync_words_from_trend_json(
|
|
|
|
|
+ repository,
|
|
|
|
|
+ api_client,
|
|
|
|
|
+ api_url,
|
|
|
|
|
+ trend_json=trend_json,
|
|
|
|
|
+ record_id=record_id,
|
|
|
|
|
+ dry_run=dry_run,
|
|
|
|
|
+ verbose=verbose,
|
|
|
|
|
+ )
|
|
|
|
|
+ for key in ("words_found", "inserted", "updated", "cached", "fetch_failed"):
|
|
|
|
|
+ summary[key] += result[key]
|
|
|
|
|
+
|
|
|
|
|
+ return summary
|