Sfoglia il codice sorgente

修改热点词统计方式

xueyiming 1 settimana fa
parent
commit
a50627e226
2 ha cambiato i file con 68 aggiunte e 23 eliminazioni
  1. 8 8
      app/hot_content/repository.py
  2. 60 15
      app/hot_content/wxindex_words.py

+ 8 - 8
app/hot_content/repository.py

@@ -1145,25 +1145,25 @@ class HotContentRepository:
                 )
         return stale_words
 
-    def list_low_avg_wxindex_words(
+    def list_low_max_wxindex_words(
         self,
         *,
-        min_avg_score: float,
+        min_max_score: float,
     ) -> list[dict[str, Any]]:
-        """按 name 聚合,返回平均分低于阈值的词。"""
+        """按 name 聚合,返回最大值低于阈值的词。"""
         self._ensure_wxindex_words_table()
         sql = """
             SELECT
                 name,
-                AVG(total_score) AS avg_score,
+                MAX(total_score) AS max_score,
                 COUNT(*) AS row_count
             FROM hot_content_wxindex_words
             GROUP BY name
-            HAVING AVG(total_score) < %s
+            HAVING MAX(total_score) < %s
             ORDER BY name ASC
         """
         with self.conn.cursor() as cursor:
-            cursor.execute(sql, (min_avg_score,))
+            cursor.execute(sql, (min_max_score,))
             rows = cursor.fetchall()
 
         low_words: list[dict[str, Any]] = []
@@ -1172,14 +1172,14 @@ class HotContentRepository:
             if not name:
                 continue
             try:
-                avg_score = float(row["avg_score"])
+                max_score = float(row["max_score"])
                 row_count = int(row["row_count"])
             except (TypeError, ValueError, KeyError):
                 continue
             low_words.append(
                 {
                     "name": name,
-                    "avg_score": avg_score,
+                    "max_score": max_score,
                     "row_count": row_count,
                 }
             )

+ 60 - 15
app/hot_content/wxindex_words.py

@@ -12,7 +12,7 @@ from app.hot_content.timezone import SHANGHAI_TZ
 
 WXINDEX_WORDS_START_YMD = "20260601"
 WXINDEX_WORDS_RECORD_SINCE = date(2026, 6, 11)
-WXINDEX_WORDS_MIN_AVG_SCORE = 100_000.0
+WXINDEX_WORDS_MIN_MAX_SCORE = 100_000.0
 
 
 def get_wxindex_end_ymd(*, today: date | None = None) -> str:
@@ -86,6 +86,33 @@ def fetch_wxindex_scores(
     return parse_wxindex_total_scores(wx_resp)
 
 
+def get_max_total_score(scores: list[dict[str, Any]]) -> float | None:
+    """从指数序列中取 total_score 最大值。"""
+    values: list[float] = []
+    for item in scores:
+        if not isinstance(item, dict):
+            continue
+        try:
+            values.append(float(item["total_score"]))
+        except (TypeError, ValueError, KeyError):
+            continue
+    if not values:
+        return None
+    return max(values)
+
+
+def word_meets_max_score_threshold(
+    scores: list[dict[str, Any]],
+    *,
+    min_max_score: float = WXINDEX_WORDS_MIN_MAX_SCORE,
+) -> bool:
+    """新增词时:最大值需严格大于阈值(不超过阈值则不添加)。"""
+    max_score = get_max_total_score(scores)
+    if max_score is None:
+        return False
+    return max_score > min_max_score
+
+
 def get_word_score_bounds(
     scores: list[dict[str, Any]],
 ) -> tuple[str | None, str | None]:
@@ -275,21 +302,21 @@ def refresh_stale_wxindex_words(
     return summary
 
 
-def cleanup_low_avg_wxindex_words(
+def cleanup_low_max_wxindex_words(
     repository: HotContentRepository,
     *,
-    min_avg_score: float = WXINDEX_WORDS_MIN_AVG_SCORE,
+    min_max_score: float = WXINDEX_WORDS_MIN_MAX_SCORE,
     dry_run: bool = False,
     verbose: bool = False,
 ) -> dict[str, int | float]:
-    """删除各 dt 平均分低于阈值的词(按 name 整词删除)。"""
+    """删除各 dt 最大值低于阈值的词(按 name 整词删除)。"""
     summary: dict[str, int | float] = {
-        "min_avg_score": min_avg_score,
-        "low_avg_words": 0,
+        "min_max_score": min_max_score,
+        "low_max_words": 0,
         "deleted_rows": 0,
     }
-    low_words = repository.list_low_avg_wxindex_words(min_avg_score=min_avg_score)
-    summary["low_avg_words"] = len(low_words)
+    low_words = repository.list_low_max_wxindex_words(min_max_score=min_max_score)
+    summary["low_max_words"] = len(low_words)
     if not low_words:
         return summary
 
@@ -298,7 +325,7 @@ def cleanup_low_avg_wxindex_words(
             for item in low_words:
                 print(
                     f"[dry-run] would delete word={item['name']} "
-                    f"avg_score={item['avg_score']:.0f} rows={item['row_count']}"
+                    f"max_score={item['max_score']:.0f} rows={item['row_count']}"
                 )
         summary["deleted_rows"] = sum(int(item["row_count"]) for item in low_words)
         return summary
@@ -310,7 +337,7 @@ def cleanup_low_avg_wxindex_words(
         for item in low_words:
             print(
                 f"deleted word={item['name']} "
-                f"avg_score={item['avg_score']:.0f} rows={item['row_count']}"
+                f"max_score={item['max_score']:.0f} rows={item['row_count']}"
             )
     return summary
 
@@ -321,11 +348,11 @@ def run_wxindex_words_daily_job(
     api_url: str,
     *,
     end_ymd: str | None = None,
-    min_avg_score: float = WXINDEX_WORDS_MIN_AVG_SCORE,
+    min_max_score: float = WXINDEX_WORDS_MIN_MAX_SCORE,
     dry_run: bool = False,
     verbose: bool = False,
 ) -> dict[str, Any]:
-    """定时任务:先补全缺失日期,再清理低值词。"""
+    """定时任务:先补全缺失日期,再清理低最大值词。"""
     refresh_summary = refresh_stale_wxindex_words(
         repository,
         api_client,
@@ -334,9 +361,9 @@ def run_wxindex_words_daily_job(
         dry_run=dry_run,
         verbose=verbose,
     )
-    cleanup_summary = cleanup_low_avg_wxindex_words(
+    cleanup_summary = cleanup_low_max_wxindex_words(
         repository,
-        min_avg_score=min_avg_score,
+        min_max_score=min_max_score,
         dry_run=dry_run,
         verbose=verbose,
     )
@@ -389,6 +416,12 @@ def ensure_word_full_scores(
     if not api_scores:
         return stored_scores, "api_empty"
 
+    if not had_data and not word_meets_max_score_threshold(
+        api_scores,
+        min_max_score=WXINDEX_WORDS_MIN_MAX_SCORE,
+    ):
+        return [], "below_threshold"
+
     inserted, _skipped = repository.save_wxindex_daily_scores(
         name=word,
         scores=api_scores,
@@ -420,6 +453,7 @@ def sync_words_from_trend_json(
         "updated": 0,
         "cached": 0,
         "api_empty": 0,
+        "below_threshold": 0,
         "fetch_failed": 0,
     }
     words = extract_searched_words(trend_json)
@@ -450,6 +484,8 @@ def sync_words_from_trend_json(
             summary["cached"] += 1
         elif action == "api_empty":
             summary["api_empty"] += 1
+        elif action == "below_threshold":
+            summary["below_threshold"] += 1
         elif action == "dry_run":
             summary["inserted"] += 1
 
@@ -477,6 +513,7 @@ def backfill_wxindex_words(
         "updated": 0,
         "cached": 0,
         "api_empty": 0,
+        "below_threshold": 0,
         "fetch_failed": 0,
         "invalid_json": 0,
     }
@@ -513,7 +550,15 @@ def backfill_wxindex_words(
             dry_run=dry_run,
             verbose=verbose,
         )
-        for key in ("words_found", "inserted", "updated", "cached", "api_empty", "fetch_failed"):
+        for key in (
+            "words_found",
+            "inserted",
+            "updated",
+            "cached",
+            "api_empty",
+            "below_threshold",
+            "fetch_failed",
+        ):
             summary[key] += result[key]
 
     return summary