|
@@ -12,7 +12,7 @@ from app.hot_content.timezone import SHANGHAI_TZ
|
|
|
|
|
|
|
|
WXINDEX_WORDS_START_YMD = "20260601"
|
|
WXINDEX_WORDS_START_YMD = "20260601"
|
|
|
WXINDEX_WORDS_RECORD_SINCE = date(2026, 6, 11)
|
|
WXINDEX_WORDS_RECORD_SINCE = date(2026, 6, 11)
|
|
|
-WXINDEX_WORDS_MIN_AVG_SCORE = 100_000.0
|
|
|
|
|
|
|
+WXINDEX_WORDS_MIN_MAX_SCORE = 100_000.0
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_wxindex_end_ymd(*, today: date | None = None) -> str:
|
|
def get_wxindex_end_ymd(*, today: date | None = None) -> str:
|
|
@@ -86,6 +86,33 @@ def fetch_wxindex_scores(
|
|
|
return parse_wxindex_total_scores(wx_resp)
|
|
return parse_wxindex_total_scores(wx_resp)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
+def get_max_total_score(scores: list[dict[str, Any]]) -> float | None:
|
|
|
|
|
+ """从指数序列中取 total_score 最大值。"""
|
|
|
|
|
+ values: list[float] = []
|
|
|
|
|
+ for item in scores:
|
|
|
|
|
+ if not isinstance(item, dict):
|
|
|
|
|
+ continue
|
|
|
|
|
+ try:
|
|
|
|
|
+ values.append(float(item["total_score"]))
|
|
|
|
|
+ except (TypeError, ValueError, KeyError):
|
|
|
|
|
+ continue
|
|
|
|
|
+ if not values:
|
|
|
|
|
+ return None
|
|
|
|
|
+ return max(values)
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def word_meets_max_score_threshold(
|
|
|
|
|
+ scores: list[dict[str, Any]],
|
|
|
|
|
+ *,
|
|
|
|
|
+ min_max_score: float = WXINDEX_WORDS_MIN_MAX_SCORE,
|
|
|
|
|
+) -> bool:
|
|
|
|
|
+ """新增词时:最大值需严格大于阈值(不超过阈值则不添加)。"""
|
|
|
|
|
+ max_score = get_max_total_score(scores)
|
|
|
|
|
+ if max_score is None:
|
|
|
|
|
+ return False
|
|
|
|
|
+ return max_score > min_max_score
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
def get_word_score_bounds(
|
|
def get_word_score_bounds(
|
|
|
scores: list[dict[str, Any]],
|
|
scores: list[dict[str, Any]],
|
|
|
) -> tuple[str | None, str | None]:
|
|
) -> tuple[str | None, str | None]:
|
|
@@ -275,21 +302,21 @@ def refresh_stale_wxindex_words(
|
|
|
return summary
|
|
return summary
|
|
|
|
|
|
|
|
|
|
|
|
|
-def cleanup_low_avg_wxindex_words(
|
|
|
|
|
|
|
+def cleanup_low_max_wxindex_words(
|
|
|
repository: HotContentRepository,
|
|
repository: HotContentRepository,
|
|
|
*,
|
|
*,
|
|
|
- min_avg_score: float = WXINDEX_WORDS_MIN_AVG_SCORE,
|
|
|
|
|
|
|
+ min_max_score: float = WXINDEX_WORDS_MIN_MAX_SCORE,
|
|
|
dry_run: bool = False,
|
|
dry_run: bool = False,
|
|
|
verbose: bool = False,
|
|
verbose: bool = False,
|
|
|
) -> dict[str, int | float]:
|
|
) -> dict[str, int | float]:
|
|
|
- """删除各 dt 平均分低于阈值的词(按 name 整词删除)。"""
|
|
|
|
|
|
|
+ """删除各 dt 最大值低于阈值的词(按 name 整词删除)。"""
|
|
|
summary: dict[str, int | float] = {
|
|
summary: dict[str, int | float] = {
|
|
|
- "min_avg_score": min_avg_score,
|
|
|
|
|
- "low_avg_words": 0,
|
|
|
|
|
|
|
+ "min_max_score": min_max_score,
|
|
|
|
|
+ "low_max_words": 0,
|
|
|
"deleted_rows": 0,
|
|
"deleted_rows": 0,
|
|
|
}
|
|
}
|
|
|
- low_words = repository.list_low_avg_wxindex_words(min_avg_score=min_avg_score)
|
|
|
|
|
- summary["low_avg_words"] = len(low_words)
|
|
|
|
|
|
|
+ low_words = repository.list_low_max_wxindex_words(min_max_score=min_max_score)
|
|
|
|
|
+ summary["low_max_words"] = len(low_words)
|
|
|
if not low_words:
|
|
if not low_words:
|
|
|
return summary
|
|
return summary
|
|
|
|
|
|
|
@@ -298,7 +325,7 @@ def cleanup_low_avg_wxindex_words(
|
|
|
for item in low_words:
|
|
for item in low_words:
|
|
|
print(
|
|
print(
|
|
|
f"[dry-run] would delete word={item['name']} "
|
|
f"[dry-run] would delete word={item['name']} "
|
|
|
- f"avg_score={item['avg_score']:.0f} rows={item['row_count']}"
|
|
|
|
|
|
|
+ f"max_score={item['max_score']:.0f} rows={item['row_count']}"
|
|
|
)
|
|
)
|
|
|
summary["deleted_rows"] = sum(int(item["row_count"]) for item in low_words)
|
|
summary["deleted_rows"] = sum(int(item["row_count"]) for item in low_words)
|
|
|
return summary
|
|
return summary
|
|
@@ -310,7 +337,7 @@ def cleanup_low_avg_wxindex_words(
|
|
|
for item in low_words:
|
|
for item in low_words:
|
|
|
print(
|
|
print(
|
|
|
f"deleted word={item['name']} "
|
|
f"deleted word={item['name']} "
|
|
|
- f"avg_score={item['avg_score']:.0f} rows={item['row_count']}"
|
|
|
|
|
|
|
+ f"max_score={item['max_score']:.0f} rows={item['row_count']}"
|
|
|
)
|
|
)
|
|
|
return summary
|
|
return summary
|
|
|
|
|
|
|
@@ -321,11 +348,11 @@ def run_wxindex_words_daily_job(
|
|
|
api_url: str,
|
|
api_url: str,
|
|
|
*,
|
|
*,
|
|
|
end_ymd: str | None = None,
|
|
end_ymd: str | None = None,
|
|
|
- min_avg_score: float = WXINDEX_WORDS_MIN_AVG_SCORE,
|
|
|
|
|
|
|
+ min_max_score: float = WXINDEX_WORDS_MIN_MAX_SCORE,
|
|
|
dry_run: bool = False,
|
|
dry_run: bool = False,
|
|
|
verbose: bool = False,
|
|
verbose: bool = False,
|
|
|
) -> dict[str, Any]:
|
|
) -> dict[str, Any]:
|
|
|
- """定时任务:先补全缺失日期,再清理低均值词。"""
|
|
|
|
|
|
|
+ """定时任务:先补全缺失日期,再清理低最大值词。"""
|
|
|
refresh_summary = refresh_stale_wxindex_words(
|
|
refresh_summary = refresh_stale_wxindex_words(
|
|
|
repository,
|
|
repository,
|
|
|
api_client,
|
|
api_client,
|
|
@@ -334,9 +361,9 @@ def run_wxindex_words_daily_job(
|
|
|
dry_run=dry_run,
|
|
dry_run=dry_run,
|
|
|
verbose=verbose,
|
|
verbose=verbose,
|
|
|
)
|
|
)
|
|
|
- cleanup_summary = cleanup_low_avg_wxindex_words(
|
|
|
|
|
|
|
+ cleanup_summary = cleanup_low_max_wxindex_words(
|
|
|
repository,
|
|
repository,
|
|
|
- min_avg_score=min_avg_score,
|
|
|
|
|
|
|
+ min_max_score=min_max_score,
|
|
|
dry_run=dry_run,
|
|
dry_run=dry_run,
|
|
|
verbose=verbose,
|
|
verbose=verbose,
|
|
|
)
|
|
)
|
|
@@ -389,6 +416,12 @@ def ensure_word_full_scores(
|
|
|
if not api_scores:
|
|
if not api_scores:
|
|
|
return stored_scores, "api_empty"
|
|
return stored_scores, "api_empty"
|
|
|
|
|
|
|
|
|
|
+ if not had_data and not word_meets_max_score_threshold(
|
|
|
|
|
+ api_scores,
|
|
|
|
|
+ min_max_score=WXINDEX_WORDS_MIN_MAX_SCORE,
|
|
|
|
|
+ ):
|
|
|
|
|
+ return [], "below_threshold"
|
|
|
|
|
+
|
|
|
inserted, _skipped = repository.save_wxindex_daily_scores(
|
|
inserted, _skipped = repository.save_wxindex_daily_scores(
|
|
|
name=word,
|
|
name=word,
|
|
|
scores=api_scores,
|
|
scores=api_scores,
|
|
@@ -420,6 +453,7 @@ def sync_words_from_trend_json(
|
|
|
"updated": 0,
|
|
"updated": 0,
|
|
|
"cached": 0,
|
|
"cached": 0,
|
|
|
"api_empty": 0,
|
|
"api_empty": 0,
|
|
|
|
|
+ "below_threshold": 0,
|
|
|
"fetch_failed": 0,
|
|
"fetch_failed": 0,
|
|
|
}
|
|
}
|
|
|
words = extract_searched_words(trend_json)
|
|
words = extract_searched_words(trend_json)
|
|
@@ -450,6 +484,8 @@ def sync_words_from_trend_json(
|
|
|
summary["cached"] += 1
|
|
summary["cached"] += 1
|
|
|
elif action == "api_empty":
|
|
elif action == "api_empty":
|
|
|
summary["api_empty"] += 1
|
|
summary["api_empty"] += 1
|
|
|
|
|
+ elif action == "below_threshold":
|
|
|
|
|
+ summary["below_threshold"] += 1
|
|
|
elif action == "dry_run":
|
|
elif action == "dry_run":
|
|
|
summary["inserted"] += 1
|
|
summary["inserted"] += 1
|
|
|
|
|
|
|
@@ -477,6 +513,7 @@ def backfill_wxindex_words(
|
|
|
"updated": 0,
|
|
"updated": 0,
|
|
|
"cached": 0,
|
|
"cached": 0,
|
|
|
"api_empty": 0,
|
|
"api_empty": 0,
|
|
|
|
|
+ "below_threshold": 0,
|
|
|
"fetch_failed": 0,
|
|
"fetch_failed": 0,
|
|
|
"invalid_json": 0,
|
|
"invalid_json": 0,
|
|
|
}
|
|
}
|
|
@@ -513,7 +550,15 @@ def backfill_wxindex_words(
|
|
|
dry_run=dry_run,
|
|
dry_run=dry_run,
|
|
|
verbose=verbose,
|
|
verbose=verbose,
|
|
|
)
|
|
)
|
|
|
- for key in ("words_found", "inserted", "updated", "cached", "api_empty", "fetch_failed"):
|
|
|
|
|
|
|
+ for key in (
|
|
|
|
|
+ "words_found",
|
|
|
|
|
+ "inserted",
|
|
|
|
|
+ "updated",
|
|
|
|
|
+ "cached",
|
|
|
|
|
+ "api_empty",
|
|
|
|
|
+ "below_threshold",
|
|
|
|
|
+ "fetch_failed",
|
|
|
|
|
+ ):
|
|
|
summary[key] += result[key]
|
|
summary[key] += result[key]
|
|
|
|
|
|
|
|
return summary
|
|
return summary
|