Browse Source

feat(r3-S1): 热度改配置驱动的每平台互动复合机制(行为字节不变)

R3 第一步:把'只看点赞、锚点写死在代码'换成'每平台读 profile.heat.signals 做复合'
的机制——每字段 log 归一化后对在场(>0)字段加权平均,缺字段/0 值排除、权重重新归一。
锚点从 platform_heat.py 常量搬进 platform_profiles(R3 工程债清除);
heat_score(digg_count,platform)→heat_score(statistics,platform),builder 一行改。

第一步各平台只配 digg 单信号=R3 前锚点(douyin 1e4/1e6、shipinhao 50/5e4),
其余平台无 heat 段回退默认锚点单信号——**行为字节等价**:replay 快照(real_id45/
sph_caihong/syn_*)进池复看分布零变,test_platform_heat 6 锚点值不变。
profile 加 heat 段不破 _validate_profile(只校验 edges)。
新增复合守卫单测×3(单信号=旧值/0值排除/多字段加权);330 passed(327+3)。
第二步(各平台多字段调权、抖音加 comment/share/collect)另起 commit。

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
Sam Lee 1 day ago
parent
commit
7385cb752e

+ 1 - 1
content_agent/business_modules/content_discovery/content_discovery_builder.py

@@ -150,7 +150,7 @@ def _build_evidence_bundle(
             "statistics": result["statistics"],
             **result["statistics"],
             "platform_heat": heat_score(
-                result["statistics"].get("digg_count"),
+                result["statistics"],
                 discovered_content_item["platform"],
             ),
         },

+ 45 - 18
content_agent/business_modules/content_discovery/platform_heat.py

@@ -1,29 +1,56 @@
-"""Platform heat normalization (V3-M3A).
+"""平台热度归一化(V3-M3A;2026-06-12 R3 改配置驱动的每平台互动复合)。
 
-跨平台唯一公共互动指标是点赞(digg_count;视频号仅 like_count→digg_count)。
-绝对值不可跨平台比较(抖音爆款 5e6 vs 视频号样本 ~1e2),故按平台锚点做
-对数归一化:digg ≤ floor → 0,≥ ceil → 1,之间按 log10 插值。
-锚点为拍板起步值,M7 真实跑测后按数据标定。
+各平台能拉到的互动数据天差地别(抖音无播放量、视频号只有点赞、快手/B站有播放量),
+一套公式套不上。每平台从自己 profile 的 `heat.signals` 读「用哪些字段、各自锚点和权重」,
+每字段先 log 归一化 0~1,再对**在场(>0)**字段加权平均——缺字段/0 值不计入,
+权重在在场字段间重新归一。锚点从代码搬进 platform_profiles,加平台/调权重只改 JSON。
+
+无 heat 段或无 profile 时回退「点赞单信号 + 默认锚点」(=R3 前老行为)。
 """
 
 from __future__ import annotations
 
 from math import log10
+from pathlib import Path
 from typing import Any
 
-# (floor, ceil) per platform — 起步值,M7 标定。
-_HEAT_ANCHORS: dict[str, tuple[float, float]] = {
-    "douyin": (10000.0, 1000000.0),
-    "shipinhao": (50.0, 50000.0),
-}
-_DEFAULT_ANCHOR = (100.0, 100000.0)
+from content_agent.integrations import config_store
+
+_PROFILE_DIR = Path("tech_documents/数据接口与来源/platform_profiles")
+_DEFAULT_ANCHOR = (100.0, 100000.0)  # 兜底:profile 缺 heat 段时,仅用点赞单信号
+
 
+def _log_norm(value: float, floor: float, ceil: float) -> float:
+    raw = (log10(value + 1) - log10(floor)) / (log10(ceil) - log10(floor))
+    return min(max(raw, 0.0), 1.0)
 
-def heat_score(digg_count: Any, platform: str) -> float:
-    floor, ceil = _HEAT_ANCHORS.get(platform, _DEFAULT_ANCHOR)
+
+def _heat_signals(platform: str) -> list[dict[str, Any]]:
+    """读 profile.heat.signals;无 profile/无 heat 段 → 点赞单信号 + 默认锚点。"""
     try:
-        digg = max(int(digg_count or 0), 0)
-    except (TypeError, ValueError):
-        digg = 0
-    raw = (log10(digg + 1) - log10(floor)) / (log10(ceil) - log10(floor))
-    return round(min(max(raw, 0.0), 1.0), 4)
+        profile, _ = config_store.load_json(_PROFILE_DIR / f"{platform}.json")
+    except (FileNotFoundError, OSError):
+        profile = {}
+    signals = (profile.get("heat") or {}).get("signals")
+    if signals:
+        return signals
+    floor, ceil = _DEFAULT_ANCHOR
+    return [{"field": "digg_count", "weight": 1.0, "floor": floor, "ceil": ceil}]
+
+
+def heat_score(statistics: dict[str, Any], platform: str) -> float:
+    weighted = 0.0
+    total_weight = 0.0
+    for signal in _heat_signals(platform):
+        try:
+            value = int(statistics.get(signal["field"]) or 0)
+        except (TypeError, ValueError):
+            value = 0
+        if value <= 0:
+            continue  # 缺字段/0 值不计入(每平台数据可得性不同)
+        weight = float(signal.get("weight", 1.0))
+        weighted += weight * _log_norm(value, float(signal["floor"]), float(signal["ceil"]))
+        total_weight += weight
+    if total_weight == 0:
+        return 0.0
+    return round(weighted / total_weight, 4)

+ 6 - 0
tech_documents/数据接口与来源/platform_profiles/douyin.json

@@ -9,6 +9,12 @@
     "retry": null,
     "video_download": { "downloadable": true, "headers": { "User-Agent": "iOS UA", "Referer": "https://www.douyin.com/" }, "note": "实测 206/video/mp4;play_addr 地址有时效" }
   },
+  "heat": {
+    "note": "R3 第一步(2026-06-12):点赞单信号=R3 前行为;抖音无播放量,第二步加 comment/share/collect 复合",
+    "signals": [
+      { "field": "digg_count", "weight": 1.0, "floor": 10000, "ceil": 1000000 }
+    ]
+  },
   "endpoints": {
     "search":         { "path": "/crawler/dou_yin/keyword", "params": { "keyword": "str", "content_type": "视频|图文", "sort_type": "最多点赞|最多分享", "cursor": "str" }, "response_shape": "raw(aweme 原生结构)" },
     "detail":         { "path": "/crawler/dou_yin/detail", "params": { "content_id": "str" }, "response_shape": "normalized(channel_* 归一化)" },

+ 6 - 0
tech_documents/数据接口与来源/platform_profiles/shipinhao.json

@@ -9,6 +9,12 @@
     "retry": { "search": { "trigger": "仅暂时性故障重试:code=25011 接口异常 / 超时 / 网络错误(code=0 空结果、参数/鉴权错 不重试)", "max_attempts": 3, "backoff_seconds": [1, 2, 4], "on_exhausted": "标 blocked·失败待查,游走绕行不卡流水线", "note": "实测首次 25011、重试 code=0;退避与 rate_limit_seconds=15 取较大值,故退避多被 15s 吸收(2026-06-11 拍板)" } },
     "video_download": { "downloadable": true, "headers": { "User-Agent": "PC UA", "Referer": "https://channels.weixin.qq.com/" }, "note": "实测 200/video/mp4;findermp 带 encfilekey" }
   },
+  "heat": {
+    "note": "R3 第一步(2026-06-12):点赞单信号=R3 前行为;视频号 comment/share/collect 实测常空,数据所限只能用点赞",
+    "signals": [
+      { "field": "digg_count", "weight": 1.0, "floor": 50, "ceil": 50000 }
+    ]
+  },
   "endpoints": {
     "search":       { "path": "/crawler/shi_pin_hao/keyword", "params": { "keyword": "str", "cursor": "str" }, "response_shape": "normalized(channel_*)", "stability": "unstable" },
     "blogger":      { "path": "/crawler/shi_pin_hao/blogger", "status": "blocked", "note": "实测 code=25011" },

+ 64 - 14
tests/test_platform_heat.py

@@ -1,38 +1,88 @@
-"""V3-M3A: platform heat log-normalization unit tests."""
+"""V3-M3A: platform heat 归一化单测;R3(2026-06-12)改配置驱动复合后扩充。"""
 
 from __future__ import annotations
 
+from math import log10
+
 from content_agent.business_modules.content_discovery.platform_heat import heat_score
 
 
+def _stat(digg, **extra):
+    return {"digg_count": digg, **extra}
+
+
+# --- R3 第一步:点赞单信号,与 R3 前行为字节等价(锚点现读自 profile heat 段) ---
+
 def test_heat_floor_maps_to_zero():
-    assert heat_score(10000, "douyin") == 0.0
-    assert heat_score(3, "douyin") == 0.0
+    assert heat_score(_stat(10000), "douyin") == 0.0
+    assert heat_score(_stat(3), "douyin") == 0.0
 
 
 def test_heat_ceil_maps_to_one():
-    assert heat_score(1000000, "douyin") == 1.0
-    assert heat_score(5034215, "douyin") == 1.0
+    assert heat_score(_stat(1000000), "douyin") == 1.0
+    assert heat_score(_stat(5034215), "douyin") == 1.0
 
 
 def test_heat_midpoint_between_zero_and_one():
     # geometric midpoint of (1e4, 1e6) is 1e5 -> ~0.5
-    assert abs(heat_score(100000, "douyin") - 0.5) < 0.01
+    assert abs(heat_score(_stat(100000), "douyin") - 0.5) < 0.01
 
 
 def test_heat_missing_or_zero_digg_is_zero():
-    assert heat_score(None, "douyin") == 0.0
-    assert heat_score(0, "shipinhao") == 0.0
-    assert heat_score("not-a-number", "douyin") == 0.0
+    assert heat_score(_stat(None), "douyin") == 0.0
+    assert heat_score(_stat(0), "shipinhao") == 0.0
+    assert heat_score(_stat("not-a-number"), "douyin") == 0.0
 
 
 def test_heat_unknown_platform_uses_default_anchor():
-    # default anchors (100, 1e5): below floor -> 0, at/above ceil -> 1
-    assert heat_score(50, "bilibili") == 0.0
-    assert heat_score(100000, "bilibili") == 1.0
+    # 无 heat 段平台回退默认锚点 (100, 1e5):below floor -> 0, at/above ceil -> 1
+    assert heat_score(_stat(50), "bilibili") == 0.0
+    assert heat_score(_stat(100000), "bilibili") == 1.0
 
 
 def test_heat_shipinhao_low_digg_not_unfairly_zero():
     # 92 likes is meaningful on shipinhao (anchors 50..5e4) though tiny on douyin.
-    assert heat_score(92, "shipinhao") > 0.05
-    assert heat_score(92, "douyin") == 0.0
+    assert heat_score(_stat(92), "shipinhao") > 0.05
+    assert heat_score(_stat(92), "douyin") == 0.0
+
+
+# --- R3 复合机制守卫(放宽到多字段后这些仍成立,无需改) ---
+
+def test_heat_single_signal_equals_legacy_digg_only():
+    # 单信号配置 == 老的纯点赞归一化(机制中性证明)。
+    digg = 200000
+    expected = round((log10(digg + 1) - log10(10000)) / (log10(1000000) - log10(10000)), 4)
+    assert heat_score(_stat(digg), "douyin") == expected
+
+
+def test_heat_ignores_zero_and_missing_fields_in_composite(monkeypatch):
+    # 复合时 0 值字段被排除、权重在在场字段间重新归一。
+    from content_agent.business_modules.content_discovery import platform_heat
+
+    def fake_signals(platform):
+        return [
+            {"field": "digg_count", "weight": 0.5, "floor": 100, "ceil": 100000},
+            {"field": "comment_count", "weight": 0.5, "floor": 10, "ceil": 10000},
+        ]
+
+    monkeypatch.setattr(platform_heat, "_heat_signals", fake_signals)
+    # comment 为 0 → 只剩 digg,权重重新归一为 1.0 → 等于纯 digg 归一
+    digg_only = heat_score({"digg_count": 10000, "comment_count": 0}, "x")
+    expected = round((log10(10001) - log10(100)) / (log10(100000) - log10(100)), 4)
+    assert digg_only == expected
+
+
+def test_heat_weighted_average_of_two_signals(monkeypatch):
+    from content_agent.business_modules.content_discovery import platform_heat
+
+    def fake_signals(platform):
+        return [
+            {"field": "digg_count", "weight": 0.5, "floor": 100, "ceil": 100000},
+            {"field": "comment_count", "weight": 0.5, "floor": 10, "ceil": 10000},
+        ]
+
+    monkeypatch.setattr(platform_heat, "_heat_signals", fake_signals)
+    score = heat_score({"digg_count": 10000, "comment_count": 1000}, "x")
+    d = (log10(10001) - log10(100)) / (log10(100000) - log10(100))
+    c = (log10(1001) - log10(10)) / (log10(10000) - log10(10))
+    assert score == round((0.5 * d + 0.5 * c) / 1.0, 4)