test_rule_judgment_scorecard.py 8.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217
  1. from __future__ import annotations
  2. from copy import deepcopy
  3. import pytest
  4. from content_agent.business_modules.rule_judgment.evaluator import decide
  5. from content_agent.run_service import RunService
  6. from content_agent.schemas import RunStartRequest
  7. from tests.p1_helpers import FakeQueryVariantClient, REAL_SOURCE_FIXTURE
  8. def test_scorecard_uses_active_dimensions_and_thresholds(tmp_path):
  9. state = _state(tmp_path)
  10. bundle = deepcopy(state["evidence_bundles"][0])
  11. # M3 2-dim scorecard: relevance gte0.8 -> 60, platform_heat gte0.4 -> 20 => 80 (pool).
  12. bundle["pattern_match_result"]["relevance_score"] = 0.8
  13. bundle["content_engagement_metrics"]["platform_heat"] = 0.4
  14. decision = decide(
  15. state["run_id"],
  16. state["policy_run_id"],
  17. 1,
  18. bundle,
  19. state["policy_bundle"],
  20. )
  21. assert decision["decision_action"] == "ADD_TO_CONTENT_POOL"
  22. assert decision["score"] == 80
  23. dimensions = {row["key"]: row for row in decision["scorecard"]["dimensions"]}
  24. assert dimensions["relevance"]["score"] == 60
  25. assert dimensions["platform_heat"]["score"] == 20
  26. # 2026-06-12 清理: 5 个 deprecated 维度已从规则包物理删除,scorecard 只剩 2 个 active 维度。
  27. assert set(dimensions) == {"relevance", "platform_heat"}
  28. def test_rule_pack_scorecard_has_only_two_active_dimensions():
  29. # 配置层钉死: 规则包里只剩 relevance + platform_heat,5 个废弃维度定义已删干净。
  30. import json
  31. from pathlib import Path
  32. rule_pack = json.loads(
  33. Path("product_documents/规则包/douyin_rule_packs.v1.json").read_text(encoding="utf-8")
  34. )
  35. for pack in rule_pack["rule_packs"]:
  36. keys = [dim["key"] for dim in pack["scorecard"]["dimensions"]]
  37. assert keys == ["relevance", "platform_heat"], keys
  38. assert all(dim["runtime_status"] == "active" for dim in pack["scorecard"]["dimensions"])
  39. def test_missing_scoring_rules_fail_fast(tmp_path):
  40. state = _state(tmp_path)
  41. policy_bundle = deepcopy(state["policy_bundle"])
  42. policy_bundle["rule_pack"]["scorecard"]["scoring_rules"] = []
  43. with pytest.raises(ValueError, match="active scorecard dimensions require"):
  44. decide(
  45. state["run_id"],
  46. state["policy_run_id"],
  47. 1,
  48. state["evidence_bundles"][0],
  49. policy_bundle,
  50. )
  51. def test_no_scoring_evidence_uses_missing_score_policy(tmp_path):
  52. state = _state(tmp_path)
  53. bundle = deepcopy(state["evidence_bundles"][0])
  54. # Drop evidence for both active dims (relevance + platform_heat) so no scoring rule matches.
  55. bundle["pattern_match_result"].pop("relevance_score", None)
  56. bundle["content_engagement_metrics"].pop("platform_heat", None)
  57. decision = decide(state["run_id"], state["policy_run_id"], 1, bundle, state["policy_bundle"])
  58. assert decision["decision_action"] == "REJECT_CONTENT"
  59. assert decision["decision_reason_code"] == "missing_score"
  60. assert decision["search_query_effect_status"] == "failed"
  61. assert decision["score"] is None
  62. @pytest.mark.parametrize(
  63. ("total_score", "expected_action", "expected_status"),
  64. [
  65. (59, "REJECT_CONTENT", "failed"),
  66. (60, "KEEP_CONTENT_FOR_REVIEW", "pending"),
  67. (69, "KEEP_CONTENT_FOR_REVIEW", "pending"),
  68. (70, "ADD_TO_CONTENT_POOL", "success"),
  69. ],
  70. )
  71. def test_score_threshold_boundaries(tmp_path, total_score, expected_action, expected_status):
  72. state = _state(tmp_path)
  73. policy_bundle = _policy_with_total_score(state["policy_bundle"], total_score)
  74. decision = decide(
  75. state["run_id"],
  76. state["policy_run_id"],
  77. 1,
  78. state["evidence_bundles"][0],
  79. policy_bundle,
  80. )
  81. assert decision["score"] == total_score
  82. assert decision["decision_action"] == expected_action
  83. assert decision["search_query_effect_status"] == expected_status
  84. def test_scoring_rule_unknown_operator_fails_fast(tmp_path):
  85. state = _state(tmp_path)
  86. policy_bundle = deepcopy(state["policy_bundle"])
  87. scoring_rules = policy_bundle["rule_pack"]["scorecard"]["scoring_rules"]
  88. for rule in scoring_rules:
  89. if rule["scoring_rule_id"] == "score_relevance_high":
  90. rule["operator"] = "contains"
  91. with pytest.raises(ValueError, match="unsupported rule operator"):
  92. decide(
  93. state["run_id"],
  94. state["policy_run_id"],
  95. 1,
  96. state["evidence_bundles"][0],
  97. policy_bundle,
  98. )
  99. def test_single_missing_dimension_scores_zero_and_keeps_threshold_flow(tmp_path):
  100. state = _state(tmp_path)
  101. bundle = deepcopy(state["evidence_bundles"][0])
  102. # relevance evidence present (0.8 -> 60); platform_heat evidence absent -> scores 0, not missing_score.
  103. bundle["pattern_match_result"]["relevance_score"] = 0.8
  104. bundle["content_engagement_metrics"].pop("platform_heat", None)
  105. decision = decide(state["run_id"], state["policy_run_id"], 1, bundle, state["policy_bundle"])
  106. dimensions = {row["key"]: row for row in decision["scorecard"]["dimensions"]}
  107. assert dimensions["platform_heat"]["score_missing"] is True
  108. assert dimensions["platform_heat"]["score"] == 0
  109. assert dimensions["relevance"]["score_missing"] is False
  110. assert dimensions["relevance"]["score"] == 60
  111. assert decision["score"] == 60
  112. assert decision["decision_reason_code"] != "missing_score"
  113. assert decision["scorecard"]["score_missing"] is False
  114. def test_all_dimensions_missing_uses_score_missing_policy(tmp_path):
  115. state = _state(tmp_path)
  116. bundle = deepcopy(state["evidence_bundles"][0])
  117. # Both active dims (relevance + platform_heat) lack evidence -> score_missing policy.
  118. bundle["pattern_match_result"].pop("relevance_score", None)
  119. bundle["content_engagement_metrics"].pop("platform_heat", None)
  120. decision = decide(state["run_id"], state["policy_run_id"], 1, bundle, state["policy_bundle"])
  121. assert decision["decision_action"] == "REJECT_CONTENT"
  122. assert decision["decision_reason_code"] == "missing_score"
  123. assert decision["score"] is None
  124. assert decision["scorecard"]["score_missing"] is True
  125. assert all(row["score_missing"] for row in decision["scorecard"]["dimensions"])
  126. def test_dimension_missing_metadata_is_recorded(tmp_path):
  127. state = _state(tmp_path)
  128. bundle = deepcopy(state["evidence_bundles"][0])
  129. bundle["content_engagement_metrics"].pop("platform_heat", None)
  130. decision = decide(state["run_id"], state["policy_run_id"], 1, bundle, state["policy_bundle"])
  131. assert decision["decision_replay_data"]["missing_dimensions"] == ["platform_heat"]
  132. full = deepcopy(state["evidence_bundles"][0])
  133. full["content_engagement_metrics"]["platform_heat"] = 0.8
  134. full_decision = decide(state["run_id"], state["policy_run_id"], 2, full, state["policy_bundle"])
  135. assert full_decision["decision_replay_data"]["missing_dimensions"] == []
  136. def _state(tmp_path):
  137. service = RunService(
  138. runtime_root=tmp_path / "runtime" / "v1",
  139. query_variant_client=FakeQueryVariantClient(),
  140. )
  141. return service.start_run(
  142. RunStartRequest(platform_mode="mock", source=str(REAL_SOURCE_FIXTURE))
  143. )
  144. def _policy_with_total_score(policy_bundle, total_score):
  145. """Build an exact total score from the two M3 active dims (relevance max60, platform_heat max40).
  146. Replaces every scoring rule with one always-matching rule per active dimension whose
  147. score_value sums to ``total_score`` (relevance carries up to 60, heat the remainder).
  148. """
  149. policy_bundle = deepcopy(policy_bundle)
  150. scorecard = policy_bundle["rule_pack"]["scorecard"]
  151. relevance_score = min(total_score, 60)
  152. heat_score = total_score - relevance_score
  153. assert heat_score <= 40, "total_score exceeds combined active-dimension caps"
  154. scorecard["scoring_rules"] = [
  155. {
  156. "scoring_rule_id": "test_relevance_score",
  157. "dimension_key": "relevance",
  158. "field_path": "content.decision_target_type",
  159. "operator": "eq",
  160. "expected_value": "content",
  161. "score_value": relevance_score,
  162. "priority": 1,
  163. "enabled": True,
  164. },
  165. {
  166. "scoring_rule_id": "test_heat_score",
  167. "dimension_key": "platform_heat",
  168. "field_path": "content.decision_target_type",
  169. "operator": "eq",
  170. "expected_value": "content",
  171. "score_value": heat_score,
  172. "priority": 1,
  173. "enabled": True,
  174. },
  175. ]
  176. return policy_bundle