test_sph_corpus_shape.py 2.0 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061
  1. """V3-M0B: shipinhao replay corpus (sph_caihong) shape and scrub checks."""
  2. from __future__ import annotations
  3. import json
  4. from pathlib import Path
  5. from tests.replay_harness import load_corpus, replay_case
  6. _CASE_DIR = Path("tests/fixtures/cases/sph_caihong/input")
  7. _REQUIRED_FILES = {"source_context.json", "discovered_content_items.jsonl"}
  8. _REQUIRED_ITEM_KEYS = {
  9. "content_discovery_id",
  10. "search_query_id",
  11. "platform",
  12. "platform_content_id",
  13. "platform_content_format",
  14. "description",
  15. "platform_author_id",
  16. "author_display_name",
  17. "statistics",
  18. "tags",
  19. }
  20. _FORBIDDEN_SUBSTRINGS = ["PASSWORD=", "TOKEN=", "sec_uid", "account_id", "cookie"]
  21. def test_sph_corpus_files_match_real_id45_filenames():
  22. real_files = {p.name for p in Path("tests/fixtures/cases/real_id45/input").glob("*")}
  23. sph_files = {p.name for p in _CASE_DIR.glob("*")}
  24. assert _REQUIRED_FILES <= sph_files
  25. assert sph_files <= real_files
  26. def test_sph_corpus_items_are_canonical():
  27. items = json.loads((_CASE_DIR / "discovered_content_items.jsonl").read_text(encoding="utf-8"))
  28. assert len(items) >= 4
  29. for item in items:
  30. assert _REQUIRED_ITEM_KEYS <= set(item)
  31. assert item["platform"] == "shipinhao"
  32. assert isinstance(item["statistics"]["digg_count"], int)
  33. def test_sph_corpus_no_forbidden_keys():
  34. for path in _CASE_DIR.glob("*"):
  35. text = path.read_text(encoding="utf-8")
  36. for token in _FORBIDDEN_SUBSTRINGS:
  37. assert token not in text, f"{path.name} contains forbidden substring {token!r}"
  38. def test_sph_corpus_replays_without_error(tmp_path):
  39. corpus = load_corpus("sph_caihong")
  40. assert corpus["source_context.json"]["ext_data"]["evidence_pack"]["seed_terms"] == ["彩虹"]
  41. artifacts = replay_case("sph_caihong", runtime_root=tmp_path / "rt")
  42. assert artifacts.state["status"] == "success"
  43. for key in (
  44. "pooled_content_count",
  45. "review_content_count",
  46. "rejected_content_count",
  47. "pending_content_count",
  48. ):
  49. assert key in artifacts.summary