| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061 |
- """V3-M0B: shipinhao replay corpus (sph_caihong) shape and scrub checks."""
- from __future__ import annotations
- import json
- from pathlib import Path
- from tests.replay_harness import load_corpus, replay_case
- _CASE_DIR = Path("tests/fixtures/cases/sph_caihong/input")
- _REQUIRED_FILES = {"source_context.json", "discovered_content_items.jsonl"}
- _REQUIRED_ITEM_KEYS = {
- "content_discovery_id",
- "search_query_id",
- "platform",
- "platform_content_id",
- "platform_content_format",
- "description",
- "platform_author_id",
- "author_display_name",
- "statistics",
- "tags",
- }
- _FORBIDDEN_SUBSTRINGS = ["PASSWORD=", "TOKEN=", "sec_uid", "account_id", "cookie"]
- def test_sph_corpus_files_match_real_id45_filenames():
- real_files = {p.name for p in Path("tests/fixtures/cases/real_id45/input").glob("*")}
- sph_files = {p.name for p in _CASE_DIR.glob("*")}
- assert _REQUIRED_FILES <= sph_files
- assert sph_files <= real_files
- def test_sph_corpus_items_are_canonical():
- items = json.loads((_CASE_DIR / "discovered_content_items.jsonl").read_text(encoding="utf-8"))
- assert len(items) >= 4
- for item in items:
- assert _REQUIRED_ITEM_KEYS <= set(item)
- assert item["platform"] == "shipinhao"
- assert isinstance(item["statistics"]["digg_count"], int)
- def test_sph_corpus_no_forbidden_keys():
- for path in _CASE_DIR.glob("*"):
- text = path.read_text(encoding="utf-8")
- for token in _FORBIDDEN_SUBSTRINGS:
- assert token not in text, f"{path.name} contains forbidden substring {token!r}"
- def test_sph_corpus_replays_without_error(tmp_path):
- corpus = load_corpus("sph_caihong")
- assert corpus["source_context.json"]["ext_data"]["evidence_pack"]["seed_terms"] == ["彩虹"]
- artifacts = replay_case("sph_caihong", runtime_root=tmp_path / "rt")
- assert artifacts.state["status"] == "success"
- for key in (
- "pooled_content_count",
- "review_content_count",
- "rejected_content_count",
- "pending_content_count",
- ):
- assert key in artifacts.summary
|