"""V3-M0B: shipinhao replay corpus (sph_caihong) shape and scrub checks.""" from __future__ import annotations import json from pathlib import Path from tests.replay_harness import load_corpus, replay_case _CASE_DIR = Path("tests/fixtures/cases/sph_caihong/input") _REQUIRED_FILES = {"source_context.json", "discovered_content_items.jsonl"} _REQUIRED_ITEM_KEYS = { "content_discovery_id", "search_query_id", "platform", "platform_content_id", "platform_content_format", "description", "platform_author_id", "author_display_name", "statistics", "tags", } _FORBIDDEN_SUBSTRINGS = ["PASSWORD=", "TOKEN=", "sec_uid", "account_id", "cookie"] def test_sph_corpus_files_match_real_id45_filenames(): real_files = {p.name for p in Path("tests/fixtures/cases/real_id45/input").glob("*")} sph_files = {p.name for p in _CASE_DIR.glob("*")} assert _REQUIRED_FILES <= sph_files assert sph_files <= real_files def test_sph_corpus_items_are_canonical(): items = json.loads((_CASE_DIR / "discovered_content_items.jsonl").read_text(encoding="utf-8")) assert len(items) >= 4 for item in items: assert _REQUIRED_ITEM_KEYS <= set(item) assert item["platform"] == "shipinhao" assert isinstance(item["statistics"]["digg_count"], int) def test_sph_corpus_no_forbidden_keys(): for path in _CASE_DIR.glob("*"): text = path.read_text(encoding="utf-8") for token in _FORBIDDEN_SUBSTRINGS: assert token not in text, f"{path.name} contains forbidden substring {token!r}" def test_sph_corpus_replays_without_error(tmp_path): corpus = load_corpus("sph_caihong") assert corpus["source_context.json"]["ext_data"]["evidence_pack"]["seed_terms"] == ["彩虹"] artifacts = replay_case("sph_caihong", runtime_root=tmp_path / "rt") assert artifacts.state["status"] == "success" for key in ( "pooled_content_count", "review_content_count", "rejected_content_count", "pending_content_count", ): assert key in artifacts.summary