| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171 |
- import copy
- import pytest
- from content_agent.business_modules import search_intent
- from content_agent.errors import ContentAgentError
- from content_agent.integrations.query_prompt_config import DEFAULT_PROFILE
- from content_agent.run_service import RunService
- from content_agent.schemas import RunStartRequest
- from tests.p1_helpers import FakeQueryVariantClient, REAL_SOURCE_FIXTURE
- FORBIDDEN_FIXED_BUSINESS_TERMS = [
- "\u8d2a\u8150",
- "\u57fa\u5c42\u516c\u804c\u4eba\u5458",
- "\u6848\u4f8b",
- "\u89e3\u8bfb",
- "\u8b66\u793a",
- ]
- class _Runtime:
- def __init__(self):
- self.rows = {}
- def append_jsonl(self, _run_id, filename, rows):
- self.rows[filename] = rows
- def _seed_pack():
- return {
- "seed_terms": ["中医养生"],
- "itemset_items": ["补气血"],
- "category_bindings": [{"category_id": "c1"}],
- "element_bindings": [{"element_id": "e1"}],
- "pattern_source_system": "pg_pattern_v2",
- "pattern_execution_id": 1987,
- "mining_config_id": 58,
- "source_post_id": "60219550",
- "matched_post_ids": ["60219550"],
- "itemset_ids": [1607977],
- "support": 0.2,
- "absolute_support": 31,
- "confidence": 0.8,
- }
- def test_search_seed_and_queries_do_not_inject_fixed_business_terms(tmp_path):
- service = RunService(
- runtime_root=tmp_path / "runtime" / "v1",
- query_variant_client=FakeQueryVariantClient(
- {
- "爱国情感": "家国叙事素材",
- "人物故事": "榜样人物素材",
- }
- ),
- )
- state = service.start_run(
- RunStartRequest(platform_mode="mock", source=str(REAL_SOURCE_FIXTURE))
- )
- run_id = state["run_id"]
- pattern_seed_pack = service.read_json(run_id, "pattern_seed_pack.json")
- queries = service.read_jsonl(run_id, "search_queries.jsonl")
- p2_queries = [
- row
- for row in queries
- if row["search_query_generation_method"] in {"item_single", "llm_variant"}
- ]
- assert pattern_seed_pack["seed_terms"] == ["爱国情感", "人物故事"]
- assert [row["search_query_id"] for row in p2_queries] == ["q_001", "q_002", "q_003", "q_004"]
- assert [row["search_query"] for row in p2_queries] == [
- "爱国情感",
- "家国叙事素材",
- "人物故事",
- "榜样人物素材",
- ]
- assert [row["search_query_generation_method"] for row in p2_queries] == [
- "item_single",
- "llm_variant",
- "item_single",
- "llm_variant",
- ]
- assert p2_queries[1]["llm_variant_of"] == "q_001"
- assert p2_queries[3]["llm_variant_of"] == "q_003"
- for value in [
- *pattern_seed_pack["seed_terms"],
- *(row["search_query"] for row in p2_queries),
- ]:
- assert not any(term in value for term in FORBIDDEN_FIXED_BUSINESS_TERMS)
- def test_search_queries_preserve_source_terms_for_replay(tmp_path):
- service = RunService(
- runtime_root=tmp_path / "runtime" / "v1",
- query_variant_client=FakeQueryVariantClient(
- {
- "爱国情感": "家国叙事素材",
- "人物故事": "榜样人物素材",
- }
- ),
- )
- state = service.start_run(
- RunStartRequest(platform_mode="mock", source=str(REAL_SOURCE_FIXTURE))
- )
- queries = service.read_jsonl(state["run_id"], "search_queries.jsonl")
- p2_queries = [
- query
- for query in queries
- if query["search_query_generation_method"] in {"item_single", "llm_variant"}
- ]
- expected_source_terms = [["爱国情感"], ["爱国情感"], ["人物故事"], ["人物故事"]]
- for query, source_terms in zip(p2_queries, expected_source_terms, strict=True):
- assert query["query_source_terms"] == source_terms
- assert query["query_source_fields"] == ["seed_terms"]
- assert query["raw_payload"]["query_source_terms"] == source_terms
- assert query["pattern_seed_ref"]["source_field"] == "seed_terms"
- assert query["pattern_seed_ref"]["seed_term"] == source_terms[0]
- assert query["raw_payload"]["pattern_seed_ref"]["seed_term"] == source_terms[0]
- llm_queries = [
- query
- for query in p2_queries
- if query["search_query_generation_method"] == "llm_variant"
- ]
- assert len(llm_queries) == 2
- for query in llm_queries:
- assert query["raw_payload"]["llm_prompt_version"] == "fake-query-prompt-v1"
- assert query["raw_payload"]["llm_generation_model"] == "fake-query-model"
- assert query["raw_payload"]["llm_input_evidence"]["source_field"] == "seed_terms"
- assert query["raw_payload"]["llm_input_evidence"]["itemset_items"]
- def test_search_intent_custom_evidence_fields_whitelist():
- client = FakeQueryVariantClient({"中医养生": "气血食疗"})
- client.profile = copy.deepcopy(DEFAULT_PROFILE)
- client.profile["evidence_fields"] = ["seed_term", "support"]
- runtime = _Runtime()
- queries = search_intent.run("run_1", "policy_1", _seed_pack(), runtime, client)
- llm_query = [row for row in queries if row["search_query_generation_method"] == "llm_variant"][0]
- assert list(llm_query["llm_input_evidence"].keys()) == ["seed_term", "support"]
- assert list(llm_query["raw_payload"]["llm_input_evidence"].keys()) == ["seed_term", "support"]
- assert llm_query["query_source_fields"] == ["seed_terms"]
- def test_search_intent_custom_generic_filter_blocks_query():
- client = FakeQueryVariantClient({"中医养生": "禁用泛词"})
- client.profile = copy.deepcopy(DEFAULT_PROFILE)
- client.profile["generic_filter"] = {"queries": ["禁用泛词"], "tokens": []}
- with pytest.raises(ContentAgentError) as exc:
- search_intent.run("run_1", "policy_1", _seed_pack(), _Runtime(), client)
- assert exc.value.error_code == "QUERY_GENERATION_FAILED"
- assert exc.value.detail["reason"] == "llm_variant_generic"
- def test_search_intent_rejects_unsupported_variants_per_seed():
- client = FakeQueryVariantClient({"中医养生": "气血食疗"})
- client.profile = copy.deepcopy(DEFAULT_PROFILE)
- client.profile["variants_per_seed"] = 2
- with pytest.raises(ContentAgentError) as exc:
- search_intent.run("run_1", "policy_1", _seed_pack(), _Runtime(), client)
- assert exc.value.error_code == "QUERY_GENERATION_FAILED"
- assert exc.value.detail == {"reason": "variants_per_seed_unsupported", "variants_per_seed": 2}
|