|
|
@@ -1,7 +1,7 @@
|
|
|
"""
|
|
|
Stage 2: 将 apply_to_draft 映射为正式 apply_to
|
|
|
|
|
|
-从 case.json 读取,只对每个 case 的 capability 中的 apply_to_draft 做映射。
|
|
|
+从 case.json 读取,只对每个 case 的 workflow_groups[*].capability 中的 apply_to_draft 做映射。
|
|
|
调用 LLM 映射到内容树的正式节点,原位回填到 case.json
|
|
|
|
|
|
改造版本:通过远程 API 获取内容树,不再依赖本地文件
|
|
|
@@ -258,118 +258,131 @@ async def ground_single_case(
|
|
|
compact_tree: str = None,
|
|
|
) -> tuple[Dict[str, Any], float]:
|
|
|
"""
|
|
|
- 对单个 case 的 capability[*].apply_to_draft 做 apply_to 映射。
|
|
|
-
|
|
|
- 只处理 capability。
|
|
|
+ 对单个 case 的 workflow_groups[*].capability[*].apply_to_draft 做 apply_to 映射。
|
|
|
"""
|
|
|
total_cost = 0.0
|
|
|
result = dict(case_item)
|
|
|
title = case_item.get("title", "")[:20] or "untitled"
|
|
|
|
|
|
- capability_items = case_item.get("capability")
|
|
|
- if not isinstance(capability_items, list) or not capability_items:
|
|
|
+ workflow_groups = case_item.get("workflow_groups")
|
|
|
+ if not isinstance(workflow_groups, list) or not workflow_groups:
|
|
|
return result, total_cost
|
|
|
|
|
|
- draft_capability_pairs = [
|
|
|
- (idx, capability)
|
|
|
- for idx, capability in enumerate(capability_items)
|
|
|
- if isinstance(capability, dict) and "apply_to_draft" in capability
|
|
|
+ updated_groups = [
|
|
|
+ dict(group) if isinstance(group, dict) else group
|
|
|
+ for group in workflow_groups
|
|
|
]
|
|
|
- if not draft_capability_pairs:
|
|
|
- return result, total_cost
|
|
|
|
|
|
- # 收集 capability 的关键词(用于 API 搜索)
|
|
|
- if use_api:
|
|
|
- all_keywords = []
|
|
|
- for _, capability in draft_capability_pairs:
|
|
|
- apply_to_draft = capability.get("apply_to_draft", {})
|
|
|
- for key in ["实质", "形式"]:
|
|
|
- for draft_text in apply_to_draft.get(key, []):
|
|
|
- all_keywords.extend(extract_keywords_from_draft(draft_text))
|
|
|
- all_keywords = list(dict.fromkeys(all_keywords))[:10]
|
|
|
-
|
|
|
- if all_keywords:
|
|
|
- categories = await search_categories_by_keywords(all_keywords, top_k=5)
|
|
|
- capability_compact_tree = build_compact_tree(categories)
|
|
|
- capability_ref_paths = list(dict.fromkeys(
|
|
|
- c["path"] for c in categories if c.get("path")
|
|
|
- ))
|
|
|
+ for group_idx, group in enumerate(updated_groups):
|
|
|
+ if not isinstance(group, dict):
|
|
|
+ continue
|
|
|
+ workflow_id = group.get("workflow_id") or f"g{group_idx + 1}"
|
|
|
+ capability_items = group.get("capability")
|
|
|
+ if not isinstance(capability_items, list) or not capability_items:
|
|
|
+ continue
|
|
|
+
|
|
|
+ draft_capability_pairs = [
|
|
|
+ (idx, capability)
|
|
|
+ for idx, capability in enumerate(capability_items)
|
|
|
+ if isinstance(capability, dict) and "apply_to_draft" in capability
|
|
|
+ ]
|
|
|
+ if not draft_capability_pairs:
|
|
|
+ continue
|
|
|
+
|
|
|
+ # 收集 capability 的关键词(用于 API 搜索)
|
|
|
+ if use_api:
|
|
|
+ all_keywords = []
|
|
|
+ for _, capability in draft_capability_pairs:
|
|
|
+ apply_to_draft = capability.get("apply_to_draft", {})
|
|
|
+ for key in ["实质", "形式"]:
|
|
|
+ for draft_text in apply_to_draft.get(key, []):
|
|
|
+ all_keywords.extend(extract_keywords_from_draft(draft_text))
|
|
|
+ all_keywords = list(dict.fromkeys(all_keywords))[:10]
|
|
|
+
|
|
|
+ if all_keywords:
|
|
|
+ categories = await search_categories_by_keywords(all_keywords, top_k=5)
|
|
|
+ capability_compact_tree = build_compact_tree(categories)
|
|
|
+ capability_ref_paths = list(dict.fromkeys(
|
|
|
+ c["path"] for c in categories if c.get("path")
|
|
|
+ ))
|
|
|
+ else:
|
|
|
+ capability_compact_tree = compact_tree or "[]"
|
|
|
+ capability_ref_paths = []
|
|
|
else:
|
|
|
capability_compact_tree = compact_tree or "[]"
|
|
|
capability_ref_paths = []
|
|
|
- else:
|
|
|
- capability_compact_tree = compact_tree or "[]"
|
|
|
- capability_ref_paths = []
|
|
|
|
|
|
- updated_capabilities = [
|
|
|
- dict(capability) if isinstance(capability, dict) else capability
|
|
|
- for capability in capability_items
|
|
|
- ]
|
|
|
- id_to_index = {
|
|
|
- capability.get("capability_id"): idx
|
|
|
- for idx, capability in draft_capability_pairs
|
|
|
- if isinstance(capability.get("capability_id"), str)
|
|
|
- }
|
|
|
-
|
|
|
- batches = iter_batches(draft_capability_pairs, CAPABILITY_GROUNDING_BATCH_SIZE)
|
|
|
- for batch_idx, batch_pairs in enumerate(batches, start=1):
|
|
|
- draft_capabilities = [
|
|
|
- build_capability_grounding_input(capability)
|
|
|
- for _, capability in batch_pairs
|
|
|
+ updated_capabilities = [
|
|
|
+ dict(capability) if isinstance(capability, dict) else capability
|
|
|
+ for capability in capability_items
|
|
|
]
|
|
|
- draft = {"capability": draft_capabilities}
|
|
|
- prompt = render_grounding_prompt(template, "capability", draft, capability_compact_tree, capability_ref_paths)
|
|
|
- messages = [{"role": "user", "content": prompt}]
|
|
|
-
|
|
|
- grounded, cost = await call_llm_with_retry(
|
|
|
- llm_call=llm_call,
|
|
|
- messages=messages,
|
|
|
- model=model,
|
|
|
- temperature=0.1,
|
|
|
- max_tokens=4000,
|
|
|
- max_retries=3,
|
|
|
- schema_name="apply_to_grounding_capability",
|
|
|
- task_name=f"Ground_C_{title}_B{batch_idx}/{len(batches)}",
|
|
|
- )
|
|
|
- total_cost += cost
|
|
|
+ id_to_index = {
|
|
|
+ capability.get("capability_id"): idx
|
|
|
+ for idx, capability in draft_capability_pairs
|
|
|
+ if isinstance(capability.get("capability_id"), str)
|
|
|
+ }
|
|
|
|
|
|
- if not grounded or not isinstance(grounded.get("capability"), list):
|
|
|
- continue
|
|
|
+ batches = iter_batches(draft_capability_pairs, CAPABILITY_GROUNDING_BATCH_SIZE)
|
|
|
+ for batch_idx, batch_pairs in enumerate(batches, start=1):
|
|
|
+ draft_capabilities = [
|
|
|
+ build_capability_grounding_input(capability)
|
|
|
+ for _, capability in batch_pairs
|
|
|
+ ]
|
|
|
+ draft = {"capability": draft_capabilities}
|
|
|
+ prompt = render_grounding_prompt(template, "capability", draft, capability_compact_tree, capability_ref_paths)
|
|
|
+ messages = [{"role": "user", "content": prompt}]
|
|
|
+
|
|
|
+ grounded, cost = await call_llm_with_retry(
|
|
|
+ llm_call=llm_call,
|
|
|
+ messages=messages,
|
|
|
+ model=model,
|
|
|
+ temperature=0.1,
|
|
|
+ max_tokens=4000,
|
|
|
+ max_retries=3,
|
|
|
+ schema_name="apply_to_grounding_capability",
|
|
|
+ task_name=f"Ground_C_{title}_{workflow_id}_B{batch_idx}/{len(batches)}",
|
|
|
+ )
|
|
|
+ total_cost += cost
|
|
|
|
|
|
- grounded_capabilities = grounded["capability"]
|
|
|
- used_indices = set()
|
|
|
- for output_idx, grounded_capability in enumerate(grounded_capabilities):
|
|
|
- if not isinstance(grounded_capability, dict):
|
|
|
- continue
|
|
|
- capability_idx = None
|
|
|
- capability_id = grounded_capability.get("capability_id")
|
|
|
- if isinstance(capability_id, str):
|
|
|
- capability_idx = id_to_index.get(capability_id)
|
|
|
- if capability_idx is None and output_idx < len(batch_pairs):
|
|
|
- capability_idx = batch_pairs[output_idx][0]
|
|
|
- if capability_idx is None or capability_idx in used_indices:
|
|
|
+ if not grounded or not isinstance(grounded.get("capability"), list):
|
|
|
continue
|
|
|
- apply_to = grounded_capability.get("apply_to")
|
|
|
- suggest_apply_to = grounded_capability.get("suggest_apply_to")
|
|
|
- body = updated_capabilities[capability_idx].get("body", "")
|
|
|
- if (
|
|
|
- apply_to is not None
|
|
|
- and isinstance(suggest_apply_to, str)
|
|
|
- and suggest_apply_to.strip()
|
|
|
- and isinstance(updated_capabilities[capability_idx], dict)
|
|
|
- and apply_to_body_excerpts_are_verbatim(apply_to, body)
|
|
|
- ):
|
|
|
- updated_capabilities[capability_idx]["apply_to"] = apply_to
|
|
|
- updated_capabilities[capability_idx]["suggest_apply_to"] = suggest_apply_to.strip()
|
|
|
- updated_capabilities[capability_idx].pop("apply_to_draft", None)
|
|
|
- used_indices.add(capability_idx)
|
|
|
- else:
|
|
|
- print(
|
|
|
- f" ⚠️ Skip capability grounding writeback: "
|
|
|
- f"{capability_id or capability_idx} has missing/non-verbatim body_excerpt"
|
|
|
- )
|
|
|
|
|
|
- result["capability"] = updated_capabilities
|
|
|
+ grounded_capabilities = grounded["capability"]
|
|
|
+ used_indices = set()
|
|
|
+ for output_idx, grounded_capability in enumerate(grounded_capabilities):
|
|
|
+ if not isinstance(grounded_capability, dict):
|
|
|
+ continue
|
|
|
+ capability_idx = None
|
|
|
+ capability_id = grounded_capability.get("capability_id")
|
|
|
+ if isinstance(capability_id, str):
|
|
|
+ capability_idx = id_to_index.get(capability_id)
|
|
|
+ if capability_idx is None and output_idx < len(batch_pairs):
|
|
|
+ capability_idx = batch_pairs[output_idx][0]
|
|
|
+ if capability_idx is None or capability_idx in used_indices:
|
|
|
+ continue
|
|
|
+ apply_to = grounded_capability.get("apply_to")
|
|
|
+ suggest_apply_to = grounded_capability.get("suggest_apply_to")
|
|
|
+ body = updated_capabilities[capability_idx].get("body", "")
|
|
|
+ if (
|
|
|
+ apply_to is not None
|
|
|
+ and isinstance(suggest_apply_to, str)
|
|
|
+ and suggest_apply_to.strip()
|
|
|
+ and isinstance(updated_capabilities[capability_idx], dict)
|
|
|
+ and apply_to_body_excerpts_are_verbatim(apply_to, body)
|
|
|
+ ):
|
|
|
+ updated_capabilities[capability_idx]["apply_to"] = apply_to
|
|
|
+ updated_capabilities[capability_idx]["suggest_apply_to"] = suggest_apply_to.strip()
|
|
|
+ updated_capabilities[capability_idx].pop("apply_to_draft", None)
|
|
|
+ used_indices.add(capability_idx)
|
|
|
+ else:
|
|
|
+ print(
|
|
|
+ f" ⚠️ Skip capability grounding writeback: "
|
|
|
+ f"{capability_id or capability_idx} has missing/non-verbatim body_excerpt"
|
|
|
+ )
|
|
|
+
|
|
|
+ group["capability"] = updated_capabilities
|
|
|
+
|
|
|
+ result["workflow_groups"] = updated_groups
|
|
|
|
|
|
return result, total_cost
|
|
|
|
|
|
@@ -411,12 +424,18 @@ async def apply_grounding(
|
|
|
# 加载 prompt 模板
|
|
|
template = load_prompt_template("apply_to_grounding")
|
|
|
|
|
|
- # 过滤出需要处理的 case(只看 capability[*].apply_to_draft)
|
|
|
+ # 过滤出需要处理的 case(只看 workflow_groups[*].capability[*].apply_to_draft)
|
|
|
needs_grounding = []
|
|
|
for case in cases:
|
|
|
- capability_items = case.get("capability")
|
|
|
- has_capability_draft = isinstance(capability_items, list) and any(
|
|
|
- isinstance(capability, dict) and "apply_to_draft" in capability for capability in capability_items
|
|
|
+ workflow_groups = case.get("workflow_groups")
|
|
|
+ has_capability_draft = isinstance(workflow_groups, list) and any(
|
|
|
+ isinstance(group, dict)
|
|
|
+ and isinstance(group.get("capability"), list)
|
|
|
+ and any(
|
|
|
+ isinstance(capability, dict) and "apply_to_draft" in capability
|
|
|
+ for capability in group.get("capability", [])
|
|
|
+ )
|
|
|
+ for group in workflow_groups
|
|
|
)
|
|
|
if has_capability_draft:
|
|
|
needs_grounding.append(case)
|