пре 6 дана · c7b6337463
--- a/examples/process_pipeline/script/apply_to_grounding.py
+++ b/examples/process_pipeline/script/apply_to_grounding.py
@@ -1,8 +1,9 @@
 
				 """
			
 
				 Stage 2: 将 apply_to_draft 映射为正式 apply_to
			
 
				 
			
 
				-从 case.json 读取，对每个 case 的 workflow 和 capabilities 中的 apply_to_draft，
			
 
				-调用 LLM 映射到内容树的正式节点，按 index 原位回填到 case.json
			
 
				+从 case.json 读取，优先对每个 case 的 fragments 中的 apply_to_draft 做映射；
			
 
				+没有 fragments 时，回退处理 workflow steps / capabilities 中的 apply_to_draft。
			
 
				+调用 LLM 映射到内容树的正式节点，原位回填到 case.json
			
 
				 
			
 
				 改造版本：通过远程 API 获取内容树，不再依赖本地文件
			
 
				 """
			
@@ -226,10 +227,12 @@ async def ground_single_case(
 
				     compact_tree: str = None,
			
 
				 ) -> tuple[Dict[str, Any], float]:
			
 
				     """
			
 
				-    对单个 case 的 workflow 和 capabilities 做 apply_to 映射
			
 
				+    对单个 case 做 apply_to 映射
			
 
				 
			
 
				-    对于 workflow：一次性处理整个 workflow，为每个 step 生成对应的 apply_to
			
 
				-    对于 capabilities：对每个有 apply_to_draft 的 capability 进行映射
			
 
				+    优先级：
			
 
				+    1. 如果存在 fragments，只处理 fragments[*].apply_to_draft，并回填到 fragments[*].apply_to
			
 
				+    2. 没有 fragments 时，处理旧格式 workflow.steps[*].apply_to_draft
			
 
				+    3. workflow 没有 draft 时，再处理 capabilities[*].apply_to_draft
			
 
				 
			
 
				     Args:
			
 
				         case_item: 案例数据
			
@@ -243,25 +246,26 @@ async def ground_single_case(
 
				     result = dict(case_item)
			
 
				     title = case_item.get("title", "")[:20] or "untitled"
			
 
				 
			
 
				-    # 处理 fragments - 整体处理，保持上下文。只要存在 fragments，就不再读取 capabilities。
			
 
				+    # 处理 fragments - 整体处理，保持上下文。只要存在 fragments，就不再读取 workflow/capabilities。
			
 
				     fragments = case_item.get("fragments")
			
 
				     has_fragments = isinstance(fragments, list) and bool(fragments)
			
 
				     if has_fragments:
			
 
				-        has_draft = any(
			
 
				-            isinstance(frag, dict) and "apply_to_draft" in frag
			
 
				-            for frag in fragments
			
 
				-        )
			
 
				+        draft_fragment_pairs = [
			
 
				+            (idx, frag)
			
 
				+            for idx, frag in enumerate(fragments)
			
 
				+            if isinstance(frag, dict) and "apply_to_draft" in frag
			
 
				+        ]
			
 
				+        has_draft = bool(draft_fragment_pairs)
			
 
				 
			
 
				         if has_draft:
			
 
				             # 收集所有 fragment 的关键词（用于 API 搜索）
			
 
				             if use_api:
			
 
				                 all_keywords = []
			
 
				-                for frag in fragments:
			
 
				-                    if isinstance(frag, dict) and "apply_to_draft" in frag:
			
 
				-                        apply_to_draft = frag.get("apply_to_draft", {})
			
 
				-                        for key in ["实质", "形式"]:
			
 
				-                            for draft_text in apply_to_draft.get(key, []):
			
 
				-                                all_keywords.extend(extract_keywords_from_draft(draft_text))
			
 
				+                for _, frag in draft_fragment_pairs:
			
 
				+                    apply_to_draft = frag.get("apply_to_draft", {})
			
 
				+                    for key in ["实质", "形式"]:
			
 
				+                        for draft_text in apply_to_draft.get(key, []):
			
 
				+                            all_keywords.extend(extract_keywords_from_draft(draft_text))
			
 
				                 all_keywords = list(dict.fromkeys(all_keywords))[:10]
			
 
				 
			
 
				                 if all_keywords:
			
@@ -278,7 +282,9 @@ async def ground_single_case(
 
				                 frag_ref_paths = []
			
 
				 
			
 
				             # 复用 capability grounding 的 prompt/schema，只把数据源从 workflow step 换成 fragment。
			
 
				-            draft = {"capabilities": fragments}
			
 
				+            # 只发送带 apply_to_draft 的 fragment，再按原始下标回填，避免数组错位。
			
 
				+            draft_fragments = [frag for _, frag in draft_fragment_pairs]
			
 
				+            draft = {"capabilities": draft_fragments}
			
 
				             prompt = render_grounding_prompt(template, "capability", draft, frag_compact_tree, frag_ref_paths)
			
 
				             messages = [{"role": "user", "content": prompt}]
			
 
				 
			
@@ -297,20 +303,98 @@ async def ground_single_case(
 
				             # 按索引回填 apply_to。输入数组来自 fragments，输出数组使用 capability schema。
			
 
				             if grounded and isinstance(grounded.get("capabilities"), list):
			
 
				                 grounded_frags = grounded["capabilities"]
			
 
				-                updated_fragments = []
			
 
				-                for idx, frag in enumerate(fragments):
			
 
				-                    updated_frag = dict(frag)
			
 
				-                    if idx < len(grounded_frags) and isinstance(grounded_frags[idx], dict):
			
 
				-                        apply_to = grounded_frags[idx].get("apply_to")
			
 
				+                updated_fragments = [
			
 
				+                    dict(frag) if isinstance(frag, dict) else frag
			
 
				+                    for frag in fragments
			
 
				+                ]
			
 
				+                for draft_idx, (frag_idx, _) in enumerate(draft_fragment_pairs):
			
 
				+                    if draft_idx < len(grounded_frags) and isinstance(grounded_frags[draft_idx], dict):
			
 
				+                        apply_to = grounded_frags[draft_idx].get("apply_to")
			
 
				                         if apply_to is not None:
			
 
				-                            updated_frag["apply_to"] = apply_to
			
 
				-                    updated_frag.pop("apply_to_draft", None)
			
 
				-                    updated_fragments.append(updated_frag)
			
 
				+                            updated_fragments[frag_idx]["apply_to"] = apply_to
			
 
				+                            updated_fragments[frag_idx].pop("apply_to_draft", None)
			
 
				                 result["fragments"] = updated_fragments
			
 
				 
			
 
				-    # 没有 fragments 时，才回退处理 capabilities。
			
 
				+    # 没有 fragments 时，回退处理旧格式 workflow step draft。
			
 
				+    workflow = case_item.get("workflow")
			
 
				+    handled_workflow = False
			
 
				+    if not has_fragments and isinstance(workflow, dict) and "steps" in workflow:
			
 
				+        steps = workflow.get("steps", [])
			
 
				+
			
 
				+        has_draft = any(
			
 
				+            isinstance(step, dict) and "apply_to_draft" in step
			
 
				+            for step in steps
			
 
				+        )
			
 
				+
			
 
				+        if has_draft:
			
 
				+            handled_workflow = True
			
 
				+            # 收集所有 step 的关键词（用于 API 搜索）
			
 
				+            if use_api:
			
 
				+                all_keywords = []
			
 
				+                for step in steps:
			
 
				+                    if isinstance(step, dict) and "apply_to_draft" in step:
			
 
				+                        apply_to_draft = step.get("apply_to_draft", {})
			
 
				+                        for key in ["实质", "形式"]:
			
 
				+                            for draft_text in apply_to_draft.get(key, []):
			
 
				+                                all_keywords.extend(extract_keywords_from_draft(draft_text))
			
 
				+                all_keywords = list(dict.fromkeys(all_keywords))[:10]
			
 
				+
			
 
				+                if all_keywords:
			
 
				+                    categories = await search_categories_by_keywords(all_keywords, top_k=5)
			
 
				+                    workflow_compact_tree = build_compact_tree(categories)
			
 
				+                    workflow_ref_paths = list(dict.fromkeys(
			
 
				+                        c["path"] for c in categories if c.get("path")
			
 
				+                    ))
			
 
				+                else:
			
 
				+                    workflow_compact_tree = compact_tree or "[]"
			
 
				+                    workflow_ref_paths = []
			
 
				+            else:
			
 
				+                workflow_compact_tree = compact_tree or "[]"
			
 
				+                workflow_ref_paths = []
			
 
				+
			
 
				+            # 整个 workflow 传给 LLM（保持上下文）
			
 
				+            draft = {"strategy": workflow}
			
 
				+            prompt = render_grounding_prompt(template, "strategy", draft, workflow_compact_tree, workflow_ref_paths)
			
 
				+            messages = [{"role": "user", "content": prompt}]
			
 
				+
			
 
				+            grounded, cost = await call_llm_with_retry(
			
 
				+                llm_call=llm_call,
			
 
				+                messages=messages,
			
 
				+                model=model,
			
 
				+                temperature=0.1,
			
 
				+                max_tokens=4000,
			
 
				+                max_retries=3,
			
 
				+                schema_name="apply_to_grounding_strategy",
			
 
				+                task_name=f"Ground_W_{title}",
			
 
				+            )
			
 
				+            total_cost += cost
			
 
				+
			
 
				+            # 按 order 回填 apply_to
			
 
				+            if grounded and isinstance(grounded.get("strategy"), dict):
			
 
				+                grounded_steps = grounded["strategy"].get("steps", [])
			
 
				+                order_to_apply_to = {}
			
 
				+                for grounded_step in grounded_steps:
			
 
				+                    if isinstance(grounded_step, dict):
			
 
				+                        order = grounded_step.get("order")
			
 
				+                        apply_to = grounded_step.get("apply_to")
			
 
				+                        if order is not None and apply_to is not None:
			
 
				+                            order_to_apply_to[order] = apply_to
			
 
				+
			
 
				+                updated_steps = []
			
 
				+                for step in steps:
			
 
				+                    updated_step = dict(step)
			
 
				+                    order = step.get("order")
			
 
				+                    if order in order_to_apply_to:
			
 
				+                        updated_step["apply_to"] = order_to_apply_to[order]
			
 
				+                        updated_step.pop("apply_to_draft", None)
			
 
				+                    updated_steps.append(updated_step)
			
 
				+
			
 
				+                result["workflow"] = dict(workflow)
			
 
				+                result["workflow"]["steps"] = updated_steps
			
 
				+
			
 
				+    # 没有 fragments 且 workflow 没处理时，才回退处理 capabilities。
			
 
				     capabilities = case_item.get("capabilities")
			
 
				-    if not has_fragments and isinstance(capabilities, list) and capabilities:
			
 
				+    if not has_fragments and not handled_workflow and isinstance(capabilities, list) and capabilities:
			
 
				         has_draft = any(
			
 
				             isinstance(cap, dict) and "apply_to_draft" in cap
			
 
				             for cap in capabilities
			
@@ -418,15 +502,20 @@ async def apply_grounding(
 
				     needs_grounding = []
			
 
				     for case in cases:
			
 
				         fragments = case.get("fragments")
			
 
				+        workflow = case.get("workflow")
			
 
				         capabilities = case.get("capabilities")
			
 
				         has_fragments = isinstance(fragments, list) and bool(fragments)
			
 
				         has_frag_draft = isinstance(fragments, list) and any(
			
 
				             isinstance(frag, dict) and "apply_to_draft" in frag for frag in fragments
			
 
				         )
			
 
				-        has_cap_draft = not has_fragments and isinstance(capabilities, list) and any(
			
 
				+        has_workflow_draft = not has_fragments and isinstance(workflow, dict) and any(
			
 
				+            isinstance(step, dict) and "apply_to_draft" in step
			
 
				+            for step in workflow.get("steps", [])
			
 
				+        )
			
 
				+        has_cap_draft = not has_fragments and not has_workflow_draft and isinstance(capabilities, list) and any(
			
 
				             isinstance(c, dict) and "apply_to_draft" in c for c in capabilities
			
 
				         )
			
 
				-        if has_frag_draft or has_cap_draft:
			
 
				+        if has_frag_draft or has_workflow_draft or has_cap_draft:
			
 
				             needs_grounding.append(case)
			
 
				 
			
 
				     print(f"Grounding apply_to for {len(needs_grounding)}/{len(cases)} cases...")
			
@@ -455,23 +544,23 @@ async def apply_grounding(
 
				             )
			
 
				 
			
 
				             print(f"  <- [{index}] [{case_id}] grounded (cost=${cost:.4f})")
			
 
				-            return grounded, cost
			
 
				+            return case_item, grounded, cost
			
 
				 
			
 
				     tasks = [process_with_semaphore(case) for case in needs_grounding]
			
 
				     results_with_costs = await asyncio.gather(*tasks)
			
 
				 
			
 
				-    # 用 grounded 结果替换原 case（按 index 匹配）
			
 
				+    # 用 grounded 结果替换原 case（按对象身份匹配，避免 index 缺失或重复时回填错 case）
			
 
				     grounded_map = {}
			
 
				     total_cost = 0.0
			
 
				-    for grounded, cost in results_with_costs:
			
 
				-        grounded_map[grounded.get("index")] = grounded
			
 
				+    for original_case, grounded, cost in results_with_costs:
			
 
				+        grounded_map[id(original_case)] = grounded
			
 
				         total_cost += cost
			
 
				 
			
 
				     updated_cases = []
			
 
				     for case in cases:
			
 
				-        idx = case.get("index")
			
 
				-        if idx in grounded_map:
			
 
				-            updated_cases.append(grounded_map[idx])
			
 
				+        case_id = id(case)
			
 
				+        if case_id in grounded_map:
			
 
				+            updated_cases.append(grounded_map[case_id])
			
 
				         else:
			
 
				             updated_cases.append(case)