liuzhiheng před 1 měsícem
rodič
revize
f3fc138127

+ 7 - 9
examples_how/overall_derivation/derivation_main.md

@@ -224,7 +224,7 @@ agent(agent_type="derivation_search", task="执行搜索任务,account_name=xx
         "derived_nodes": []
       },
       "output": ["分享"],
-      "reason": "意图维度中,'分享'节点是全局常量(c=true)且整体概率 r=0.913,极高,是账号最核心的创作意图起点。",
+      "reason": "'分享'节点是全局常量(c=true)且整体概率 r=0.913,极高,是账号最核心的创作意图起点。",
       "tools": []
     },
     {
@@ -351,9 +351,9 @@ agent(agent_type="derivation_search", task="执行搜索任务,account_name=xx
 - **适用场景**:通过人设树条件概率关联推导相关节点;非首轮进行内部推导时可以使用。
 - **操作方式**:调用工具 `find_tree_nodes_by_conditional_ratio(account_name, post_id, derived_items, conditional_ratio_threshold, top_n, round, log_id)`。`log_id` 为当前推导日志ID;`round` 为当前推导轮次。`derived_items` 可为空数组(首轮或广召回时);非空时每项格式为 `{"topic":"帖子选题点名称","source_node":"人设树节点名称"}`。工具返回格式示例:
   ```
-  - 分享  条件概率=1.0  父节点=分享  所属维度=分享  帖子选题点匹配=分享(1.0)
-  - 趣味道具  条件概率=0.125  父节点=家居用品  所属维度=物品  帖子选题点匹配=夸张道具(0.7831)
-  - 第一人称视角  条件概率=1.0  父节点=体验式呈现  所属维度=故事编排  帖子选题点匹配=无
+  - 分享  条件概率=1.0  所属维度=分享  帖子选题点匹配=分享(1.0)
+  - 趣味道具  条件概率=0.125  所属维度=物品  帖子选题点匹配=夸张道具(0.7831)
+  - 第一人称视角  条件概率=1.0  所属维度=故事编排  帖子选题点匹配=无
   ```
   - **推导路径的 `output`**:填写工具返回的**人设树节点名称**(如 `趣味道具`)。
   - **匹配判断**:读取「帖子选题点匹配」字段——若有值(如 `夸张道具(0.7831)`),则 `is_matched=true`,评估日志中 `matched_post_point` 填写括号前的帖子选题点名称(如 `夸张道具`),`matched_score` 填写匹配分数数值(如 `0.7831`),`matched_reason` 填写匹配分数描述(如 `匹配分数=0.7831`);若字段值为「无」,则 `is_matched=false`。
@@ -371,14 +371,12 @@ agent(agent_type="derivation_search", task="执行搜索任务,account_name=xx
                 "趣味道具"
             ],
             "patterns": [],
-            "derived_nodes": [
-                "家居用品"
-            ]
+            "derived_nodes": []
         },
         "output": [
             "趣味道具"
         ],
-        "reason": "根据已推导出的'家居用品'及维度'物品',人设树中'趣味道具'节点(父节点=家居用品)的条件概率=0.125,工具返回该节点存在帖子选题点匹配,因此将其作为推导候选。",
+        "reason": "在已推导出的维度'物品'下,'趣味道具'节点条件概率=0.125,工具返回该节点存在帖子选题点匹配,因此将其作为推导候选。",
         "tools": []
     },
     {
@@ -393,7 +391,7 @@ agent(agent_type="derivation_search", task="执行搜索任务,account_name=xx
         "output": [
             "第一人称视角"
         ],
-        "reason": "根据已推导出维度'故事编排',人设树中'第一人称视角'节点的条件概率=1.0且属于'故事编排'维度下的孩子节点,因此将其作为推导候选。",
+        "reason": "在已推导出的维度'故事编排'下,'第一人称视角'节点的条件概率=1.0,工具返回该节点存在帖子选题点匹配,因此将其作为推导候选。",
         "tools": []
     }
 ]  

+ 84 - 28
examples_how/overall_derivation/generate_visualize_data.py

@@ -146,7 +146,8 @@ def build_derivation_result(
     result = []
     derived_names_so_far: set[str] = set()
     fully_derived_names_so_far: set[str] = set()  # 已出现过 is_fully_derived=true 的选题点
-    best_score_by_name: dict[str, tuple[float, bool]] = {}  # name -> (matched_score, is_fully_derived),遇 is_fully=true 时更新
+    # name -> (matched_score, is_fully_derived),一旦 is_fully_derived=True,后续轮次不再更新 matched_score
+    best_score_by_name: dict[str, tuple[float, bool]] = {}
 
     for i, (derivation, eval_data) in enumerate(zip(derivations, evals)):
         round_num = derivation.get("round", i + 1)
@@ -175,7 +176,10 @@ def build_derivation_result(
             elif name not in fully_derived_names_so_far and is_fully:
                 new_derived_names.add(name)
 
-        # 更新推导集合与 best:首次出现或本轮 is_fully=true 时更新 best
+        # 更新推导集合与 best:
+        # - 首次出现时写入
+        # - 若尚未 fully 且本轮 fully,则更新为 fully,并锁定,不再被后续轮次覆盖
+        # - 若尚未 fully 且本轮仍为部分推导,可用更高分数更新
         derived_names_so_far |= matched_post_points
         for name in matched_post_points:
             val = this_round_scores.get(name)
@@ -184,8 +188,18 @@ def build_derivation_result(
             score, is_fully = val
             if name not in best_score_by_name:
                 best_score_by_name[name] = (score, is_fully)
-            elif is_fully:
-                best_score_by_name[name] = (score, is_fully)
+            else:
+                prev_score, prev_fully = best_score_by_name[name]
+                # 已经 fully 的节点,后续轮次不再更新 matched_score
+                if prev_fully:
+                    pass
+                else:
+                    if is_fully:
+                        best_score_by_name[name] = (score, True)
+                    else:
+                        # 都是部分推导时,可以用更高分覆盖
+                        if score > prev_score:
+                            best_score_by_name[name] = (score, False)
             if is_fully:
                 fully_derived_names_so_far.add(name)
 
@@ -320,9 +334,6 @@ def build_visualize_edges(
                 if k not in match_by_round_output:
                     match_by_round_output[k] = val
 
-    # 按 (round_num, mp) 收集节点候选,同轮同节点保留 matched_score 最高的一条
-    node_candidates: dict[tuple[int, str], dict] = {}  # (round_num, mp) -> node_dict (含 score, is_fully_derived)
-
     def get_match(round_num: int, path_id: int | None, item_id: int | None, out_item: str) -> tuple[str, str, float, bool] | None:
         if path_id is not None and item_id is not None:
             v = match_by_path_item.get((round_num, path_id, item_id))
@@ -330,13 +341,41 @@ def build_visualize_edges(
                 return v
         return match_by_round_output.get((round_num, out_item))
 
+    # 第一遍:按 (round_num, mp) 聚合节点最佳信息(不考虑边是否最终保留)
+    # (round_num, mp) -> (score, is_fully_derived, derivation_output_point, method)
+    best_node_info_by_round_mp: dict[tuple[int, str], tuple[float, bool, str, str]] = {}
+    for round_idx, derivation in enumerate(derivations):
+        round_num = derivation.get("round", round_idx + 1)
+        for dr in derivation.get("derivation_results") or []:
+            output_list = dr.get("output") or []
+            path_id = dr.get("id")
+            for i, out_item in enumerate(output_list):
+                item_id = i + 1
+                v = get_match(round_num, path_id, item_id, out_item)
+                if not v:
+                    continue
+                mp, _reason, score, is_fully = v
+                key = (round_num, mp)
+                prev = best_node_info_by_round_mp.get(key)
+                if prev is None or score > prev[0]:
+                    best_node_info_by_round_mp[key] = (score, bool(is_fully), out_item, dr.get("method", ""))
+
     edge_list = []
     round_output_seen: set[tuple[int, str]] = set()  # (round_num, node_name) 本轮已作为某边的 output
     best_score_by_node: dict[str, float] = {}  # node_name -> 已出现过的最高 matched_score
     fully_derived_nodes: set[str] = set()
+    current_round: int | None = None
 
     for round_idx, derivation in enumerate(derivations):
         round_num = derivation.get("round", round_idx + 1)
+        if current_round is None:
+            current_round = round_num
+        elif round_num != current_round:
+            # 一轮结束后,将本轮 is_fully_derived=true 的节点加入 fully_derived_nodes,用于后续轮次过滤
+            for (rn, name), (score, is_fully, _out_item, _method) in best_node_info_by_round_mp.items():
+                if rn == current_round and is_fully:
+                    fully_derived_nodes.add(name)
+            current_round = round_num
         for dr in derivation.get("derivation_results") or []:
             output_list = dr.get("output") or []
             path_id = dr.get("id")
@@ -352,7 +391,8 @@ def build_visualize_edges(
             if not matched:
                 continue
 
-            # 同一轮内 output 节点不重复;若前面轮次该节点匹配分更高则本轮不保留
+            # 同一轮内 output 节点不重复;若前面轮次该节点匹配分更高则本轮不保留;
+            # 并且只保留与 node_list 中该轮该节点的最高分记录一致的边
             output_names_this_edge = []
             for mp, reason, score, is_fully, out_item in matched:
                 if (round_num, mp) in round_output_seen:
@@ -361,6 +401,9 @@ def build_visualize_edges(
                     continue
                 if score <= best_score_by_node.get(mp, -1.0):
                     continue
+                best_info = best_node_info_by_round_mp.get((round_num, mp))
+                if not best_info or score < best_info[0]:
+                    continue
                 output_names_this_edge.append((mp, reason, score, is_fully, out_item))
 
             if not output_names_this_edge:
@@ -370,20 +413,6 @@ def build_visualize_edges(
                 round_output_seen.add((round_num, mp))
                 best_score_by_node[mp] = max(best_score_by_node.get(mp, -1.0), score)
 
-            # 节点候选:同轮同节点保留匹配分更高的
-            for mp, _reason, score, is_fully, out_item in output_names_this_edge:
-                key = (round_num, mp)
-                if key not in node_candidates or node_candidates[key].get("matched_score", 0) < score:
-                    node = dict(topic_by_name.get(mp, {"name": mp, "point": "", "dimension": "", "root_source": "", "root_sources_desc": ""}))
-                    node["level"] = round_num
-                    node.setdefault("original_word", node.get("name", mp))
-                    node["derivation_type"] = dr.get("method", "")
-                    node["matched_score"] = score
-                    node["is_fully_derived"] = is_fully
-                    # 对应评估中的 derivation_output_point
-                    node["derivation_output_point"] = out_item
-                    node_candidates[key] = node
-
             input_data = dr.get("input") or {}
             derived_nodes = input_data.get("derived_nodes") or []
             tree_nodes = input_data.get("tree_nodes") or []
@@ -425,11 +454,38 @@ def build_visualize_edges(
                 "detail": detail,
             })
 
-        for (rn, name), nd in node_candidates.items():
-            if rn == round_num and nd.get("is_fully_derived"):
+    # 处理最后一轮的 fully_derived_nodes
+    if current_round is not None:
+        for (rn, name), (score, is_fully, _out_item, _method) in best_node_info_by_round_mp.items():
+            if rn == current_round and is_fully:
                 fully_derived_nodes.add(name)
 
-    node_list = list(node_candidates.values())
+    # 根据按 (round, mp) 聚合后的最佳信息生成 node_list
+    # 规则:若某节点在某轮已经 is_fully_derived=True,则之后轮次即便分数更高也不再保留该节点
+    first_full_round_by_name: dict[str, int] = {}
+    for (round_num, mp), (_score, is_fully, _out_item, _method) in best_node_info_by_round_mp.items():
+        if not is_fully:
+            continue
+        prev = first_full_round_by_name.get(mp)
+        if prev is None or round_num < prev:
+            first_full_round_by_name[mp] = round_num
+
+    node_list: list[dict] = []
+    for (round_num, mp), (score, is_fully, out_item, method) in best_node_info_by_round_mp.items():
+        full_round = first_full_round_by_name.get(mp)
+        # 若存在更早的 fully 轮次,且当前轮次在其之后,则不再保留
+        if full_round is not None and round_num > full_round:
+            continue
+        base = dict(topic_by_name.get(mp, {"name": mp, "point": "", "dimension": "", "root_source": "", "root_sources_desc": ""}))
+        base["level"] = round_num
+        base.setdefault("original_word", base.get("name", mp))
+        base["derivation_type"] = method
+        base["matched_score"] = score
+        base["is_fully_derived"] = is_fully
+        base["derivation_output_point"] = out_item
+        node_list.append(base)
+
+    node_list.sort(key=lambda n: (n.get("level", 0), str(n.get("name", ""))))
     return node_list, edge_list
 
 
@@ -502,9 +558,9 @@ if __name__ == "__main__":
     account_name="家有大志"
 
     items = [
-        {"post_id":"68fb6a5c000000000302e5de","log_id":"20260317214307"},
-        {"post_id":"69185d49000000000d00f94e","log_id":"20260317214841"},
-        {"post_id":"6921937a000000001b0278d1","log_id":"20260317215616"}
+        {"post_id":"68fb6a5c000000000302e5de","log_id":"20260318220540"},
+        {"post_id":"69185d49000000000d00f94e","log_id":"20260318221136"},
+        {"post_id":"6921937a000000001b0278d1","log_id":"20260318221538"}
     ]
     for item in items:
         post_id = item["post_id"]

+ 2 - 2
examples_how/overall_derivation/overall_derivation_agent_run.py

@@ -525,7 +525,7 @@ async def main(account_name, post_id):
         # 以脚本所在目录为基准,兼容从项目根或脚本目录启动
         script_dir = Path(__file__).resolve().parent
         output_dir = script_dir / "output"
-        output_file = output_dir / account_name / "推导日志" / current_trace_id / log_id / "result.txt"
+        output_file = output_dir / account_name / "推导日志" / post_id / log_id / "result.txt"
         output_file.parent.mkdir(parents=True, exist_ok=True)
         with open(output_file, 'w', encoding='utf-8') as f:
             f.write(final_response)
@@ -550,4 +550,4 @@ async def main(account_name, post_id):
 if __name__ == "__main__":
     # anthropic/claude-sonnet-4.6
     # google/gemini-3-flash-preview
-    asyncio.run(main(account_name="家有大志", post_id="68fb6a5c000000000302e5de"))
+    asyncio.run(main(account_name="家有大志", post_id="6921937a000000001b0278d1"))

+ 39 - 14
examples_how/overall_derivation/tools/pattern_dimension_analyze.py

@@ -335,6 +335,14 @@ class TreeIndex:
                 self.node_info.setdefault(child, {})
                 if self.node_info[child].get("depth") is None:
                     self.node_info[child]["depth"] = cur_depth + 1
+                    # BFS 首次到达该节点时(即最短路径),同步修正 parent 指针,
+                    # 确保 parent 与 depth 始终保持一致。
+                    # 若同名节点在树中多处出现,walk() 会用最后一次遍历的父节点
+                    # 覆盖 parent,导致 parent 指向更深处的节点,
+                    # 进而使 find_ancestor_at_level 沿 parent 链爬升时出现深度
+                    # 倒退(越走越深)甚至返回错误祖先/None 的问题。
+                    # 在 BFS 阶段统一修正,可保证 parent 链单调递减至根节点。
+                    self.node_info[child]["parent"] = cur
                     q.append(child)
 
     def find_ancestor_at_level(self, node_name: str, level: int) -> Optional[str]:
@@ -343,6 +351,16 @@ class TreeIndex:
         - 若 node_name 自身 depth == level,直接返回自身。
         - 若 node_name depth < level(比目标层浅),返回自身。
         - 否则沿 parent 链向上查找,返回第一个 depth == level 的祖先节点。
+
+        说明:
+        早期实现中为了防止意外环路使用了 visited 集合,一旦检测到「重复节点」就直接
+        返回 None,导致在树中存在同名节点、且 parent 指针被覆盖的情况下,会错误返回
+        None。这里改为**只沿 parent 链向上行走**,不再依赖 visited 截断:
+        - 每一步仅查看当前节点的 depth 与 parent;
+        - 一旦到达 depth <= level,直接返回当前节点;
+        - 若 parent 为空,则返回当前已到达的最高节点。
+        在正常树结构下(parent 指针无环),该过程必然在有限步内结束;若底层数据意外
+        形成环,需在构建 node_info 时修复,祖先查找本身不再额外承担防御职责。
         """
         info = self.node_info.get(node_name)
         if not info:
@@ -352,21 +370,20 @@ class TreeIndex:
             return None
         if depth <= level:
             return node_name
+        # 只沿 parent 链向上查找,不再依赖 visited 截断;
+        # 一旦到达 depth <= level 或 parent 为空即返回当前节点。
         cur = node_name
-        visited: Set[str] = set()
-        while cur and cur not in visited:
-            visited.add(cur)
+        while True:
             cur_info = self.node_info.get(cur) or {}
-            cur_depth = cur_info.get("depth") or 0
-            if cur_depth == level:
+            cur_depth = cur_info.get("depth")
+            if cur_depth is None:
                 return cur
-            if cur_depth < level:
+            if cur_depth <= level:
                 return cur
             parent = cur_info.get("parent")
             if parent is None:
                 return cur
             cur = parent
-        return None
 
     # 聚类搜索(不再区分维度)
     def find_clusters(
@@ -919,13 +936,22 @@ def main(account_name, post_id, log_id) -> None:
     # 控制台打印前 4000 字符,便于快速查看
     # print(json.dumps(result, ensure_ascii=False, indent=2)[:4000] + "...")
 
-    # 写入输出文件:../output/{account_name}/推导日志/{post_id}/{log_id}/pattern_dimension_analyze.json
+    # 写入输出文件 1:../output/{account_name}/推导日志/{post_id}/{log_id}/pattern_dimension_analyze.json
     out_dir = _round_eval_dir(account_name, post_id, log_id)
     out_dir.mkdir(parents=True, exist_ok=True)
     output_file_name = f"{post_id}_pattern_dimension_analyze.json"
     out_path = out_dir / output_file_name
     with open(out_path, "w", encoding="utf-8") as f:
         json.dump(result, f, ensure_ascii=False, indent=2)
+
+    # 写入输出文件 2:../output/{account_name}/整体推导维度分析/{post_id}_pattern_dimension_analyze.json
+    overall_dir = _BASE_OUTPUT / account_name / "整体推导维度分析"
+    overall_dir.mkdir(parents=True, exist_ok=True)
+    overall_output_file_name = f"{post_id}_pattern_dimension_analyze.json"
+    overall_out_path = overall_dir / overall_output_file_name
+    with open(overall_out_path, "w", encoding="utf-8") as f:
+        json.dump(result, f, ensure_ascii=False, indent=2)
+
     print(f"\n分析结果已写入: {out_path}")
 
 
@@ -944,7 +970,6 @@ def main_round_pattern_dimension_analyze(
             post_id=post_id,
             log_id=log_id,
             round=round_num,
-            context=None,
         )
         if result is None:
             print(
@@ -984,14 +1009,14 @@ if __name__ == "__main__":
     run_full_pattern_analyze = True
 
     test_account_name = "家有大志"
-    test_post_id = "68fb6a5c000000000302e5de"
-    test_log_id = "20260317214307"
+    test_post_id = "69185d49000000000d00f94e"
+    test_log_id = "20260318221136"
     test_round = 1
 
     items = [
-        {"post_id": "68fb6a5c000000000302e5de", "log_id": "20260318172724"},
-        # {"post_id": "69185d49000000000d00f94e", "log_id": "20260317214841"},
-        # {"post_id": "6921937a000000001b0278d1", "log_id": "20260317215616"},
+        {"post_id": "68fb6a5c000000000302e5de", "log_id": "20260318220540"},
+        {"post_id": "69185d49000000000d00f94e", "log_id": "20260318221136"},
+        {"post_id": "6921937a000000001b0278d1", "log_id": "20260318221538"}
     ]
 
     if run_round_pattern_test: