liuzhiheng 3 днів тому
батько
коміт
4e35e13ee5

+ 1 - 1
examples_how/overall_derivation/derivation_main.md

@@ -1,5 +1,5 @@
 ---
-model: anthropic/claude-sonnet-4.6
+model: google/gemini-3-flash-preview
 temperature: 0.3
 ---
 

+ 4 - 4
examples_how/overall_derivation/overall_derivation_agent_run.py

@@ -303,7 +303,7 @@ async def main(account_name, post_id):
         trace_store=store,
         llm_call=create_openrouter_llm_call(model=model_id),
         skills_dir=skills_dir,
-        experiences_path="./.cache/experiences_overall_derivation.md",
+        # experiences_path="./.cache/experiences_overall_derivation.md",
         debug=True,
     )
 
@@ -421,7 +421,7 @@ async def main(account_name, post_id):
                     if isinstance(item, Trace):
                         current_trace_id = item.trace_id
                         if item.status == "running":
-                            print(f"[Trace] 开始: {item.trace_id[:8]}...")
+                            print(f"[Trace] 开始: {item.trace_id[:100]}...")
                         elif item.status == "completed":
                             print("\n[Trace] ✅ 完成")
                             print(f"  - Total messages: {item.total_messages}")
@@ -444,7 +444,7 @@ async def main(account_name, post_id):
                                     print("\n[Response] Agent 回复:")
                                     print(text)
                                 elif text:
-                                    preview = text[:150] + "..." if len(text) > 150 else text
+                                    preview = text[:500] + "..." if len(text) > 500 else text
                                     print(f"[Assistant] {preview}")
                                 if tool_calls:
                                     for tc in tool_calls:
@@ -460,7 +460,7 @@ async def main(account_name, post_id):
                                 print(f"[Tool Result] ✅ {tool_name}")
                                 print(f"  result: {tool_result}")
                             if item.description:
-                                desc = item.description[:80] if len(item.description) > 80 else item.description
+                                desc = item.description[:500] if len(item.description) > 500 else item.description
                                 print(f"  {desc}...")
 
             except Exception as e:

+ 9 - 22
examples_how/overall_derivation/tools/find_pattern.py

@@ -4,11 +4,18 @@
 功能:读取账号的 pattern 库,合并去重后按条件概率筛选,返回 topN 条 pattern(含 pattern 名称、条件概率)。
 """
 
-import importlib.util
 import json
+import sys
 from pathlib import Path
 from typing import Any, Optional
 
+# 保证直接运行或作为包加载时都能解析 utils / tools(IDE 可跳转)
+_root = Path(__file__).resolve().parent.parent
+if str(_root) not in sys.path:
+    sys.path.insert(0, str(_root))
+from utils.conditional_ratio_calc import calc_pattern_conditional_ratio
+from tools.point_match import _load_match_data, match_derivation_to_post_points
+
 try:
     from agent.tools import tool, ToolResult, ToolContext
 except ImportError:
@@ -27,28 +34,8 @@ TOP_KEYS = [
 ]
 SUB_KEYS = ["two_x", "one_x", "zero_x"]
 
-# 加载 conditional_ratio_calc(与 find_tree_node 一致)
-_utils_dir = Path(__file__).resolve().parent.parent / "utils"
-_cond_spec = importlib.util.spec_from_file_location(
-    "conditional_ratio_calc",
-    _utils_dir / "conditional_ratio_calc.py",
-)
-_cond_mod = importlib.util.module_from_spec(_cond_spec)
-_cond_spec.loader.exec_module(_cond_mod)
-calc_pattern_conditional_ratio = _cond_mod.calc_pattern_conditional_ratio
-
 _BASE_INPUT = Path(__file__).resolve().parent.parent / "input"
 
-# 加载 point_match(用于检查 pattern 元素是否匹配帖子选题点)
-_point_match_spec = importlib.util.spec_from_file_location(
-    "point_match",
-    Path(__file__).resolve().parent / "point_match.py",
-)
-_point_match_mod = importlib.util.module_from_spec(_point_match_spec)
-_point_match_spec.loader.exec_module(_point_match_mod)
-_match_derivation_to_post_points = _point_match_mod.match_derivation_to_post_points
-_load_match_data = _point_match_mod._load_match_data
-
 
 def _pattern_file(account_name: str) -> Path:
     """pattern 库文件:../input/{account_name}/原始数据/pattern/processed_edge_data.json"""
@@ -259,7 +246,7 @@ async def find_pattern(
                     if elem and elem not in seen_elements:
                         all_elements.append(elem)
                         seen_elements.add(elem)
-            matched_results = await _match_derivation_to_post_points(all_elements, account_name, post_id)
+            matched_results = await match_derivation_to_post_points(all_elements, account_name, post_id)
             elem_match_map: dict[str, list] = {}
             for m in matched_results:
                 elem_match_map.setdefault(m["推导选题点"], []).append({

+ 10 - 22
examples_how/overall_derivation/tools/find_tree_node.py

@@ -6,11 +6,18 @@
 2. 获取符合条件概率阈值的节点(按条件概率排序返回 topN)
 """
 
-import importlib.util
 import json
+import sys
 from pathlib import Path
 from typing import Any, Optional
 
+# 保证直接运行或作为包加载时都能解析 utils / tools(IDE 可跳转)
+_root = Path(__file__).resolve().parent.parent
+if str(_root) not in sys.path:
+    sys.path.insert(0, str(_root))
+from utils.conditional_ratio_calc import calc_node_conditional_ratio
+from tools.point_match import match_derivation_to_post_points
+
 try:
     from agent.tools import tool, ToolResult, ToolContext
 except ImportError:
@@ -19,28 +26,9 @@ except ImportError:
     ToolResult = None  # 仅用 main() 测核心逻辑时可无 agent
     ToolContext = None
 
-# 加载同目录层级的 conditional_ratio_calc(不依赖包结构)
-_utils_dir = Path(__file__).resolve().parent.parent / "utils"
-_cond_spec = importlib.util.spec_from_file_location(
-    "conditional_ratio_calc",
-    _utils_dir / "conditional_ratio_calc.py",
-)
-_cond_mod = importlib.util.module_from_spec(_cond_spec)
-_cond_spec.loader.exec_module(_cond_mod)
-calc_node_conditional_ratio = _cond_mod.calc_node_conditional_ratio
-
 # 相对本文件:tools -> overall_derivation,input 在 overall_derivation 下
 _BASE_INPUT = Path(__file__).resolve().parent.parent / "input"
 
-# 加载 point_match(用于检查节点是否匹配帖子选题点)
-_point_match_spec = importlib.util.spec_from_file_location(
-    "point_match",
-    Path(__file__).resolve().parent / "point_match.py",
-)
-_point_match_mod = importlib.util.module_from_spec(_point_match_spec)
-_point_match_spec.loader.exec_module(_point_match_mod)
-_match_derivation_to_post_points = _point_match_mod.match_derivation_to_post_points
-
 
 def _tree_dir(account_name: str) -> Path:
     """人设树目录:../input/{account_name}/原始数据/tree/"""
@@ -217,7 +205,7 @@ async def find_tree_constant_nodes(
         # 批量匹配所有节点与帖子选题点
         if items and post_id:
             node_names = [x["节点名称"] for x in items]
-            matched_results = await _match_derivation_to_post_points(node_names, account_name, post_id)
+            matched_results = await match_derivation_to_post_points(node_names, account_name, post_id)
             node_match_map: dict[str, list] = {}
             for m in matched_results:
                 node_match_map.setdefault(m["推导选题点"], []).append({
@@ -303,7 +291,7 @@ async def find_tree_nodes_by_conditional_ratio(
         # 批量匹配所有节点与帖子选题点
         if items and post_id:
             node_names = [x["节点名称"] for x in items]
-            matched_results = await _match_derivation_to_post_points(node_names, account_name, post_id)
+            matched_results = await match_derivation_to_post_points(node_names, account_name, post_id)
             node_match_map: dict[str, list] = {}
             for m in matched_results:
                 node_match_map.setdefault(m["推导选题点"], []).append({

+ 7 - 11
examples_how/overall_derivation/tools/point_match.py

@@ -4,11 +4,17 @@
 功能:读取帖子选题点列表,与推导选题点做相似度计算,返回 combined_score >= 阈值的匹配对。
 """
 
-import importlib.util
 import json
+import sys
 from pathlib import Path
 from typing import Any, List, Optional
 
+# 保证直接运行或作为包加载时都能解析 utils(IDE 可跳转)
+_root = Path(__file__).resolve().parent.parent
+if str(_root) not in sys.path:
+    sys.path.insert(0, str(_root))
+from utils.similarity_calc import similarity_matrix
+
 try:
     from agent.tools import tool, ToolResult, ToolContext
 except ImportError:
@@ -17,16 +23,6 @@ except ImportError:
     ToolResult = None
     ToolContext = None
 
-# 加载 similarity_calc
-_utils_dir = Path(__file__).resolve().parent.parent / "utils"
-_sim_spec = importlib.util.spec_from_file_location(
-    "similarity_calc",
-    _utils_dir / "similarity_calc.py",
-)
-_sim_mod = importlib.util.module_from_spec(_sim_spec)
-_sim_spec.loader.exec_module(_sim_mod)
-similarity_matrix = _sim_mod.similarity_matrix
-
 _BASE_INPUT = Path(__file__).resolve().parent.parent / "input"
 
 # 默认匹配阈值

+ 1 - 1
examples_how/overall_derivation/utils/conditional_ratio_calc.py

@@ -115,7 +115,7 @@ def calc_node_conditional_ratio(
     set_p = set(p_pids)
     den = len(derived_post_ids & set_p)
     if den == 0:
-        return 1.0
+        return 0.0
     num = len(derived_post_ids & set_n)
     return min(1.0, num / den)
 

+ 44 - 30
examples_how/overall_derivation/utils/similarity_calc.py

@@ -97,7 +97,7 @@ DEFAULT_BATCH_PROMPT_TEMPLATE = """
 # Embedding 相似度 API
 EMBEDDING_SIMILARITY_URL = "http://61.48.133.26:8187/cartesian_similarity"
 # LLM 模型
-LLM_MODEL = "openai/gpt-4.1-mini"
+LLM_MODEL = "google/gemini-3-flash-preview"
 
 
 def _phrase_pairs(phrases_a: List[str], phrases_b: List[str]) -> List[Tuple[str, str]]:
@@ -165,9 +165,12 @@ def _extract_json_array(content: str) -> List[dict]:
     return json.loads(content)
 
 
-async def _llm_similarity(phrases_a: List[str], phrases_b: List[str]) -> List[List[float]]:
+async def _llm_similarity(
+    phrases_a: List[str], phrases_b: List[str], *, use_cache: bool = True
+) -> List[List[float]]:
     """
-    用 LLM 对短语对打分,返回 M×N 矩阵。先查原子缓存,仅对未命中的短语对调用 API。
+    用 LLM 对短语对打分,返回 M×N 矩阵。use_cache=True 时先查原子缓存,仅对未命中的短语对调用 API;
+    use_cache=False 时不读不写缓存。
     """
     if not phrases_a or not phrases_b:
         return []
@@ -177,16 +180,18 @@ async def _llm_similarity(phrases_a: List[str], phrases_b: List[str]) -> List[Li
     missing_indices: List[Tuple[int, int]] = []
     for i in range(M):
         for j in range(N):
-            score = _read_atomic_score("llm", phrases_a[i], phrases_b[j])
-            if score is not None:
-                matrix[i][j] = score
-            else:
-                missing_indices.append((i, j))
+            if use_cache:
+                score = _read_atomic_score("llm", phrases_a[i], phrases_b[j])
+                if score is not None:
+                    matrix[i][j] = score
+                    continue
+            missing_indices.append((i, j))
 
     total = M * N
-    hit_count = total - len(missing_indices)
-    if hit_count > 0:
-        logger.info("[similarity_matrix] LLM 原子缓存命中 %d/%d", hit_count, total)
+    if use_cache:
+        hit_count = total - len(missing_indices)
+        if hit_count > 0:
+            logger.info("[similarity_matrix] LLM 原子缓存命中 %d/%d", hit_count, total)
     if not missing_indices:
         return matrix
 
@@ -213,8 +218,9 @@ async def _llm_similarity(phrases_a: List[str], phrases_b: List[str]) -> List[Li
         score = float(items[idx].get("score", 0.0))
         score = max(0.0, min(1.0, score))
         matrix[i][j] = score
-        a, b = phrases_a[i], phrases_b[j]
-        _write_atomic_score("llm", a, b, score)
+        if use_cache:
+            a, b = phrases_a[i], phrases_b[j]
+            _write_atomic_score("llm", a, b, score)
     return matrix
 
 
@@ -224,6 +230,7 @@ async def similarity_matrix(
     *,
     embedding_weight: float = 0.5,
     llm_weight: float = 0.5,
+    use_llm_cache: bool = True,
 ) -> List[SimilarityItem]:
     """
     计算两组短语的相似度,返回对象列表(每条含 phrase_a, phrase_b, embedding_score, llm_score, combined_score)。
@@ -236,6 +243,7 @@ async def similarity_matrix(
         phrases_b: 第二组短语列表(N 个)
         embedding_weight: embedding 权重,默认 0.5
         llm_weight: LLM 权重,默认 0.5
+        use_llm_cache: 是否使用 LLM 相似度缓存,默认 True
 
     Returns:
         对象列表,长度 M×N,顺序与短语对 (a0,b0),(a0,b1),...,(aM-1,bN-1) 一致。
@@ -256,7 +264,7 @@ async def similarity_matrix(
 
     async def _run_llm() -> List[List[float]]:
         t0 = time.perf_counter()
-        out = await _llm_similarity(phrases_a, phrases_b)
+        out = await _llm_similarity(phrases_a, phrases_b, use_cache=use_llm_cache)
         logger.info("[similarity_matrix] LLM 耗时: %.3fs", time.perf_counter() - t0)
         return out
 
@@ -328,22 +336,28 @@ def test_extract_json_array() -> None:
 
 
 async def test_similarity_matrix() -> None:
-    """集成测试:调用 embedding + LLM 得到相似度对象列表。"""
-    phrases_a = ["犬", "猫咪"]
-    phrases_b = ["狗", "手机"]
-    items = await similarity_matrix(phrases_a, phrases_b)
-    assert len(items) == 4
-    for row in items:
-        assert "phrase_a" in row and "phrase_b" in row
-        assert "embedding_score" in row and "llm_score" in row and "combined_score" in row
-        assert 0 <= row["combined_score"] <= 1, f"combined_score 应在 [0,1],得到 {row['combined_score']}"
-    # 语义上 "犬"-"狗" 应高于 "犬"-"手机"
-    dog_dog = next(r for r in items if r["phrase_a"] == "犬" and r["phrase_b"] == "狗")
-    dog_phone = next(r for r in items if r["phrase_a"] == "犬" and r["phrase_b"] == "手机")
-    assert dog_dog["combined_score"] > dog_phone["combined_score"], "犬-狗 应高于 犬-手机"
-    print("test_similarity_matrix: ok")
-    for r in items:
-        print(f"  {r['phrase_a']}-{r['phrase_b']}: emb={r['embedding_score']:.4f} llm={r['llm_score']:.4f} combined={r['combined_score']:.4f}")
+    """集成测试:调用 embedding + LLM 得到相似度对象列表。use_llm_cache 可控制是否使用 LLM 缓存。"""
+    # use_llm_cache = True
+    use_llm_cache = False
+    phrases_a = ["犬", "猫咪", "夸张"]
+    phrases_b = ["狗", "手机", "夸张堆叠"]
+    items = await similarity_matrix(phrases_a, phrases_b, use_llm_cache=use_llm_cache)
+    for item in items:
+        print(item)
+
+
+    # assert len(items) == 4
+    # for row in items:
+    #     assert "phrase_a" in row and "phrase_b" in row
+    #     assert "embedding_score" in row and "llm_score" in row and "combined_score" in row
+    #     assert 0 <= row["combined_score"] <= 1, f"combined_score 应在 [0,1],得到 {row['combined_score']}"
+    # # 语义上 "犬"-"狗" 应高于 "犬"-"手机"
+    # dog_dog = next(r for r in items if r["phrase_a"] == "犬" and r["phrase_b"] == "狗")
+    # dog_phone = next(r for r in items if r["phrase_a"] == "犬" and r["phrase_b"] == "手机")
+    # assert dog_dog["combined_score"] > dog_phone["combined_score"], "犬-狗 应高于 犬-手机"
+    # print("test_similarity_matrix: ok")
+    # for r in items:
+    #     print(f"  {r['phrase_a']}-{r['phrase_b']}: emb={r['embedding_score']:.4f} llm={r['llm_score']:.4f} combined={r['combined_score']:.4f}")
 
 
 if __name__ == "__main__":