|
|
@@ -1,15 +1,16 @@
|
|
|
"""
|
|
|
-Pattern 维度聚类分析 Tool
|
|
|
+Pattern 维度分析 Tool
|
|
|
|
|
|
功能概述:
|
|
|
-1. 读取某次整体推导日志目录下各轮评估结果,累计每轮已匹配的 matched_post_point。
|
|
|
-2. 基于帖子与 pattern 库的匹配结果,对 pattern 元素做打分与分类(已推导/未推导)。
|
|
|
-3. 在账号人设树(实质/形式/意图)中,分别为「已推导元素」「未推导元素」寻找聚类节点。
|
|
|
+1. 读取某次整体推导日志目录下各轮评估结果,累计 matched_post_point / derivation_output_point 等字段。
|
|
|
+2. 每轮通过 derivation_output_point 在人设树中找到 cluster_level 层祖先节点(已推导维度节点集合)。
|
|
|
+3. 从 deduped_patterns 中筛选包含已推导维度节点的 pattern,并对各元素标记是否已推导。
|
|
|
|
|
|
输入参数:
|
|
|
- account_name: 账号名称
|
|
|
- post_id: 帖子 ID
|
|
|
- log_id: 推导日志目录名(形如 20260313210921)
|
|
|
+- cluster_level: 在人设树中查找祖先节点的目标深度(root 为 0 层)
|
|
|
"""
|
|
|
|
|
|
import json
|
|
|
@@ -22,7 +23,6 @@ _root = Path(__file__).resolve().parent.parent
|
|
|
if str(_root) not in sys.path:
|
|
|
sys.path.insert(0, str(_root))
|
|
|
|
|
|
-from tools.point_match import _load_match_data # 帖子选题点与人设树节点匹配分
|
|
|
from tools.find_tree_node import _load_trees # 加载三棵人设树
|
|
|
|
|
|
|
|
|
@@ -58,8 +58,18 @@ def _load_round_matched_points(
|
|
|
[
|
|
|
{
|
|
|
"round": 1,
|
|
|
- "round_points": [... 本轮 matched_post_point 去重 ...],
|
|
|
- "cumulative_points": [... 累计到本轮的 matched_post_point 去重 ...],
|
|
|
+ "round_points": [
|
|
|
+ {
|
|
|
+ "matched_post_point": "叙事结构",
|
|
|
+ "derivation_output_point": "叙事编排",
|
|
|
+ "matched_score": 0.9151,
|
|
|
+ "is_fully_derived": true,
|
|
|
+ },
|
|
|
+ ...
|
|
|
+ ],
|
|
|
+ "cumulative_points": [
|
|
|
+ ... 累计到本轮的去重列表(以 derivation_output_point 为去重 key) ...
|
|
|
+ ],
|
|
|
},
|
|
|
...
|
|
|
]
|
|
|
@@ -83,8 +93,8 @@ def _load_round_matched_points(
|
|
|
|
|
|
eval_files.sort(key=lambda x: x[0])
|
|
|
results: List[Dict[str, Any]] = []
|
|
|
- cumulative: List[str] = []
|
|
|
- cumulative_set: Set[str] = set()
|
|
|
+ cumulative: List[Dict[str, Any]] = []
|
|
|
+ cumulative_set: Set[str] = set() # 以 derivation_output_point 去重
|
|
|
|
|
|
for r, path in eval_files:
|
|
|
try:
|
|
|
@@ -93,34 +103,42 @@ def _load_round_matched_points(
|
|
|
except Exception:
|
|
|
continue
|
|
|
eval_results = data.get("eval_results") or []
|
|
|
- round_points: List[str] = []
|
|
|
+ round_points: List[Dict[str, Any]] = []
|
|
|
+ seen_in_round: Set[str] = set()
|
|
|
+
|
|
|
for item in eval_results:
|
|
|
if not isinstance(item, dict):
|
|
|
continue
|
|
|
if not item.get("is_matched"):
|
|
|
continue
|
|
|
|
|
|
- # 根据是否已完全推导,选择不同的帖子选题点字段:
|
|
|
- # - is_fully_derived 为 False 时,使用 derivation_output_point
|
|
|
- # - 其他情况(True 或缺失)使用 matched_post_point(兼容旧数据)
|
|
|
- if item.get("is_fully_derived") is False:
|
|
|
- mp = item.get("derivation_output_point")
|
|
|
- else:
|
|
|
- mp = item.get("matched_post_point")
|
|
|
-
|
|
|
- if mp is None:
|
|
|
+ dop = item.get("derivation_output_point")
|
|
|
+ if dop is None:
|
|
|
continue
|
|
|
- mp = str(mp).strip()
|
|
|
- if not mp:
|
|
|
+ dop = str(dop).strip()
|
|
|
+ if not dop:
|
|
|
continue
|
|
|
- if mp not in round_points:
|
|
|
- round_points.append(mp)
|
|
|
|
|
|
- # 累加到本轮
|
|
|
- for mp in round_points:
|
|
|
- if mp not in cumulative_set:
|
|
|
- cumulative_set.add(mp)
|
|
|
- cumulative.append(mp)
|
|
|
+ # 本轮内按 derivation_output_point 去重
|
|
|
+ if dop in seen_in_round:
|
|
|
+ continue
|
|
|
+ seen_in_round.add(dop)
|
|
|
+
|
|
|
+ mpp = item.get("matched_post_point")
|
|
|
+ entry: Dict[str, Any] = {
|
|
|
+ "matched_post_point": str(mpp).strip() if mpp is not None else None,
|
|
|
+ "derivation_output_point": dop,
|
|
|
+ "matched_score": item.get("matched_score"),
|
|
|
+ "is_fully_derived": item.get("is_fully_derived"),
|
|
|
+ }
|
|
|
+ round_points.append(entry)
|
|
|
+
|
|
|
+ # 累加到累计列表(按 derivation_output_point 去重)
|
|
|
+ for entry in round_points:
|
|
|
+ dop = entry["derivation_output_point"]
|
|
|
+ if dop not in cumulative_set:
|
|
|
+ cumulative_set.add(dop)
|
|
|
+ cumulative.append(entry)
|
|
|
|
|
|
results.append(
|
|
|
{
|
|
|
@@ -207,106 +225,6 @@ def _dedupe_patterns(raw_patterns: List[Dict[str, Any]]) -> List[Dict[str, Any]]
|
|
|
return list(key_to_best.values())
|
|
|
|
|
|
|
|
|
-def _score_patterns_by_matched_points(
|
|
|
- patterns: List[Dict[str, Any]],
|
|
|
- account_name: str,
|
|
|
- post_id: str,
|
|
|
- matched_post_points: List[str],
|
|
|
- match_threshold: float,
|
|
|
-) -> List[Dict[str, Any]]:
|
|
|
- """
|
|
|
- 对传入的 pattern 列表计算其元素与 matched_post_point 列表的匹配分:
|
|
|
- - 匹配分来源:../input/{account_name}/match_data/{post_id}_匹配_all.json
|
|
|
- lookup key 为 (帖子选题点, 人设树节点)
|
|
|
- - 对于每个 pattern 的每个元素(item):
|
|
|
- * 以 item["name"] 视为人设树节点名称
|
|
|
- * 对每个 matched_post_point 查找匹配分,取最大值
|
|
|
- - 仅保留「至少有一个元素匹配分 >= match_threshold」的 pattern。
|
|
|
- 返回的 pattern 结构(item 不再保留 point / dimension 字段):
|
|
|
- {
|
|
|
- "id": xxx,
|
|
|
- "support": xxx,
|
|
|
- "items": [
|
|
|
- {
|
|
|
- "name": "xxx",
|
|
|
- "type": "xxx",
|
|
|
- "matched_post_point": "xxx" | null,
|
|
|
- "matched_score": float,
|
|
|
- },
|
|
|
- ...
|
|
|
- ],
|
|
|
- }
|
|
|
- """
|
|
|
- if not patterns or not matched_post_points:
|
|
|
- return []
|
|
|
-
|
|
|
- match_lookup = _load_match_data(account_name, post_id)
|
|
|
- matched_post_points = [str(x).strip() for x in matched_post_points if str(x).strip()]
|
|
|
- if not matched_post_points:
|
|
|
- return []
|
|
|
-
|
|
|
- results: List[Dict[str, Any]] = []
|
|
|
- for p in patterns:
|
|
|
- items = p.get("items") or []
|
|
|
- if not isinstance(items, list):
|
|
|
- continue
|
|
|
- scored_items: List[Dict[str, Any]] = []
|
|
|
- max_item_score = 0.0
|
|
|
-
|
|
|
- for it in items:
|
|
|
- if not isinstance(it, dict):
|
|
|
- continue
|
|
|
- name = str(it.get("name") or "").strip()
|
|
|
- _type = str(it.get("type") or "").strip()
|
|
|
-
|
|
|
- best_score = 0.0
|
|
|
- best_post_point: Optional[str] = None
|
|
|
- if name:
|
|
|
- for post_point in matched_post_points:
|
|
|
- # 如果帖子选题点与节点名称完全一致,直接视为满分匹配
|
|
|
- if post_point == name:
|
|
|
- s = 1.0
|
|
|
- else:
|
|
|
- score = match_lookup.get((post_point, name))
|
|
|
- if score is None:
|
|
|
- continue
|
|
|
- try:
|
|
|
- s = float(score)
|
|
|
- except (TypeError, ValueError):
|
|
|
- continue
|
|
|
- if s > best_score:
|
|
|
- best_score = s
|
|
|
- best_post_point = post_point
|
|
|
-
|
|
|
- if best_score > max_item_score:
|
|
|
- max_item_score = best_score
|
|
|
-
|
|
|
- scored_items.append(
|
|
|
- {
|
|
|
- "name": name,
|
|
|
- "type": _type,
|
|
|
- "matched_post_point": best_post_point,
|
|
|
- "matched_score": round(best_score, 6),
|
|
|
- }
|
|
|
- )
|
|
|
-
|
|
|
- if not scored_items:
|
|
|
- continue
|
|
|
- if max_item_score < match_threshold:
|
|
|
- # 该 pattern 在本轮未与帖子形成足够强的匹配
|
|
|
- continue
|
|
|
-
|
|
|
- results.append(
|
|
|
- {
|
|
|
- "id": p.get("id"),
|
|
|
- "support": p.get("support"),
|
|
|
- "items": scored_items,
|
|
|
- }
|
|
|
- )
|
|
|
-
|
|
|
- return results
|
|
|
-
|
|
|
-
|
|
|
# ---------------------------------------------------------------------------
|
|
|
# 3. 人设树节点信息 & 聚类节点搜索
|
|
|
# ---------------------------------------------------------------------------
|
|
|
@@ -399,6 +317,37 @@ class TreeIndex:
|
|
|
self.node_info[child]["depth"] = cur_depth + 1
|
|
|
q.append(child)
|
|
|
|
|
|
+ def find_ancestor_at_level(self, node_name: str, level: int) -> Optional[str]:
|
|
|
+ """
|
|
|
+ 在人设树中找到 node_name 的 depth == level 的祖先节点。
|
|
|
+ - 若 node_name 自身 depth == level,直接返回自身。
|
|
|
+ - 若 node_name depth < level(比目标层浅),返回自身。
|
|
|
+ - 否则沿 parent 链向上查找,返回第一个 depth == level 的祖先节点。
|
|
|
+ """
|
|
|
+ info = self.node_info.get(node_name)
|
|
|
+ if not info:
|
|
|
+ return None
|
|
|
+ depth = info.get("depth")
|
|
|
+ if depth is None:
|
|
|
+ return None
|
|
|
+ if depth <= level:
|
|
|
+ return node_name
|
|
|
+ cur = node_name
|
|
|
+ visited: Set[str] = set()
|
|
|
+ while cur and cur not in visited:
|
|
|
+ visited.add(cur)
|
|
|
+ cur_info = self.node_info.get(cur) or {}
|
|
|
+ cur_depth = cur_info.get("depth") or 0
|
|
|
+ if cur_depth == level:
|
|
|
+ return cur
|
|
|
+ if cur_depth < level:
|
|
|
+ return cur
|
|
|
+ parent = cur_info.get("parent")
|
|
|
+ if parent is None:
|
|
|
+ return cur
|
|
|
+ cur = parent
|
|
|
+ return None
|
|
|
+
|
|
|
# 聚类搜索(不再区分维度)
|
|
|
def find_clusters(
|
|
|
self,
|
|
|
@@ -564,95 +513,144 @@ class TreeIndex:
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def _analyze_single_round(
|
|
|
- account_name: str,
|
|
|
- post_id: str,
|
|
|
patterns: List[Dict[str, Any]],
|
|
|
tree_index: TreeIndex,
|
|
|
- cumulative_points: List[str],
|
|
|
- match_threshold: float,
|
|
|
+ cumulative_points: List[Dict[str, Any]],
|
|
|
cluster_level: int,
|
|
|
) -> Dict[str, Any]:
|
|
|
"""
|
|
|
- 对某一轮(给定累计 matched_post_point 列表)执行分析:
|
|
|
- - 筛选与帖子匹配度 >= match_threshold 的 pattern
|
|
|
- - 将 pattern 元素按 matched_score 分为「已推导元素」与「未推导元素」
|
|
|
- - 在三棵人设树中(不区分维度)为两组元素分别寻找聚类节点
|
|
|
- """
|
|
|
- patterns = _score_patterns_by_matched_points(
|
|
|
- patterns=patterns,
|
|
|
- account_name=account_name,
|
|
|
- post_id=post_id,
|
|
|
- matched_post_points=cumulative_points,
|
|
|
- match_threshold=match_threshold,
|
|
|
- )
|
|
|
- print(f"_score_patterns_by_matched_points len: {len(patterns)}")
|
|
|
+ 对某一轮(给定累计 point 列表)执行维度分析:
|
|
|
+
|
|
|
+ 1. 从 cumulative_points 中提取 derivation_output_point,
|
|
|
+ 在人设树中找到每个节点的 cluster_level 层祖先 → derived_ancestor_set(已推导维度节点集合)。
|
|
|
+ 2. 从 deduped_patterns 中筛选出包含 derived_ancestor_set 中节点的 pattern。
|
|
|
+ 3. 对筛选出 pattern 的每个元素标记是否已推导:
|
|
|
+ - 元素在 derived_ancestor_set 中 → is_derived=True(已推导维度)
|
|
|
+ - 其他 → is_derived=False(未推导维度)
|
|
|
+ 4. 汇总 derived_dims / underived_dims 列表。
|
|
|
|
|
|
- # 已推导 / 未推导 元素列表(不再按维度拆分)
|
|
|
- derived_elems: List[str] = []
|
|
|
- underived_elems: List[str] = []
|
|
|
+ 返回结构:
|
|
|
+ {
|
|
|
+ "cumulative_points": [...], # 原始累计 point 对象列表
|
|
|
+ "derived_ancestor_nodes": [...], # 所有 derivation_output_point 对应的 cluster_level 层祖先节点(已推导维度节点集合)
|
|
|
+ "patterns": [...], # 筛选后带 is_derived 标记的 pattern 列表
|
|
|
+ "derived_dims": [...], # 已推导维度节点(去重,出现于筛选 pattern 中)
|
|
|
+ "underived_dims": [...], # 未推导维度节点(去重,排除已推导节点)
|
|
|
+ "patterns_count": int,
|
|
|
+ "derived_dim_count": int,
|
|
|
+ "underived_dim_count": int,
|
|
|
+ }
|
|
|
+ """
|
|
|
+ # 1. 收集 derived_ancestor_set,同时记录每个祖先节点对应的 matched_post_point 来源
|
|
|
+ derived_ancestor_set: Set[str] = set()
|
|
|
+ ancestor_to_mpps: Dict[str, List[str]] = {} # 祖先节点 -> [matched_post_point, ...]
|
|
|
+ for entry in cumulative_points:
|
|
|
+ dop = entry.get("derivation_output_point")
|
|
|
+ if not dop:
|
|
|
+ continue
|
|
|
+ ancestor = tree_index.find_ancestor_at_level(str(dop).strip(), cluster_level)
|
|
|
+ if not ancestor:
|
|
|
+ continue
|
|
|
+ derived_ancestor_set.add(ancestor)
|
|
|
+ mpp = entry.get("matched_post_point") or ""
|
|
|
+ if mpp and mpp not in ancestor_to_mpps.get(ancestor, []):
|
|
|
+ ancestor_to_mpps.setdefault(ancestor, []).append(mpp)
|
|
|
|
|
|
+ # 2. 筛选 pattern:已推导维度节点占所有元素的比例 >= 50%
|
|
|
+ filtered_patterns: List[Dict[str, Any]] = []
|
|
|
for p in patterns:
|
|
|
- for it in p.get("items", []):
|
|
|
+ items = p.get("items") or []
|
|
|
+ item_names = [
|
|
|
+ str(it.get("name") or "").strip()
|
|
|
+ for it in items
|
|
|
+ if isinstance(it, dict)
|
|
|
+ ]
|
|
|
+ if not item_names:
|
|
|
+ continue
|
|
|
+ if len(item_names) < 5:
|
|
|
+ continue
|
|
|
+ derived_count = sum(1 for name in item_names if name in derived_ancestor_set)
|
|
|
+ if derived_count / len(item_names) >= 0.5:
|
|
|
+ filtered_patterns.append(p)
|
|
|
+
|
|
|
+ print(
|
|
|
+ f"filtered_patterns: {len(filtered_patterns)}, "
|
|
|
+ f"derived_ancestor_set: {len(derived_ancestor_set)}"
|
|
|
+ )
|
|
|
+
|
|
|
+ def _node_label(name: str, is_derived: bool) -> str:
|
|
|
+ """
|
|
|
+ 返回格式化标签:
|
|
|
+ - 已推导节点:'node_name->dimension(mpp1,mpp2,...)'
|
|
|
+ - 未推导节点:'node_name->dimension'
|
|
|
+ """
|
|
|
+ dim = (tree_index.node_info.get(name) or {}).get("dimension") or ""
|
|
|
+ base = f"{name}->{dim}" if dim else name
|
|
|
+ if is_derived:
|
|
|
+ mpps = ancestor_to_mpps.get(name) or []
|
|
|
+ if mpps:
|
|
|
+ return f"{base}({','.join(mpps)})"
|
|
|
+ return base
|
|
|
+
|
|
|
+ # 3. 对筛选 pattern 元素分类并汇总维度列表
|
|
|
+ derived_dims: List[str] = []
|
|
|
+ underived_dims: List[str] = []
|
|
|
+ derived_dims_seen: Set[str] = set()
|
|
|
+ underived_dims_seen: Set[str] = set()
|
|
|
+
|
|
|
+ scored_patterns: List[Dict[str, Any]] = []
|
|
|
+ for p in filtered_patterns:
|
|
|
+ items = p.get("items") or []
|
|
|
+ tagged_items: List[Dict[str, Any]] = []
|
|
|
+ for it in items:
|
|
|
if not isinstance(it, dict):
|
|
|
continue
|
|
|
- node_name = str(it.get("name") or "").strip()
|
|
|
- if not node_name:
|
|
|
- continue
|
|
|
- score = float(it.get("matched_score") or 0.0)
|
|
|
- if score >= match_threshold:
|
|
|
- derived_elems.append(node_name)
|
|
|
+ name = str(it.get("name") or "").strip()
|
|
|
+ is_derived = name in derived_ancestor_set
|
|
|
+ tagged_items.append(
|
|
|
+ {
|
|
|
+ "name": name,
|
|
|
+ "is_derived": is_derived,
|
|
|
+ }
|
|
|
+ )
|
|
|
+ if is_derived:
|
|
|
+ if name and name not in derived_dims_seen:
|
|
|
+ derived_dims_seen.add(name)
|
|
|
+ derived_dims.append(_node_label(name, is_derived=True))
|
|
|
else:
|
|
|
- underived_elems.append(node_name)
|
|
|
+ if name and name not in underived_dims_seen:
|
|
|
+ underived_dims_seen.add(name)
|
|
|
+ underived_dims.append(_node_label(name, is_derived=False))
|
|
|
|
|
|
- # 为避免重复元素干扰统计与聚类,先做去重
|
|
|
- derived_set: List[str] = list(dict.fromkeys(derived_elems))
|
|
|
- underived_set: List[str] = list(dict.fromkeys(underived_elems))
|
|
|
+ scored_patterns.append(
|
|
|
+ {
|
|
|
+ "id": p.get("id"),
|
|
|
+ "support": p.get("support"),
|
|
|
+ "items": tagged_items,
|
|
|
+ }
|
|
|
+ )
|
|
|
|
|
|
- clusters: Dict[str, Any] = {
|
|
|
- "derived": [],
|
|
|
- "underived": [],
|
|
|
- }
|
|
|
+ # 从 underived_dims 中排除与 derived_dims 重叠的节点
|
|
|
+ underived_dims = [d for d in underived_dims if d.split("->")[0] not in derived_dims_seen]
|
|
|
|
|
|
- # 已推导元素聚类
|
|
|
- if derived_set:
|
|
|
- c = tree_index.find_clusters(derived_set, cluster_level=cluster_level)
|
|
|
- clusters["derived"] = c or []
|
|
|
-
|
|
|
- # 未推导元素聚类
|
|
|
- if underived_set:
|
|
|
- c = tree_index.find_clusters(underived_set, cluster_level=cluster_level)
|
|
|
- clusters["underived"] = c or []
|
|
|
-
|
|
|
- # 在同一轮中,如果某个 cluster_node 已经在 derived 聚类里出现过,
|
|
|
- # 则从 underived 聚类中剔除该 cluster_node,避免重复展示。
|
|
|
- if isinstance(clusters.get("derived"), list) and isinstance(clusters.get("underived"), list):
|
|
|
- derived_nodes = {
|
|
|
- str(item.get("cluster_node"))
|
|
|
- for item in clusters["derived"]
|
|
|
- if isinstance(item, dict) and item.get("cluster_node") is not None
|
|
|
- }
|
|
|
- if derived_nodes:
|
|
|
- filtered_underived = []
|
|
|
- for item in clusters["underived"]:
|
|
|
- if not isinstance(item, dict):
|
|
|
- continue
|
|
|
- node = str(item.get("cluster_node"))
|
|
|
- if node in derived_nodes:
|
|
|
- continue
|
|
|
- filtered_underived.append(item)
|
|
|
- clusters["underived"] = filtered_underived
|
|
|
+ # 按 is_derived=True 的元素数量从高到低排序,数量相同再按元素总数从高到低
|
|
|
+ scored_patterns.sort(
|
|
|
+ key=lambda x: (
|
|
|
+ sum(1 for it in x.get("items", []) if it.get("is_derived")),
|
|
|
+ len(x.get("items", [])),
|
|
|
+ ),
|
|
|
+ reverse=True,
|
|
|
+ )
|
|
|
|
|
|
return {
|
|
|
- "matched_post_points": list(cumulative_points),
|
|
|
- "patterns": patterns,
|
|
|
- "clusters": clusters,
|
|
|
- # 统计信息:
|
|
|
- # - patterns_count: 本轮参与分析的 pattern 数量
|
|
|
- # - derived_cluster_count: 已推导元素聚类节点数量
|
|
|
- # - underived_cluster_count: 未推导元素聚类节点数量
|
|
|
- "patterns_count": len(patterns),
|
|
|
- "derived_cluster_count": len(clusters["derived"]) if isinstance(clusters.get("derived"), list) else 0,
|
|
|
- "underived_cluster_count": len(clusters["underived"]) if isinstance(clusters.get("underived"), list) else 0,
|
|
|
+ "cumulative_points": list(cumulative_points),
|
|
|
+ "derived_ancestor_nodes": sorted(derived_ancestor_set),
|
|
|
+ "patterns": scored_patterns,
|
|
|
+ "derived_dims": derived_dims,
|
|
|
+ "underived_dims": underived_dims,
|
|
|
+ "patterns_count": len(scored_patterns),
|
|
|
+ "derived_dim_count": len(derived_dims),
|
|
|
+ "underived_dim_count": len(underived_dims),
|
|
|
}
|
|
|
|
|
|
|
|
|
@@ -661,7 +659,6 @@ def pattern_dimension_analyze(
|
|
|
account_name: str,
|
|
|
post_id: str,
|
|
|
log_id: str,
|
|
|
- match_threshold: float = 0.6,
|
|
|
cluster_level: int = 2,
|
|
|
) -> Dict[str, Any]:
|
|
|
"""
|
|
|
@@ -670,11 +667,16 @@ def pattern_dimension_analyze(
|
|
|
参数
|
|
|
-------
|
|
|
account_name : 账号名(用于定位 input / output 下的数据目录)
|
|
|
- post_id : 帖子 ID(用于定位推导日志与帖子匹配数据)
|
|
|
+ post_id : 帖子 ID(用于定位推导日志)
|
|
|
log_id : 推导日志目录名(../output/{account_name}/推导日志/{post_id}/{log_id}/)
|
|
|
- match_threshold : pattern 元素与 matched_post_point 的最小匹配分,默认 0.6
|
|
|
- cluster_level : 在人设树中搜索聚类节点的聚类层级(root 为 0 层),默认 2
|
|
|
-
|
|
|
+ cluster_level : 在人设树中查找祖先节点的目标深度(root 为 0 层),默认 2
|
|
|
+
|
|
|
+ 逻辑概述
|
|
|
+ --------
|
|
|
+ 每一轮:
|
|
|
+ 1. 从 derivation_output_point 在人设树中找到 cluster_level 层祖先节点 → 已推导维度节点集合。
|
|
|
+ 2. 筛选包含已推导维度节点的 pattern。
|
|
|
+ 3. 标记每个 pattern 元素是否已推导,汇总 derived_dims / underived_dims。
|
|
|
"""
|
|
|
eval_dir = _round_eval_dir(account_name, post_id, log_id)
|
|
|
if not eval_dir.is_dir():
|
|
|
@@ -686,7 +688,6 @@ def pattern_dimension_analyze(
|
|
|
"account_name": account_name,
|
|
|
"post_id": post_id,
|
|
|
"log_id": log_id,
|
|
|
- "match_threshold": match_threshold,
|
|
|
"cluster_level": cluster_level,
|
|
|
"rounds": [],
|
|
|
"message": "未在指定日志目录下找到任何评估结果文件(*_评估.json)",
|
|
|
@@ -703,40 +704,29 @@ def pattern_dimension_analyze(
|
|
|
r = info["round"]
|
|
|
cumulative_points = info["cumulative_points"]
|
|
|
analyzed = _analyze_single_round(
|
|
|
- account_name=account_name,
|
|
|
- post_id=post_id,
|
|
|
patterns=deduped_patterns,
|
|
|
tree_index=tree_index,
|
|
|
cumulative_points=cumulative_points,
|
|
|
- match_threshold=match_threshold,
|
|
|
cluster_level=cluster_level,
|
|
|
)
|
|
|
analyzed["round"] = r
|
|
|
rounds_output.append(analyzed)
|
|
|
|
|
|
- result = {
|
|
|
+ return {
|
|
|
"account_name": account_name,
|
|
|
"post_id": post_id,
|
|
|
"log_id": log_id,
|
|
|
- "match_threshold": match_threshold,
|
|
|
"cluster_level": cluster_level,
|
|
|
"rounds": rounds_output,
|
|
|
}
|
|
|
- return result
|
|
|
|
|
|
|
|
|
-def main() -> None:
|
|
|
+def main(account_name, post_id, log_id) -> None:
|
|
|
"""本地简单测试:以家有大志账号的一次推导日志做分析,并将结果写入输出目录。"""
|
|
|
- account_name = "家有大志"
|
|
|
- post_id = "68fb6a5c000000000302e5de"
|
|
|
- # 需要根据实际运行结果修改为最新的 log_id
|
|
|
- log_id = "20260317112639"
|
|
|
-
|
|
|
result = pattern_dimension_analyze(
|
|
|
account_name=account_name,
|
|
|
post_id=post_id,
|
|
|
log_id=log_id,
|
|
|
- match_threshold=0.5,
|
|
|
cluster_level=3,
|
|
|
)
|
|
|
# 控制台打印前 4000 字符,便于快速查看
|
|
|
@@ -753,5 +743,15 @@ def main() -> None:
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
- main()
|
|
|
+ account_name = "家有大志"
|
|
|
+
|
|
|
+ items = [
|
|
|
+ {"post_id": "68fb6a5c000000000302e5de", "log_id": "20260317214307"},
|
|
|
+ {"post_id": "69185d49000000000d00f94e", "log_id": "20260317214841"},
|
|
|
+ {"post_id": "6921937a000000001b0278d1", "log_id": "20260317215616"}
|
|
|
+ ]
|
|
|
+ for item in items:
|
|
|
+ post_id = item["post_id"]
|
|
|
+ log_id = item["log_id"]
|
|
|
+ main(account_name, post_id, log_id)
|
|
|
|