刘立冬 3 týždňov pred
rodič
commit
50da468447
1 zmenil súbory, kde vykonal 109 pridanie a 5 odobranie
  1. 109 5
      enhanced_search_v2.py

+ 109 - 5
enhanced_search_v2.py

@@ -42,6 +42,7 @@ class EnhancedSearchV2:
         self,
         how_json_path: str,
         dimension_associations_path: str,
+        intra_associations_path: str,
         optimized_clustered_data_path: str,
         openrouter_api_key: Optional[str] = None,
         output_dir: str = "output_v2",
@@ -68,6 +69,7 @@ class EnhancedSearchV2:
         Args:
             how_json_path: How解构文件路径
             dimension_associations_path: 维度关联文件路径
+            intra_associations_path: 维度内关联文件路径
             optimized_clustered_data_path: 人设特征库路径
             openrouter_api_key: OpenRouter API密钥
             output_dir: 输出目录
@@ -90,6 +92,7 @@ class EnhancedSearchV2:
         """
         self.how_json_path = how_json_path
         self.dimension_associations_path = dimension_associations_path
+        self.intra_associations_path = intra_associations_path
         self.optimized_clustered_data_path = optimized_clustered_data_path
         self.output_dir = output_dir
         self.top_n = top_n
@@ -110,6 +113,7 @@ class EnhancedSearchV2:
         logger.info("加载数据文件...")
         self.how_data = self._load_json(how_json_path)
         self.dimension_associations = self._load_json(dimension_associations_path)
+        self.intra_associations = self._load_json(intra_associations_path)
         self.optimized_clustered_data = self._load_json(optimized_clustered_data_path)
 
         # 初始化组件
@@ -528,32 +532,43 @@ class EnhancedSearchV2:
                     search_path = classification_path
                     logger.info(f"    匹配到特征,使用所属分类: {search_path}")
 
-                # 查找关联
+                # 查找跨维度关联
                 associations = self._find_associations(search_path, source_level)
+                logger.info(f"    找到 {len(associations)} 个跨维度关联")
+
+                # 查找维度内关联
+                intra_associations = self._find_intra_dimension_associations(search_path, source_level)
+                logger.info(f"    找到 {len(intra_associations)} 个维度内关联")
+
+                # 合并两种关联
+                all_associations = associations + intra_associations
 
                 # 收集关联信息
                 base_word_associations = []
 
-                for assoc in associations:
+                for assoc in all_associations:
                     target_path = assoc['目标分类']
 
                     # 收集分类信息
                     classification_info = self._collect_classification_info(target_path)
 
                     if classification_info:
+                        # 检查是否为维度内关联
+                        is_intra = assoc['关联类型'] == '维度内组合关联'
+
                         base_word_associations.append({
                             '来源方向': assoc['来源方向'],
                             '关联类型': assoc['关联类型'],
                             '目标分类路径': target_path,
-                            '共同帖子数': assoc['共同帖子数'],
-                            'Jaccard相似度': assoc['Jaccard相似度'],
+                            '共同帖子数': assoc.get('点数', assoc.get('共同帖子数', 0)),
+                            'Jaccard相似度': assoc.get('Jaccard相似度', 0.0) if not is_intra else 0.0,
                             '分类名称': classification_info['classification_name'],
                             '标签列表': classification_info['tags'],
                             '子分类列表': classification_info['sub_classifications']
                         })
 
                 associations_by_base_word[base_word] = base_word_associations
-                logger.info(f"    找到 {len(base_word_associations)} 个关联")
+                logger.info(f"    总计 {len(base_word_associations)} 个关联(跨维度: {len(associations)}, 维度内: {len(intra_associations)})")
 
             # 保存结果
             feature['找到的关联_按base_word'] = associations_by_base_word
@@ -632,6 +647,89 @@ class EnhancedSearchV2:
 
         return associations
 
+    def _find_intra_dimension_associations(
+        self,
+        classification_path: str,
+        source_level: str
+    ) -> List[Dict[str, Any]]:
+        """
+        查找维度内关联
+
+        在同一维度内,查找叶子分类的组合关联。
+        例如:如果A和B经常在同一帖子中出现,它们就有维度内关联。
+
+        Args:
+            classification_path: 分类路径,如 "实质/身份与情绪/生理状态与行为/疲惫与熬夜状态"
+            source_level: 源层级,如 "关键点列表"
+
+        Returns:
+            关联列表,每个关联包含:
+            - 来源方向: 维度-维度内
+            - 关联类型: 维度内组合关联
+            - 目标分类: 关联分类的完整路径
+            - 组合键: 组合的唯一标识(如 "夸张极致表现|疲惫与熬夜状态")
+            - 点数: 该组合出现的次数
+            - 目标层级: 目标层级(与源层级相同)
+        """
+        if not self.intra_associations:
+            return []
+
+        associations = []
+
+        # 步骤1: 提取叶子分类名称(路径最后一段)
+        if not classification_path:
+            return []
+
+        leaf_name = classification_path.split('/')[-1]
+
+        # 步骤2: 确定维度
+        dimension = None
+        if '灵感点' in source_level:
+            dimension = '灵感点'
+        elif '关键点' in source_level:
+            dimension = '关键点'
+        elif '目的点' in source_level:
+            dimension = '目的点'
+
+        if not dimension:
+            return []
+
+        # 步骤3: 查找组合
+        clusters = self.intra_associations.get('叶子分类组合聚类', {}).get(dimension, {})
+
+        if not clusters:
+            return []
+
+        # 步骤4: 遍历所有组合,找到包含当前叶子分类的组合
+        for combo_key, cluster in clusters.items():
+            combo_parts = combo_key.split('|')
+
+            # 如果当前叶子分类在组合中
+            if leaf_name not in combo_parts:
+                continue
+
+            # 提取点详情中的特征信息
+            for point in cluster.get('点详情列表', []):
+                for feature in point.get('特征列表', []):
+                    other_leaf = feature.get('叶子分类', '')
+                    other_path = feature.get('完整路径', '')
+
+                    # 跳过自己
+                    if other_leaf == leaf_name or not other_path:
+                        continue
+
+                    # 添加维度内关联(保持与跨维度关联相同的结构)
+                    associations.append({
+                        '来源方向': f'{dimension}-维度内',
+                        '关联类型': '维度内组合关联',
+                        '目标分类': other_path,  # 使用'目标分类'保持与跨维度关联一致
+                        '组合键': combo_key,
+                        '点数': cluster.get('点数', 0),
+                        '目标层级': source_level  # 同一维度内的关联,层级相同
+                    })
+
+        return associations
+
     def _collect_classification_info(self, classification_path: str) -> Optional[Dict[str, Any]]:
         """
         收集分类信息:分类名 + 标签 + 子分类
@@ -1686,6 +1784,11 @@ def main():
         default='dimension_associations_analysis.json',
         help='维度关联文件路径'
     )
+    parser.add_argument(
+        '--intra-associations',
+        default='intra_dimension_associations_analysis.json',
+        help='维度内关联文件路径'
+    )
     parser.add_argument(
         '--optimized-clustered',
         default='optimized_clustered_data_gemini-3-pro-preview.json',
@@ -1802,6 +1905,7 @@ def main():
     system = EnhancedSearchV2(
         how_json_path=args.how_json,
         dimension_associations_path=args.dimension_associations,
+        intra_associations_path=args.intra_associations,
         optimized_clustered_data_path=args.optimized_clustered,
         openrouter_api_key=args.api_key,
         output_dir=args.output_dir,