|
|
@@ -42,6 +42,7 @@ class EnhancedSearchV2:
|
|
|
self,
|
|
|
how_json_path: str,
|
|
|
dimension_associations_path: str,
|
|
|
+ intra_associations_path: str,
|
|
|
optimized_clustered_data_path: str,
|
|
|
openrouter_api_key: Optional[str] = None,
|
|
|
output_dir: str = "output_v2",
|
|
|
@@ -68,6 +69,7 @@ class EnhancedSearchV2:
|
|
|
Args:
|
|
|
how_json_path: How解构文件路径
|
|
|
dimension_associations_path: 维度关联文件路径
|
|
|
+ intra_associations_path: 维度内关联文件路径
|
|
|
optimized_clustered_data_path: 人设特征库路径
|
|
|
openrouter_api_key: OpenRouter API密钥
|
|
|
output_dir: 输出目录
|
|
|
@@ -90,6 +92,7 @@ class EnhancedSearchV2:
|
|
|
"""
|
|
|
self.how_json_path = how_json_path
|
|
|
self.dimension_associations_path = dimension_associations_path
|
|
|
+ self.intra_associations_path = intra_associations_path
|
|
|
self.optimized_clustered_data_path = optimized_clustered_data_path
|
|
|
self.output_dir = output_dir
|
|
|
self.top_n = top_n
|
|
|
@@ -110,6 +113,7 @@ class EnhancedSearchV2:
|
|
|
logger.info("加载数据文件...")
|
|
|
self.how_data = self._load_json(how_json_path)
|
|
|
self.dimension_associations = self._load_json(dimension_associations_path)
|
|
|
+ self.intra_associations = self._load_json(intra_associations_path)
|
|
|
self.optimized_clustered_data = self._load_json(optimized_clustered_data_path)
|
|
|
|
|
|
# 初始化组件
|
|
|
@@ -528,32 +532,43 @@ class EnhancedSearchV2:
|
|
|
search_path = classification_path
|
|
|
logger.info(f" 匹配到特征,使用所属分类: {search_path}")
|
|
|
|
|
|
- # 查找关联
|
|
|
+ # 查找跨维度关联
|
|
|
associations = self._find_associations(search_path, source_level)
|
|
|
+ logger.info(f" 找到 {len(associations)} 个跨维度关联")
|
|
|
+
|
|
|
+ # 查找维度内关联
|
|
|
+ intra_associations = self._find_intra_dimension_associations(search_path, source_level)
|
|
|
+ logger.info(f" 找到 {len(intra_associations)} 个维度内关联")
|
|
|
+
|
|
|
+ # 合并两种关联
|
|
|
+ all_associations = associations + intra_associations
|
|
|
|
|
|
# 收集关联信息
|
|
|
base_word_associations = []
|
|
|
|
|
|
- for assoc in associations:
|
|
|
+ for assoc in all_associations:
|
|
|
target_path = assoc['目标分类']
|
|
|
|
|
|
# 收集分类信息
|
|
|
classification_info = self._collect_classification_info(target_path)
|
|
|
|
|
|
if classification_info:
|
|
|
+ # 检查是否为维度内关联
|
|
|
+ is_intra = assoc['关联类型'] == '维度内组合关联'
|
|
|
+
|
|
|
base_word_associations.append({
|
|
|
'来源方向': assoc['来源方向'],
|
|
|
'关联类型': assoc['关联类型'],
|
|
|
'目标分类路径': target_path,
|
|
|
- '共同帖子数': assoc['共同帖子数'],
|
|
|
- 'Jaccard相似度': assoc['Jaccard相似度'],
|
|
|
+ '共同帖子数': assoc.get('点数', assoc.get('共同帖子数', 0)),
|
|
|
+ 'Jaccard相似度': assoc.get('Jaccard相似度', 0.0) if not is_intra else 0.0,
|
|
|
'分类名称': classification_info['classification_name'],
|
|
|
'标签列表': classification_info['tags'],
|
|
|
'子分类列表': classification_info['sub_classifications']
|
|
|
})
|
|
|
|
|
|
associations_by_base_word[base_word] = base_word_associations
|
|
|
- logger.info(f" 找到 {len(base_word_associations)} 个关联")
|
|
|
+ logger.info(f" 总计 {len(base_word_associations)} 个关联(跨维度: {len(associations)}, 维度内: {len(intra_associations)})")
|
|
|
|
|
|
# 保存结果
|
|
|
feature['找到的关联_按base_word'] = associations_by_base_word
|
|
|
@@ -632,6 +647,89 @@ class EnhancedSearchV2:
|
|
|
|
|
|
return associations
|
|
|
|
|
|
+ def _find_intra_dimension_associations(
|
|
|
+ self,
|
|
|
+ classification_path: str,
|
|
|
+ source_level: str
|
|
|
+ ) -> List[Dict[str, Any]]:
|
|
|
+ """
|
|
|
+ 查找维度内关联
|
|
|
+
|
|
|
+ 在同一维度内,查找叶子分类的组合关联。
|
|
|
+ 例如:如果A和B经常在同一帖子中出现,它们就有维度内关联。
|
|
|
+
|
|
|
+ Args:
|
|
|
+ classification_path: 分类路径,如 "实质/身份与情绪/生理状态与行为/疲惫与熬夜状态"
|
|
|
+ source_level: 源层级,如 "关键点列表"
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ 关联列表,每个关联包含:
|
|
|
+ - 来源方向: 维度-维度内
|
|
|
+ - 关联类型: 维度内组合关联
|
|
|
+ - 目标分类: 关联分类的完整路径
|
|
|
+ - 组合键: 组合的唯一标识(如 "夸张极致表现|疲惫与熬夜状态")
|
|
|
+ - 点数: 该组合出现的次数
|
|
|
+ - 目标层级: 目标层级(与源层级相同)
|
|
|
+ """
|
|
|
+ if not self.intra_associations:
|
|
|
+ return []
|
|
|
+
|
|
|
+ associations = []
|
|
|
+
|
|
|
+ # 步骤1: 提取叶子分类名称(路径最后一段)
|
|
|
+ if not classification_path:
|
|
|
+ return []
|
|
|
+
|
|
|
+ leaf_name = classification_path.split('/')[-1]
|
|
|
+
|
|
|
+ # 步骤2: 确定维度
|
|
|
+ dimension = None
|
|
|
+ if '灵感点' in source_level:
|
|
|
+ dimension = '灵感点'
|
|
|
+ elif '关键点' in source_level:
|
|
|
+ dimension = '关键点'
|
|
|
+ elif '目的点' in source_level:
|
|
|
+ dimension = '目的点'
|
|
|
+
|
|
|
+ if not dimension:
|
|
|
+ return []
|
|
|
+
|
|
|
+ # 步骤3: 查找组合
|
|
|
+ clusters = self.intra_associations.get('叶子分类组合聚类', {}).get(dimension, {})
|
|
|
+
|
|
|
+ if not clusters:
|
|
|
+ return []
|
|
|
+
|
|
|
+ # 步骤4: 遍历所有组合,找到包含当前叶子分类的组合
|
|
|
+ for combo_key, cluster in clusters.items():
|
|
|
+ combo_parts = combo_key.split('|')
|
|
|
+
|
|
|
+ # 如果当前叶子分类在组合中
|
|
|
+ if leaf_name not in combo_parts:
|
|
|
+ continue
|
|
|
+
|
|
|
+ # 提取点详情中的特征信息
|
|
|
+ for point in cluster.get('点详情列表', []):
|
|
|
+ for feature in point.get('特征列表', []):
|
|
|
+ other_leaf = feature.get('叶子分类', '')
|
|
|
+ other_path = feature.get('完整路径', '')
|
|
|
+
|
|
|
+ # 跳过自己
|
|
|
+ if other_leaf == leaf_name or not other_path:
|
|
|
+ continue
|
|
|
+
|
|
|
+ # 添加维度内关联(保持与跨维度关联相同的结构)
|
|
|
+ associations.append({
|
|
|
+ '来源方向': f'{dimension}-维度内',
|
|
|
+ '关联类型': '维度内组合关联',
|
|
|
+ '目标分类': other_path, # 使用'目标分类'保持与跨维度关联一致
|
|
|
+ '组合键': combo_key,
|
|
|
+ '点数': cluster.get('点数', 0),
|
|
|
+ '目标层级': source_level # 同一维度内的关联,层级相同
|
|
|
+ })
|
|
|
+
|
|
|
+ return associations
|
|
|
+
|
|
|
def _collect_classification_info(self, classification_path: str) -> Optional[Dict[str, Any]]:
|
|
|
"""
|
|
|
收集分类信息:分类名 + 标签 + 子分类
|
|
|
@@ -1686,6 +1784,11 @@ def main():
|
|
|
default='dimension_associations_analysis.json',
|
|
|
help='维度关联文件路径'
|
|
|
)
|
|
|
+ parser.add_argument(
|
|
|
+ '--intra-associations',
|
|
|
+ default='intra_dimension_associations_analysis.json',
|
|
|
+ help='维度内关联文件路径'
|
|
|
+ )
|
|
|
parser.add_argument(
|
|
|
'--optimized-clustered',
|
|
|
default='optimized_clustered_data_gemini-3-pro-preview.json',
|
|
|
@@ -1802,6 +1905,7 @@ def main():
|
|
|
system = EnhancedSearchV2(
|
|
|
how_json_path=args.how_json,
|
|
|
dimension_associations_path=args.dimension_associations,
|
|
|
+ intra_associations_path=args.intra_associations,
|
|
|
optimized_clustered_data_path=args.optimized_clustered,
|
|
|
openrouter_api_key=args.api_key,
|
|
|
output_dir=args.output_dir,
|