2 bulan lalu · 0fada6c296
--- a/enhanced_search_v2.py
+++ b/enhanced_search_v2.py
@@ -170,45 +170,53 @@ class EnhancedSearchV2:
 
				                             continue
			
 
				 
			
 
				                         # 0.5 <= max_similarity < 0.8，保留
			
 
				-                        best_match = max(
			
 
				+                        # 按相似度降序排序，取前3个
			
 
				+                        sorted_matches = sorted(
			
 
				                             match_results,
			
 
				-                            key=lambda x: x.get('匹配结果', {}).get('相似度', 0)
			
 
				+                            key=lambda x: x.get('匹配结果', {}).get('相似度', 0),
			
 
				+                            reverse=True
			
 
				                         )
			
 
				+                        top3_matches = sorted_matches[:3]  # 取前3个
			
 
				 
			
 
				-                        # 判断是分类还是特征
			
 
				-                        feature_classification = best_match.get('特征分类', [])
			
 
				-                        classification_path = self._build_classification_path(feature_classification)
			
 
				+                        # 构建top3匹配信息列表
			
 
				+                        top3_match_info = []
			
 
				+                        for match in top3_matches:
			
 
				+                            feature_classification = match.get('特征分类', [])
			
 
				+                            classification_path = self._build_classification_path(feature_classification)
			
 
				 
			
 
				-                        # 如果路径为空且是分类类型，搜索补全路径
			
 
				-                        if not classification_path and best_match.get('特征类型') == '分类':
			
 
				-                            feature_name_to_search = best_match.get('人设特征名称', '')
			
 
				-                            classification_path = self._search_classification_path(feature_name_to_search)
			
 
				+                            # 如果路径为空且是分类类型，搜索补全路径
			
 
				+                            if not classification_path and match.get('特征类型') == '分类':
			
 
				+                                feature_name_to_search = match.get('人设特征名称', '')
			
 
				+                                classification_path = self._search_classification_path(feature_name_to_search)
			
 
				 
			
 
				-                        is_classification = self._is_classification(best_match.get('人设特征名称', ''), classification_path)
			
 
				+                            is_classification = self._is_classification(match.get('人设特征名称', ''), classification_path)
			
 
				+
			
 
				+                            top3_match_info.append({
			
 
				+                                '人设特征名称': match.get('人设特征名称'),
			
 
				+                                '人设特征层级': match.get('人设特征层级'),
			
 
				+                                '特征类型': match.get('特征类型'),
			
 
				+                                '特征分类': feature_classification,
			
 
				+                                '相似度': match.get('匹配结果', {}).get('相似度', 0),
			
 
				+                                '匹配说明': match.get('匹配结果', {}).get('说明', ''),
			
 
				+                                '是分类': is_classification,
			
 
				+                                '所属分类路径': classification_path
			
 
				+                            })
			
 
				 
			
 
				                         result_item = {
			
 
				                             '原始特征名称': feature_name,
			
 
				                             '来源层级': level_name,
			
 
				                             '权重': feature.get('权重', 0),
			
 
				                             '所属点名称': item_name,
			
 
				-                            '最高匹配信息': {
			
 
				-                                '人设特征名称': best_match.get('人设特征名称'),
			
 
				-                                '人设特征层级': best_match.get('人设特征层级'),
			
 
				-                                '特征类型': best_match.get('特征类型'),
			
 
				-                                '特征分类': feature_classification,
			
 
				-                                '相似度': best_match.get('匹配结果', {}).get('相似度', 0),
			
 
				-                                '匹配说明': best_match.get('匹配结果', {}).get('说明', ''),
			
 
				-                                '是分类': is_classification,
			
 
				-                                '所属分类路径': classification_path
			
 
				-                            }
			
 
				+                            '最高匹配信息': top3_match_info[0],  # 保留第1个用于Stage2
			
 
				+                            'top3匹配信息': top3_match_info  # 新增字段
			
 
				                         }
			
 
				 
			
 
				                         results.append(result_item)
			
 
				                         selected_count += 1
			
 
				 
			
 
				-                        logger.info(f"  ✓ {feature_name} → {best_match.get('人设特征名称')} "
			
 
				-                                   f"(相似度: {max_similarity:.3f}, "
			
 
				-                                   f"{'分类' if is_classification else '特征'})")
			
 
				+                        # 显示top3匹配信息
			
 
				+                        top3_names = [m['人设特征名称'] for m in top3_match_info]
			
 
				+                        logger.info(f"  ✓ {feature_name} → Top{len(top3_match_info)}: {', '.join(top3_names)}")
			
 
				 
			
 
				         # 统计信息
			
 
				         logger.info(f"\n" + "=" * 60)
			
@@ -430,6 +438,8 @@ class EnhancedSearchV2:
 
				         """
			
 
				         阶段2：查找关联分类，收集分类名称、标签、子分类
			
 
				 
			
 
				+        改进: 为top3的每个base_word都查找关联
			
 
				+
			
 
				         Args:
			
 
				             filtered_features: 阶段1筛选的特征
			
 
				 
			
@@ -437,52 +447,75 @@ class EnhancedSearchV2:
 
				             带关联信息的特征列表
			
 
				         """
			
 
				         logger.info("=" * 60)
			
 
				-        logger.info("阶段2：查找关联分类")
			
 
				+        logger.info("阶段2：查找关联分类（为每个base_word）")
			
 
				         logger.info("=" * 60)
			
 
				 
			
 
				         for idx, feature in enumerate(filtered_features, 1):
			
 
				             logger.info(f"\n[{idx}/{len(filtered_features)}] 处理: {feature['原始特征名称']}")
			
 
				 
			
 
				-            match_info = feature['最高匹配信息']
			
 
				-            is_classification = match_info['是分类']
			
 
				-            classification_path = match_info['所属分类路径']
			
 
				-            source_level = match_info['人设特征层级']
			
 
				+            # 获取top3 base_words
			
 
				+            top3_info = feature.get('top3匹配信息', [])
			
 
				+            if not top3_info:
			
 
				+                logger.warning(f"  无top3匹配信息，跳过")
			
 
				+                feature['找到的关联_按base_word'] = {}
			
 
				+                continue
			
 
				 
			
 
				-            if is_classification:
			
 
				-                # 匹配的是分类，直接用分类路径
			
 
				-                search_path = classification_path
			
 
				-                logger.info(f"  匹配到分类: {search_path}")
			
 
				-            else:
			
 
				-                # 匹配的是特征，用所属分类路径
			
 
				-                search_path = classification_path
			
 
				-                logger.info(f"  匹配到特征，使用所属分类: {search_path}")
			
 
				-
			
 
				-            # 查找关联
			
 
				-            associations = self._find_associations(search_path, source_level)
			
 
				-
			
 
				-            # 收集关联信息
			
 
				-            feature['找到的关联'] = []
			
 
				-
			
 
				-            for assoc in associations:
			
 
				-                target_path = assoc['目标分类']
			
 
				-                logger.info(f"  处理关联: {target_path}")
			
 
				-
			
 
				-                # 收集分类信息
			
 
				-                classification_info = self._collect_classification_info(target_path)
			
 
				-
			
 
				-                if classification_info:
			
 
				-                    feature['找到的关联'].append({
			
 
				-                        '来源方向': assoc['来源方向'],
			
 
				-                        '关联类型': assoc['关联类型'],
			
 
				-                        '目标分类路径': target_path,
			
 
				-                        '共同帖子数': assoc['共同帖子数'],
			
 
				-                        'Jaccard相似度': assoc['Jaccard相似度'],
			
 
				-                        '分类名称': classification_info['classification_name'],
			
 
				-                        '标签列表': classification_info['tags'],
			
 
				-                        '子分类列表': classification_info['sub_classifications']
			
 
				-                    })
			
 
				+            logger.info(f"  找到 {len(top3_info)} 个base_word")
			
 
				+
			
 
				+            # 为每个base_word查找关联
			
 
				+            associations_by_base_word = {}
			
 
				 
			
 
				-            logger.info(f"  找到 {len(feature['找到的关联'])} 个关联")
			
 
				+            for base_idx, base_info in enumerate(top3_info, 1):
			
 
				+                base_word = base_info.get('人设特征名称', '')
			
 
				+                is_classification = base_info['是分类']
			
 
				+                classification_path = base_info['所属分类路径']
			
 
				+                source_level = base_info['人设特征层级']
			
 
				+
			
 
				+                logger.info(f"  [{base_idx}/{len(top3_info)}] Base Word: {base_word}")
			
 
				+
			
 
				+                if is_classification:
			
 
				+                    search_path = classification_path
			
 
				+                    logger.info(f"    匹配到分类: {search_path}")
			
 
				+                else:
			
 
				+                    search_path = classification_path
			
 
				+                    logger.info(f"    匹配到特征，使用所属分类: {search_path}")
			
 
				+
			
 
				+                # 查找关联
			
 
				+                associations = self._find_associations(search_path, source_level)
			
 
				+
			
 
				+                # 收集关联信息
			
 
				+                base_word_associations = []
			
 
				+
			
 
				+                for assoc in associations:
			
 
				+                    target_path = assoc['目标分类']
			
 
				+
			
 
				+                    # 收集分类信息
			
 
				+                    classification_info = self._collect_classification_info(target_path)
			
 
				+
			
 
				+                    if classification_info:
			
 
				+                        base_word_associations.append({
			
 
				+                            '来源方向': assoc['来源方向'],
			
 
				+                            '关联类型': assoc['关联类型'],
			
 
				+                            '目标分类路径': target_path,
			
 
				+                            '共同帖子数': assoc['共同帖子数'],
			
 
				+                            'Jaccard相似度': assoc['Jaccard相似度'],
			
 
				+                            '分类名称': classification_info['classification_name'],
			
 
				+                            '标签列表': classification_info['tags'],
			
 
				+                            '子分类列表': classification_info['sub_classifications']
			
 
				+                        })
			
 
				+
			
 
				+                associations_by_base_word[base_word] = base_word_associations
			
 
				+                logger.info(f"    找到 {len(base_word_associations)} 个关联")
			
 
				+
			
 
				+            # 保存结果
			
 
				+            feature['找到的关联_按base_word'] = associations_by_base_word
			
 
				+
			
 
				+            # 向后兼容：保留基于最高匹配信息的关联（即第1个base_word的关联）
			
 
				+            first_base_word = top3_info[0].get('人设特征名称', '')
			
 
				+            feature['找到的关联'] = associations_by_base_word.get(first_base_word, [])
			
 
				+
			
 
				+            total_associations = sum(len(v) for v in associations_by_base_word.values())
			
 
				+            logger.info(f"  总共找到 {total_associations} 个关联（{len(associations_by_base_word)} 个base_word）")
			
 
				 
			
 
				         # 保存结果
			
 
				         output_path = os.path.join(self.output_dir, "stage2_associations.json")
			
@@ -589,8 +622,8 @@ class EnhancedSearchV2:
 
				         """
			
 
				         阶段3：筛选高相似度匹配（>0.8）
			
 
				 
			
 
				-        遍历how解构中的所有原始特征，找出匹配结果中相似度>0.8
			
 
				-        且人设特征名称在Stage2关联范围内的高质量匹配
			
 
				+        改进：为每个base_word独立筛选候选词
			
 
				+        基于该base_word的关联范围，在how解构中找出相似度>0.8的匹配
			
 
				 
			
 
				         Args:
			
 
				             associations_data: 阶段2的关联数据
			
@@ -599,76 +632,107 @@ class EnhancedSearchV2:
 
				             带高相似度候选的数据
			
 
				         """
			
 
				         logger.info("=" * 60)
			
 
				-        logger.info("阶段3：筛选高相似度匹配（>0.8）")
			
 
				+        logger.info("阶段3：筛选高相似度匹配（>0.8，为每个base_word）")
			
 
				         logger.info("=" * 60)
			
 
				 
			
 
				         for idx, feature_result in enumerate(associations_data, 1):
			
 
				             original_feature_name = feature_result['原始特征名称']
			
 
				             logger.info(f"\n[{idx}/{len(associations_data)}] 处理: {original_feature_name}")
			
 
				 
			
 
				-            # 步骤1: 收集Stage2的关联范围（分类名+标签）
			
 
				-            stage2_scope = self._collect_stage2_scope(feature_result)
			
 
				-            logger.info(f"  Stage2范围包含 {len(stage2_scope)} 个分类/标签")
			
 
				+            # 获取top3 base_words
			
 
				+            top3_info = feature_result.get('top3匹配信息', [])
			
 
				+            associations_by_base_word = feature_result.get('找到的关联_按base_word', {})
			
 
				+
			
 
				+            if not top3_info or not associations_by_base_word:
			
 
				+                logger.warning(f"  无top3匹配信息或关联数据，跳过")
			
 
				+                feature_result['高相似度候选_按base_word'] = {}
			
 
				+                continue
			
 
				+
			
 
				+            logger.info(f"  找到 {len(top3_info)} 个base_word")
			
 
				+
			
 
				+            # 为每个base_word独立筛选候选词
			
 
				+            candidates_by_base_word = {}
			
 
				 
			
 
				-            # 步骤2: 遍历how解构中的所有原始特征，找出高相似度匹配
			
 
				-            high_sim_candidates = []
			
 
				-            total_checked = 0
			
 
				-            high_sim_found = 0
			
 
				+            for base_idx, base_info in enumerate(top3_info, 1):
			
 
				+                base_word = base_info.get('人设特征名称', '')
			
 
				+                logger.info(f"  [{base_idx}/{len(top3_info)}] Base Word: {base_word}")
			
 
				 
			
 
				-            how_result = self.how_data.get('how解构结果', {})
			
 
				-            for level_name, level_list in how_result.items():
			
 
				-                if not isinstance(level_list, list):
			
 
				+                # 步骤1: 收集该base_word的关联范围
			
 
				+                base_word_associations = associations_by_base_word.get(base_word, [])
			
 
				+                base_word_scope = self._collect_scope_from_associations(base_word_associations)
			
 
				+                logger.info(f"    关联范围包含 {len(base_word_scope)} 个分类/标签")
			
 
				+
			
 
				+                if not base_word_scope:
			
 
				+                    logger.warning(f"    无关联范围，跳过")
			
 
				+                    candidates_by_base_word[base_word] = []
			
 
				                     continue
			
 
				 
			
 
				-                for item in level_list:
			
 
				-                    for step in item.get('how步骤列表', []):
			
 
				-                        for feature in step.get('特征列表', []):
			
 
				-                            # 获取该特征的所有匹配
			
 
				-                            matches = feature.get('匹配结果', [])
			
 
				-                            total_checked += len(matches)
			
 
				-
			
 
				-                            # 筛选相似度>0.8且在Stage2范围内的匹配
			
 
				-                            for match in matches:
			
 
				-                                sim = match.get('匹配结果', {}).get('相似度', 0)
			
 
				-                                persona_feature_name = match.get('人设特征名称', '')
			
 
				-
			
 
				-                                if sim > 0.8 and persona_feature_name in stage2_scope:
			
 
				-                                    high_sim_found += 1
			
 
				-                                    # 记录来源信息
			
 
				-                                    high_sim_candidates.append({
			
 
				-                                        '人设特征名称': persona_feature_name,
			
 
				-                                        '相似度': sim,
			
 
				-                                        '特征类型': match.get('特征类型', ''),
			
 
				-                                        '特征分类': match.get('特征分类', []),
			
 
				-                                        '人设特征层级': match.get('人设特征层级', ''),
			
 
				-                                        '来源路径': self._build_classification_path(match.get('特征分类', [])),
			
 
				-                                        '匹配说明': match.get('匹配结果', {}).get('说明', ''),
			
 
				-                                        '来源原始特征': feature.get('特征名称', '')  # 记录来自哪个原始特征
			
 
				-                                    })
			
 
				-
			
 
				-            logger.info(f"  检查了 {total_checked} 个匹配")
			
 
				-            logger.info(f"  找到 {high_sim_found} 个相似度>0.8的匹配")
			
 
				-
			
 
				-            # 按相似度降序排序，并去重（同一个人设特征名称只保留最高分）
			
 
				-            seen_names = set()
			
 
				-            unique_candidates = []
			
 
				-            high_sim_candidates.sort(key=lambda x: x['相似度'], reverse=True)
			
 
				-
			
 
				-            for candidate in high_sim_candidates:
			
 
				-                name = candidate['人设特征名称']
			
 
				-                if name not in seen_names:
			
 
				-                    seen_names.add(name)
			
 
				-                    unique_candidates.append(candidate)
			
 
				-
			
 
				-            # 添加到结果中
			
 
				-            feature_result['高相似度候选'] = unique_candidates
			
 
				-            logger.info(f"  去重后筛选出 {len(unique_candidates)} 个高相似度候选")
			
 
				-
			
 
				-            # 显示前5个
			
 
				-            if unique_candidates:
			
 
				-                logger.info(f"  Top 5:")
			
 
				-                for c in unique_candidates[:5]:
			
 
				-                    logger.info(f"    • {c['人设特征名称']} ({c['相似度']:.3f}) ← 来自\"{c['来源原始特征']}\"")
			
 
				+                # 步骤2: 遍历how解构，找出高相似度匹配
			
 
				+                high_sim_candidates = []
			
 
				+                total_checked = 0
			
 
				+                high_sim_found = 0
			
 
				+
			
 
				+                how_result = self.how_data.get('how解构结果', {})
			
 
				+                for level_name, level_list in how_result.items():
			
 
				+                    if not isinstance(level_list, list):
			
 
				+                        continue
			
 
				+
			
 
				+                    for item in level_list:
			
 
				+                        for step in item.get('how步骤列表', []):
			
 
				+                            for feature in step.get('特征列表', []):
			
 
				+                                matches = feature.get('匹配结果', [])
			
 
				+                                total_checked += len(matches)
			
 
				+
			
 
				+                                # 筛选相似度>0.8且在该base_word的范围内的匹配
			
 
				+                                for match in matches:
			
 
				+                                    sim = match.get('匹配结果', {}).get('相似度', 0)
			
 
				+                                    persona_feature_name = match.get('人设特征名称', '')
			
 
				+
			
 
				+                                    if sim > 0.8 and persona_feature_name in base_word_scope:
			
 
				+                                        high_sim_found += 1
			
 
				+                                        high_sim_candidates.append({
			
 
				+                                            '人设特征名称': persona_feature_name,
			
 
				+                                            '相似度': sim,
			
 
				+                                            '特征类型': match.get('特征类型', ''),
			
 
				+                                            '特征分类': match.get('特征分类', []),
			
 
				+                                            '人设特征层级': match.get('人设特征层级', ''),
			
 
				+                                            '来源路径': self._build_classification_path(match.get('特征分类', [])),
			
 
				+                                            '匹配说明': match.get('匹配结果', {}).get('说明', ''),
			
 
				+                                            '来源原始特征': feature.get('特征名称', '')
			
 
				+                                        })
			
 
				+
			
 
				+                logger.info(f"    检查了 {total_checked} 个匹配")
			
 
				+                logger.info(f"    找到 {high_sim_found} 个相似度>0.8的匹配")
			
 
				+
			
 
				+                # 按相似度降序排序并去重
			
 
				+                seen_names = set()
			
 
				+                unique_candidates = []
			
 
				+                high_sim_candidates.sort(key=lambda x: x['相似度'], reverse=True)
			
 
				+
			
 
				+                for candidate in high_sim_candidates:
			
 
				+                    name = candidate['人设特征名称']
			
 
				+                    if name not in seen_names:
			
 
				+                        seen_names.add(name)
			
 
				+                        unique_candidates.append(candidate)
			
 
				+
			
 
				+                candidates_by_base_word[base_word] = unique_candidates
			
 
				+                logger.info(f"    去重后筛选出 {len(unique_candidates)} 个候选")
			
 
				+
			
 
				+                # 显示前5个
			
 
				+                if unique_candidates:
			
 
				+                    logger.info(f"    Top 5:")
			
 
				+                    for c in unique_candidates[:5]:
			
 
				+                        logger.info(f"      • {c['人设特征名称']} ({c['相似度']:.3f}) ← 来自\"{c['来源原始特征']}\"")
			
 
				+
			
 
				+            # 保存结果
			
 
				+            feature_result['高相似度候选_按base_word'] = candidates_by_base_word
			
 
				+
			
 
				+            # 向后兼容：保留第1个base_word的候选
			
 
				+            first_base_word = top3_info[0].get('人设特征名称', '')
			
 
				+            feature_result['高相似度候选'] = candidates_by_base_word.get(first_base_word, [])
			
 
				+
			
 
				+            total_candidates = sum(len(v) for v in candidates_by_base_word.values())
			
 
				+            logger.info(f"  总共筛选出 {total_candidates} 个候选（{len(candidates_by_base_word)} 个base_word）")
			
 
				 
			
 
				         # 保存结果
			
 
				         output_path = os.path.join(self.output_dir, "stage3_high_similarity.json")
			
@@ -681,19 +745,19 @@ class EnhancedSearchV2:
 
				         return associations_data
			
 
				 
			
 
				 
			
 
				-    def _collect_stage2_scope(self, feature_result: Dict[str, Any]) -> Set[str]:
			
 
				+    def _collect_scope_from_associations(self, associations: List[Dict[str, Any]]) -> Set[str]:
			
 
				         """
			
 
				-        收集Stage2找到的所有分类名和标签，形成范围集合
			
 
				+        从关联列表中收集所有分类名和标签，形成范围集合
			
 
				 
			
 
				         Args:
			
 
				-            feature_result: 特征结果数据
			
 
				+            associations: 关联列表
			
 
				 
			
 
				         Returns:
			
 
				             包含所有分类名和标签的集合
			
 
				         """
			
 
				         scope = set()
			
 
				 
			
 
				-        for assoc in feature_result.get('找到的关联', []):
			
 
				+        for assoc in associations:
			
 
				             # 添加分类名
			
 
				             scope.add(assoc['分类名称'])
			
 
				 
			
@@ -703,6 +767,19 @@ class EnhancedSearchV2:
 
				 
			
 
				         return scope
			
 
				 
			
 
				+    def _collect_stage2_scope(self, feature_result: Dict[str, Any]) -> Set[str]:
			
 
				+        """
			
 
				+        收集Stage2找到的所有分类名和标签，形成范围集合（兼容旧方法）
			
 
				+
			
 
				+        Args:
			
 
				+            feature_result: 特征结果数据
			
 
				+
			
 
				+        Returns:
			
 
				+            包含所有分类名和标签的集合
			
 
				+        """
			
 
				+        associations = feature_result.get('找到的关联', [])
			
 
				+        return self._collect_scope_from_associations(associations)
			
 
				+
			
 
				     def _find_features_by_path(self, target_classification: str) -> List[Dict[str, Any]]:
			
 
				         """
			
 
				         根据路径查找特征列表
			
@@ -797,12 +874,16 @@ class EnhancedSearchV2:
 
				         """
			
 
				         处理单个原始特征的组合生成和评估
			
 
				 
			
 
				+        改进: 每个base_word使用自己的候选词（而不是共享）
			
 
				+
			
 
				         Steps:
			
 
				-        1. Get base_word from Stage1's 最高匹配信息
			
 
				-        2. Get candidates from Stage3's 高相似度候选 (top max_candidates)
			
 
				-        3. Generate 2-N word combinations
			
 
				-        4. LLM batch evaluation
			
 
				-        5. Select Top 10 and write back
			
 
				+        1. Get top3 base_words from Stage1's top3匹配信息
			
 
				+        2. For each base_word:
			
 
				+           a. Get candidates from Stage3's 高相似度候选_按base_word
			
 
				+           b. Generate combinations
			
 
				+           c. LLM evaluation
			
 
				+           d. Select Top 10
			
 
				+        3. Save grouped results
			
 
				 
			
 
				         Args:
			
 
				             idx: 特征索引
			
@@ -814,62 +895,94 @@ class EnhancedSearchV2:
 
				         original_feature = feature_result['原始特征名称']
			
 
				         logger.info(f"\n[{idx}/{total}] 处理: {original_feature}")
			
 
				 
			
 
				-        # 步骤1: 获取基础词
			
 
				-        base_word = feature_result.get('最高匹配信息', {}).get('人设特征名称', '')
			
 
				-        if not base_word:
			
 
				-            logger.info(f"  无基础词，跳过")
			
 
				-            feature_result['组合评估结果'] = []
			
 
				+        # 步骤1: 获取top3基础词
			
 
				+        top3_info = feature_result.get('top3匹配信息', [])
			
 
				+        if not top3_info:
			
 
				+            logger.info(f"  无top3匹配信息，跳过")
			
 
				+            feature_result['组合评估结果_分组'] = []
			
 
				             return
			
 
				 
			
 
				-        logger.info(f"  基础词: {base_word}")
			
 
				+        logger.info(f"  找到 {len(top3_info)} 个base_word")
			
 
				 
			
 
				-        # 步骤2: 获取候选词（从高相似度候选中）
			
 
				-        high_sim_candidates = feature_result.get('高相似度候选', [])
			
 
				+        # 步骤2: 获取按base_word分组的候选词
			
 
				+        candidates_by_base_word = feature_result.get('高相似度候选_按base_word', {})
			
 
				 
			
 
				-        # 限制候选词数量
			
 
				-        candidates = high_sim_candidates[:max_candidates]
			
 
				-        candidate_words = [c['人设特征名称'] for c in candidates]
			
 
				-
			
 
				-        if not candidate_words:
			
 
				-            logger.info(f"  无候选词，跳过")
			
 
				-            feature_result['组合评估结果'] = []
			
 
				+        if not candidates_by_base_word:
			
 
				+            logger.warning(f"  无按base_word分组的候选词，跳过")
			
 
				+            feature_result['组合评估结果_分组'] = []
			
 
				             return
			
 
				 
			
 
				-        logger.info(f"  候选词数量: {len(candidate_words)} (限制: {max_candidates})")
			
 
				+        # 步骤3: 为每个base_word独立处理
			
 
				+        grouped_results = []
			
 
				+
			
 
				+        for base_idx, base_info in enumerate(top3_info, 1):
			
 
				+            base_word = base_info.get('人设特征名称', '')
			
 
				+            base_similarity = base_info.get('相似度', 0)
			
 
				+
			
 
				+            if not base_word:
			
 
				+                continue
			
 
				+
			
 
				+            logger.info(f"  [{base_idx}/{len(top3_info)}] Base Word: {base_word} (相似度: {base_similarity:.3f})")
			
 
				 
			
 
				-        # 步骤3: 生成所有组合
			
 
				-        all_combinations = []
			
 
				+            # 获取该base_word的候选词
			
 
				+            base_candidates = candidates_by_base_word.get(base_word, [])
			
 
				+            candidates = base_candidates[:max_candidates]
			
 
				+            candidate_words = [c['人设特征名称'] for c in candidates]
			
 
				 
			
 
				-        # 生成1词到max_combo_length-1词的候选词组合（因为还要加上base_word）
			
 
				-        for length in range(1, min(max_combo_length, len(candidate_words) + 1)):
			
 
				-            for combo in combinations(candidate_words, length):
			
 
				-                # 组合成搜索词：基础词 + 候选词组合
			
 
				-                search_phrase = base_word + ' ' + ' '.join(combo)
			
 
				-                all_combinations.append({
			
 
				-                    'search_word': search_phrase,
			
 
				+            if not candidate_words:
			
 
				+                logger.warning(f"    该base_word无候选词，跳过")
			
 
				+                grouped_results.append({
			
 
				                     'base_word': base_word,
			
 
				-                    'candidate_words': list(combo),
			
 
				-                    'combo_length': length + 1  # +1 因为包含base_word
			
 
				+                    'base_word_similarity': base_similarity,
			
 
				+                    'base_word_info': base_info,
			
 
				+                    'top10_searches': [],
			
 
				+                    'available_words': []
			
 
				                 })
			
 
				+                continue
			
 
				 
			
 
				-        logger.info(f"  生成 {len(all_combinations)} 个组合")
			
 
				+            logger.info(f"    候选词数量: {len(candidate_words)} (限制: {max_candidates})")
			
 
				+
			
 
				+            # 生成组合
			
 
				+            combinations_for_base = []
			
 
				+            for length in range(1, min(max_combo_length, len(candidate_words) + 1)):
			
 
				+                for combo in combinations(candidate_words, length):
			
 
				+                    search_phrase = base_word + ' ' + ' '.join(combo)
			
 
				+                    combinations_for_base.append({
			
 
				+                        'search_word': search_phrase,
			
 
				+                        'base_word': base_word,
			
 
				+                        'candidate_words': list(combo),
			
 
				+                        'combo_length': length + 1
			
 
				+                    })
			
 
				 
			
 
				-        # 步骤4: LLM批量评估
			
 
				-        logger.info(f"  开始LLM评估...")
			
 
				-        evaluated = self.llm_evaluator.evaluate_search_words_in_batches(
			
 
				-            original_feature=original_feature,
			
 
				-            search_words=[c['search_word'] for c in all_combinations],
			
 
				-            batch_size=50
			
 
				-        )
			
 
				+            logger.info(f"    生成 {len(combinations_for_base)} 个组合")
			
 
				+
			
 
				+            # LLM评估
			
 
				+            logger.info(f"    开始LLM评估...")
			
 
				+            evaluated = self.llm_evaluator.evaluate_search_words_in_batches(
			
 
				+                original_feature=original_feature,
			
 
				+                search_words=[c['search_word'] for c in combinations_for_base],
			
 
				+                batch_size=50
			
 
				+            )
			
 
				+
			
 
				+            # 选出Top 10
			
 
				+            top_10 = evaluated[:10]
			
 
				+            max_score = top_10[0]['score'] if top_10 else 0.0
			
 
				+            logger.info(f"    评估完成，Top 10 最高分: {max_score:.3f}")
			
 
				 
			
 
				-        # 步骤5: 选出Top 10
			
 
				-        top_10 = evaluated[:10]
			
 
				+            # 保存分组结果 - 每个base_word有自己的available_words
			
 
				+            grouped_results.append({
			
 
				+                'base_word': base_word,
			
 
				+                'base_word_similarity': base_similarity,
			
 
				+                'base_word_info': base_info,
			
 
				+                'top10_searches': top_10,
			
 
				+                'available_words': candidate_words  # 该base_word自己的候选词
			
 
				+            })
			
 
				 
			
 
				         # 写回结果
			
 
				-        feature_result['组合评估结果'] = top_10
			
 
				+        feature_result['组合评估结果_分组'] = grouped_results
			
 
				 
			
 
				-        max_score = top_10[0]['score'] if top_10 else 0.0
			
 
				-        logger.info(f"  评估完成，Top 10 最高分: {max_score:.3f}")
			
 
				+        total_searches = sum(len(g['top10_searches']) for g in grouped_results)
			
 
				+        logger.info(f"  完成！共 {len(grouped_results)} 个base_word，{total_searches} 个搜索词")
			
 
				 
			
 
				     # ========== 阶段5：执行搜索 ==========
			
 
				 
			
@@ -954,7 +1067,7 @@ class EnhancedSearchV2:
 
				         logger.info("阶段5：执行小红书搜索")
			
 
				         logger.info("=" * 60)
			
 
				 
			
 
				-        # 按原始特征分组收集搜索词（从Stage4的组合评估结果读取）
			
 
				+        # 按原始特征分组收集搜索词（从Stage4的组合评估结果_分组读取）
			
 
				         feature_search_groups = {}
			
 
				 
			
 
				         for feature_result in features_data:
			
@@ -963,46 +1076,60 @@ class EnhancedSearchV2:
 
				             if original_feature not in feature_search_groups:
			
 
				                 feature_search_groups[original_feature] = []
			
 
				 
			
 
				-            # 从Stage4的组合评估结果读取
			
 
				-            for eval_item in feature_result.get('组合评估结果', []):
			
 
				-                sw = eval_item.get('search_word')
			
 
				-                if not sw:
			
 
				-                    continue
			
 
				+            # 从Stage4的组合评估结果_分组读取（新结构）
			
 
				+            grouped_results = feature_result.get('组合评估结果_分组', [])
			
 
				 
			
 
				-                score = eval_item.get('score', 0.0)
			
 
				+            if grouped_results:
			
 
				+                # 使用分组结构：每个base_word的top10都执行
			
 
				+                for group in grouped_results:
			
 
				+                    base_word = group.get('base_word', '')
			
 
				+                    base_similarity = group.get('base_word_similarity', 0)
			
 
				 
			
 
				-                feature_search_groups[original_feature].append({
			
 
				-                    'search_word': sw,
			
 
				-                    'score': score,
			
 
				-                    'feature_ref': eval_item  # 引用评估项，用于写入搜索结果
			
 
				-                })
			
 
				+                    for eval_item in group.get('top10_searches', []):
			
 
				+                        sw = eval_item.get('search_word')
			
 
				+                        if not sw:
			
 
				+                            continue
			
 
				 
			
 
				-        # 每组取Top N
			
 
				-        all_searches = []
			
 
				-        total_before_filter = 0
			
 
				-        total_filtered = 0
			
 
				+                        score = eval_item.get('score', 0.0)
			
 
				 
			
 
				-        for original_feature, search_list in feature_search_groups.items():
			
 
				-            total_before_filter += len(search_list)
			
 
				+                        feature_search_groups[original_feature].append({
			
 
				+                            'search_word': sw,
			
 
				+                            'score': score,
			
 
				+                            'base_word': base_word,
			
 
				+                            'base_word_similarity': base_similarity,
			
 
				+                            'feature_ref': eval_item  # 引用评估项，用于写入搜索结果
			
 
				+                        })
			
 
				+            else:
			
 
				+                # 兼容旧结构（组合评估结果）
			
 
				+                for eval_item in feature_result.get('组合评估结果', []):
			
 
				+                    sw = eval_item.get('search_word')
			
 
				+                    if not sw:
			
 
				+                        continue
			
 
				 
			
 
				-            # 按分数降序排序
			
 
				-            sorted_list = sorted(search_list, key=lambda x: x['score'], reverse=True)
			
 
				+                    score = eval_item.get('score', 0.0)
			
 
				 
			
 
				-            # 取前top_n个
			
 
				-            selected = sorted_list[:top_n]
			
 
				-            all_searches.extend(selected)
			
 
				+                    feature_search_groups[original_feature].append({
			
 
				+                        'search_word': sw,
			
 
				+                        'score': score,
			
 
				+                        'feature_ref': eval_item
			
 
				+                    })
			
 
				+
			
 
				+        # 收集所有搜索任务（分组结构下执行所有base_word的top10，不再过滤）
			
 
				+        all_searches = []
			
 
				+        total_count = 0
			
 
				 
			
 
				-            filtered = len(sorted_list) - len(selected)
			
 
				-            total_filtered += filtered
			
 
				+        for original_feature, search_list in feature_search_groups.items():
			
 
				+            total_count += len(search_list)
			
 
				+            all_searches.extend(search_list)
			
 
				 
			
 
				-            logger.info(f"  {original_feature}: 从 {len(sorted_list)} 个搜索词中选择 Top {len(selected)} (过滤 {filtered} 个)")
			
 
				+            logger.info(f"  {original_feature}: {len(search_list)} 个搜索词")
			
 
				 
			
 
				         # 应用全局搜索次数限制
			
 
				         if self.max_total_searches and len(all_searches) > self.max_total_searches:
			
 
				             logger.info(f"  应用全局限制：从 {len(all_searches)} 个减少到 {self.max_total_searches} 个")
			
 
				             all_searches = all_searches[:self.max_total_searches]
			
 
				 
			
 
				-        logger.info(f"\n共 {len(all_searches)} 个搜索任务（过滤前: {total_before_filter}, 过滤掉: {total_filtered}）")
			
 
				+        logger.info(f"\n共 {len(all_searches)} 个搜索任务")
			
 
				         logger.info(f"  并发执行搜索（并发数: {self.search_max_workers}）")
			
 
				 
			
 
				         # 使用ThreadPoolExecutor并发执行搜索
			
--- a/visualize_stage5_results_v3.py
+++ b/visualize_stage5_results_v3.py
@@ -281,6 +281,14 @@ def generate_html(data: List[Dict[str, Any]], stats: Dict[str, Any], output_path
 
				             margin-left: 8px;
			
 
				         }}
			
 
				 
			
 
				+        .base-word-available-words {{
			
 
				+            font-size: 10px;
			
 
				+            color: #9ca3af;
			
 
				+            margin-top: 4px;
			
 
				+            line-height: 1.5;
			
 
				+            font-weight: normal;
			
 
				+        }}
			
 
				+
			
 
				         /* Level 3: Search Terms */
			
 
				         .search-terms-list {{
			
 
				             display: none;
			
@@ -702,12 +710,15 @@ def generate_html(data: List[Dict[str, Any]], stats: Dict[str, Any], output_path
 
				                     const baseWord = group.base_word || '';
			
 
				                     const baseSimilarity = group.base_word_similarity || 0;
			
 
				                     const searchTerms = group.top10_searches || [];
			
 
				+                    const availableWords = group.available_words || [];
			
 
				+                    const wordsStr = availableWords.join('、');
			
 
				 
			
 
				                     html += `
			
 
				                         <div class="base-word-item" onclick="event.stopPropagation(); toggleBaseWord(${{featureIdx}}, ${{groupIdx}})" id="base-word-${{featureIdx}}-${{groupIdx}}">
			
 
				                             <span class="base-expand-icon">▶</span>
			
 
				                             <div class="base-word-text">
			
 
				                                 🎯 ${{baseWord}}
			
 
				+                                ${{wordsStr ? `<div class="base-word-available-words">${{wordsStr}}</div>` : ''}}
			
 
				                             </div>
			
 
				                         </div>
			
 
				                         <div class="search-terms-list" id="search-terms-${{featureIdx}}-${{groupIdx}}">