пре 2 месеци · 0fada6c296
--- a/enhanced_search_v2.py
+++ b/enhanced_search_v2.py
@@ -170,45 +170,53 @@ class EnhancedSearchV2:
 
															                             continue
														
 
															                         # 0.5 <= max_similarity < 0.8，保留
														
 
															-                        best_match = max(
														
 
															+                        # 按相似度降序排序，取前3个
														
 
															+                        sorted_matches = sorted(
														
 
															                             match_results,
														
 
															-                            key=lambda x: x.get('匹配结果', {}).get('相似度', 0)
														
 
															+                            key=lambda x: x.get('匹配结果', {}).get('相似度', 0),
														
 
															+                            reverse=True
														
 
															                         )
														
 
															+                        top3_matches = sorted_matches[:3]  # 取前3个
														
 
															-                        # 判断是分类还是特征
														
 
															-                        feature_classification = best_match.get('特征分类', [])
														
 
															-                        classification_path = self._build_classification_path(feature_classification)
														
 
															+                        # 构建top3匹配信息列表
														
 
															+                        top3_match_info = []
														
 
															+                        for match in top3_matches:
														
 
															+                            feature_classification = match.get('特征分类', [])
														
 
															+                            classification_path = self._build_classification_path(feature_classification)
														
 
															-                        # 如果路径为空且是分类类型，搜索补全路径
														
 
															-                        if not classification_path and best_match.get('特征类型') == '分类':
														
 
															-                            feature_name_to_search = best_match.get('人设特征名称', '')
														
 
															-                            classification_path = self._search_classification_path(feature_name_to_search)
														
 
															+                            # 如果路径为空且是分类类型，搜索补全路径
														
 
															+                            if not classification_path and match.get('特征类型') == '分类':
														
 
															+                                feature_name_to_search = match.get('人设特征名称', '')
														
 
															+                                classification_path = self._search_classification_path(feature_name_to_search)
														
 
															-                        is_classification = self._is_classification(best_match.get('人设特征名称', ''), classification_path)
														
 
															+                            is_classification = self._is_classification(match.get('人设特征名称', ''), classification_path)
														
 
															+
														
 
															+                            top3_match_info.append({
														
 
															+                                '人设特征名称': match.get('人设特征名称'),
														
 
															+                                '人设特征层级': match.get('人设特征层级'),
														
 
															+                                '特征类型': match.get('特征类型'),
														
 
															+                                '特征分类': feature_classification,
														
 
															+                                '相似度': match.get('匹配结果', {}).get('相似度', 0),
														
 
															+                                '匹配说明': match.get('匹配结果', {}).get('说明', ''),
														
 
															+                                '是分类': is_classification,
														
 
															+                                '所属分类路径': classification_path
														
 
															+                            })
														
 
															                         result_item = {
														
 
															                             '原始特征名称': feature_name,
														
 
															                             '来源层级': level_name,
														
 
															                             '权重': feature.get('权重', 0),
														
 
															                             '所属点名称': item_name,
														
 
															-                            '最高匹配信息': {
														
 
															-                                '人设特征名称': best_match.get('人设特征名称'),
														
 
															-                                '人设特征层级': best_match.get('人设特征层级'),
														
 
															-                                '特征类型': best_match.get('特征类型'),
														
 
															-                                '特征分类': feature_classification,
														
 
															-                                '相似度': best_match.get('匹配结果', {}).get('相似度', 0),
														
 
															-                                '匹配说明': best_match.get('匹配结果', {}).get('说明', ''),
														
 
															-                                '是分类': is_classification,
														
 
															-                                '所属分类路径': classification_path
														
 
															-                            }
														
 
															+                            '最高匹配信息': top3_match_info[0],  # 保留第1个用于Stage2
														
 
															+                            'top3匹配信息': top3_match_info  # 新增字段
														
 
															                         }
														
 
															                         results.append(result_item)
														
 
															                         selected_count += 1
														
 
															-                        logger.info(f"  ✓ {feature_name} → {best_match.get('人设特征名称')} "
														
 
															-                                   f"(相似度: {max_similarity:.3f}, "
														
 
															-                                   f"{'分类' if is_classification else '特征'})")
														
 
															+                        # 显示top3匹配信息
														
 
															+                        top3_names = [m['人设特征名称'] for m in top3_match_info]
														
 
															+                        logger.info(f"  ✓ {feature_name} → Top{len(top3_match_info)}: {', '.join(top3_names)}")
														
 
															         # 统计信息
														
 
															         logger.info(f"\n" + "=" * 60)
														
@@ -430,6 +438,8 @@ class EnhancedSearchV2:
 
															         """
														
 
															         阶段2：查找关联分类，收集分类名称、标签、子分类
														
 
															+        改进: 为top3的每个base_word都查找关联
														
 
															+
														
 
															         Args:
														
 
															             filtered_features: 阶段1筛选的特征
														
@@ -437,52 +447,75 @@ class EnhancedSearchV2:
 
															             带关联信息的特征列表
														
 
															         """
														
 
															         logger.info("=" * 60)
														
 
															-        logger.info("阶段2：查找关联分类")
														
 
															+        logger.info("阶段2：查找关联分类（为每个base_word）")
														
 
															         logger.info("=" * 60)
														
 
															         for idx, feature in enumerate(filtered_features, 1):
														
 
															             logger.info(f"\n[{idx}/{len(filtered_features)}] 处理: {feature['原始特征名称']}")
														
 
															-            match_info = feature['最高匹配信息']
														
 
															-            is_classification = match_info['是分类']
														
 
															-            classification_path = match_info['所属分类路径']
														
 
															-            source_level = match_info['人设特征层级']
														
 
															+            # 获取top3 base_words
														
 
															+            top3_info = feature.get('top3匹配信息', [])
														
 
															+            if not top3_info:
														
 
															+                logger.warning(f"  无top3匹配信息，跳过")
														
 
															+                feature['找到的关联_按base_word'] = {}
														
 
															+                continue
														
 
															-            if is_classification:
														
 
															-                # 匹配的是分类，直接用分类路径
														
 
															-                search_path = classification_path
														
 
															-                logger.info(f"  匹配到分类: {search_path}")
														
 
															-            else:
														
 
															-                # 匹配的是特征，用所属分类路径
														
 
															-                search_path = classification_path
														
 
															-                logger.info(f"  匹配到特征，使用所属分类: {search_path}")
														
 
															-
														
 
															-            # 查找关联
														
 
															-            associations = self._find_associations(search_path, source_level)
														
 
															-
														
 
															-            # 收集关联信息
														
 
															-            feature['找到的关联'] = []
														
 
															-
														
 
															-            for assoc in associations:
														
 
															-                target_path = assoc['目标分类']
														
 
															-                logger.info(f"  处理关联: {target_path}")
														
 
															-
														
 
															-                # 收集分类信息
														
 
															-                classification_info = self._collect_classification_info(target_path)
														
 
															-
														
 
															-                if classification_info:
														
 
															-                    feature['找到的关联'].append({
														
 
															-                        '来源方向': assoc['来源方向'],
														
 
															-                        '关联类型': assoc['关联类型'],
														
 
															-                        '目标分类路径': target_path,
														
 
															-                        '共同帖子数': assoc['共同帖子数'],
														
 
															-                        'Jaccard相似度': assoc['Jaccard相似度'],
														
 
															-                        '分类名称': classification_info['classification_name'],
														
 
															-                        '标签列表': classification_info['tags'],
														
 
															-                        '子分类列表': classification_info['sub_classifications']
														
 
															-                    })
														
 
															+            logger.info(f"  找到 {len(top3_info)} 个base_word")
														
 
															+
														
 
															+            # 为每个base_word查找关联
														
 
															+            associations_by_base_word = {}
														
 
															-            logger.info(f"  找到 {len(feature['找到的关联'])} 个关联")
														
 
															+            for base_idx, base_info in enumerate(top3_info, 1):
														
 
															+                base_word = base_info.get('人设特征名称', '')
														
 
															+                is_classification = base_info['是分类']
														
 
															+                classification_path = base_info['所属分类路径']
														
 
															+                source_level = base_info['人设特征层级']
														
 
															+
														
 
															+                logger.info(f"  [{base_idx}/{len(top3_info)}] Base Word: {base_word}")
														
 
															+
														
 
															+                if is_classification:
														
 
															+                    search_path = classification_path
														
 
															+                    logger.info(f"    匹配到分类: {search_path}")
														
 
															+                else:
														
 
															+                    search_path = classification_path
														
 
															+                    logger.info(f"    匹配到特征，使用所属分类: {search_path}")
														
 
															+
														
 
															+                # 查找关联
														
 
															+                associations = self._find_associations(search_path, source_level)
														
 
															+
														
 
															+                # 收集关联信息
														
 
															+                base_word_associations = []
														
 
															+
														
 
															+                for assoc in associations:
														
 
															+                    target_path = assoc['目标分类']
														
 
															+
														
 
															+                    # 收集分类信息
														
 
															+                    classification_info = self._collect_classification_info(target_path)
														
 
															+
														
 
															+                    if classification_info:
														
 
															+                        base_word_associations.append({
														
 
															+                            '来源方向': assoc['来源方向'],
														
 
															+                            '关联类型': assoc['关联类型'],
														
 
															+                            '目标分类路径': target_path,
														
 
															+                            '共同帖子数': assoc['共同帖子数'],
														
 
															+                            'Jaccard相似度': assoc['Jaccard相似度'],
														
 
															+                            '分类名称': classification_info['classification_name'],
														
 
															+                            '标签列表': classification_info['tags'],
														
 
															+                            '子分类列表': classification_info['sub_classifications']
														
 
															+                        })
														
 
															+
														
 
															+                associations_by_base_word[base_word] = base_word_associations
														
 
															+                logger.info(f"    找到 {len(base_word_associations)} 个关联")
														
 
															+
														
 
															+            # 保存结果
														
 
															+            feature['找到的关联_按base_word'] = associations_by_base_word
														
 
															+
														
 
															+            # 向后兼容：保留基于最高匹配信息的关联（即第1个base_word的关联）
														
 
															+            first_base_word = top3_info[0].get('人设特征名称', '')
														
 
															+            feature['找到的关联'] = associations_by_base_word.get(first_base_word, [])
														
 
															+
														
 
															+            total_associations = sum(len(v) for v in associations_by_base_word.values())
														
 
															+            logger.info(f"  总共找到 {total_associations} 个关联（{len(associations_by_base_word)} 个base_word）")
														
 
															         # 保存结果
														
 
															         output_path = os.path.join(self.output_dir, "stage2_associations.json")
														
@@ -589,8 +622,8 @@ class EnhancedSearchV2:
 
															         """
														
 
															         阶段3：筛选高相似度匹配（>0.8）
														
 
															-        遍历how解构中的所有原始特征，找出匹配结果中相似度>0.8
														
 
															-        且人设特征名称在Stage2关联范围内的高质量匹配
														
 
															+        改进：为每个base_word独立筛选候选词
														
 
															+        基于该base_word的关联范围，在how解构中找出相似度>0.8的匹配
														
 
															         Args:
														
 
															             associations_data: 阶段2的关联数据
														
@@ -599,76 +632,107 @@ class EnhancedSearchV2:
 
															             带高相似度候选的数据
														
 
															         """
														
 
															         logger.info("=" * 60)
														
 
															-        logger.info("阶段3：筛选高相似度匹配（>0.8）")
														
 
															+        logger.info("阶段3：筛选高相似度匹配（>0.8，为每个base_word）")
														
 
															         logger.info("=" * 60)
														
 
															         for idx, feature_result in enumerate(associations_data, 1):
														
 
															             original_feature_name = feature_result['原始特征名称']
														
 
															             logger.info(f"\n[{idx}/{len(associations_data)}] 处理: {original_feature_name}")
														
 
															-            # 步骤1: 收集Stage2的关联范围（分类名+标签）
														
 
															-            stage2_scope = self._collect_stage2_scope(feature_result)
														
 
															-            logger.info(f"  Stage2范围包含 {len(stage2_scope)} 个分类/标签")
														
 
															+            # 获取top3 base_words
														
 
															+            top3_info = feature_result.get('top3匹配信息', [])
														
 
															+            associations_by_base_word = feature_result.get('找到的关联_按base_word', {})
														
 
															+
														
 
															+            if not top3_info or not associations_by_base_word:
														
 
															+                logger.warning(f"  无top3匹配信息或关联数据，跳过")
														
 
															+                feature_result['高相似度候选_按base_word'] = {}
														
 
															+                continue
														
 
															+
														
 
															+            logger.info(f"  找到 {len(top3_info)} 个base_word")
														
 
															+
														
 
															+            # 为每个base_word独立筛选候选词
														
 
															+            candidates_by_base_word = {}
														
 
															-            # 步骤2: 遍历how解构中的所有原始特征，找出高相似度匹配
														
 
															-            high_sim_candidates = []
														
 
															-            total_checked = 0
														
 
															-            high_sim_found = 0
														
 
															+            for base_idx, base_info in enumerate(top3_info, 1):
														
 
															+                base_word = base_info.get('人设特征名称', '')
														
 
															+                logger.info(f"  [{base_idx}/{len(top3_info)}] Base Word: {base_word}")
														
 
															-            how_result = self.how_data.get('how解构结果', {})
														
 
															-            for level_name, level_list in how_result.items():
														
 
															-                if not isinstance(level_list, list):
														
 
															+                # 步骤1: 收集该base_word的关联范围
														
 
															+                base_word_associations = associations_by_base_word.get(base_word, [])
														
 
															+                base_word_scope = self._collect_scope_from_associations(base_word_associations)
														
 
															+                logger.info(f"    关联范围包含 {len(base_word_scope)} 个分类/标签")
														
 
															+
														
 
															+                if not base_word_scope:
														
 
															+                    logger.warning(f"    无关联范围，跳过")
														
 
															+                    candidates_by_base_word[base_word] = []
														
 
															                     continue
														
 
															-                for item in level_list:
														
 
															-                    for step in item.get('how步骤列表', []):
														
 
															-                        for feature in step.get('特征列表', []):
														
 
															-                            # 获取该特征的所有匹配
														
 
															-                            matches = feature.get('匹配结果', [])
														
 
															-                            total_checked += len(matches)
														
 
															-
														
 
															-                            # 筛选相似度>0.8且在Stage2范围内的匹配
														
 
															-                            for match in matches:
														
 
															-                                sim = match.get('匹配结果', {}).get('相似度', 0)
														
 
															-                                persona_feature_name = match.get('人设特征名称', '')
														
 
															-
														
 
															-                                if sim > 0.8 and persona_feature_name in stage2_scope:
														
 
															-                                    high_sim_found += 1
														
 
															-                                    # 记录来源信息
														
 
															-                                    high_sim_candidates.append({
														
 
															-                                        '人设特征名称': persona_feature_name,
														
 
															-                                        '相似度': sim,
														
 
															-                                        '特征类型': match.get('特征类型', ''),
														
 
															-                                        '特征分类': match.get('特征分类', []),
														
 
															-                                        '人设特征层级': match.get('人设特征层级', ''),
														
 
															-                                        '来源路径': self._build_classification_path(match.get('特征分类', [])),
														
 
															-                                        '匹配说明': match.get('匹配结果', {}).get('说明', ''),
														
 
															-                                        '来源原始特征': feature.get('特征名称', '')  # 记录来自哪个原始特征
														
 
															-                                    })
														
 
															-
														
 
															-            logger.info(f"  检查了 {total_checked} 个匹配")
														
 
															-            logger.info(f"  找到 {high_sim_found} 个相似度>0.8的匹配")
														
 
															-
														
 
															-            # 按相似度降序排序，并去重（同一个人设特征名称只保留最高分）
														
 
															-            seen_names = set()
														
 
															-            unique_candidates = []
														
 
															-            high_sim_candidates.sort(key=lambda x: x['相似度'], reverse=True)
														
 
															-
														
 
															-            for candidate in high_sim_candidates:
														
 
															-                name = candidate['人设特征名称']
														
 
															-                if name not in seen_names:
														
 
															-                    seen_names.add(name)
														
 
															-                    unique_candidates.append(candidate)
														
 
															-
														
 
															-            # 添加到结果中
														
 
															-            feature_result['高相似度候选'] = unique_candidates
														
 
															-            logger.info(f"  去重后筛选出 {len(unique_candidates)} 个高相似度候选")
														
 
															-
														
 
															-            # 显示前5个
														
 
															-            if unique_candidates:
														
 
															-                logger.info(f"  Top 5:")
														
 
															-                for c in unique_candidates[:5]:
														
 
															-                    logger.info(f"    • {c['人设特征名称']} ({c['相似度']:.3f}) ← 来自\"{c['来源原始特征']}\"")
														
 
															+                # 步骤2: 遍历how解构，找出高相似度匹配
														
 
															+                high_sim_candidates = []
														
 
															+                total_checked = 0
														
 
															+                high_sim_found = 0
														
 
															+
														
 
															+                how_result = self.how_data.get('how解构结果', {})
														
 
															+                for level_name, level_list in how_result.items():
														
 
															+                    if not isinstance(level_list, list):
														
 
															+                        continue
														
 
															+
														
 
															+                    for item in level_list:
														
 
															+                        for step in item.get('how步骤列表', []):
														
 
															+                            for feature in step.get('特征列表', []):
														
 
															+                                matches = feature.get('匹配结果', [])
														
 
															+                                total_checked += len(matches)
														
 
															+
														
 
															+                                # 筛选相似度>0.8且在该base_word的范围内的匹配
														
 
															+                                for match in matches:
														
 
															+                                    sim = match.get('匹配结果', {}).get('相似度', 0)
														
 
															+                                    persona_feature_name = match.get('人设特征名称', '')
														
 
															+
														
 
															+                                    if sim > 0.8 and persona_feature_name in base_word_scope:
														
 
															+                                        high_sim_found += 1
														
 
															+                                        high_sim_candidates.append({
														
 
															+                                            '人设特征名称': persona_feature_name,
														
 
															+                                            '相似度': sim,
														
 
															+                                            '特征类型': match.get('特征类型', ''),
														
 
															+                                            '特征分类': match.get('特征分类', []),
														
 
															+                                            '人设特征层级': match.get('人设特征层级', ''),
														
 
															+                                            '来源路径': self._build_classification_path(match.get('特征分类', [])),
														
 
															+                                            '匹配说明': match.get('匹配结果', {}).get('说明', ''),
														
 
															+                                            '来源原始特征': feature.get('特征名称', '')
														
 
															+                                        })
														
 
															+
														
 
															+                logger.info(f"    检查了 {total_checked} 个匹配")
														
 
															+                logger.info(f"    找到 {high_sim_found} 个相似度>0.8的匹配")
														
 
															+
														
 
															+                # 按相似度降序排序并去重
														
 
															+                seen_names = set()
														
 
															+                unique_candidates = []
														
 
															+                high_sim_candidates.sort(key=lambda x: x['相似度'], reverse=True)
														
 
															+
														
 
															+                for candidate in high_sim_candidates:
														
 
															+                    name = candidate['人设特征名称']
														
 
															+                    if name not in seen_names:
														
 
															+                        seen_names.add(name)
														
 
															+                        unique_candidates.append(candidate)
														
 
															+
														
 
															+                candidates_by_base_word[base_word] = unique_candidates
														
 
															+                logger.info(f"    去重后筛选出 {len(unique_candidates)} 个候选")
														
 
															+
														
 
															+                # 显示前5个
														
 
															+                if unique_candidates:
														
 
															+                    logger.info(f"    Top 5:")
														
 
															+                    for c in unique_candidates[:5]:
														
 
															+                        logger.info(f"      • {c['人设特征名称']} ({c['相似度']:.3f}) ← 来自\"{c['来源原始特征']}\"")
														
 
															+
														
 
															+            # 保存结果
														
 
															+            feature_result['高相似度候选_按base_word'] = candidates_by_base_word
														
 
															+
														
 
															+            # 向后兼容：保留第1个base_word的候选
														
 
															+            first_base_word = top3_info[0].get('人设特征名称', '')
														
 
															+            feature_result['高相似度候选'] = candidates_by_base_word.get(first_base_word, [])
														
 
															+
														
 
															+            total_candidates = sum(len(v) for v in candidates_by_base_word.values())
														
 
															+            logger.info(f"  总共筛选出 {total_candidates} 个候选（{len(candidates_by_base_word)} 个base_word）")
														
 
															         # 保存结果
														
 
															         output_path = os.path.join(self.output_dir, "stage3_high_similarity.json")
														
@@ -681,19 +745,19 @@ class EnhancedSearchV2:
 
															         return associations_data
														
 
															-    def _collect_stage2_scope(self, feature_result: Dict[str, Any]) -> Set[str]:
														
 
															+    def _collect_scope_from_associations(self, associations: List[Dict[str, Any]]) -> Set[str]:
														
 
															         """
														
 
															-        收集Stage2找到的所有分类名和标签，形成范围集合
														
 
															+        从关联列表中收集所有分类名和标签，形成范围集合
														
 
															         Args:
														
 
															-            feature_result: 特征结果数据
														
 
															+            associations: 关联列表
														
 
															         Returns:
														
 
															             包含所有分类名和标签的集合
														
 
															         """
														
 
															         scope = set()
														
 
															-        for assoc in feature_result.get('找到的关联', []):
														
 
															+        for assoc in associations:
														
 
															             # 添加分类名
														
 
															             scope.add(assoc['分类名称'])
														
@@ -703,6 +767,19 @@ class EnhancedSearchV2:
 
															         return scope
														
 
															+    def _collect_stage2_scope(self, feature_result: Dict[str, Any]) -> Set[str]:
														
 
															+        """
														
 
															+        收集Stage2找到的所有分类名和标签，形成范围集合（兼容旧方法）
														
 
															+
														
 
															+        Args:
														
 
															+            feature_result: 特征结果数据
														
 
															+
														
 
															+        Returns:
														
 
															+            包含所有分类名和标签的集合
														
 
															+        """
														
 
															+        associations = feature_result.get('找到的关联', [])
														
 
															+        return self._collect_scope_from_associations(associations)
														
 
															+
														
 
															     def _find_features_by_path(self, target_classification: str) -> List[Dict[str, Any]]:
														
 
															         """
														
 
															         根据路径查找特征列表
														
@@ -797,12 +874,16 @@ class EnhancedSearchV2:
 
															         """
														
 
															         处理单个原始特征的组合生成和评估
														
 
															+        改进: 每个base_word使用自己的候选词（而不是共享）
														
 
															+
														
 
															         Steps:
														
 
															-        1. Get base_word from Stage1's 最高匹配信息
														
 
															-        2. Get candidates from Stage3's 高相似度候选 (top max_candidates)
														
 
															-        3. Generate 2-N word combinations
														
 
															-        4. LLM batch evaluation
														
 
															-        5. Select Top 10 and write back
														
 
															+        1. Get top3 base_words from Stage1's top3匹配信息
														
 
															+        2. For each base_word:
														
 
															+           a. Get candidates from Stage3's 高相似度候选_按base_word
														
 
															+           b. Generate combinations
														
 
															+           c. LLM evaluation
														
 
															+           d. Select Top 10
														
 
															+        3. Save grouped results
														
 
															         Args:
														
 
															             idx: 特征索引
														
@@ -814,62 +895,94 @@ class EnhancedSearchV2:
 
															         original_feature = feature_result['原始特征名称']
														
 
															         logger.info(f"\n[{idx}/{total}] 处理: {original_feature}")
														
 
															-        # 步骤1: 获取基础词
														
 
															-        base_word = feature_result.get('最高匹配信息', {}).get('人设特征名称', '')
														
 
															-        if not base_word:
														
 
															-            logger.info(f"  无基础词，跳过")
														
 
															-            feature_result['组合评估结果'] = []
														
 
															+        # 步骤1: 获取top3基础词
														
 
															+        top3_info = feature_result.get('top3匹配信息', [])
														
 
															+        if not top3_info:
														
 
															+            logger.info(f"  无top3匹配信息，跳过")
														
 
															+            feature_result['组合评估结果_分组'] = []
														
 
															             return
														
 
															-        logger.info(f"  基础词: {base_word}")
														
 
															+        logger.info(f"  找到 {len(top3_info)} 个base_word")
														
 
															-        # 步骤2: 获取候选词（从高相似度候选中）
														
 
															-        high_sim_candidates = feature_result.get('高相似度候选', [])
														
 
															+        # 步骤2: 获取按base_word分组的候选词
														
 
															+        candidates_by_base_word = feature_result.get('高相似度候选_按base_word', {})
														
 
															-        # 限制候选词数量
														
 
															-        candidates = high_sim_candidates[:max_candidates]
														
 
															-        candidate_words = [c['人设特征名称'] for c in candidates]
														
 
															-
														
 
															-        if not candidate_words:
														
 
															-            logger.info(f"  无候选词，跳过")
														
 
															-            feature_result['组合评估结果'] = []
														
 
															+        if not candidates_by_base_word:
														
 
															+            logger.warning(f"  无按base_word分组的候选词，跳过")
														
 
															+            feature_result['组合评估结果_分组'] = []
														
 
															             return
														
 
															-        logger.info(f"  候选词数量: {len(candidate_words)} (限制: {max_candidates})")
														
 
															+        # 步骤3: 为每个base_word独立处理
														
 
															+        grouped_results = []
														
 
															+
														
 
															+        for base_idx, base_info in enumerate(top3_info, 1):
														
 
															+            base_word = base_info.get('人设特征名称', '')
														
 
															+            base_similarity = base_info.get('相似度', 0)
														
 
															+
														
 
															+            if not base_word:
														
 
															+                continue
														
 
															+
														
 
															+            logger.info(f"  [{base_idx}/{len(top3_info)}] Base Word: {base_word} (相似度: {base_similarity:.3f})")
														
 
															-        # 步骤3: 生成所有组合
														
 
															-        all_combinations = []
														
 
															+            # 获取该base_word的候选词
														
 
															+            base_candidates = candidates_by_base_word.get(base_word, [])
														
 
															+            candidates = base_candidates[:max_candidates]
														
 
															+            candidate_words = [c['人设特征名称'] for c in candidates]
														
 
															-        # 生成1词到max_combo_length-1词的候选词组合（因为还要加上base_word）
														
 
															-        for length in range(1, min(max_combo_length, len(candidate_words) + 1)):
														
 
															-            for combo in combinations(candidate_words, length):
														
 
															-                # 组合成搜索词：基础词 + 候选词组合
														
 
															-                search_phrase = base_word + ' ' + ' '.join(combo)
														
 
															-                all_combinations.append({
														
 
															-                    'search_word': search_phrase,
														
 
															+            if not candidate_words:
														
 
															+                logger.warning(f"    该base_word无候选词，跳过")
														
 
															+                grouped_results.append({
														
 
															                     'base_word': base_word,
														
 
															-                    'candidate_words': list(combo),
														
 
															-                    'combo_length': length + 1  # +1 因为包含base_word
														
 
															+                    'base_word_similarity': base_similarity,
														
 
															+                    'base_word_info': base_info,
														
 
															+                    'top10_searches': [],
														
 
															+                    'available_words': []
														
 
															                 })
														
 
															+                continue
														
 
															-        logger.info(f"  生成 {len(all_combinations)} 个组合")
														
 
															+            logger.info(f"    候选词数量: {len(candidate_words)} (限制: {max_candidates})")
														
 
															+
														
 
															+            # 生成组合
														
 
															+            combinations_for_base = []
														
 
															+            for length in range(1, min(max_combo_length, len(candidate_words) + 1)):
														
 
															+                for combo in combinations(candidate_words, length):
														
 
															+                    search_phrase = base_word + ' ' + ' '.join(combo)
														
 
															+                    combinations_for_base.append({
														
 
															+                        'search_word': search_phrase,
														
 
															+                        'base_word': base_word,
														
 
															+                        'candidate_words': list(combo),
														
 
															+                        'combo_length': length + 1
														
 
															+                    })
														
 
															-        # 步骤4: LLM批量评估
														
 
															-        logger.info(f"  开始LLM评估...")
														
 
															-        evaluated = self.llm_evaluator.evaluate_search_words_in_batches(
														
 
															-            original_feature=original_feature,
														
 
															-            search_words=[c['search_word'] for c in all_combinations],
														
 
															-            batch_size=50
														
 
															-        )
														
 
															+            logger.info(f"    生成 {len(combinations_for_base)} 个组合")
														
 
															+
														
 
															+            # LLM评估
														
 
															+            logger.info(f"    开始LLM评估...")
														
 
															+            evaluated = self.llm_evaluator.evaluate_search_words_in_batches(
														
 
															+                original_feature=original_feature,
														
 
															+                search_words=[c['search_word'] for c in combinations_for_base],
														
 
															+                batch_size=50
														
 
															+            )
														
 
															+
														
 
															+            # 选出Top 10
														
 
															+            top_10 = evaluated[:10]
														
 
															+            max_score = top_10[0]['score'] if top_10 else 0.0
														
 
															+            logger.info(f"    评估完成，Top 10 最高分: {max_score:.3f}")
														
 
															-        # 步骤5: 选出Top 10
														
 
															-        top_10 = evaluated[:10]
														
 
															+            # 保存分组结果 - 每个base_word有自己的available_words
														
 
															+            grouped_results.append({
														
 
															+                'base_word': base_word,
														
 
															+                'base_word_similarity': base_similarity,
														
 
															+                'base_word_info': base_info,
														
 
															+                'top10_searches': top_10,
														
 
															+                'available_words': candidate_words  # 该base_word自己的候选词
														
 
															+            })
														
 
															         # 写回结果
														
 
															-        feature_result['组合评估结果'] = top_10
														
 
															+        feature_result['组合评估结果_分组'] = grouped_results
														
 
															-        max_score = top_10[0]['score'] if top_10 else 0.0
														
 
															-        logger.info(f"  评估完成，Top 10 最高分: {max_score:.3f}")
														
 
															+        total_searches = sum(len(g['top10_searches']) for g in grouped_results)
														
 
															+        logger.info(f"  完成！共 {len(grouped_results)} 个base_word，{total_searches} 个搜索词")
														
 
															     # ========== 阶段5：执行搜索 ==========
														
@@ -954,7 +1067,7 @@ class EnhancedSearchV2:
 
															         logger.info("阶段5：执行小红书搜索")
														
 
															         logger.info("=" * 60)
														
 
															-        # 按原始特征分组收集搜索词（从Stage4的组合评估结果读取）
														
 
															+        # 按原始特征分组收集搜索词（从Stage4的组合评估结果_分组读取）
														
 
															         feature_search_groups = {}
														
 
															         for feature_result in features_data:
														
@@ -963,46 +1076,60 @@ class EnhancedSearchV2:
 
															             if original_feature not in feature_search_groups:
														
 
															                 feature_search_groups[original_feature] = []
														
 
															-            # 从Stage4的组合评估结果读取
														
 
															-            for eval_item in feature_result.get('组合评估结果', []):
														
 
															-                sw = eval_item.get('search_word')
														
 
															-                if not sw:
														
 
															-                    continue
														
 
															+            # 从Stage4的组合评估结果_分组读取（新结构）
														
 
															+            grouped_results = feature_result.get('组合评估结果_分组', [])
														
 
															-                score = eval_item.get('score', 0.0)
														
 
															+            if grouped_results:
														
 
															+                # 使用分组结构：每个base_word的top10都执行
														
 
															+                for group in grouped_results:
														
 
															+                    base_word = group.get('base_word', '')
														
 
															+                    base_similarity = group.get('base_word_similarity', 0)
														
 
															-                feature_search_groups[original_feature].append({
														
 
															-                    'search_word': sw,
														
 
															-                    'score': score,
														
 
															-                    'feature_ref': eval_item  # 引用评估项，用于写入搜索结果
														
 
															-                })
														
 
															+                    for eval_item in group.get('top10_searches', []):
														
 
															+                        sw = eval_item.get('search_word')
														
 
															+                        if not sw:
														
 
															+                            continue
														
 
															-        # 每组取Top N
														
 
															-        all_searches = []
														
 
															-        total_before_filter = 0
														
 
															-        total_filtered = 0
														
 
															+                        score = eval_item.get('score', 0.0)
														
 
															-        for original_feature, search_list in feature_search_groups.items():
														
 
															-            total_before_filter += len(search_list)
														
 
															+                        feature_search_groups[original_feature].append({
														
 
															+                            'search_word': sw,
														
 
															+                            'score': score,
														
 
															+                            'base_word': base_word,
														
 
															+                            'base_word_similarity': base_similarity,
														
 
															+                            'feature_ref': eval_item  # 引用评估项，用于写入搜索结果
														
 
															+                        })
														
 
															+            else:
														
 
															+                # 兼容旧结构（组合评估结果）
														
 
															+                for eval_item in feature_result.get('组合评估结果', []):
														
 
															+                    sw = eval_item.get('search_word')
														
 
															+                    if not sw:
														
 
															+                        continue
														
 
															-            # 按分数降序排序
														
 
															-            sorted_list = sorted(search_list, key=lambda x: x['score'], reverse=True)
														
 
															+                    score = eval_item.get('score', 0.0)
														
 
															-            # 取前top_n个
														
 
															-            selected = sorted_list[:top_n]
														
 
															-            all_searches.extend(selected)
														
 
															+                    feature_search_groups[original_feature].append({
														
 
															+                        'search_word': sw,
														
 
															+                        'score': score,
														
 
															+                        'feature_ref': eval_item
														
 
															+                    })
														
 
															+
														
 
															+        # 收集所有搜索任务（分组结构下执行所有base_word的top10，不再过滤）
														
 
															+        all_searches = []
														
 
															+        total_count = 0
														
 
															-            filtered = len(sorted_list) - len(selected)
														
 
															-            total_filtered += filtered
														
 
															+        for original_feature, search_list in feature_search_groups.items():
														
 
															+            total_count += len(search_list)
														
 
															+            all_searches.extend(search_list)
														
 
															-            logger.info(f"  {original_feature}: 从 {len(sorted_list)} 个搜索词中选择 Top {len(selected)} (过滤 {filtered} 个)")
														
 
															+            logger.info(f"  {original_feature}: {len(search_list)} 个搜索词")
														
 
															         # 应用全局搜索次数限制
														
 
															         if self.max_total_searches and len(all_searches) > self.max_total_searches:
														
 
															             logger.info(f"  应用全局限制：从 {len(all_searches)} 个减少到 {self.max_total_searches} 个")
														
 
															             all_searches = all_searches[:self.max_total_searches]
														
 
															-        logger.info(f"\n共 {len(all_searches)} 个搜索任务（过滤前: {total_before_filter}, 过滤掉: {total_filtered}）")
														
 
															+        logger.info(f"\n共 {len(all_searches)} 个搜索任务")
														
 
															         logger.info(f"  并发执行搜索（并发数: {self.search_max_workers}）")
														
 
															         # 使用ThreadPoolExecutor并发执行搜索
														
--- a/visualize_stage5_results_v3.py
+++ b/visualize_stage5_results_v3.py
@@ -281,6 +281,14 @@ def generate_html(data: List[Dict[str, Any]], stats: Dict[str, Any], output_path
 
															             margin-left: 8px;
														
 
															         }}
														
 
															+        .base-word-available-words {{
														
 
															+            font-size: 10px;
														
 
															+            color: #9ca3af;
														
 
															+            margin-top: 4px;
														
 
															+            line-height: 1.5;
														
 
															+            font-weight: normal;
														
 
															+        }}
														
 
															+
														
 
															         /* Level 3: Search Terms */
														
 
															         .search-terms-list {{
														
 
															             display: none;
														
@@ -702,12 +710,15 @@ def generate_html(data: List[Dict[str, Any]], stats: Dict[str, Any], output_path
 
															                     const baseWord = group.base_word || '';
														
 
															                     const baseSimilarity = group.base_word_similarity || 0;
														
 
															                     const searchTerms = group.top10_searches || [];
														
 
															+                    const availableWords = group.available_words || [];
														
 
															+                    const wordsStr = availableWords.join('、');
														
 
															                     html += `
														
 
															                         <div class="base-word-item" onclick="event.stopPropagation(); toggleBaseWord(${{featureIdx}}, ${{groupIdx}})" id="base-word-${{featureIdx}}-${{groupIdx}}">
														
 
															                             <span class="base-expand-icon">▶</span>
														
 
															                             <div class="base-word-text">
														
 
															                                 🎯 ${{baseWord}}
														
 
															+                                ${{wordsStr ? `<div class="base-word-available-words">${{wordsStr}}</div>` : ''}}
														
 
															                             </div>
														
 
															                         </div>
														
 
															                         <div class="search-terms-list" id="search-terms-${{featureIdx}}-${{groupIdx}}">