刘立冬 3 minggu lalu
induk
melakukan
0fada6c296
2 mengubah file dengan 338 tambahan dan 200 penghapusan
  1. 327 200
      enhanced_search_v2.py
  2. 11 0
      visualize_stage5_results_v3.py

+ 327 - 200
enhanced_search_v2.py

@@ -170,45 +170,53 @@ class EnhancedSearchV2:
                             continue
 
                         # 0.5 <= max_similarity < 0.8,保留
-                        best_match = max(
+                        # 按相似度降序排序,取前3个
+                        sorted_matches = sorted(
                             match_results,
-                            key=lambda x: x.get('匹配结果', {}).get('相似度', 0)
+                            key=lambda x: x.get('匹配结果', {}).get('相似度', 0),
+                            reverse=True
                         )
+                        top3_matches = sorted_matches[:3]  # 取前3个
 
-                        # 判断是分类还是特征
-                        feature_classification = best_match.get('特征分类', [])
-                        classification_path = self._build_classification_path(feature_classification)
+                        # 构建top3匹配信息列表
+                        top3_match_info = []
+                        for match in top3_matches:
+                            feature_classification = match.get('特征分类', [])
+                            classification_path = self._build_classification_path(feature_classification)
 
-                        # 如果路径为空且是分类类型,搜索补全路径
-                        if not classification_path and best_match.get('特征类型') == '分类':
-                            feature_name_to_search = best_match.get('人设特征名称', '')
-                            classification_path = self._search_classification_path(feature_name_to_search)
+                            # 如果路径为空且是分类类型,搜索补全路径
+                            if not classification_path and match.get('特征类型') == '分类':
+                                feature_name_to_search = match.get('人设特征名称', '')
+                                classification_path = self._search_classification_path(feature_name_to_search)
 
-                        is_classification = self._is_classification(best_match.get('人设特征名称', ''), classification_path)
+                            is_classification = self._is_classification(match.get('人设特征名称', ''), classification_path)
+
+                            top3_match_info.append({
+                                '人设特征名称': match.get('人设特征名称'),
+                                '人设特征层级': match.get('人设特征层级'),
+                                '特征类型': match.get('特征类型'),
+                                '特征分类': feature_classification,
+                                '相似度': match.get('匹配结果', {}).get('相似度', 0),
+                                '匹配说明': match.get('匹配结果', {}).get('说明', ''),
+                                '是分类': is_classification,
+                                '所属分类路径': classification_path
+                            })
 
                         result_item = {
                             '原始特征名称': feature_name,
                             '来源层级': level_name,
                             '权重': feature.get('权重', 0),
                             '所属点名称': item_name,
-                            '最高匹配信息': {
-                                '人设特征名称': best_match.get('人设特征名称'),
-                                '人设特征层级': best_match.get('人设特征层级'),
-                                '特征类型': best_match.get('特征类型'),
-                                '特征分类': feature_classification,
-                                '相似度': best_match.get('匹配结果', {}).get('相似度', 0),
-                                '匹配说明': best_match.get('匹配结果', {}).get('说明', ''),
-                                '是分类': is_classification,
-                                '所属分类路径': classification_path
-                            }
+                            '最高匹配信息': top3_match_info[0],  # 保留第1个用于Stage2
+                            'top3匹配信息': top3_match_info  # 新增字段
                         }
 
                         results.append(result_item)
                         selected_count += 1
 
-                        logger.info(f"  ✓ {feature_name} → {best_match.get('人设特征名称')} "
-                                   f"(相似度: {max_similarity:.3f}, "
-                                   f"{'分类' if is_classification else '特征'})")
+                        # 显示top3匹配信息
+                        top3_names = [m['人设特征名称'] for m in top3_match_info]
+                        logger.info(f"  ✓ {feature_name} → Top{len(top3_match_info)}: {', '.join(top3_names)}")
 
         # 统计信息
         logger.info(f"\n" + "=" * 60)
@@ -430,6 +438,8 @@ class EnhancedSearchV2:
         """
         阶段2:查找关联分类,收集分类名称、标签、子分类
 
+        改进: 为top3的每个base_word都查找关联
+
         Args:
             filtered_features: 阶段1筛选的特征
 
@@ -437,52 +447,75 @@ class EnhancedSearchV2:
             带关联信息的特征列表
         """
         logger.info("=" * 60)
-        logger.info("阶段2:查找关联分类")
+        logger.info("阶段2:查找关联分类(为每个base_word)")
         logger.info("=" * 60)
 
         for idx, feature in enumerate(filtered_features, 1):
             logger.info(f"\n[{idx}/{len(filtered_features)}] 处理: {feature['原始特征名称']}")
 
-            match_info = feature['最高匹配信息']
-            is_classification = match_info['是分类']
-            classification_path = match_info['所属分类路径']
-            source_level = match_info['人设特征层级']
+            # 获取top3 base_words
+            top3_info = feature.get('top3匹配信息', [])
+            if not top3_info:
+                logger.warning(f"  无top3匹配信息,跳过")
+                feature['找到的关联_按base_word'] = {}
+                continue
 
-            if is_classification:
-                # 匹配的是分类,直接用分类路径
-                search_path = classification_path
-                logger.info(f"  匹配到分类: {search_path}")
-            else:
-                # 匹配的是特征,用所属分类路径
-                search_path = classification_path
-                logger.info(f"  匹配到特征,使用所属分类: {search_path}")
-
-            # 查找关联
-            associations = self._find_associations(search_path, source_level)
-
-            # 收集关联信息
-            feature['找到的关联'] = []
-
-            for assoc in associations:
-                target_path = assoc['目标分类']
-                logger.info(f"  处理关联: {target_path}")
-
-                # 收集分类信息
-                classification_info = self._collect_classification_info(target_path)
-
-                if classification_info:
-                    feature['找到的关联'].append({
-                        '来源方向': assoc['来源方向'],
-                        '关联类型': assoc['关联类型'],
-                        '目标分类路径': target_path,
-                        '共同帖子数': assoc['共同帖子数'],
-                        'Jaccard相似度': assoc['Jaccard相似度'],
-                        '分类名称': classification_info['classification_name'],
-                        '标签列表': classification_info['tags'],
-                        '子分类列表': classification_info['sub_classifications']
-                    })
+            logger.info(f"  找到 {len(top3_info)} 个base_word")
+
+            # 为每个base_word查找关联
+            associations_by_base_word = {}
 
-            logger.info(f"  找到 {len(feature['找到的关联'])} 个关联")
+            for base_idx, base_info in enumerate(top3_info, 1):
+                base_word = base_info.get('人设特征名称', '')
+                is_classification = base_info['是分类']
+                classification_path = base_info['所属分类路径']
+                source_level = base_info['人设特征层级']
+
+                logger.info(f"  [{base_idx}/{len(top3_info)}] Base Word: {base_word}")
+
+                if is_classification:
+                    search_path = classification_path
+                    logger.info(f"    匹配到分类: {search_path}")
+                else:
+                    search_path = classification_path
+                    logger.info(f"    匹配到特征,使用所属分类: {search_path}")
+
+                # 查找关联
+                associations = self._find_associations(search_path, source_level)
+
+                # 收集关联信息
+                base_word_associations = []
+
+                for assoc in associations:
+                    target_path = assoc['目标分类']
+
+                    # 收集分类信息
+                    classification_info = self._collect_classification_info(target_path)
+
+                    if classification_info:
+                        base_word_associations.append({
+                            '来源方向': assoc['来源方向'],
+                            '关联类型': assoc['关联类型'],
+                            '目标分类路径': target_path,
+                            '共同帖子数': assoc['共同帖子数'],
+                            'Jaccard相似度': assoc['Jaccard相似度'],
+                            '分类名称': classification_info['classification_name'],
+                            '标签列表': classification_info['tags'],
+                            '子分类列表': classification_info['sub_classifications']
+                        })
+
+                associations_by_base_word[base_word] = base_word_associations
+                logger.info(f"    找到 {len(base_word_associations)} 个关联")
+
+            # 保存结果
+            feature['找到的关联_按base_word'] = associations_by_base_word
+
+            # 向后兼容:保留基于最高匹配信息的关联(即第1个base_word的关联)
+            first_base_word = top3_info[0].get('人设特征名称', '')
+            feature['找到的关联'] = associations_by_base_word.get(first_base_word, [])
+
+            total_associations = sum(len(v) for v in associations_by_base_word.values())
+            logger.info(f"  总共找到 {total_associations} 个关联({len(associations_by_base_word)} 个base_word)")
 
         # 保存结果
         output_path = os.path.join(self.output_dir, "stage2_associations.json")
@@ -589,8 +622,8 @@ class EnhancedSearchV2:
         """
         阶段3:筛选高相似度匹配(>0.8)
 
-        遍历how解构中的所有原始特征,找出匹配结果中相似度>0.8
-        且人设特征名称在Stage2关联范围内的高质量匹配
+        改进:为每个base_word独立筛选候选词
+        基于该base_word的关联范围,在how解构中找出相似度>0.8的匹配
 
         Args:
             associations_data: 阶段2的关联数据
@@ -599,76 +632,107 @@ class EnhancedSearchV2:
             带高相似度候选的数据
         """
         logger.info("=" * 60)
-        logger.info("阶段3:筛选高相似度匹配(>0.8)")
+        logger.info("阶段3:筛选高相似度匹配(>0.8,为每个base_word)")
         logger.info("=" * 60)
 
         for idx, feature_result in enumerate(associations_data, 1):
             original_feature_name = feature_result['原始特征名称']
             logger.info(f"\n[{idx}/{len(associations_data)}] 处理: {original_feature_name}")
 
-            # 步骤1: 收集Stage2的关联范围(分类名+标签)
-            stage2_scope = self._collect_stage2_scope(feature_result)
-            logger.info(f"  Stage2范围包含 {len(stage2_scope)} 个分类/标签")
+            # 获取top3 base_words
+            top3_info = feature_result.get('top3匹配信息', [])
+            associations_by_base_word = feature_result.get('找到的关联_按base_word', {})
+
+            if not top3_info or not associations_by_base_word:
+                logger.warning(f"  无top3匹配信息或关联数据,跳过")
+                feature_result['高相似度候选_按base_word'] = {}
+                continue
+
+            logger.info(f"  找到 {len(top3_info)} 个base_word")
+
+            # 为每个base_word独立筛选候选词
+            candidates_by_base_word = {}
 
-            # 步骤2: 遍历how解构中的所有原始特征,找出高相似度匹配
-            high_sim_candidates = []
-            total_checked = 0
-            high_sim_found = 0
+            for base_idx, base_info in enumerate(top3_info, 1):
+                base_word = base_info.get('人设特征名称', '')
+                logger.info(f"  [{base_idx}/{len(top3_info)}] Base Word: {base_word}")
 
-            how_result = self.how_data.get('how解构结果', {})
-            for level_name, level_list in how_result.items():
-                if not isinstance(level_list, list):
+                # 步骤1: 收集该base_word的关联范围
+                base_word_associations = associations_by_base_word.get(base_word, [])
+                base_word_scope = self._collect_scope_from_associations(base_word_associations)
+                logger.info(f"    关联范围包含 {len(base_word_scope)} 个分类/标签")
+
+                if not base_word_scope:
+                    logger.warning(f"    无关联范围,跳过")
+                    candidates_by_base_word[base_word] = []
                     continue
 
-                for item in level_list:
-                    for step in item.get('how步骤列表', []):
-                        for feature in step.get('特征列表', []):
-                            # 获取该特征的所有匹配
-                            matches = feature.get('匹配结果', [])
-                            total_checked += len(matches)
-
-                            # 筛选相似度>0.8且在Stage2范围内的匹配
-                            for match in matches:
-                                sim = match.get('匹配结果', {}).get('相似度', 0)
-                                persona_feature_name = match.get('人设特征名称', '')
-
-                                if sim > 0.8 and persona_feature_name in stage2_scope:
-                                    high_sim_found += 1
-                                    # 记录来源信息
-                                    high_sim_candidates.append({
-                                        '人设特征名称': persona_feature_name,
-                                        '相似度': sim,
-                                        '特征类型': match.get('特征类型', ''),
-                                        '特征分类': match.get('特征分类', []),
-                                        '人设特征层级': match.get('人设特征层级', ''),
-                                        '来源路径': self._build_classification_path(match.get('特征分类', [])),
-                                        '匹配说明': match.get('匹配结果', {}).get('说明', ''),
-                                        '来源原始特征': feature.get('特征名称', '')  # 记录来自哪个原始特征
-                                    })
-
-            logger.info(f"  检查了 {total_checked} 个匹配")
-            logger.info(f"  找到 {high_sim_found} 个相似度>0.8的匹配")
-
-            # 按相似度降序排序,并去重(同一个人设特征名称只保留最高分)
-            seen_names = set()
-            unique_candidates = []
-            high_sim_candidates.sort(key=lambda x: x['相似度'], reverse=True)
-
-            for candidate in high_sim_candidates:
-                name = candidate['人设特征名称']
-                if name not in seen_names:
-                    seen_names.add(name)
-                    unique_candidates.append(candidate)
-
-            # 添加到结果中
-            feature_result['高相似度候选'] = unique_candidates
-            logger.info(f"  去重后筛选出 {len(unique_candidates)} 个高相似度候选")
-
-            # 显示前5个
-            if unique_candidates:
-                logger.info(f"  Top 5:")
-                for c in unique_candidates[:5]:
-                    logger.info(f"    • {c['人设特征名称']} ({c['相似度']:.3f}) ← 来自\"{c['来源原始特征']}\"")
+                # 步骤2: 遍历how解构,找出高相似度匹配
+                high_sim_candidates = []
+                total_checked = 0
+                high_sim_found = 0
+
+                how_result = self.how_data.get('how解构结果', {})
+                for level_name, level_list in how_result.items():
+                    if not isinstance(level_list, list):
+                        continue
+
+                    for item in level_list:
+                        for step in item.get('how步骤列表', []):
+                            for feature in step.get('特征列表', []):
+                                matches = feature.get('匹配结果', [])
+                                total_checked += len(matches)
+
+                                # 筛选相似度>0.8且在该base_word的范围内的匹配
+                                for match in matches:
+                                    sim = match.get('匹配结果', {}).get('相似度', 0)
+                                    persona_feature_name = match.get('人设特征名称', '')
+
+                                    if sim > 0.8 and persona_feature_name in base_word_scope:
+                                        high_sim_found += 1
+                                        high_sim_candidates.append({
+                                            '人设特征名称': persona_feature_name,
+                                            '相似度': sim,
+                                            '特征类型': match.get('特征类型', ''),
+                                            '特征分类': match.get('特征分类', []),
+                                            '人设特征层级': match.get('人设特征层级', ''),
+                                            '来源路径': self._build_classification_path(match.get('特征分类', [])),
+                                            '匹配说明': match.get('匹配结果', {}).get('说明', ''),
+                                            '来源原始特征': feature.get('特征名称', '')
+                                        })
+
+                logger.info(f"    检查了 {total_checked} 个匹配")
+                logger.info(f"    找到 {high_sim_found} 个相似度>0.8的匹配")
+
+                # 按相似度降序排序并去重
+                seen_names = set()
+                unique_candidates = []
+                high_sim_candidates.sort(key=lambda x: x['相似度'], reverse=True)
+
+                for candidate in high_sim_candidates:
+                    name = candidate['人设特征名称']
+                    if name not in seen_names:
+                        seen_names.add(name)
+                        unique_candidates.append(candidate)
+
+                candidates_by_base_word[base_word] = unique_candidates
+                logger.info(f"    去重后筛选出 {len(unique_candidates)} 个候选")
+
+                # 显示前5个
+                if unique_candidates:
+                    logger.info(f"    Top 5:")
+                    for c in unique_candidates[:5]:
+                        logger.info(f"      • {c['人设特征名称']} ({c['相似度']:.3f}) ← 来自\"{c['来源原始特征']}\"")
+
+            # 保存结果
+            feature_result['高相似度候选_按base_word'] = candidates_by_base_word
+
+            # 向后兼容:保留第1个base_word的候选
+            first_base_word = top3_info[0].get('人设特征名称', '')
+            feature_result['高相似度候选'] = candidates_by_base_word.get(first_base_word, [])
+
+            total_candidates = sum(len(v) for v in candidates_by_base_word.values())
+            logger.info(f"  总共筛选出 {total_candidates} 个候选({len(candidates_by_base_word)} 个base_word)")
 
         # 保存结果
         output_path = os.path.join(self.output_dir, "stage3_high_similarity.json")
@@ -681,19 +745,19 @@ class EnhancedSearchV2:
         return associations_data
 
 
-    def _collect_stage2_scope(self, feature_result: Dict[str, Any]) -> Set[str]:
+    def _collect_scope_from_associations(self, associations: List[Dict[str, Any]]) -> Set[str]:
         """
-        收集Stage2找到的所有分类名和标签,形成范围集合
+        从关联列表中收集所有分类名和标签,形成范围集合
 
         Args:
-            feature_result: 特征结果数据
+            associations: 关联列表
 
         Returns:
             包含所有分类名和标签的集合
         """
         scope = set()
 
-        for assoc in feature_result.get('找到的关联', []):
+        for assoc in associations:
             # 添加分类名
             scope.add(assoc['分类名称'])
 
@@ -703,6 +767,19 @@ class EnhancedSearchV2:
 
         return scope
 
+    def _collect_stage2_scope(self, feature_result: Dict[str, Any]) -> Set[str]:
+        """
+        收集Stage2找到的所有分类名和标签,形成范围集合(兼容旧方法)
+
+        Args:
+            feature_result: 特征结果数据
+
+        Returns:
+            包含所有分类名和标签的集合
+        """
+        associations = feature_result.get('找到的关联', [])
+        return self._collect_scope_from_associations(associations)
+
     def _find_features_by_path(self, target_classification: str) -> List[Dict[str, Any]]:
         """
         根据路径查找特征列表
@@ -797,12 +874,16 @@ class EnhancedSearchV2:
         """
         处理单个原始特征的组合生成和评估
 
+        改进: 每个base_word使用自己的候选词(而不是共享)
+
         Steps:
-        1. Get base_word from Stage1's 最高匹配信息
-        2. Get candidates from Stage3's 高相似度候选 (top max_candidates)
-        3. Generate 2-N word combinations
-        4. LLM batch evaluation
-        5. Select Top 10 and write back
+        1. Get top3 base_words from Stage1's top3匹配信息
+        2. For each base_word:
+           a. Get candidates from Stage3's 高相似度候选_按base_word
+           b. Generate combinations
+           c. LLM evaluation
+           d. Select Top 10
+        3. Save grouped results
 
         Args:
             idx: 特征索引
@@ -814,62 +895,94 @@ class EnhancedSearchV2:
         original_feature = feature_result['原始特征名称']
         logger.info(f"\n[{idx}/{total}] 处理: {original_feature}")
 
-        # 步骤1: 获取基础词
-        base_word = feature_result.get('最高匹配信息', {}).get('人设特征名称', '')
-        if not base_word:
-            logger.info(f"  无基础词,跳过")
-            feature_result['组合评估结果'] = []
+        # 步骤1: 获取top3基础词
+        top3_info = feature_result.get('top3匹配信息', [])
+        if not top3_info:
+            logger.info(f"  无top3匹配信息,跳过")
+            feature_result['组合评估结果_分组'] = []
             return
 
-        logger.info(f"  基础词: {base_word}")
+        logger.info(f"  找到 {len(top3_info)} 个base_word")
 
-        # 步骤2: 获取候选词(从高相似度候选中)
-        high_sim_candidates = feature_result.get('高相似度候选', [])
+        # 步骤2: 获取按base_word分组的候选词
+        candidates_by_base_word = feature_result.get('高相似度候选_按base_word', {})
 
-        # 限制候选词数量
-        candidates = high_sim_candidates[:max_candidates]
-        candidate_words = [c['人设特征名称'] for c in candidates]
-
-        if not candidate_words:
-            logger.info(f"  无候选词,跳过")
-            feature_result['组合评估结果'] = []
+        if not candidates_by_base_word:
+            logger.warning(f"  无按base_word分组的候选词,跳过")
+            feature_result['组合评估结果_分组'] = []
             return
 
-        logger.info(f"  候选词数量: {len(candidate_words)} (限制: {max_candidates})")
+        # 步骤3: 为每个base_word独立处理
+        grouped_results = []
+
+        for base_idx, base_info in enumerate(top3_info, 1):
+            base_word = base_info.get('人设特征名称', '')
+            base_similarity = base_info.get('相似度', 0)
+
+            if not base_word:
+                continue
+
+            logger.info(f"  [{base_idx}/{len(top3_info)}] Base Word: {base_word} (相似度: {base_similarity:.3f})")
 
-        # 步骤3: 生成所有组合
-        all_combinations = []
+            # 获取该base_word的候选词
+            base_candidates = candidates_by_base_word.get(base_word, [])
+            candidates = base_candidates[:max_candidates]
+            candidate_words = [c['人设特征名称'] for c in candidates]
 
-        # 生成1词到max_combo_length-1词的候选词组合(因为还要加上base_word)
-        for length in range(1, min(max_combo_length, len(candidate_words) + 1)):
-            for combo in combinations(candidate_words, length):
-                # 组合成搜索词:基础词 + 候选词组合
-                search_phrase = base_word + ' ' + ' '.join(combo)
-                all_combinations.append({
-                    'search_word': search_phrase,
+            if not candidate_words:
+                logger.warning(f"    该base_word无候选词,跳过")
+                grouped_results.append({
                     'base_word': base_word,
-                    'candidate_words': list(combo),
-                    'combo_length': length + 1  # +1 因为包含base_word
+                    'base_word_similarity': base_similarity,
+                    'base_word_info': base_info,
+                    'top10_searches': [],
+                    'available_words': []
                 })
+                continue
 
-        logger.info(f"  生成 {len(all_combinations)} 个组合")
+            logger.info(f"    候选词数量: {len(candidate_words)} (限制: {max_candidates})")
+
+            # 生成组合
+            combinations_for_base = []
+            for length in range(1, min(max_combo_length, len(candidate_words) + 1)):
+                for combo in combinations(candidate_words, length):
+                    search_phrase = base_word + ' ' + ' '.join(combo)
+                    combinations_for_base.append({
+                        'search_word': search_phrase,
+                        'base_word': base_word,
+                        'candidate_words': list(combo),
+                        'combo_length': length + 1
+                    })
 
-        # 步骤4: LLM批量评估
-        logger.info(f"  开始LLM评估...")
-        evaluated = self.llm_evaluator.evaluate_search_words_in_batches(
-            original_feature=original_feature,
-            search_words=[c['search_word'] for c in all_combinations],
-            batch_size=50
-        )
+            logger.info(f"    生成 {len(combinations_for_base)} 个组合")
+
+            # LLM评估
+            logger.info(f"    开始LLM评估...")
+            evaluated = self.llm_evaluator.evaluate_search_words_in_batches(
+                original_feature=original_feature,
+                search_words=[c['search_word'] for c in combinations_for_base],
+                batch_size=50
+            )
+
+            # 选出Top 10
+            top_10 = evaluated[:10]
+            max_score = top_10[0]['score'] if top_10 else 0.0
+            logger.info(f"    评估完成,Top 10 最高分: {max_score:.3f}")
 
-        # 步骤5: 选出Top 10
-        top_10 = evaluated[:10]
+            # 保存分组结果 - 每个base_word有自己的available_words
+            grouped_results.append({
+                'base_word': base_word,
+                'base_word_similarity': base_similarity,
+                'base_word_info': base_info,
+                'top10_searches': top_10,
+                'available_words': candidate_words  # 该base_word自己的候选词
+            })
 
         # 写回结果
-        feature_result['组合评估结果'] = top_10
+        feature_result['组合评估结果_分组'] = grouped_results
 
-        max_score = top_10[0]['score'] if top_10 else 0.0
-        logger.info(f"  评估完成,Top 10 最高分: {max_score:.3f}")
+        total_searches = sum(len(g['top10_searches']) for g in grouped_results)
+        logger.info(f"  完成!共 {len(grouped_results)} 个base_word,{total_searches} 个搜索词")
 
     # ========== 阶段5:执行搜索 ==========
 
@@ -954,7 +1067,7 @@ class EnhancedSearchV2:
         logger.info("阶段5:执行小红书搜索")
         logger.info("=" * 60)
 
-        # 按原始特征分组收集搜索词(从Stage4的组合评估结果读取)
+        # 按原始特征分组收集搜索词(从Stage4的组合评估结果_分组读取)
         feature_search_groups = {}
 
         for feature_result in features_data:
@@ -963,46 +1076,60 @@ class EnhancedSearchV2:
             if original_feature not in feature_search_groups:
                 feature_search_groups[original_feature] = []
 
-            # 从Stage4的组合评估结果读取
-            for eval_item in feature_result.get('组合评估结果', []):
-                sw = eval_item.get('search_word')
-                if not sw:
-                    continue
+            # 从Stage4的组合评估结果_分组读取(新结构)
+            grouped_results = feature_result.get('组合评估结果_分组', [])
 
-                score = eval_item.get('score', 0.0)
+            if grouped_results:
+                # 使用分组结构:每个base_word的top10都执行
+                for group in grouped_results:
+                    base_word = group.get('base_word', '')
+                    base_similarity = group.get('base_word_similarity', 0)
 
-                feature_search_groups[original_feature].append({
-                    'search_word': sw,
-                    'score': score,
-                    'feature_ref': eval_item  # 引用评估项,用于写入搜索结果
-                })
+                    for eval_item in group.get('top10_searches', []):
+                        sw = eval_item.get('search_word')
+                        if not sw:
+                            continue
 
-        # 每组取Top N
-        all_searches = []
-        total_before_filter = 0
-        total_filtered = 0
+                        score = eval_item.get('score', 0.0)
 
-        for original_feature, search_list in feature_search_groups.items():
-            total_before_filter += len(search_list)
+                        feature_search_groups[original_feature].append({
+                            'search_word': sw,
+                            'score': score,
+                            'base_word': base_word,
+                            'base_word_similarity': base_similarity,
+                            'feature_ref': eval_item  # 引用评估项,用于写入搜索结果
+                        })
+            else:
+                # 兼容旧结构(组合评估结果)
+                for eval_item in feature_result.get('组合评估结果', []):
+                    sw = eval_item.get('search_word')
+                    if not sw:
+                        continue
 
-            # 按分数降序排序
-            sorted_list = sorted(search_list, key=lambda x: x['score'], reverse=True)
+                    score = eval_item.get('score', 0.0)
 
-            # 取前top_n个
-            selected = sorted_list[:top_n]
-            all_searches.extend(selected)
+                    feature_search_groups[original_feature].append({
+                        'search_word': sw,
+                        'score': score,
+                        'feature_ref': eval_item
+                    })
+
+        # 收集所有搜索任务(分组结构下执行所有base_word的top10,不再过滤)
+        all_searches = []
+        total_count = 0
 
-            filtered = len(sorted_list) - len(selected)
-            total_filtered += filtered
+        for original_feature, search_list in feature_search_groups.items():
+            total_count += len(search_list)
+            all_searches.extend(search_list)
 
-            logger.info(f"  {original_feature}: 从 {len(sorted_list)} 个搜索词中选择 Top {len(selected)} (过滤 {filtered} 个)")
+            logger.info(f"  {original_feature}: {len(search_list)} 个搜索词")
 
         # 应用全局搜索次数限制
         if self.max_total_searches and len(all_searches) > self.max_total_searches:
             logger.info(f"  应用全局限制:从 {len(all_searches)} 个减少到 {self.max_total_searches} 个")
             all_searches = all_searches[:self.max_total_searches]
 
-        logger.info(f"\n共 {len(all_searches)} 个搜索任务(过滤前: {total_before_filter}, 过滤掉: {total_filtered})")
+        logger.info(f"\n共 {len(all_searches)} 个搜索任务")
         logger.info(f"  并发执行搜索(并发数: {self.search_max_workers})")
 
         # 使用ThreadPoolExecutor并发执行搜索

+ 11 - 0
visualize_stage5_results_v3.py

@@ -281,6 +281,14 @@ def generate_html(data: List[Dict[str, Any]], stats: Dict[str, Any], output_path
             margin-left: 8px;
         }}
 
+        .base-word-available-words {{
+            font-size: 10px;
+            color: #9ca3af;
+            margin-top: 4px;
+            line-height: 1.5;
+            font-weight: normal;
+        }}
+
         /* Level 3: Search Terms */
         .search-terms-list {{
             display: none;
@@ -702,12 +710,15 @@ def generate_html(data: List[Dict[str, Any]], stats: Dict[str, Any], output_path
                     const baseWord = group.base_word || '';
                     const baseSimilarity = group.base_word_similarity || 0;
                     const searchTerms = group.top10_searches || [];
+                    const availableWords = group.available_words || [];
+                    const wordsStr = availableWords.join('、');
 
                     html += `
                         <div class="base-word-item" onclick="event.stopPropagation(); toggleBaseWord(${{featureIdx}}, ${{groupIdx}})" id="base-word-${{featureIdx}}-${{groupIdx}}">
                             <span class="base-expand-icon">▶</span>
                             <div class="base-word-text">
                                 🎯 ${{baseWord}}
+                                ${{wordsStr ? `<div class="base-word-available-words">${{wordsStr}}</div>` : ''}}
                             </div>
                         </div>
                         <div class="search-terms-list" id="search-terms-${{featureIdx}}-${{groupIdx}}">