3 주 전 · 2d0863a6d2
--- a/enhanced_search_v2.py
+++ b/enhanced_search_v2.py
@@ -51,6 +51,7 @@ class EnhancedSearchV2:
 
				         search_max_workers: int = 3,
			
 
				         max_searches_per_feature: Optional[int] = None,
			
 
				         max_searches_per_base_word: Optional[int] = None,
			
 
				+        combination_source: str = "how_based",
			
 
				         enable_stage6: bool = False,
			
 
				         stage6_max_workers: int = 10,
			
 
				         stage6_max_notes: int = 20,
			
@@ -78,6 +79,9 @@ class EnhancedSearchV2:
 
				             search_max_workers: 搜索并发数（默认3）
			
 
				             max_searches_per_feature: 每个原始特征的最大搜索次数（默认None不限制）
			
 
				             max_searches_per_base_word: 每个base_word的最大搜索次数（默认None不限制）
			
 
				+            combination_source: 组合词来源方式（默认how_based）
			
 
				+                - "how_based": 从how文件提取相似度>=0.8的候选词（新方式，默认）
			
 
				+                - "association": 基于关联分析提取候选词（旧方式）
			
 
				             enable_stage6: 是否启用Stage 6评估（默认False）
			
 
				             stage6_max_workers: Stage 6并发评估数（默认10）
			
 
				             stage6_max_notes: 每个搜索结果评估的最大帖子数（默认20）
			
@@ -100,6 +104,7 @@ class EnhancedSearchV2:
 
				         self.search_max_workers = search_max_workers
			
 
				         self.max_searches_per_feature = max_searches_per_feature
			
 
				         self.max_searches_per_base_word = max_searches_per_base_word
			
 
				+        self.combination_source = combination_source
			
 
				         self.enable_stage6 = enable_stage6
			
 
				         self.stage6_max_workers = stage6_max_workers
			
 
				         self.stage6_max_notes = stage6_max_notes
			
@@ -913,6 +918,125 @@ class EnhancedSearchV2:
 
				 
			
 
				         return scope
			
 
				 
			
 
				+    def stage23_extract_candidates_from_how(self, filtered_features: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
			
 
				+        """
			
 
				+        新方式：从how文件提取相似度>=0.8的候选词
			
 
				+        替代 Stage 2-3，但构造相同的数据结构
			
 
				+
			
 
				+        处理流程:
			
 
				+        1. 遍历 how_data['how解构结果'] 所有特征的匹配结果
			
 
				+        2. 筛选 相似度 >= 0.8 的人设特征名称
			
 
				+        3. 去重（按最高相似度保留）
			
 
				+        4. 按相似度降序排序
			
 
				+        5. 为每个中心词复制相同的候选词列表
			
 
				+        6. 构造 '高相似度候选_按base_word' 结构
			
 
				+
			
 
				+        Args:
			
 
				+            filtered_features: Stage 1筛选的特征列表
			
 
				+
			
 
				+        Returns:
			
 
				+            与Stage 3输出结构完全一致的特征列表
			
 
				+        """
			
 
				+        logger.info("=" * 60)
			
 
				+        logger.info("Stage 2-3 (新方式): 从how文件提取高相似度候选词")
			
 
				+        logger.info("=" * 60)
			
 
				+
			
 
				+        # Step 1: 从整个how文件提取候选词
			
 
				+        candidates_dict = {}  # {人设特征名称: {候选词信息}}
			
 
				+
			
 
				+        how_result = self.how_data.get('how解构结果', {})
			
 
				+
			
 
				+        # 遍历三个维度
			
 
				+        for dimension in ['灵感点列表', '关键点列表', '目的点列表']:
			
 
				+            features_list = how_result.get(dimension, [])
			
 
				+
			
 
				+            for item in features_list:
			
 
				+                item_name = item.get('名称', '')
			
 
				+                how_steps = item.get('how步骤列表', [])
			
 
				+
			
 
				+                for step in how_steps:
			
 
				+                    for feature in step.get('特征列表', []):
			
 
				+                        feature_name = feature.get('特征名称', '')
			
 
				+                        matches = feature.get('匹配结果', [])
			
 
				+
			
 
				+                        for match in matches:
			
 
				+                            # 获取相似度（从匹配结果的嵌套结构中）
			
 
				+                            similarity = match.get('匹配结果', {}).get('相似度', 0)
			
 
				+                            persona_feature_name = match.get('人设特征名称', '')
			
 
				+
			
 
				+                            # 筛选相似度 >= 0.8
			
 
				+                            if similarity >= 0.8 and persona_feature_name:
			
 
				+                                # 去重逻辑：保留最高相似度
			
 
				+                                if persona_feature_name not in candidates_dict or \
			
 
				+                                   similarity > candidates_dict[persona_feature_name]['相似度']:
			
 
				+                                    candidates_dict[persona_feature_name] = {
			
 
				+                                        '人设特征名称': persona_feature_name,
			
 
				+                                        '相似度': similarity,
			
 
				+                                        '特征类型': match.get('特征类型', ''),
			
 
				+                                        '特征分类': match.get('特征分类', []),
			
 
				+                                        '人设特征层级': match.get('人设特征层级', ''),
			
 
				+                                        '来源路径': self._build_classification_path(match.get('特征分类', [])),
			
 
				+                                        '匹配说明': match.get('匹配结果', {}).get('说明', ''),
			
 
				+                                        '来源原始特征': feature_name
			
 
				+                                    }
			
 
				+
			
 
				+        # Step 2: 转为列表并按相似度降序排序
			
 
				+        global_candidates = sorted(
			
 
				+            candidates_dict.values(),
			
 
				+            key=lambda x: x['相似度'],
			
 
				+            reverse=True
			
 
				+        )
			
 
				+
			
 
				+        logger.info(f"从how文件提取到 {len(global_candidates)} 个唯一的高相似度候选词")
			
 
				+
			
 
				+        # 显示Top 10候选词
			
 
				+        if global_candidates:
			
 
				+            logger.info("Top 10 候选词:")
			
 
				+            for i, candidate in enumerate(global_candidates[:10], 1):
			
 
				+                logger.info(f"  {i}. {candidate['人设特征名称']} (相似度: {candidate['相似度']:.3f})")
			
 
				+
			
 
				+        # Step 3: 为每个特征构造输出结构
			
 
				+        results = []
			
 
				+        for idx, feature_data in enumerate(filtered_features, 1):
			
 
				+            original_feature_name = feature_data.get('原始特征名称', '')
			
 
				+            logger.info(f"\n[{idx}/{len(filtered_features)}] 处理: {original_feature_name}")
			
 
				+
			
 
				+            top3_matches = feature_data.get('top3匹配信息', [])
			
 
				+
			
 
				+            # 提取3个中心词
			
 
				+            base_words = [match.get('人设特征名称', '') for match in top3_matches[:3]]
			
 
				+            logger.info(f"  中心词: {', '.join(base_words)}")
			
 
				+
			
 
				+            # 所有中心词共享相同的候选词列表
			
 
				+            high_similarity_by_base = {}
			
 
				+            for base_word in base_words:
			
 
				+                if base_word:
			
 
				+                    high_similarity_by_base[base_word] = global_candidates.copy()
			
 
				+
			
 
				+            logger.info(f"  每个中心词分配 {len(global_candidates)} 个候选词")
			
 
				+
			
 
				+            result = {
			
 
				+                '原始特征名称': original_feature_name,
			
 
				+                '来源层级': feature_data.get('来源层级', ''),  # 保留元数据
			
 
				+                '权重': feature_data.get('权重', 0),  # 保留元数据
			
 
				+                'top3匹配信息': top3_matches,
			
 
				+                '找到的关联_按base_word': {},  # 新方式不需要关联分析
			
 
				+                '高相似度候选_按base_word': high_similarity_by_base
			
 
				+            }
			
 
				+            results.append(result)
			
 
				+
			
 
				+        # 保存结果
			
 
				+        output_path = os.path.join(self.output_dir, 'stage3_high_similarity_how_based.json')
			
 
				+        self._save_json(results, output_path)
			
 
				+
			
 
				+        logger.info(f"\n" + "=" * 60)
			
 
				+        logger.info(f"Stage 2-3 (新方式) 完成")
			
 
				+        logger.info(f"  提取候选词: {len(global_candidates)} 个")
			
 
				+        logger.info(f"  处理特征: {len(results)} 个")
			
 
				+        logger.info("=" * 60)
			
 
				+
			
 
				+        return results
			
 
				+
			
 
				     def _collect_stage2_scope(self, feature_result: Dict[str, Any]) -> Set[str]:
			
 
				         """
			
 
				         收集Stage2找到的所有分类名和标签，形成范围集合（兼容旧方法）
			
@@ -1088,16 +1212,18 @@ class EnhancedSearchV2:
 
				 
			
 
				             logger.info(f"    候选词数量: {len(candidate_words)} (限制: {max_candidates})")
			
 
				 
			
 
				-            # 生成组合
			
 
				+            # 生成组合（简化策略：只生成 base_word + 1词 和 base_word + 2词）
			
 
				             combinations_for_base = []
			
 
				-            for length in range(1, min(max_combo_length, len(candidate_words) + 1)):
			
 
				+            max_additional_words = 2  # 最多额外添加2个词（生成 base_word + 1词 和 base_word + 2词）
			
 
				+
			
 
				+            for length in range(1, min(max_additional_words + 1, len(candidate_words) + 1)):
			
 
				                 for combo in combinations(candidate_words, length):
			
 
				                     search_phrase = base_word + ' ' + ' '.join(combo)
			
 
				                     combinations_for_base.append({
			
 
				                         'search_word': search_phrase,
			
 
				                         'base_word': base_word,
			
 
				                         'candidate_words': list(combo),
			
 
				-                        'combo_length': length + 1
			
 
				+                        'combo_length': length + 1  # +1 因为包含 base_word
			
 
				                     })
			
 
				 
			
 
				             logger.info(f"    生成 {len(combinations_for_base)} 个组合")
			
@@ -1107,7 +1233,8 @@ class EnhancedSearchV2:
 
				             evaluated = self.llm_evaluator.evaluate_search_words_in_batches(
			
 
				                 original_feature=original_feature,
			
 
				                 search_words=[c['search_word'] for c in combinations_for_base],
			
 
				-                batch_size=50
			
 
				+                batch_size=50,
			
 
				+                base_word=base_word  # 传递中心词，确保生成的 source_word 包含 base_word
			
 
				             )
			
 
				 
			
 
				             # 选出Top 10
			
@@ -1696,11 +1823,18 @@ class EnhancedSearchV2:
 
				             # 阶段1
			
 
				             stage1_results = self.stage1_filter_features()
			
 
				 
			
 
				-            # 阶段2
			
 
				-            stage2_results = self.stage2_find_associations(stage1_results)
			
 
				-
			
 
				-            # 阶段3 - 使用新方法：筛选高相似度匹配
			
 
				-            stage3_results = self.stage3_filter_high_similarity_matches(stage2_results)
			
 
				+            # 阶段2-3：根据 combination_source 选择方式
			
 
				+            if self.combination_source == "how_based":
			
 
				+                # 新方式：直接从how文件提取候选词（跳过Stage 2，直接生成Stage 3格式）
			
 
				+                logger.info(f"\n使用组合词来源方式: {self.combination_source} (新方式)")
			
 
				+                stage3_results = self.stage23_extract_candidates_from_how(stage1_results)
			
 
				+            else:
			
 
				+                # 旧方式：基于关联分析（association）
			
 
				+                logger.info(f"\n使用组合词来源方式: {self.combination_source} (旧方式)")
			
 
				+                # 阶段2
			
 
				+                stage2_results = self.stage2_find_associations(stage1_results)
			
 
				+                # 阶段3
			
 
				+                stage3_results = self.stage3_filter_high_similarity_matches(stage2_results)
			
 
				 
			
 
				             # 阶段4
			
 
				             stage4_results = self.stage4_generate_and_evaluate_search_words(
			
@@ -1834,6 +1968,13 @@ def main():
 
				         default=None,
			
 
				         help='每个base_word的最大搜索次数（默认None不限制）'
			
 
				     )
			
 
				+    parser.add_argument(
			
 
				+        '--combination-source',
			
 
				+        type=str,
			
 
				+        choices=['how_based', 'association'],
			
 
				+        default='how_based',
			
 
				+        help='组合词来源方式（默认how_based）：how_based=从how文件提取相似度>=0.8的候选词（新方式），association=基于关联分析提取候选词（旧方式）'
			
 
				+    )
			
 
				     parser.add_argument(
			
 
				         '--enable-stage6',
			
 
				         action='store_true',
			
--- a/llm_evaluator.py
+++ b/llm_evaluator.py
@@ -140,7 +140,8 @@ class LLMEvaluator:
 
				         self,
			
 
				         original_feature: str,
			
 
				         search_words: List[str],
			
 
				-        batch_size: int = 50
			
 
				+        batch_size: int = 50,
			
 
				+        base_word: str = ""
			
 
				     ) -> List[Dict[str, Any]]:
			
 
				         """
			
 
				         分批评估搜索词（每批N个，减少API调用）
			
@@ -149,6 +150,7 @@ class LLMEvaluator:
 
				             original_feature: 原始特征
			
 
				             search_words: 搜索词列表
			
 
				             batch_size: 每批处理的搜索词数量，默认10
			
 
				+            base_word: 中心词（如果提供，要求所有组合必须包含此词）
			
 
				 
			
 
				         Returns:
			
 
				             评估结果列表（已排序）
			
@@ -179,10 +181,21 @@ class LLMEvaluator:
 
				             # 构建可选词汇字符串（逗号分隔）
			
 
				             available_words_str = "、".join(available_words)
			
 
				 
			
 
				+            # 构建 base_word 约束
			
 
				+            base_word_constraint = ""
			
 
				+            if base_word:
			
 
				+                base_word_constraint = f"""
			
 
				+## 中心词约束（重要）
			
 
				+- 所有组合词都基于中心词: **{base_word}**
			
 
				+- **禁止去掉中心词**，你只负责评分和排序
			
 
				+- source_word 必须包含 "{base_word}"
			
 
				+"""
			
 
				+
			
 
				             prompt = f"""
			
 
				 
			
 
				 # 任务说明
			
 
				-模拟你是一个内容创作者，生成的组合词要符合一个创作者在内容平台搜索的习惯。从给定关键词中提取并组合适合在小红书搜索的query词。
			
 
				+模拟你是一个内容创作者，评估并排序这些基于中心词的搜索组合。
			
 
				+{base_word_constraint}
			
 
				 
			
 
				 ## 可选词汇
			
 
				 {available_words_str}
			
@@ -192,8 +205,9 @@ class LLMEvaluator:
 
				    - 直接使用原词或括号内的同义词
			
 
				    - 多个词组合
			
 
				    - 适当精简
			
 
				-2. 不能添加可选词汇以外的新词
			
 
				-3. 按推荐程度排序(越靠前越推荐)，取top5
			
 
				+2. **source_word 必须包含中心词 "{base_word}"**（如果提供了中心词）
			
 
				+3. 不能添加可选词汇以外的新词
			
 
				+4. 按推荐程度排序(越靠前越推荐)，取top5
			
 
				 
			
 
				 ## 输出格式（JSON）:
			
 
				 [