hai 2 meses · 2d0863a6d2
--- a/enhanced_search_v2.py
+++ b/enhanced_search_v2.py
@@ -51,6 +51,7 @@ class EnhancedSearchV2:
 
															         search_max_workers: int = 3,
														
 
															         max_searches_per_feature: Optional[int] = None,
														
 
															         max_searches_per_base_word: Optional[int] = None,
														
 
															+        combination_source: str = "how_based",
														
 
															         enable_stage6: bool = False,
														
 
															         stage6_max_workers: int = 10,
														
 
															         stage6_max_notes: int = 20,
														
@@ -78,6 +79,9 @@ class EnhancedSearchV2:
 
															             search_max_workers: 搜索并发数（默认3）
														
 
															             max_searches_per_feature: 每个原始特征的最大搜索次数（默认None不限制）
														
 
															             max_searches_per_base_word: 每个base_word的最大搜索次数（默认None不限制）
														
 
															+            combination_source: 组合词来源方式（默认how_based）
														
 
															+                - "how_based": 从how文件提取相似度>=0.8的候选词（新方式，默认）
														
 
															+                - "association": 基于关联分析提取候选词（旧方式）
														
 
															             enable_stage6: 是否启用Stage 6评估（默认False）
														
 
															             stage6_max_workers: Stage 6并发评估数（默认10）
														
 
															             stage6_max_notes: 每个搜索结果评估的最大帖子数（默认20）
														
@@ -100,6 +104,7 @@ class EnhancedSearchV2:
 
															         self.search_max_workers = search_max_workers
														
 
															         self.max_searches_per_feature = max_searches_per_feature
														
 
															         self.max_searches_per_base_word = max_searches_per_base_word
														
 
															+        self.combination_source = combination_source
														
 
															         self.enable_stage6 = enable_stage6
														
 
															         self.stage6_max_workers = stage6_max_workers
														
 
															         self.stage6_max_notes = stage6_max_notes
														
@@ -913,6 +918,125 @@ class EnhancedSearchV2:
 
															         return scope
														
 
															+    def stage23_extract_candidates_from_how(self, filtered_features: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
														
 
															+        """
														
 
															+        新方式：从how文件提取相似度>=0.8的候选词
														
 
															+        替代 Stage 2-3，但构造相同的数据结构
														
 
															+
														
 
															+        处理流程:
														
 
															+        1. 遍历 how_data['how解构结果'] 所有特征的匹配结果
														
 
															+        2. 筛选 相似度 >= 0.8 的人设特征名称
														
 
															+        3. 去重（按最高相似度保留）
														
 
															+        4. 按相似度降序排序
														
 
															+        5. 为每个中心词复制相同的候选词列表
														
 
															+        6. 构造 '高相似度候选_按base_word' 结构
														
 
															+
														
 
															+        Args:
														
 
															+            filtered_features: Stage 1筛选的特征列表
														
 
															+
														
 
															+        Returns:
														
 
															+            与Stage 3输出结构完全一致的特征列表
														
 
															+        """
														
 
															+        logger.info("=" * 60)
														
 
															+        logger.info("Stage 2-3 (新方式): 从how文件提取高相似度候选词")
														
 
															+        logger.info("=" * 60)
														
 
															+
														
 
															+        # Step 1: 从整个how文件提取候选词
														
 
															+        candidates_dict = {}  # {人设特征名称: {候选词信息}}
														
 
															+
														
 
															+        how_result = self.how_data.get('how解构结果', {})
														
 
															+
														
 
															+        # 遍历三个维度
														
 
															+        for dimension in ['灵感点列表', '关键点列表', '目的点列表']:
														
 
															+            features_list = how_result.get(dimension, [])
														
 
															+
														
 
															+            for item in features_list:
														
 
															+                item_name = item.get('名称', '')
														
 
															+                how_steps = item.get('how步骤列表', [])
														
 
															+
														
 
															+                for step in how_steps:
														
 
															+                    for feature in step.get('特征列表', []):
														
 
															+                        feature_name = feature.get('特征名称', '')
														
 
															+                        matches = feature.get('匹配结果', [])
														
 
															+
														
 
															+                        for match in matches:
														
 
															+                            # 获取相似度（从匹配结果的嵌套结构中）
														
 
															+                            similarity = match.get('匹配结果', {}).get('相似度', 0)
														
 
															+                            persona_feature_name = match.get('人设特征名称', '')
														
 
															+
														
 
															+                            # 筛选相似度 >= 0.8
														
 
															+                            if similarity >= 0.8 and persona_feature_name:
														
 
															+                                # 去重逻辑：保留最高相似度
														
 
															+                                if persona_feature_name not in candidates_dict or \
														
 
															+                                   similarity > candidates_dict[persona_feature_name]['相似度']:
														
 
															+                                    candidates_dict[persona_feature_name] = {
														
 
															+                                        '人设特征名称': persona_feature_name,
														
 
															+                                        '相似度': similarity,
														
 
															+                                        '特征类型': match.get('特征类型', ''),
														
 
															+                                        '特征分类': match.get('特征分类', []),
														
 
															+                                        '人设特征层级': match.get('人设特征层级', ''),
														
 
															+                                        '来源路径': self._build_classification_path(match.get('特征分类', [])),
														
 
															+                                        '匹配说明': match.get('匹配结果', {}).get('说明', ''),
														
 
															+                                        '来源原始特征': feature_name
														
 
															+                                    }
														
 
															+
														
 
															+        # Step 2: 转为列表并按相似度降序排序
														
 
															+        global_candidates = sorted(
														
 
															+            candidates_dict.values(),
														
 
															+            key=lambda x: x['相似度'],
														
 
															+            reverse=True
														
 
															+        )
														
 
															+
														
 
															+        logger.info(f"从how文件提取到 {len(global_candidates)} 个唯一的高相似度候选词")
														
 
															+
														
 
															+        # 显示Top 10候选词
														
 
															+        if global_candidates:
														
 
															+            logger.info("Top 10 候选词:")
														
 
															+            for i, candidate in enumerate(global_candidates[:10], 1):
														
 
															+                logger.info(f"  {i}. {candidate['人设特征名称']} (相似度: {candidate['相似度']:.3f})")
														
 
															+
														
 
															+        # Step 3: 为每个特征构造输出结构
														
 
															+        results = []
														
 
															+        for idx, feature_data in enumerate(filtered_features, 1):
														
 
															+            original_feature_name = feature_data.get('原始特征名称', '')
														
 
															+            logger.info(f"\n[{idx}/{len(filtered_features)}] 处理: {original_feature_name}")
														
 
															+
														
 
															+            top3_matches = feature_data.get('top3匹配信息', [])
														
 
															+
														
 
															+            # 提取3个中心词
														
 
															+            base_words = [match.get('人设特征名称', '') for match in top3_matches[:3]]
														
 
															+            logger.info(f"  中心词: {', '.join(base_words)}")
														
 
															+
														
 
															+            # 所有中心词共享相同的候选词列表
														
 
															+            high_similarity_by_base = {}
														
 
															+            for base_word in base_words:
														
 
															+                if base_word:
														
 
															+                    high_similarity_by_base[base_word] = global_candidates.copy()
														
 
															+
														
 
															+            logger.info(f"  每个中心词分配 {len(global_candidates)} 个候选词")
														
 
															+
														
 
															+            result = {
														
 
															+                '原始特征名称': original_feature_name,
														
 
															+                '来源层级': feature_data.get('来源层级', ''),  # 保留元数据
														
 
															+                '权重': feature_data.get('权重', 0),  # 保留元数据
														
 
															+                'top3匹配信息': top3_matches,
														
 
															+                '找到的关联_按base_word': {},  # 新方式不需要关联分析
														
 
															+                '高相似度候选_按base_word': high_similarity_by_base
														
 
															+            }
														
 
															+            results.append(result)
														
 
															+
														
 
															+        # 保存结果
														
 
															+        output_path = os.path.join(self.output_dir, 'stage3_high_similarity_how_based.json')
														
 
															+        self._save_json(results, output_path)
														
 
															+
														
 
															+        logger.info(f"\n" + "=" * 60)
														
 
															+        logger.info(f"Stage 2-3 (新方式) 完成")
														
 
															+        logger.info(f"  提取候选词: {len(global_candidates)} 个")
														
 
															+        logger.info(f"  处理特征: {len(results)} 个")
														
 
															+        logger.info("=" * 60)
														
 
															+
														
 
															+        return results
														
 
															+
														
 
															     def _collect_stage2_scope(self, feature_result: Dict[str, Any]) -> Set[str]:
														
 
															         """
														
 
															         收集Stage2找到的所有分类名和标签，形成范围集合（兼容旧方法）
														
@@ -1088,16 +1212,18 @@ class EnhancedSearchV2:
 
															             logger.info(f"    候选词数量: {len(candidate_words)} (限制: {max_candidates})")
														
 
															-            # 生成组合
														
 
															+            # 生成组合（简化策略：只生成 base_word + 1词 和 base_word + 2词）
														
 
															             combinations_for_base = []
														
 
															-            for length in range(1, min(max_combo_length, len(candidate_words) + 1)):
														
 
															+            max_additional_words = 2  # 最多额外添加2个词（生成 base_word + 1词 和 base_word + 2词）
														
 
															+
														
 
															+            for length in range(1, min(max_additional_words + 1, len(candidate_words) + 1)):
														
 
															                 for combo in combinations(candidate_words, length):
														
 
															                     search_phrase = base_word + ' ' + ' '.join(combo)
														
 
															                     combinations_for_base.append({
														
 
															                         'search_word': search_phrase,
														
 
															                         'base_word': base_word,
														
 
															                         'candidate_words': list(combo),
														
 
															-                        'combo_length': length + 1
														
 
															+                        'combo_length': length + 1  # +1 因为包含 base_word
														
 
															                     })
														
 
															             logger.info(f"    生成 {len(combinations_for_base)} 个组合")
														
@@ -1107,7 +1233,8 @@ class EnhancedSearchV2:
 
															             evaluated = self.llm_evaluator.evaluate_search_words_in_batches(
														
 
															                 original_feature=original_feature,
														
 
															                 search_words=[c['search_word'] for c in combinations_for_base],
														
 
															-                batch_size=50
														
 
															+                batch_size=50,
														
 
															+                base_word=base_word  # 传递中心词，确保生成的 source_word 包含 base_word
														
 
															             )
														
 
															             # 选出Top 10
														
@@ -1696,11 +1823,18 @@ class EnhancedSearchV2:
 
															             # 阶段1
														
 
															             stage1_results = self.stage1_filter_features()
														
 
															-            # 阶段2
														
 
															-            stage2_results = self.stage2_find_associations(stage1_results)
														
 
															-
														
 
															-            # 阶段3 - 使用新方法：筛选高相似度匹配
														
 
															-            stage3_results = self.stage3_filter_high_similarity_matches(stage2_results)
														
 
															+            # 阶段2-3：根据 combination_source 选择方式
														
 
															+            if self.combination_source == "how_based":
														
 
															+                # 新方式：直接从how文件提取候选词（跳过Stage 2，直接生成Stage 3格式）
														
 
															+                logger.info(f"\n使用组合词来源方式: {self.combination_source} (新方式)")
														
 
															+                stage3_results = self.stage23_extract_candidates_from_how(stage1_results)
														
 
															+            else:
														
 
															+                # 旧方式：基于关联分析（association）
														
 
															+                logger.info(f"\n使用组合词来源方式: {self.combination_source} (旧方式)")
														
 
															+                # 阶段2
														
 
															+                stage2_results = self.stage2_find_associations(stage1_results)
														
 
															+                # 阶段3
														
 
															+                stage3_results = self.stage3_filter_high_similarity_matches(stage2_results)
														
 
															             # 阶段4
														
 
															             stage4_results = self.stage4_generate_and_evaluate_search_words(
														
@@ -1834,6 +1968,13 @@ def main():
 
															         default=None,
														
 
															         help='每个base_word的最大搜索次数（默认None不限制）'
														
 
															     )
														
 
															+    parser.add_argument(
														
 
															+        '--combination-source',
														
 
															+        type=str,
														
 
															+        choices=['how_based', 'association'],
														
 
															+        default='how_based',
														
 
															+        help='组合词来源方式（默认how_based）：how_based=从how文件提取相似度>=0.8的候选词（新方式），association=基于关联分析提取候选词（旧方式）'
														
 
															+    )
														
 
															     parser.add_argument(
														
 
															         '--enable-stage6',
														
 
															         action='store_true',
														
--- a/llm_evaluator.py
+++ b/llm_evaluator.py
@@ -140,7 +140,8 @@ class LLMEvaluator:
 
															         self,
														
 
															         original_feature: str,
														
 
															         search_words: List[str],
														
 
															-        batch_size: int = 50
														
 
															+        batch_size: int = 50,
														
 
															+        base_word: str = ""
														
 
															     ) -> List[Dict[str, Any]]:
														
 
															         """
														
 
															         分批评估搜索词（每批N个，减少API调用）
														
@@ -149,6 +150,7 @@ class LLMEvaluator:
 
															             original_feature: 原始特征
														
 
															             search_words: 搜索词列表
														
 
															             batch_size: 每批处理的搜索词数量，默认10
														
 
															+            base_word: 中心词（如果提供，要求所有组合必须包含此词）
														
 
															         Returns:
														
 
															             评估结果列表（已排序）
														
@@ -179,10 +181,21 @@ class LLMEvaluator:
 
															             # 构建可选词汇字符串（逗号分隔）
														
 
															             available_words_str = "、".join(available_words)
														
 
															+            # 构建 base_word 约束
														
 
															+            base_word_constraint = ""
														
 
															+            if base_word:
														
 
															+                base_word_constraint = f"""
														
 
															+## 中心词约束（重要）
														
 
															+- 所有组合词都基于中心词: **{base_word}**
														
 
															+- **禁止去掉中心词**，你只负责评分和排序
														
 
															+- source_word 必须包含 "{base_word}"
														
 
															+"""
														
 
															+
														
 
															             prompt = f"""
														
 
															 # 任务说明
														
 
															-模拟你是一个内容创作者，生成的组合词要符合一个创作者在内容平台搜索的习惯。从给定关键词中提取并组合适合在小红书搜索的query词。
														
 
															+模拟你是一个内容创作者，评估并排序这些基于中心词的搜索组合。
														
 
															+{base_word_constraint}
														
 
															 ## 可选词汇
														
 
															 {available_words_str}
														
@@ -192,8 +205,9 @@ class LLMEvaluator:
 
															    - 直接使用原词或括号内的同义词
														
 
															    - 多个词组合
														
 
															    - 适当精简
														
 
															-2. 不能添加可选词汇以外的新词
														
 
															-3. 按推荐程度排序(越靠前越推荐)，取top5
														
 
															+2. **source_word 必须包含中心词 "{base_word}"**（如果提供了中心词）
														
 
															+3. 不能添加可选词汇以外的新词
														
 
															+4. 按推荐程度排序(越靠前越推荐)，取top5
														
 
															 ## 输出格式（JSON）:
														
 
															 [