|
@@ -51,6 +51,7 @@ class EnhancedSearchV2:
|
|
|
search_max_workers: int = 3,
|
|
search_max_workers: int = 3,
|
|
|
max_searches_per_feature: Optional[int] = None,
|
|
max_searches_per_feature: Optional[int] = None,
|
|
|
max_searches_per_base_word: Optional[int] = None,
|
|
max_searches_per_base_word: Optional[int] = None,
|
|
|
|
|
+ combination_source: str = "how_based",
|
|
|
enable_stage6: bool = False,
|
|
enable_stage6: bool = False,
|
|
|
stage6_max_workers: int = 10,
|
|
stage6_max_workers: int = 10,
|
|
|
stage6_max_notes: int = 20,
|
|
stage6_max_notes: int = 20,
|
|
@@ -78,6 +79,9 @@ class EnhancedSearchV2:
|
|
|
search_max_workers: 搜索并发数(默认3)
|
|
search_max_workers: 搜索并发数(默认3)
|
|
|
max_searches_per_feature: 每个原始特征的最大搜索次数(默认None不限制)
|
|
max_searches_per_feature: 每个原始特征的最大搜索次数(默认None不限制)
|
|
|
max_searches_per_base_word: 每个base_word的最大搜索次数(默认None不限制)
|
|
max_searches_per_base_word: 每个base_word的最大搜索次数(默认None不限制)
|
|
|
|
|
+ combination_source: 组合词来源方式(默认how_based)
|
|
|
|
|
+ - "how_based": 从how文件提取相似度>=0.8的候选词(新方式,默认)
|
|
|
|
|
+ - "association": 基于关联分析提取候选词(旧方式)
|
|
|
enable_stage6: 是否启用Stage 6评估(默认False)
|
|
enable_stage6: 是否启用Stage 6评估(默认False)
|
|
|
stage6_max_workers: Stage 6并发评估数(默认10)
|
|
stage6_max_workers: Stage 6并发评估数(默认10)
|
|
|
stage6_max_notes: 每个搜索结果评估的最大帖子数(默认20)
|
|
stage6_max_notes: 每个搜索结果评估的最大帖子数(默认20)
|
|
@@ -100,6 +104,7 @@ class EnhancedSearchV2:
|
|
|
self.search_max_workers = search_max_workers
|
|
self.search_max_workers = search_max_workers
|
|
|
self.max_searches_per_feature = max_searches_per_feature
|
|
self.max_searches_per_feature = max_searches_per_feature
|
|
|
self.max_searches_per_base_word = max_searches_per_base_word
|
|
self.max_searches_per_base_word = max_searches_per_base_word
|
|
|
|
|
+ self.combination_source = combination_source
|
|
|
self.enable_stage6 = enable_stage6
|
|
self.enable_stage6 = enable_stage6
|
|
|
self.stage6_max_workers = stage6_max_workers
|
|
self.stage6_max_workers = stage6_max_workers
|
|
|
self.stage6_max_notes = stage6_max_notes
|
|
self.stage6_max_notes = stage6_max_notes
|
|
@@ -913,6 +918,125 @@ class EnhancedSearchV2:
|
|
|
|
|
|
|
|
return scope
|
|
return scope
|
|
|
|
|
|
|
|
|
|
+ def stage23_extract_candidates_from_how(self, filtered_features: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
|
|
|
+ """
|
|
|
|
|
+ 新方式:从how文件提取相似度>=0.8的候选词
|
|
|
|
|
+ 替代 Stage 2-3,但构造相同的数据结构
|
|
|
|
|
+
|
|
|
|
|
+ 处理流程:
|
|
|
|
|
+ 1. 遍历 how_data['how解构结果'] 所有特征的匹配结果
|
|
|
|
|
+ 2. 筛选 相似度 >= 0.8 的人设特征名称
|
|
|
|
|
+ 3. 去重(按最高相似度保留)
|
|
|
|
|
+ 4. 按相似度降序排序
|
|
|
|
|
+ 5. 为每个中心词复制相同的候选词列表
|
|
|
|
|
+ 6. 构造 '高相似度候选_按base_word' 结构
|
|
|
|
|
+
|
|
|
|
|
+ Args:
|
|
|
|
|
+ filtered_features: Stage 1筛选的特征列表
|
|
|
|
|
+
|
|
|
|
|
+ Returns:
|
|
|
|
|
+ 与Stage 3输出结构完全一致的特征列表
|
|
|
|
|
+ """
|
|
|
|
|
+ logger.info("=" * 60)
|
|
|
|
|
+ logger.info("Stage 2-3 (新方式): 从how文件提取高相似度候选词")
|
|
|
|
|
+ logger.info("=" * 60)
|
|
|
|
|
+
|
|
|
|
|
+ # Step 1: 从整个how文件提取候选词
|
|
|
|
|
+ candidates_dict = {} # {人设特征名称: {候选词信息}}
|
|
|
|
|
+
|
|
|
|
|
+ how_result = self.how_data.get('how解构结果', {})
|
|
|
|
|
+
|
|
|
|
|
+ # 遍历三个维度
|
|
|
|
|
+ for dimension in ['灵感点列表', '关键点列表', '目的点列表']:
|
|
|
|
|
+ features_list = how_result.get(dimension, [])
|
|
|
|
|
+
|
|
|
|
|
+ for item in features_list:
|
|
|
|
|
+ item_name = item.get('名称', '')
|
|
|
|
|
+ how_steps = item.get('how步骤列表', [])
|
|
|
|
|
+
|
|
|
|
|
+ for step in how_steps:
|
|
|
|
|
+ for feature in step.get('特征列表', []):
|
|
|
|
|
+ feature_name = feature.get('特征名称', '')
|
|
|
|
|
+ matches = feature.get('匹配结果', [])
|
|
|
|
|
+
|
|
|
|
|
+ for match in matches:
|
|
|
|
|
+ # 获取相似度(从匹配结果的嵌套结构中)
|
|
|
|
|
+ similarity = match.get('匹配结果', {}).get('相似度', 0)
|
|
|
|
|
+ persona_feature_name = match.get('人设特征名称', '')
|
|
|
|
|
+
|
|
|
|
|
+ # 筛选相似度 >= 0.8
|
|
|
|
|
+ if similarity >= 0.8 and persona_feature_name:
|
|
|
|
|
+ # 去重逻辑:保留最高相似度
|
|
|
|
|
+ if persona_feature_name not in candidates_dict or \
|
|
|
|
|
+ similarity > candidates_dict[persona_feature_name]['相似度']:
|
|
|
|
|
+ candidates_dict[persona_feature_name] = {
|
|
|
|
|
+ '人设特征名称': persona_feature_name,
|
|
|
|
|
+ '相似度': similarity,
|
|
|
|
|
+ '特征类型': match.get('特征类型', ''),
|
|
|
|
|
+ '特征分类': match.get('特征分类', []),
|
|
|
|
|
+ '人设特征层级': match.get('人设特征层级', ''),
|
|
|
|
|
+ '来源路径': self._build_classification_path(match.get('特征分类', [])),
|
|
|
|
|
+ '匹配说明': match.get('匹配结果', {}).get('说明', ''),
|
|
|
|
|
+ '来源原始特征': feature_name
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ # Step 2: 转为列表并按相似度降序排序
|
|
|
|
|
+ global_candidates = sorted(
|
|
|
|
|
+ candidates_dict.values(),
|
|
|
|
|
+ key=lambda x: x['相似度'],
|
|
|
|
|
+ reverse=True
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+ logger.info(f"从how文件提取到 {len(global_candidates)} 个唯一的高相似度候选词")
|
|
|
|
|
+
|
|
|
|
|
+ # 显示Top 10候选词
|
|
|
|
|
+ if global_candidates:
|
|
|
|
|
+ logger.info("Top 10 候选词:")
|
|
|
|
|
+ for i, candidate in enumerate(global_candidates[:10], 1):
|
|
|
|
|
+ logger.info(f" {i}. {candidate['人设特征名称']} (相似度: {candidate['相似度']:.3f})")
|
|
|
|
|
+
|
|
|
|
|
+ # Step 3: 为每个特征构造输出结构
|
|
|
|
|
+ results = []
|
|
|
|
|
+ for idx, feature_data in enumerate(filtered_features, 1):
|
|
|
|
|
+ original_feature_name = feature_data.get('原始特征名称', '')
|
|
|
|
|
+ logger.info(f"\n[{idx}/{len(filtered_features)}] 处理: {original_feature_name}")
|
|
|
|
|
+
|
|
|
|
|
+ top3_matches = feature_data.get('top3匹配信息', [])
|
|
|
|
|
+
|
|
|
|
|
+ # 提取3个中心词
|
|
|
|
|
+ base_words = [match.get('人设特征名称', '') for match in top3_matches[:3]]
|
|
|
|
|
+ logger.info(f" 中心词: {', '.join(base_words)}")
|
|
|
|
|
+
|
|
|
|
|
+ # 所有中心词共享相同的候选词列表
|
|
|
|
|
+ high_similarity_by_base = {}
|
|
|
|
|
+ for base_word in base_words:
|
|
|
|
|
+ if base_word:
|
|
|
|
|
+ high_similarity_by_base[base_word] = global_candidates.copy()
|
|
|
|
|
+
|
|
|
|
|
+ logger.info(f" 每个中心词分配 {len(global_candidates)} 个候选词")
|
|
|
|
|
+
|
|
|
|
|
+ result = {
|
|
|
|
|
+ '原始特征名称': original_feature_name,
|
|
|
|
|
+ '来源层级': feature_data.get('来源层级', ''), # 保留元数据
|
|
|
|
|
+ '权重': feature_data.get('权重', 0), # 保留元数据
|
|
|
|
|
+ 'top3匹配信息': top3_matches,
|
|
|
|
|
+ '找到的关联_按base_word': {}, # 新方式不需要关联分析
|
|
|
|
|
+ '高相似度候选_按base_word': high_similarity_by_base
|
|
|
|
|
+ }
|
|
|
|
|
+ results.append(result)
|
|
|
|
|
+
|
|
|
|
|
+ # 保存结果
|
|
|
|
|
+ output_path = os.path.join(self.output_dir, 'stage3_high_similarity_how_based.json')
|
|
|
|
|
+ self._save_json(results, output_path)
|
|
|
|
|
+
|
|
|
|
|
+ logger.info(f"\n" + "=" * 60)
|
|
|
|
|
+ logger.info(f"Stage 2-3 (新方式) 完成")
|
|
|
|
|
+ logger.info(f" 提取候选词: {len(global_candidates)} 个")
|
|
|
|
|
+ logger.info(f" 处理特征: {len(results)} 个")
|
|
|
|
|
+ logger.info("=" * 60)
|
|
|
|
|
+
|
|
|
|
|
+ return results
|
|
|
|
|
+
|
|
|
def _collect_stage2_scope(self, feature_result: Dict[str, Any]) -> Set[str]:
|
|
def _collect_stage2_scope(self, feature_result: Dict[str, Any]) -> Set[str]:
|
|
|
"""
|
|
"""
|
|
|
收集Stage2找到的所有分类名和标签,形成范围集合(兼容旧方法)
|
|
收集Stage2找到的所有分类名和标签,形成范围集合(兼容旧方法)
|
|
@@ -1088,16 +1212,18 @@ class EnhancedSearchV2:
|
|
|
|
|
|
|
|
logger.info(f" 候选词数量: {len(candidate_words)} (限制: {max_candidates})")
|
|
logger.info(f" 候选词数量: {len(candidate_words)} (限制: {max_candidates})")
|
|
|
|
|
|
|
|
- # 生成组合
|
|
|
|
|
|
|
+ # 生成组合(简化策略:只生成 base_word + 1词 和 base_word + 2词)
|
|
|
combinations_for_base = []
|
|
combinations_for_base = []
|
|
|
- for length in range(1, min(max_combo_length, len(candidate_words) + 1)):
|
|
|
|
|
|
|
+ max_additional_words = 2 # 最多额外添加2个词(生成 base_word + 1词 和 base_word + 2词)
|
|
|
|
|
+
|
|
|
|
|
+ for length in range(1, min(max_additional_words + 1, len(candidate_words) + 1)):
|
|
|
for combo in combinations(candidate_words, length):
|
|
for combo in combinations(candidate_words, length):
|
|
|
search_phrase = base_word + ' ' + ' '.join(combo)
|
|
search_phrase = base_word + ' ' + ' '.join(combo)
|
|
|
combinations_for_base.append({
|
|
combinations_for_base.append({
|
|
|
'search_word': search_phrase,
|
|
'search_word': search_phrase,
|
|
|
'base_word': base_word,
|
|
'base_word': base_word,
|
|
|
'candidate_words': list(combo),
|
|
'candidate_words': list(combo),
|
|
|
- 'combo_length': length + 1
|
|
|
|
|
|
|
+ 'combo_length': length + 1 # +1 因为包含 base_word
|
|
|
})
|
|
})
|
|
|
|
|
|
|
|
logger.info(f" 生成 {len(combinations_for_base)} 个组合")
|
|
logger.info(f" 生成 {len(combinations_for_base)} 个组合")
|
|
@@ -1107,7 +1233,8 @@ class EnhancedSearchV2:
|
|
|
evaluated = self.llm_evaluator.evaluate_search_words_in_batches(
|
|
evaluated = self.llm_evaluator.evaluate_search_words_in_batches(
|
|
|
original_feature=original_feature,
|
|
original_feature=original_feature,
|
|
|
search_words=[c['search_word'] for c in combinations_for_base],
|
|
search_words=[c['search_word'] for c in combinations_for_base],
|
|
|
- batch_size=50
|
|
|
|
|
|
|
+ batch_size=50,
|
|
|
|
|
+ base_word=base_word # 传递中心词,确保生成的 source_word 包含 base_word
|
|
|
)
|
|
)
|
|
|
|
|
|
|
|
# 选出Top 10
|
|
# 选出Top 10
|
|
@@ -1696,11 +1823,18 @@ class EnhancedSearchV2:
|
|
|
# 阶段1
|
|
# 阶段1
|
|
|
stage1_results = self.stage1_filter_features()
|
|
stage1_results = self.stage1_filter_features()
|
|
|
|
|
|
|
|
- # 阶段2
|
|
|
|
|
- stage2_results = self.stage2_find_associations(stage1_results)
|
|
|
|
|
-
|
|
|
|
|
- # 阶段3 - 使用新方法:筛选高相似度匹配
|
|
|
|
|
- stage3_results = self.stage3_filter_high_similarity_matches(stage2_results)
|
|
|
|
|
|
|
+ # 阶段2-3:根据 combination_source 选择方式
|
|
|
|
|
+ if self.combination_source == "how_based":
|
|
|
|
|
+ # 新方式:直接从how文件提取候选词(跳过Stage 2,直接生成Stage 3格式)
|
|
|
|
|
+ logger.info(f"\n使用组合词来源方式: {self.combination_source} (新方式)")
|
|
|
|
|
+ stage3_results = self.stage23_extract_candidates_from_how(stage1_results)
|
|
|
|
|
+ else:
|
|
|
|
|
+ # 旧方式:基于关联分析(association)
|
|
|
|
|
+ logger.info(f"\n使用组合词来源方式: {self.combination_source} (旧方式)")
|
|
|
|
|
+ # 阶段2
|
|
|
|
|
+ stage2_results = self.stage2_find_associations(stage1_results)
|
|
|
|
|
+ # 阶段3
|
|
|
|
|
+ stage3_results = self.stage3_filter_high_similarity_matches(stage2_results)
|
|
|
|
|
|
|
|
# 阶段4
|
|
# 阶段4
|
|
|
stage4_results = self.stage4_generate_and_evaluate_search_words(
|
|
stage4_results = self.stage4_generate_and_evaluate_search_words(
|
|
@@ -1834,6 +1968,13 @@ def main():
|
|
|
default=None,
|
|
default=None,
|
|
|
help='每个base_word的最大搜索次数(默认None不限制)'
|
|
help='每个base_word的最大搜索次数(默认None不限制)'
|
|
|
)
|
|
)
|
|
|
|
|
+ parser.add_argument(
|
|
|
|
|
+ '--combination-source',
|
|
|
|
|
+ type=str,
|
|
|
|
|
+ choices=['how_based', 'association'],
|
|
|
|
|
+ default='how_based',
|
|
|
|
|
+ help='组合词来源方式(默认how_based):how_based=从how文件提取相似度>=0.8的候选词(新方式),association=基于关联分析提取候选词(旧方式)'
|
|
|
|
|
+ )
|
|
|
parser.add_argument(
|
|
parser.add_argument(
|
|
|
'--enable-stage6',
|
|
'--enable-stage6',
|
|
|
action='store_true',
|
|
action='store_true',
|