|
@@ -170,45 +170,53 @@ class EnhancedSearchV2:
|
|
|
continue
|
|
continue
|
|
|
|
|
|
|
|
# 0.5 <= max_similarity < 0.8,保留
|
|
# 0.5 <= max_similarity < 0.8,保留
|
|
|
- best_match = max(
|
|
|
|
|
|
|
+ # 按相似度降序排序,取前3个
|
|
|
|
|
+ sorted_matches = sorted(
|
|
|
match_results,
|
|
match_results,
|
|
|
- key=lambda x: x.get('匹配结果', {}).get('相似度', 0)
|
|
|
|
|
|
|
+ key=lambda x: x.get('匹配结果', {}).get('相似度', 0),
|
|
|
|
|
+ reverse=True
|
|
|
)
|
|
)
|
|
|
|
|
+ top3_matches = sorted_matches[:3] # 取前3个
|
|
|
|
|
|
|
|
- # 判断是分类还是特征
|
|
|
|
|
- feature_classification = best_match.get('特征分类', [])
|
|
|
|
|
- classification_path = self._build_classification_path(feature_classification)
|
|
|
|
|
|
|
+ # 构建top3匹配信息列表
|
|
|
|
|
+ top3_match_info = []
|
|
|
|
|
+ for match in top3_matches:
|
|
|
|
|
+ feature_classification = match.get('特征分类', [])
|
|
|
|
|
+ classification_path = self._build_classification_path(feature_classification)
|
|
|
|
|
|
|
|
- # 如果路径为空且是分类类型,搜索补全路径
|
|
|
|
|
- if not classification_path and best_match.get('特征类型') == '分类':
|
|
|
|
|
- feature_name_to_search = best_match.get('人设特征名称', '')
|
|
|
|
|
- classification_path = self._search_classification_path(feature_name_to_search)
|
|
|
|
|
|
|
+ # 如果路径为空且是分类类型,搜索补全路径
|
|
|
|
|
+ if not classification_path and match.get('特征类型') == '分类':
|
|
|
|
|
+ feature_name_to_search = match.get('人设特征名称', '')
|
|
|
|
|
+ classification_path = self._search_classification_path(feature_name_to_search)
|
|
|
|
|
|
|
|
- is_classification = self._is_classification(best_match.get('人设特征名称', ''), classification_path)
|
|
|
|
|
|
|
+ is_classification = self._is_classification(match.get('人设特征名称', ''), classification_path)
|
|
|
|
|
+
|
|
|
|
|
+ top3_match_info.append({
|
|
|
|
|
+ '人设特征名称': match.get('人设特征名称'),
|
|
|
|
|
+ '人设特征层级': match.get('人设特征层级'),
|
|
|
|
|
+ '特征类型': match.get('特征类型'),
|
|
|
|
|
+ '特征分类': feature_classification,
|
|
|
|
|
+ '相似度': match.get('匹配结果', {}).get('相似度', 0),
|
|
|
|
|
+ '匹配说明': match.get('匹配结果', {}).get('说明', ''),
|
|
|
|
|
+ '是分类': is_classification,
|
|
|
|
|
+ '所属分类路径': classification_path
|
|
|
|
|
+ })
|
|
|
|
|
|
|
|
result_item = {
|
|
result_item = {
|
|
|
'原始特征名称': feature_name,
|
|
'原始特征名称': feature_name,
|
|
|
'来源层级': level_name,
|
|
'来源层级': level_name,
|
|
|
'权重': feature.get('权重', 0),
|
|
'权重': feature.get('权重', 0),
|
|
|
'所属点名称': item_name,
|
|
'所属点名称': item_name,
|
|
|
- '最高匹配信息': {
|
|
|
|
|
- '人设特征名称': best_match.get('人设特征名称'),
|
|
|
|
|
- '人设特征层级': best_match.get('人设特征层级'),
|
|
|
|
|
- '特征类型': best_match.get('特征类型'),
|
|
|
|
|
- '特征分类': feature_classification,
|
|
|
|
|
- '相似度': best_match.get('匹配结果', {}).get('相似度', 0),
|
|
|
|
|
- '匹配说明': best_match.get('匹配结果', {}).get('说明', ''),
|
|
|
|
|
- '是分类': is_classification,
|
|
|
|
|
- '所属分类路径': classification_path
|
|
|
|
|
- }
|
|
|
|
|
|
|
+ '最高匹配信息': top3_match_info[0], # 保留第1个用于Stage2
|
|
|
|
|
+ 'top3匹配信息': top3_match_info # 新增字段
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
results.append(result_item)
|
|
results.append(result_item)
|
|
|
selected_count += 1
|
|
selected_count += 1
|
|
|
|
|
|
|
|
- logger.info(f" ✓ {feature_name} → {best_match.get('人设特征名称')} "
|
|
|
|
|
- f"(相似度: {max_similarity:.3f}, "
|
|
|
|
|
- f"{'分类' if is_classification else '特征'})")
|
|
|
|
|
|
|
+ # 显示top3匹配信息
|
|
|
|
|
+ top3_names = [m['人设特征名称'] for m in top3_match_info]
|
|
|
|
|
+ logger.info(f" ✓ {feature_name} → Top{len(top3_match_info)}: {', '.join(top3_names)}")
|
|
|
|
|
|
|
|
# 统计信息
|
|
# 统计信息
|
|
|
logger.info(f"\n" + "=" * 60)
|
|
logger.info(f"\n" + "=" * 60)
|
|
@@ -430,6 +438,8 @@ class EnhancedSearchV2:
|
|
|
"""
|
|
"""
|
|
|
阶段2:查找关联分类,收集分类名称、标签、子分类
|
|
阶段2:查找关联分类,收集分类名称、标签、子分类
|
|
|
|
|
|
|
|
|
|
+ 改进: 为top3的每个base_word都查找关联
|
|
|
|
|
+
|
|
|
Args:
|
|
Args:
|
|
|
filtered_features: 阶段1筛选的特征
|
|
filtered_features: 阶段1筛选的特征
|
|
|
|
|
|
|
@@ -437,52 +447,75 @@ class EnhancedSearchV2:
|
|
|
带关联信息的特征列表
|
|
带关联信息的特征列表
|
|
|
"""
|
|
"""
|
|
|
logger.info("=" * 60)
|
|
logger.info("=" * 60)
|
|
|
- logger.info("阶段2:查找关联分类")
|
|
|
|
|
|
|
+ logger.info("阶段2:查找关联分类(为每个base_word)")
|
|
|
logger.info("=" * 60)
|
|
logger.info("=" * 60)
|
|
|
|
|
|
|
|
for idx, feature in enumerate(filtered_features, 1):
|
|
for idx, feature in enumerate(filtered_features, 1):
|
|
|
logger.info(f"\n[{idx}/{len(filtered_features)}] 处理: {feature['原始特征名称']}")
|
|
logger.info(f"\n[{idx}/{len(filtered_features)}] 处理: {feature['原始特征名称']}")
|
|
|
|
|
|
|
|
- match_info = feature['最高匹配信息']
|
|
|
|
|
- is_classification = match_info['是分类']
|
|
|
|
|
- classification_path = match_info['所属分类路径']
|
|
|
|
|
- source_level = match_info['人设特征层级']
|
|
|
|
|
|
|
+ # 获取top3 base_words
|
|
|
|
|
+ top3_info = feature.get('top3匹配信息', [])
|
|
|
|
|
+ if not top3_info:
|
|
|
|
|
+ logger.warning(f" 无top3匹配信息,跳过")
|
|
|
|
|
+ feature['找到的关联_按base_word'] = {}
|
|
|
|
|
+ continue
|
|
|
|
|
|
|
|
- if is_classification:
|
|
|
|
|
- # 匹配的是分类,直接用分类路径
|
|
|
|
|
- search_path = classification_path
|
|
|
|
|
- logger.info(f" 匹配到分类: {search_path}")
|
|
|
|
|
- else:
|
|
|
|
|
- # 匹配的是特征,用所属分类路径
|
|
|
|
|
- search_path = classification_path
|
|
|
|
|
- logger.info(f" 匹配到特征,使用所属分类: {search_path}")
|
|
|
|
|
-
|
|
|
|
|
- # 查找关联
|
|
|
|
|
- associations = self._find_associations(search_path, source_level)
|
|
|
|
|
-
|
|
|
|
|
- # 收集关联信息
|
|
|
|
|
- feature['找到的关联'] = []
|
|
|
|
|
-
|
|
|
|
|
- for assoc in associations:
|
|
|
|
|
- target_path = assoc['目标分类']
|
|
|
|
|
- logger.info(f" 处理关联: {target_path}")
|
|
|
|
|
-
|
|
|
|
|
- # 收集分类信息
|
|
|
|
|
- classification_info = self._collect_classification_info(target_path)
|
|
|
|
|
-
|
|
|
|
|
- if classification_info:
|
|
|
|
|
- feature['找到的关联'].append({
|
|
|
|
|
- '来源方向': assoc['来源方向'],
|
|
|
|
|
- '关联类型': assoc['关联类型'],
|
|
|
|
|
- '目标分类路径': target_path,
|
|
|
|
|
- '共同帖子数': assoc['共同帖子数'],
|
|
|
|
|
- 'Jaccard相似度': assoc['Jaccard相似度'],
|
|
|
|
|
- '分类名称': classification_info['classification_name'],
|
|
|
|
|
- '标签列表': classification_info['tags'],
|
|
|
|
|
- '子分类列表': classification_info['sub_classifications']
|
|
|
|
|
- })
|
|
|
|
|
|
|
+ logger.info(f" 找到 {len(top3_info)} 个base_word")
|
|
|
|
|
+
|
|
|
|
|
+ # 为每个base_word查找关联
|
|
|
|
|
+ associations_by_base_word = {}
|
|
|
|
|
|
|
|
- logger.info(f" 找到 {len(feature['找到的关联'])} 个关联")
|
|
|
|
|
|
|
+ for base_idx, base_info in enumerate(top3_info, 1):
|
|
|
|
|
+ base_word = base_info.get('人设特征名称', '')
|
|
|
|
|
+ is_classification = base_info['是分类']
|
|
|
|
|
+ classification_path = base_info['所属分类路径']
|
|
|
|
|
+ source_level = base_info['人设特征层级']
|
|
|
|
|
+
|
|
|
|
|
+ logger.info(f" [{base_idx}/{len(top3_info)}] Base Word: {base_word}")
|
|
|
|
|
+
|
|
|
|
|
+ if is_classification:
|
|
|
|
|
+ search_path = classification_path
|
|
|
|
|
+ logger.info(f" 匹配到分类: {search_path}")
|
|
|
|
|
+ else:
|
|
|
|
|
+ search_path = classification_path
|
|
|
|
|
+ logger.info(f" 匹配到特征,使用所属分类: {search_path}")
|
|
|
|
|
+
|
|
|
|
|
+ # 查找关联
|
|
|
|
|
+ associations = self._find_associations(search_path, source_level)
|
|
|
|
|
+
|
|
|
|
|
+ # 收集关联信息
|
|
|
|
|
+ base_word_associations = []
|
|
|
|
|
+
|
|
|
|
|
+ for assoc in associations:
|
|
|
|
|
+ target_path = assoc['目标分类']
|
|
|
|
|
+
|
|
|
|
|
+ # 收集分类信息
|
|
|
|
|
+ classification_info = self._collect_classification_info(target_path)
|
|
|
|
|
+
|
|
|
|
|
+ if classification_info:
|
|
|
|
|
+ base_word_associations.append({
|
|
|
|
|
+ '来源方向': assoc['来源方向'],
|
|
|
|
|
+ '关联类型': assoc['关联类型'],
|
|
|
|
|
+ '目标分类路径': target_path,
|
|
|
|
|
+ '共同帖子数': assoc['共同帖子数'],
|
|
|
|
|
+ 'Jaccard相似度': assoc['Jaccard相似度'],
|
|
|
|
|
+ '分类名称': classification_info['classification_name'],
|
|
|
|
|
+ '标签列表': classification_info['tags'],
|
|
|
|
|
+ '子分类列表': classification_info['sub_classifications']
|
|
|
|
|
+ })
|
|
|
|
|
+
|
|
|
|
|
+ associations_by_base_word[base_word] = base_word_associations
|
|
|
|
|
+ logger.info(f" 找到 {len(base_word_associations)} 个关联")
|
|
|
|
|
+
|
|
|
|
|
+ # 保存结果
|
|
|
|
|
+ feature['找到的关联_按base_word'] = associations_by_base_word
|
|
|
|
|
+
|
|
|
|
|
+ # 向后兼容:保留基于最高匹配信息的关联(即第1个base_word的关联)
|
|
|
|
|
+ first_base_word = top3_info[0].get('人设特征名称', '')
|
|
|
|
|
+ feature['找到的关联'] = associations_by_base_word.get(first_base_word, [])
|
|
|
|
|
+
|
|
|
|
|
+ total_associations = sum(len(v) for v in associations_by_base_word.values())
|
|
|
|
|
+ logger.info(f" 总共找到 {total_associations} 个关联({len(associations_by_base_word)} 个base_word)")
|
|
|
|
|
|
|
|
# 保存结果
|
|
# 保存结果
|
|
|
output_path = os.path.join(self.output_dir, "stage2_associations.json")
|
|
output_path = os.path.join(self.output_dir, "stage2_associations.json")
|
|
@@ -589,8 +622,8 @@ class EnhancedSearchV2:
|
|
|
"""
|
|
"""
|
|
|
阶段3:筛选高相似度匹配(>0.8)
|
|
阶段3:筛选高相似度匹配(>0.8)
|
|
|
|
|
|
|
|
- 遍历how解构中的所有原始特征,找出匹配结果中相似度>0.8
|
|
|
|
|
- 且人设特征名称在Stage2关联范围内的高质量匹配
|
|
|
|
|
|
|
+ 改进:为每个base_word独立筛选候选词
|
|
|
|
|
+ 基于该base_word的关联范围,在how解构中找出相似度>0.8的匹配
|
|
|
|
|
|
|
|
Args:
|
|
Args:
|
|
|
associations_data: 阶段2的关联数据
|
|
associations_data: 阶段2的关联数据
|
|
@@ -599,76 +632,107 @@ class EnhancedSearchV2:
|
|
|
带高相似度候选的数据
|
|
带高相似度候选的数据
|
|
|
"""
|
|
"""
|
|
|
logger.info("=" * 60)
|
|
logger.info("=" * 60)
|
|
|
- logger.info("阶段3:筛选高相似度匹配(>0.8)")
|
|
|
|
|
|
|
+ logger.info("阶段3:筛选高相似度匹配(>0.8,为每个base_word)")
|
|
|
logger.info("=" * 60)
|
|
logger.info("=" * 60)
|
|
|
|
|
|
|
|
for idx, feature_result in enumerate(associations_data, 1):
|
|
for idx, feature_result in enumerate(associations_data, 1):
|
|
|
original_feature_name = feature_result['原始特征名称']
|
|
original_feature_name = feature_result['原始特征名称']
|
|
|
logger.info(f"\n[{idx}/{len(associations_data)}] 处理: {original_feature_name}")
|
|
logger.info(f"\n[{idx}/{len(associations_data)}] 处理: {original_feature_name}")
|
|
|
|
|
|
|
|
- # 步骤1: 收集Stage2的关联范围(分类名+标签)
|
|
|
|
|
- stage2_scope = self._collect_stage2_scope(feature_result)
|
|
|
|
|
- logger.info(f" Stage2范围包含 {len(stage2_scope)} 个分类/标签")
|
|
|
|
|
|
|
+ # 获取top3 base_words
|
|
|
|
|
+ top3_info = feature_result.get('top3匹配信息', [])
|
|
|
|
|
+ associations_by_base_word = feature_result.get('找到的关联_按base_word', {})
|
|
|
|
|
+
|
|
|
|
|
+ if not top3_info or not associations_by_base_word:
|
|
|
|
|
+ logger.warning(f" 无top3匹配信息或关联数据,跳过")
|
|
|
|
|
+ feature_result['高相似度候选_按base_word'] = {}
|
|
|
|
|
+ continue
|
|
|
|
|
+
|
|
|
|
|
+ logger.info(f" 找到 {len(top3_info)} 个base_word")
|
|
|
|
|
+
|
|
|
|
|
+ # 为每个base_word独立筛选候选词
|
|
|
|
|
+ candidates_by_base_word = {}
|
|
|
|
|
|
|
|
- # 步骤2: 遍历how解构中的所有原始特征,找出高相似度匹配
|
|
|
|
|
- high_sim_candidates = []
|
|
|
|
|
- total_checked = 0
|
|
|
|
|
- high_sim_found = 0
|
|
|
|
|
|
|
+ for base_idx, base_info in enumerate(top3_info, 1):
|
|
|
|
|
+ base_word = base_info.get('人设特征名称', '')
|
|
|
|
|
+ logger.info(f" [{base_idx}/{len(top3_info)}] Base Word: {base_word}")
|
|
|
|
|
|
|
|
- how_result = self.how_data.get('how解构结果', {})
|
|
|
|
|
- for level_name, level_list in how_result.items():
|
|
|
|
|
- if not isinstance(level_list, list):
|
|
|
|
|
|
|
+ # 步骤1: 收集该base_word的关联范围
|
|
|
|
|
+ base_word_associations = associations_by_base_word.get(base_word, [])
|
|
|
|
|
+ base_word_scope = self._collect_scope_from_associations(base_word_associations)
|
|
|
|
|
+ logger.info(f" 关联范围包含 {len(base_word_scope)} 个分类/标签")
|
|
|
|
|
+
|
|
|
|
|
+ if not base_word_scope:
|
|
|
|
|
+ logger.warning(f" 无关联范围,跳过")
|
|
|
|
|
+ candidates_by_base_word[base_word] = []
|
|
|
continue
|
|
continue
|
|
|
|
|
|
|
|
- for item in level_list:
|
|
|
|
|
- for step in item.get('how步骤列表', []):
|
|
|
|
|
- for feature in step.get('特征列表', []):
|
|
|
|
|
- # 获取该特征的所有匹配
|
|
|
|
|
- matches = feature.get('匹配结果', [])
|
|
|
|
|
- total_checked += len(matches)
|
|
|
|
|
-
|
|
|
|
|
- # 筛选相似度>0.8且在Stage2范围内的匹配
|
|
|
|
|
- for match in matches:
|
|
|
|
|
- sim = match.get('匹配结果', {}).get('相似度', 0)
|
|
|
|
|
- persona_feature_name = match.get('人设特征名称', '')
|
|
|
|
|
-
|
|
|
|
|
- if sim > 0.8 and persona_feature_name in stage2_scope:
|
|
|
|
|
- high_sim_found += 1
|
|
|
|
|
- # 记录来源信息
|
|
|
|
|
- high_sim_candidates.append({
|
|
|
|
|
- '人设特征名称': persona_feature_name,
|
|
|
|
|
- '相似度': sim,
|
|
|
|
|
- '特征类型': match.get('特征类型', ''),
|
|
|
|
|
- '特征分类': match.get('特征分类', []),
|
|
|
|
|
- '人设特征层级': match.get('人设特征层级', ''),
|
|
|
|
|
- '来源路径': self._build_classification_path(match.get('特征分类', [])),
|
|
|
|
|
- '匹配说明': match.get('匹配结果', {}).get('说明', ''),
|
|
|
|
|
- '来源原始特征': feature.get('特征名称', '') # 记录来自哪个原始特征
|
|
|
|
|
- })
|
|
|
|
|
-
|
|
|
|
|
- logger.info(f" 检查了 {total_checked} 个匹配")
|
|
|
|
|
- logger.info(f" 找到 {high_sim_found} 个相似度>0.8的匹配")
|
|
|
|
|
-
|
|
|
|
|
- # 按相似度降序排序,并去重(同一个人设特征名称只保留最高分)
|
|
|
|
|
- seen_names = set()
|
|
|
|
|
- unique_candidates = []
|
|
|
|
|
- high_sim_candidates.sort(key=lambda x: x['相似度'], reverse=True)
|
|
|
|
|
-
|
|
|
|
|
- for candidate in high_sim_candidates:
|
|
|
|
|
- name = candidate['人设特征名称']
|
|
|
|
|
- if name not in seen_names:
|
|
|
|
|
- seen_names.add(name)
|
|
|
|
|
- unique_candidates.append(candidate)
|
|
|
|
|
-
|
|
|
|
|
- # 添加到结果中
|
|
|
|
|
- feature_result['高相似度候选'] = unique_candidates
|
|
|
|
|
- logger.info(f" 去重后筛选出 {len(unique_candidates)} 个高相似度候选")
|
|
|
|
|
-
|
|
|
|
|
- # 显示前5个
|
|
|
|
|
- if unique_candidates:
|
|
|
|
|
- logger.info(f" Top 5:")
|
|
|
|
|
- for c in unique_candidates[:5]:
|
|
|
|
|
- logger.info(f" • {c['人设特征名称']} ({c['相似度']:.3f}) ← 来自\"{c['来源原始特征']}\"")
|
|
|
|
|
|
|
+ # 步骤2: 遍历how解构,找出高相似度匹配
|
|
|
|
|
+ high_sim_candidates = []
|
|
|
|
|
+ total_checked = 0
|
|
|
|
|
+ high_sim_found = 0
|
|
|
|
|
+
|
|
|
|
|
+ how_result = self.how_data.get('how解构结果', {})
|
|
|
|
|
+ for level_name, level_list in how_result.items():
|
|
|
|
|
+ if not isinstance(level_list, list):
|
|
|
|
|
+ continue
|
|
|
|
|
+
|
|
|
|
|
+ for item in level_list:
|
|
|
|
|
+ for step in item.get('how步骤列表', []):
|
|
|
|
|
+ for feature in step.get('特征列表', []):
|
|
|
|
|
+ matches = feature.get('匹配结果', [])
|
|
|
|
|
+ total_checked += len(matches)
|
|
|
|
|
+
|
|
|
|
|
+ # 筛选相似度>0.8且在该base_word的范围内的匹配
|
|
|
|
|
+ for match in matches:
|
|
|
|
|
+ sim = match.get('匹配结果', {}).get('相似度', 0)
|
|
|
|
|
+ persona_feature_name = match.get('人设特征名称', '')
|
|
|
|
|
+
|
|
|
|
|
+ if sim > 0.8 and persona_feature_name in base_word_scope:
|
|
|
|
|
+ high_sim_found += 1
|
|
|
|
|
+ high_sim_candidates.append({
|
|
|
|
|
+ '人设特征名称': persona_feature_name,
|
|
|
|
|
+ '相似度': sim,
|
|
|
|
|
+ '特征类型': match.get('特征类型', ''),
|
|
|
|
|
+ '特征分类': match.get('特征分类', []),
|
|
|
|
|
+ '人设特征层级': match.get('人设特征层级', ''),
|
|
|
|
|
+ '来源路径': self._build_classification_path(match.get('特征分类', [])),
|
|
|
|
|
+ '匹配说明': match.get('匹配结果', {}).get('说明', ''),
|
|
|
|
|
+ '来源原始特征': feature.get('特征名称', '')
|
|
|
|
|
+ })
|
|
|
|
|
+
|
|
|
|
|
+ logger.info(f" 检查了 {total_checked} 个匹配")
|
|
|
|
|
+ logger.info(f" 找到 {high_sim_found} 个相似度>0.8的匹配")
|
|
|
|
|
+
|
|
|
|
|
+ # 按相似度降序排序并去重
|
|
|
|
|
+ seen_names = set()
|
|
|
|
|
+ unique_candidates = []
|
|
|
|
|
+ high_sim_candidates.sort(key=lambda x: x['相似度'], reverse=True)
|
|
|
|
|
+
|
|
|
|
|
+ for candidate in high_sim_candidates:
|
|
|
|
|
+ name = candidate['人设特征名称']
|
|
|
|
|
+ if name not in seen_names:
|
|
|
|
|
+ seen_names.add(name)
|
|
|
|
|
+ unique_candidates.append(candidate)
|
|
|
|
|
+
|
|
|
|
|
+ candidates_by_base_word[base_word] = unique_candidates
|
|
|
|
|
+ logger.info(f" 去重后筛选出 {len(unique_candidates)} 个候选")
|
|
|
|
|
+
|
|
|
|
|
+ # 显示前5个
|
|
|
|
|
+ if unique_candidates:
|
|
|
|
|
+ logger.info(f" Top 5:")
|
|
|
|
|
+ for c in unique_candidates[:5]:
|
|
|
|
|
+ logger.info(f" • {c['人设特征名称']} ({c['相似度']:.3f}) ← 来自\"{c['来源原始特征']}\"")
|
|
|
|
|
+
|
|
|
|
|
+ # 保存结果
|
|
|
|
|
+ feature_result['高相似度候选_按base_word'] = candidates_by_base_word
|
|
|
|
|
+
|
|
|
|
|
+ # 向后兼容:保留第1个base_word的候选
|
|
|
|
|
+ first_base_word = top3_info[0].get('人设特征名称', '')
|
|
|
|
|
+ feature_result['高相似度候选'] = candidates_by_base_word.get(first_base_word, [])
|
|
|
|
|
+
|
|
|
|
|
+ total_candidates = sum(len(v) for v in candidates_by_base_word.values())
|
|
|
|
|
+ logger.info(f" 总共筛选出 {total_candidates} 个候选({len(candidates_by_base_word)} 个base_word)")
|
|
|
|
|
|
|
|
# 保存结果
|
|
# 保存结果
|
|
|
output_path = os.path.join(self.output_dir, "stage3_high_similarity.json")
|
|
output_path = os.path.join(self.output_dir, "stage3_high_similarity.json")
|
|
@@ -681,19 +745,19 @@ class EnhancedSearchV2:
|
|
|
return associations_data
|
|
return associations_data
|
|
|
|
|
|
|
|
|
|
|
|
|
- def _collect_stage2_scope(self, feature_result: Dict[str, Any]) -> Set[str]:
|
|
|
|
|
|
|
+ def _collect_scope_from_associations(self, associations: List[Dict[str, Any]]) -> Set[str]:
|
|
|
"""
|
|
"""
|
|
|
- 收集Stage2找到的所有分类名和标签,形成范围集合
|
|
|
|
|
|
|
+ 从关联列表中收集所有分类名和标签,形成范围集合
|
|
|
|
|
|
|
|
Args:
|
|
Args:
|
|
|
- feature_result: 特征结果数据
|
|
|
|
|
|
|
+ associations: 关联列表
|
|
|
|
|
|
|
|
Returns:
|
|
Returns:
|
|
|
包含所有分类名和标签的集合
|
|
包含所有分类名和标签的集合
|
|
|
"""
|
|
"""
|
|
|
scope = set()
|
|
scope = set()
|
|
|
|
|
|
|
|
- for assoc in feature_result.get('找到的关联', []):
|
|
|
|
|
|
|
+ for assoc in associations:
|
|
|
# 添加分类名
|
|
# 添加分类名
|
|
|
scope.add(assoc['分类名称'])
|
|
scope.add(assoc['分类名称'])
|
|
|
|
|
|
|
@@ -703,6 +767,19 @@ class EnhancedSearchV2:
|
|
|
|
|
|
|
|
return scope
|
|
return scope
|
|
|
|
|
|
|
|
|
|
+ def _collect_stage2_scope(self, feature_result: Dict[str, Any]) -> Set[str]:
|
|
|
|
|
+ """
|
|
|
|
|
+ 收集Stage2找到的所有分类名和标签,形成范围集合(兼容旧方法)
|
|
|
|
|
+
|
|
|
|
|
+ Args:
|
|
|
|
|
+ feature_result: 特征结果数据
|
|
|
|
|
+
|
|
|
|
|
+ Returns:
|
|
|
|
|
+ 包含所有分类名和标签的集合
|
|
|
|
|
+ """
|
|
|
|
|
+ associations = feature_result.get('找到的关联', [])
|
|
|
|
|
+ return self._collect_scope_from_associations(associations)
|
|
|
|
|
+
|
|
|
def _find_features_by_path(self, target_classification: str) -> List[Dict[str, Any]]:
|
|
def _find_features_by_path(self, target_classification: str) -> List[Dict[str, Any]]:
|
|
|
"""
|
|
"""
|
|
|
根据路径查找特征列表
|
|
根据路径查找特征列表
|
|
@@ -797,12 +874,16 @@ class EnhancedSearchV2:
|
|
|
"""
|
|
"""
|
|
|
处理单个原始特征的组合生成和评估
|
|
处理单个原始特征的组合生成和评估
|
|
|
|
|
|
|
|
|
|
+ 改进: 每个base_word使用自己的候选词(而不是共享)
|
|
|
|
|
+
|
|
|
Steps:
|
|
Steps:
|
|
|
- 1. Get base_word from Stage1's 最高匹配信息
|
|
|
|
|
- 2. Get candidates from Stage3's 高相似度候选 (top max_candidates)
|
|
|
|
|
- 3. Generate 2-N word combinations
|
|
|
|
|
- 4. LLM batch evaluation
|
|
|
|
|
- 5. Select Top 10 and write back
|
|
|
|
|
|
|
+ 1. Get top3 base_words from Stage1's top3匹配信息
|
|
|
|
|
+ 2. For each base_word:
|
|
|
|
|
+ a. Get candidates from Stage3's 高相似度候选_按base_word
|
|
|
|
|
+ b. Generate combinations
|
|
|
|
|
+ c. LLM evaluation
|
|
|
|
|
+ d. Select Top 10
|
|
|
|
|
+ 3. Save grouped results
|
|
|
|
|
|
|
|
Args:
|
|
Args:
|
|
|
idx: 特征索引
|
|
idx: 特征索引
|
|
@@ -814,62 +895,94 @@ class EnhancedSearchV2:
|
|
|
original_feature = feature_result['原始特征名称']
|
|
original_feature = feature_result['原始特征名称']
|
|
|
logger.info(f"\n[{idx}/{total}] 处理: {original_feature}")
|
|
logger.info(f"\n[{idx}/{total}] 处理: {original_feature}")
|
|
|
|
|
|
|
|
- # 步骤1: 获取基础词
|
|
|
|
|
- base_word = feature_result.get('最高匹配信息', {}).get('人设特征名称', '')
|
|
|
|
|
- if not base_word:
|
|
|
|
|
- logger.info(f" 无基础词,跳过")
|
|
|
|
|
- feature_result['组合评估结果'] = []
|
|
|
|
|
|
|
+ # 步骤1: 获取top3基础词
|
|
|
|
|
+ top3_info = feature_result.get('top3匹配信息', [])
|
|
|
|
|
+ if not top3_info:
|
|
|
|
|
+ logger.info(f" 无top3匹配信息,跳过")
|
|
|
|
|
+ feature_result['组合评估结果_分组'] = []
|
|
|
return
|
|
return
|
|
|
|
|
|
|
|
- logger.info(f" 基础词: {base_word}")
|
|
|
|
|
|
|
+ logger.info(f" 找到 {len(top3_info)} 个base_word")
|
|
|
|
|
|
|
|
- # 步骤2: 获取候选词(从高相似度候选中)
|
|
|
|
|
- high_sim_candidates = feature_result.get('高相似度候选', [])
|
|
|
|
|
|
|
+ # 步骤2: 获取按base_word分组的候选词
|
|
|
|
|
+ candidates_by_base_word = feature_result.get('高相似度候选_按base_word', {})
|
|
|
|
|
|
|
|
- # 限制候选词数量
|
|
|
|
|
- candidates = high_sim_candidates[:max_candidates]
|
|
|
|
|
- candidate_words = [c['人设特征名称'] for c in candidates]
|
|
|
|
|
-
|
|
|
|
|
- if not candidate_words:
|
|
|
|
|
- logger.info(f" 无候选词,跳过")
|
|
|
|
|
- feature_result['组合评估结果'] = []
|
|
|
|
|
|
|
+ if not candidates_by_base_word:
|
|
|
|
|
+ logger.warning(f" 无按base_word分组的候选词,跳过")
|
|
|
|
|
+ feature_result['组合评估结果_分组'] = []
|
|
|
return
|
|
return
|
|
|
|
|
|
|
|
- logger.info(f" 候选词数量: {len(candidate_words)} (限制: {max_candidates})")
|
|
|
|
|
|
|
+ # 步骤3: 为每个base_word独立处理
|
|
|
|
|
+ grouped_results = []
|
|
|
|
|
+
|
|
|
|
|
+ for base_idx, base_info in enumerate(top3_info, 1):
|
|
|
|
|
+ base_word = base_info.get('人设特征名称', '')
|
|
|
|
|
+ base_similarity = base_info.get('相似度', 0)
|
|
|
|
|
+
|
|
|
|
|
+ if not base_word:
|
|
|
|
|
+ continue
|
|
|
|
|
+
|
|
|
|
|
+ logger.info(f" [{base_idx}/{len(top3_info)}] Base Word: {base_word} (相似度: {base_similarity:.3f})")
|
|
|
|
|
|
|
|
- # 步骤3: 生成所有组合
|
|
|
|
|
- all_combinations = []
|
|
|
|
|
|
|
+ # 获取该base_word的候选词
|
|
|
|
|
+ base_candidates = candidates_by_base_word.get(base_word, [])
|
|
|
|
|
+ candidates = base_candidates[:max_candidates]
|
|
|
|
|
+ candidate_words = [c['人设特征名称'] for c in candidates]
|
|
|
|
|
|
|
|
- # 生成1词到max_combo_length-1词的候选词组合(因为还要加上base_word)
|
|
|
|
|
- for length in range(1, min(max_combo_length, len(candidate_words) + 1)):
|
|
|
|
|
- for combo in combinations(candidate_words, length):
|
|
|
|
|
- # 组合成搜索词:基础词 + 候选词组合
|
|
|
|
|
- search_phrase = base_word + ' ' + ' '.join(combo)
|
|
|
|
|
- all_combinations.append({
|
|
|
|
|
- 'search_word': search_phrase,
|
|
|
|
|
|
|
+ if not candidate_words:
|
|
|
|
|
+ logger.warning(f" 该base_word无候选词,跳过")
|
|
|
|
|
+ grouped_results.append({
|
|
|
'base_word': base_word,
|
|
'base_word': base_word,
|
|
|
- 'candidate_words': list(combo),
|
|
|
|
|
- 'combo_length': length + 1 # +1 因为包含base_word
|
|
|
|
|
|
|
+ 'base_word_similarity': base_similarity,
|
|
|
|
|
+ 'base_word_info': base_info,
|
|
|
|
|
+ 'top10_searches': [],
|
|
|
|
|
+ 'available_words': []
|
|
|
})
|
|
})
|
|
|
|
|
+ continue
|
|
|
|
|
|
|
|
- logger.info(f" 生成 {len(all_combinations)} 个组合")
|
|
|
|
|
|
|
+ logger.info(f" 候选词数量: {len(candidate_words)} (限制: {max_candidates})")
|
|
|
|
|
+
|
|
|
|
|
+ # 生成组合
|
|
|
|
|
+ combinations_for_base = []
|
|
|
|
|
+ for length in range(1, min(max_combo_length, len(candidate_words) + 1)):
|
|
|
|
|
+ for combo in combinations(candidate_words, length):
|
|
|
|
|
+ search_phrase = base_word + ' ' + ' '.join(combo)
|
|
|
|
|
+ combinations_for_base.append({
|
|
|
|
|
+ 'search_word': search_phrase,
|
|
|
|
|
+ 'base_word': base_word,
|
|
|
|
|
+ 'candidate_words': list(combo),
|
|
|
|
|
+ 'combo_length': length + 1
|
|
|
|
|
+ })
|
|
|
|
|
|
|
|
- # 步骤4: LLM批量评估
|
|
|
|
|
- logger.info(f" 开始LLM评估...")
|
|
|
|
|
- evaluated = self.llm_evaluator.evaluate_search_words_in_batches(
|
|
|
|
|
- original_feature=original_feature,
|
|
|
|
|
- search_words=[c['search_word'] for c in all_combinations],
|
|
|
|
|
- batch_size=50
|
|
|
|
|
- )
|
|
|
|
|
|
|
+ logger.info(f" 生成 {len(combinations_for_base)} 个组合")
|
|
|
|
|
+
|
|
|
|
|
+ # LLM评估
|
|
|
|
|
+ logger.info(f" 开始LLM评估...")
|
|
|
|
|
+ evaluated = self.llm_evaluator.evaluate_search_words_in_batches(
|
|
|
|
|
+ original_feature=original_feature,
|
|
|
|
|
+ search_words=[c['search_word'] for c in combinations_for_base],
|
|
|
|
|
+ batch_size=50
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+ # 选出Top 10
|
|
|
|
|
+ top_10 = evaluated[:10]
|
|
|
|
|
+ max_score = top_10[0]['score'] if top_10 else 0.0
|
|
|
|
|
+ logger.info(f" 评估完成,Top 10 最高分: {max_score:.3f}")
|
|
|
|
|
|
|
|
- # 步骤5: 选出Top 10
|
|
|
|
|
- top_10 = evaluated[:10]
|
|
|
|
|
|
|
+ # 保存分组结果 - 每个base_word有自己的available_words
|
|
|
|
|
+ grouped_results.append({
|
|
|
|
|
+ 'base_word': base_word,
|
|
|
|
|
+ 'base_word_similarity': base_similarity,
|
|
|
|
|
+ 'base_word_info': base_info,
|
|
|
|
|
+ 'top10_searches': top_10,
|
|
|
|
|
+ 'available_words': candidate_words # 该base_word自己的候选词
|
|
|
|
|
+ })
|
|
|
|
|
|
|
|
# 写回结果
|
|
# 写回结果
|
|
|
- feature_result['组合评估结果'] = top_10
|
|
|
|
|
|
|
+ feature_result['组合评估结果_分组'] = grouped_results
|
|
|
|
|
|
|
|
- max_score = top_10[0]['score'] if top_10 else 0.0
|
|
|
|
|
- logger.info(f" 评估完成,Top 10 最高分: {max_score:.3f}")
|
|
|
|
|
|
|
+ total_searches = sum(len(g['top10_searches']) for g in grouped_results)
|
|
|
|
|
+ logger.info(f" 完成!共 {len(grouped_results)} 个base_word,{total_searches} 个搜索词")
|
|
|
|
|
|
|
|
# ========== 阶段5:执行搜索 ==========
|
|
# ========== 阶段5:执行搜索 ==========
|
|
|
|
|
|
|
@@ -954,7 +1067,7 @@ class EnhancedSearchV2:
|
|
|
logger.info("阶段5:执行小红书搜索")
|
|
logger.info("阶段5:执行小红书搜索")
|
|
|
logger.info("=" * 60)
|
|
logger.info("=" * 60)
|
|
|
|
|
|
|
|
- # 按原始特征分组收集搜索词(从Stage4的组合评估结果读取)
|
|
|
|
|
|
|
+ # 按原始特征分组收集搜索词(从Stage4的组合评估结果_分组读取)
|
|
|
feature_search_groups = {}
|
|
feature_search_groups = {}
|
|
|
|
|
|
|
|
for feature_result in features_data:
|
|
for feature_result in features_data:
|
|
@@ -963,46 +1076,60 @@ class EnhancedSearchV2:
|
|
|
if original_feature not in feature_search_groups:
|
|
if original_feature not in feature_search_groups:
|
|
|
feature_search_groups[original_feature] = []
|
|
feature_search_groups[original_feature] = []
|
|
|
|
|
|
|
|
- # 从Stage4的组合评估结果读取
|
|
|
|
|
- for eval_item in feature_result.get('组合评估结果', []):
|
|
|
|
|
- sw = eval_item.get('search_word')
|
|
|
|
|
- if not sw:
|
|
|
|
|
- continue
|
|
|
|
|
|
|
+ # 从Stage4的组合评估结果_分组读取(新结构)
|
|
|
|
|
+ grouped_results = feature_result.get('组合评估结果_分组', [])
|
|
|
|
|
|
|
|
- score = eval_item.get('score', 0.0)
|
|
|
|
|
|
|
+ if grouped_results:
|
|
|
|
|
+ # 使用分组结构:每个base_word的top10都执行
|
|
|
|
|
+ for group in grouped_results:
|
|
|
|
|
+ base_word = group.get('base_word', '')
|
|
|
|
|
+ base_similarity = group.get('base_word_similarity', 0)
|
|
|
|
|
|
|
|
- feature_search_groups[original_feature].append({
|
|
|
|
|
- 'search_word': sw,
|
|
|
|
|
- 'score': score,
|
|
|
|
|
- 'feature_ref': eval_item # 引用评估项,用于写入搜索结果
|
|
|
|
|
- })
|
|
|
|
|
|
|
+ for eval_item in group.get('top10_searches', []):
|
|
|
|
|
+ sw = eval_item.get('search_word')
|
|
|
|
|
+ if not sw:
|
|
|
|
|
+ continue
|
|
|
|
|
|
|
|
- # 每组取Top N
|
|
|
|
|
- all_searches = []
|
|
|
|
|
- total_before_filter = 0
|
|
|
|
|
- total_filtered = 0
|
|
|
|
|
|
|
+ score = eval_item.get('score', 0.0)
|
|
|
|
|
|
|
|
- for original_feature, search_list in feature_search_groups.items():
|
|
|
|
|
- total_before_filter += len(search_list)
|
|
|
|
|
|
|
+ feature_search_groups[original_feature].append({
|
|
|
|
|
+ 'search_word': sw,
|
|
|
|
|
+ 'score': score,
|
|
|
|
|
+ 'base_word': base_word,
|
|
|
|
|
+ 'base_word_similarity': base_similarity,
|
|
|
|
|
+ 'feature_ref': eval_item # 引用评估项,用于写入搜索结果
|
|
|
|
|
+ })
|
|
|
|
|
+ else:
|
|
|
|
|
+ # 兼容旧结构(组合评估结果)
|
|
|
|
|
+ for eval_item in feature_result.get('组合评估结果', []):
|
|
|
|
|
+ sw = eval_item.get('search_word')
|
|
|
|
|
+ if not sw:
|
|
|
|
|
+ continue
|
|
|
|
|
|
|
|
- # 按分数降序排序
|
|
|
|
|
- sorted_list = sorted(search_list, key=lambda x: x['score'], reverse=True)
|
|
|
|
|
|
|
+ score = eval_item.get('score', 0.0)
|
|
|
|
|
|
|
|
- # 取前top_n个
|
|
|
|
|
- selected = sorted_list[:top_n]
|
|
|
|
|
- all_searches.extend(selected)
|
|
|
|
|
|
|
+ feature_search_groups[original_feature].append({
|
|
|
|
|
+ 'search_word': sw,
|
|
|
|
|
+ 'score': score,
|
|
|
|
|
+ 'feature_ref': eval_item
|
|
|
|
|
+ })
|
|
|
|
|
+
|
|
|
|
|
+ # 收集所有搜索任务(分组结构下执行所有base_word的top10,不再过滤)
|
|
|
|
|
+ all_searches = []
|
|
|
|
|
+ total_count = 0
|
|
|
|
|
|
|
|
- filtered = len(sorted_list) - len(selected)
|
|
|
|
|
- total_filtered += filtered
|
|
|
|
|
|
|
+ for original_feature, search_list in feature_search_groups.items():
|
|
|
|
|
+ total_count += len(search_list)
|
|
|
|
|
+ all_searches.extend(search_list)
|
|
|
|
|
|
|
|
- logger.info(f" {original_feature}: 从 {len(sorted_list)} 个搜索词中选择 Top {len(selected)} (过滤 {filtered} 个)")
|
|
|
|
|
|
|
+ logger.info(f" {original_feature}: {len(search_list)} 个搜索词")
|
|
|
|
|
|
|
|
# 应用全局搜索次数限制
|
|
# 应用全局搜索次数限制
|
|
|
if self.max_total_searches and len(all_searches) > self.max_total_searches:
|
|
if self.max_total_searches and len(all_searches) > self.max_total_searches:
|
|
|
logger.info(f" 应用全局限制:从 {len(all_searches)} 个减少到 {self.max_total_searches} 个")
|
|
logger.info(f" 应用全局限制:从 {len(all_searches)} 个减少到 {self.max_total_searches} 个")
|
|
|
all_searches = all_searches[:self.max_total_searches]
|
|
all_searches = all_searches[:self.max_total_searches]
|
|
|
|
|
|
|
|
- logger.info(f"\n共 {len(all_searches)} 个搜索任务(过滤前: {total_before_filter}, 过滤掉: {total_filtered})")
|
|
|
|
|
|
|
+ logger.info(f"\n共 {len(all_searches)} 个搜索任务")
|
|
|
logger.info(f" 并发执行搜索(并发数: {self.search_max_workers})")
|
|
logger.info(f" 并发执行搜索(并发数: {self.search_max_workers})")
|
|
|
|
|
|
|
|
# 使用ThreadPoolExecutor并发执行搜索
|
|
# 使用ThreadPoolExecutor并发执行搜索
|