|
@@ -11,9 +11,11 @@ import copy
|
|
|
import time
|
|
import time
|
|
|
import os
|
|
import os
|
|
|
import argparse
|
|
import argparse
|
|
|
|
|
+import subprocess
|
|
|
from typing import Dict, List, Any, Optional, Set, Tuple
|
|
from typing import Dict, List, Any, Optional, Set, Tuple
|
|
|
from datetime import datetime
|
|
from datetime import datetime
|
|
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
|
|
|
+from itertools import combinations
|
|
|
|
|
|
|
|
from openrouter_client import OpenRouterClient
|
|
from openrouter_client import OpenRouterClient
|
|
|
from llm_evaluator import LLMEvaluator
|
|
from llm_evaluator import LLMEvaluator
|
|
@@ -41,7 +43,10 @@ class EnhancedSearchV2:
|
|
|
dimension_associations_path: str,
|
|
dimension_associations_path: str,
|
|
|
optimized_clustered_data_path: str,
|
|
optimized_clustered_data_path: str,
|
|
|
openrouter_api_key: Optional[str] = None,
|
|
openrouter_api_key: Optional[str] = None,
|
|
|
- output_dir: str = "output_v2"
|
|
|
|
|
|
|
+ output_dir: str = "output_v2",
|
|
|
|
|
+ top_n: int = 10,
|
|
|
|
|
+ max_total_searches: Optional[int] = None,
|
|
|
|
|
+ search_max_workers: int = 3
|
|
|
):
|
|
):
|
|
|
"""
|
|
"""
|
|
|
初始化系统
|
|
初始化系统
|
|
@@ -52,11 +57,17 @@ class EnhancedSearchV2:
|
|
|
optimized_clustered_data_path: 人设特征库路径
|
|
optimized_clustered_data_path: 人设特征库路径
|
|
|
openrouter_api_key: OpenRouter API密钥
|
|
openrouter_api_key: OpenRouter API密钥
|
|
|
output_dir: 输出目录
|
|
output_dir: 输出目录
|
|
|
|
|
+ top_n: 每个原始特征取评分最高的N个搜索词(默认10)
|
|
|
|
|
+ max_total_searches: 全局最大搜索次数限制(默认None不限制)
|
|
|
|
|
+ search_max_workers: 搜索并发数(默认3)
|
|
|
"""
|
|
"""
|
|
|
self.how_json_path = how_json_path
|
|
self.how_json_path = how_json_path
|
|
|
self.dimension_associations_path = dimension_associations_path
|
|
self.dimension_associations_path = dimension_associations_path
|
|
|
self.optimized_clustered_data_path = optimized_clustered_data_path
|
|
self.optimized_clustered_data_path = optimized_clustered_data_path
|
|
|
self.output_dir = output_dir
|
|
self.output_dir = output_dir
|
|
|
|
|
+ self.top_n = top_n
|
|
|
|
|
+ self.max_total_searches = max_total_searches
|
|
|
|
|
+ self.search_max_workers = search_max_workers
|
|
|
|
|
|
|
|
# 创建输出目录
|
|
# 创建输出目录
|
|
|
os.makedirs(output_dir, exist_ok=True)
|
|
os.makedirs(output_dir, exist_ok=True)
|
|
@@ -572,38 +583,95 @@ class EnhancedSearchV2:
|
|
|
'sub_classifications': sub_classifications
|
|
'sub_classifications': sub_classifications
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
- # ========== 阶段3:提取特征列表 ==========
|
|
|
|
|
|
|
+ # ========== 阶段3:筛选高相似度匹配(>0.8) ==========
|
|
|
|
|
|
|
|
- def stage3_extract_features(self, associations_data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
|
|
|
|
|
+ def stage3_filter_high_similarity_matches(self, associations_data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
|
"""
|
|
"""
|
|
|
- 阶段3:从关联分类中提取特征列表
|
|
|
|
|
|
|
+ 阶段3:筛选高相似度匹配(>0.8)
|
|
|
|
|
+
|
|
|
|
|
+ 遍历how解构中的所有原始特征,找出匹配结果中相似度>0.8
|
|
|
|
|
+ 且人设特征名称在Stage2关联范围内的高质量匹配
|
|
|
|
|
|
|
|
Args:
|
|
Args:
|
|
|
associations_data: 阶段2的关联数据
|
|
associations_data: 阶段2的关联数据
|
|
|
|
|
|
|
|
Returns:
|
|
Returns:
|
|
|
- 带特征列表的数据
|
|
|
|
|
|
|
+ 带高相似度候选的数据
|
|
|
"""
|
|
"""
|
|
|
logger.info("=" * 60)
|
|
logger.info("=" * 60)
|
|
|
- logger.info("阶段3:提取特征列表")
|
|
|
|
|
|
|
+ logger.info("阶段3:筛选高相似度匹配(>0.8)")
|
|
|
logger.info("=" * 60)
|
|
logger.info("=" * 60)
|
|
|
|
|
|
|
|
for idx, feature_result in enumerate(associations_data, 1):
|
|
for idx, feature_result in enumerate(associations_data, 1):
|
|
|
- logger.info(f"\n[{idx}/{len(associations_data)}] 处理: {feature_result['原始特征名称']}")
|
|
|
|
|
|
|
+ original_feature_name = feature_result['原始特征名称']
|
|
|
|
|
+ logger.info(f"\n[{idx}/{len(associations_data)}] 处理: {original_feature_name}")
|
|
|
|
|
|
|
|
- for assoc in feature_result.get('找到的关联', []):
|
|
|
|
|
- target_path = assoc['目标分类路径']
|
|
|
|
|
- logger.info(f" 提取特征: {target_path}")
|
|
|
|
|
|
|
+ # 步骤1: 收集Stage2的关联范围(分类名+标签)
|
|
|
|
|
+ stage2_scope = self._collect_stage2_scope(feature_result)
|
|
|
|
|
+ logger.info(f" Stage2范围包含 {len(stage2_scope)} 个分类/标签")
|
|
|
|
|
|
|
|
- # 提取特征
|
|
|
|
|
- features = self._find_features_by_path(target_path)
|
|
|
|
|
|
|
+ # 步骤2: 遍历how解构中的所有原始特征,找出高相似度匹配
|
|
|
|
|
+ high_sim_candidates = []
|
|
|
|
|
+ total_checked = 0
|
|
|
|
|
+ high_sim_found = 0
|
|
|
|
|
+
|
|
|
|
|
+ how_result = self.how_data.get('how解构结果', {})
|
|
|
|
|
+ for level_name, level_list in how_result.items():
|
|
|
|
|
+ if not isinstance(level_list, list):
|
|
|
|
|
+ continue
|
|
|
|
|
|
|
|
- # 添加到关联中
|
|
|
|
|
- assoc['特征列表'] = features
|
|
|
|
|
- logger.info(f" 找到 {len(features)} 个特征")
|
|
|
|
|
|
|
+ for item in level_list:
|
|
|
|
|
+ for step in item.get('how步骤列表', []):
|
|
|
|
|
+ for feature in step.get('特征列表', []):
|
|
|
|
|
+ # 获取该特征的所有匹配
|
|
|
|
|
+ matches = feature.get('匹配结果', [])
|
|
|
|
|
+ total_checked += len(matches)
|
|
|
|
|
+
|
|
|
|
|
+ # 筛选相似度>0.8且在Stage2范围内的匹配
|
|
|
|
|
+ for match in matches:
|
|
|
|
|
+ sim = match.get('匹配结果', {}).get('相似度', 0)
|
|
|
|
|
+ persona_feature_name = match.get('人设特征名称', '')
|
|
|
|
|
+
|
|
|
|
|
+ if sim > 0.8 and persona_feature_name in stage2_scope:
|
|
|
|
|
+ high_sim_found += 1
|
|
|
|
|
+ # 记录来源信息
|
|
|
|
|
+ high_sim_candidates.append({
|
|
|
|
|
+ '人设特征名称': persona_feature_name,
|
|
|
|
|
+ '相似度': sim,
|
|
|
|
|
+ '特征类型': match.get('特征类型', ''),
|
|
|
|
|
+ '特征分类': match.get('特征分类', []),
|
|
|
|
|
+ '人设特征层级': match.get('人设特征层级', ''),
|
|
|
|
|
+ '来源路径': self._build_classification_path(match.get('特征分类', [])),
|
|
|
|
|
+ '匹配说明': match.get('匹配结果', {}).get('说明', ''),
|
|
|
|
|
+ '来源原始特征': feature.get('特征名称', '') # 记录来自哪个原始特征
|
|
|
|
|
+ })
|
|
|
|
|
+
|
|
|
|
|
+ logger.info(f" 检查了 {total_checked} 个匹配")
|
|
|
|
|
+ logger.info(f" 找到 {high_sim_found} 个相似度>0.8的匹配")
|
|
|
|
|
+
|
|
|
|
|
+ # 按相似度降序排序,并去重(同一个人设特征名称只保留最高分)
|
|
|
|
|
+ seen_names = set()
|
|
|
|
|
+ unique_candidates = []
|
|
|
|
|
+ high_sim_candidates.sort(key=lambda x: x['相似度'], reverse=True)
|
|
|
|
|
+
|
|
|
|
|
+ for candidate in high_sim_candidates:
|
|
|
|
|
+ name = candidate['人设特征名称']
|
|
|
|
|
+ if name not in seen_names:
|
|
|
|
|
+ seen_names.add(name)
|
|
|
|
|
+ unique_candidates.append(candidate)
|
|
|
|
|
+
|
|
|
|
|
+ # 添加到结果中
|
|
|
|
|
+ feature_result['高相似度候选'] = unique_candidates
|
|
|
|
|
+ logger.info(f" 去重后筛选出 {len(unique_candidates)} 个高相似度候选")
|
|
|
|
|
+
|
|
|
|
|
+ # 显示前5个
|
|
|
|
|
+ if unique_candidates:
|
|
|
|
|
+ logger.info(f" Top 5:")
|
|
|
|
|
+ for c in unique_candidates[:5]:
|
|
|
|
|
+ logger.info(f" • {c['人设特征名称']} ({c['相似度']:.3f}) ← 来自\"{c['来源原始特征']}\"")
|
|
|
|
|
|
|
|
# 保存结果
|
|
# 保存结果
|
|
|
- output_path = os.path.join(self.output_dir, "stage3_features.json")
|
|
|
|
|
|
|
+ output_path = os.path.join(self.output_dir, "stage3_high_similarity.json")
|
|
|
self._save_json(associations_data, output_path)
|
|
self._save_json(associations_data, output_path)
|
|
|
|
|
|
|
|
logger.info(f"\n" + "=" * 60)
|
|
logger.info(f"\n" + "=" * 60)
|
|
@@ -612,6 +680,29 @@ class EnhancedSearchV2:
|
|
|
|
|
|
|
|
return associations_data
|
|
return associations_data
|
|
|
|
|
|
|
|
|
|
+
|
|
|
|
|
+ def _collect_stage2_scope(self, feature_result: Dict[str, Any]) -> Set[str]:
|
|
|
|
|
+ """
|
|
|
|
|
+ 收集Stage2找到的所有分类名和标签,形成范围集合
|
|
|
|
|
+
|
|
|
|
|
+ Args:
|
|
|
|
|
+ feature_result: 特征结果数据
|
|
|
|
|
+
|
|
|
|
|
+ Returns:
|
|
|
|
|
+ 包含所有分类名和标签的集合
|
|
|
|
|
+ """
|
|
|
|
|
+ scope = set()
|
|
|
|
|
+
|
|
|
|
|
+ for assoc in feature_result.get('找到的关联', []):
|
|
|
|
|
+ # 添加分类名
|
|
|
|
|
+ scope.add(assoc['分类名称'])
|
|
|
|
|
+
|
|
|
|
|
+ # 添加所有标签
|
|
|
|
|
+ tags = assoc.get('标签列表', [])
|
|
|
|
|
+ scope.update(tags)
|
|
|
|
|
+
|
|
|
|
|
+ return scope
|
|
|
|
|
+
|
|
|
def _find_features_by_path(self, target_classification: str) -> List[Dict[str, Any]]:
|
|
def _find_features_by_path(self, target_classification: str) -> List[Dict[str, Any]]:
|
|
|
"""
|
|
"""
|
|
|
根据路径查找特征列表
|
|
根据路径查找特征列表
|
|
@@ -630,56 +721,63 @@ class EnhancedSearchV2:
|
|
|
# 深拷贝
|
|
# 深拷贝
|
|
|
return copy.deepcopy(features)
|
|
return copy.deepcopy(features)
|
|
|
|
|
|
|
|
- # ========== 阶段4:生成搜索词 + LLM评估质量 ==========
|
|
|
|
|
|
|
+ # ========== 阶段4:多词组合 + LLM评估 ==========
|
|
|
|
|
|
|
|
def stage4_generate_and_evaluate_search_words(
|
|
def stage4_generate_and_evaluate_search_words(
|
|
|
self,
|
|
self,
|
|
|
- features_data: List[Dict[str, Any]]
|
|
|
|
|
|
|
+ features_data: List[Dict[str, Any]],
|
|
|
|
|
+ max_workers: int = 4,
|
|
|
|
|
+ max_candidates: int = 20,
|
|
|
|
|
+ max_combo_length: int = 4
|
|
|
) -> List[Dict[str, Any]]:
|
|
) -> List[Dict[str, Any]]:
|
|
|
"""
|
|
"""
|
|
|
- 阶段4:生成搜索词并用LLM评估质量
|
|
|
|
|
|
|
+ 阶段4:多词组合 + LLM评估
|
|
|
|
|
+
|
|
|
|
|
+ 基于Stage1的基础词和Stage3的高相似度候选,
|
|
|
|
|
+ 生成所有2-N词组合,通过LLM评估选出Top10
|
|
|
|
|
|
|
|
Args:
|
|
Args:
|
|
|
- features_data: 阶段3的特征数据
|
|
|
|
|
|
|
+ features_data: 阶段3的数据(包含高相似度候选)
|
|
|
|
|
+ max_workers: 并发评估的原始特征数(默认4)
|
|
|
|
|
+ max_candidates: 参与组合的最大候选词数(默认20)
|
|
|
|
|
+ max_combo_length: 最大组合词数(默认4,即基础词+3个候选)
|
|
|
|
|
|
|
|
Returns:
|
|
Returns:
|
|
|
带LLM评估的数据
|
|
带LLM评估的数据
|
|
|
"""
|
|
"""
|
|
|
logger.info("=" * 60)
|
|
logger.info("=" * 60)
|
|
|
- logger.info("阶段4:生成搜索词 + LLM评估质量")
|
|
|
|
|
|
|
+ logger.info("阶段4:多词组合 + LLM评估")
|
|
|
|
|
+ logger.info(f" 最大候选词数: {max_candidates}")
|
|
|
|
|
+ logger.info(f" 最大组合长度: {max_combo_length} 词")
|
|
|
|
|
+ logger.info(f" 并发数: {max_workers} 个原始特征")
|
|
|
logger.info("=" * 60)
|
|
logger.info("=" * 60)
|
|
|
|
|
|
|
|
- for idx, feature_result in enumerate(features_data, 1):
|
|
|
|
|
- logger.info(f"\n[{idx}/{len(features_data)}] 处理: {feature_result['原始特征名称']}")
|
|
|
|
|
-
|
|
|
|
|
- # 生成搜索词
|
|
|
|
|
- self._add_search_words(feature_result)
|
|
|
|
|
-
|
|
|
|
|
- # 收集所有搜索词
|
|
|
|
|
- all_search_words = self._collect_all_search_words(feature_result)
|
|
|
|
|
-
|
|
|
|
|
- if not all_search_words:
|
|
|
|
|
- logger.info(f" 无搜索词,跳过")
|
|
|
|
|
- continue
|
|
|
|
|
-
|
|
|
|
|
- logger.info(f" 生成 {len(all_search_words)} 个搜索词")
|
|
|
|
|
-
|
|
|
|
|
- # LLM分批评估(每10个一批)
|
|
|
|
|
- logger.info(f" 开始LLM评估...")
|
|
|
|
|
- original_feature = feature_result['原始特征名称']
|
|
|
|
|
- evaluated = self.llm_evaluator.evaluate_search_words_in_batches(
|
|
|
|
|
- original_feature=original_feature,
|
|
|
|
|
- search_words=[sw['search_word'] for sw in all_search_words],
|
|
|
|
|
- batch_size=10
|
|
|
|
|
- )
|
|
|
|
|
|
|
+ total_features = len(features_data)
|
|
|
|
|
|
|
|
- # 将评估结果写回到特征节点
|
|
|
|
|
- self._write_back_evaluations(feature_result, evaluated)
|
|
|
|
|
|
|
+ # 使用ThreadPoolExecutor并行处理不同的原始特征
|
|
|
|
|
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
|
|
|
+ # 提交所有任务
|
|
|
|
|
+ futures = []
|
|
|
|
|
+ for idx, feature_result in enumerate(features_data, 1):
|
|
|
|
|
+ future = executor.submit(
|
|
|
|
|
+ self._process_single_feature_combinations,
|
|
|
|
|
+ idx,
|
|
|
|
|
+ total_features,
|
|
|
|
|
+ feature_result,
|
|
|
|
|
+ max_candidates,
|
|
|
|
|
+ max_combo_length
|
|
|
|
|
+ )
|
|
|
|
|
+ futures.append((future, feature_result))
|
|
|
|
|
|
|
|
- logger.info(f" 评估完成,最高分: {evaluated[0]['score']:.3f}")
|
|
|
|
|
|
|
+ # 等待所有任务完成并收集结果
|
|
|
|
|
+ for future, feature_result in futures:
|
|
|
|
|
+ try:
|
|
|
|
|
+ _ = future.result() # 等待完成,结果已经写回到feature_result中
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ logger.error(f" 评估失败: {feature_result['原始特征名称']}, 错误: {e}")
|
|
|
|
|
|
|
|
# 保存结果
|
|
# 保存结果
|
|
|
- output_path = os.path.join(self.output_dir, "stage4_with_llm_scores.json")
|
|
|
|
|
|
|
+ output_path = os.path.join(self.output_dir, "stage4_combinations_evaluated.json")
|
|
|
self._save_json(features_data, output_path)
|
|
self._save_json(features_data, output_path)
|
|
|
|
|
|
|
|
logger.info(f"\n" + "=" * 60)
|
|
logger.info(f"\n" + "=" * 60)
|
|
@@ -688,95 +786,152 @@ class EnhancedSearchV2:
|
|
|
|
|
|
|
|
return features_data
|
|
return features_data
|
|
|
|
|
|
|
|
- def _add_search_words(self, result: Dict[str, Any]):
|
|
|
|
|
|
|
+ def _process_single_feature_combinations(
|
|
|
|
|
+ self,
|
|
|
|
|
+ idx: int,
|
|
|
|
|
+ total: int,
|
|
|
|
|
+ feature_result: Dict[str, Any],
|
|
|
|
|
+ max_candidates: int,
|
|
|
|
|
+ max_combo_length: int
|
|
|
|
|
+ ) -> None:
|
|
|
"""
|
|
"""
|
|
|
- 为结果项添加search_word字段(去重)
|
|
|
|
|
|
|
+ 处理单个原始特征的组合生成和评估
|
|
|
|
|
+
|
|
|
|
|
+ Steps:
|
|
|
|
|
+ 1. Get base_word from Stage1's 最高匹配信息
|
|
|
|
|
+ 2. Get candidates from Stage3's 高相似度候选 (top max_candidates)
|
|
|
|
|
+ 3. Generate 2-N word combinations
|
|
|
|
|
+ 4. LLM batch evaluation
|
|
|
|
|
+ 5. Select Top 10 and write back
|
|
|
|
|
|
|
|
Args:
|
|
Args:
|
|
|
- result: 单个结果项
|
|
|
|
|
|
|
+ idx: 特征索引
|
|
|
|
|
+ total: 总特征数
|
|
|
|
|
+ feature_result: 特征结果数据
|
|
|
|
|
+ max_candidates: 参与组合的最大候选词数
|
|
|
|
|
+ max_combo_length: 最大组合词数
|
|
|
"""
|
|
"""
|
|
|
- # 获取基础词(人设特征名称)
|
|
|
|
|
- base_word = result.get('最高匹配信息', {}).get('人设特征名称', '')
|
|
|
|
|
|
|
+ original_feature = feature_result['原始特征名称']
|
|
|
|
|
+ logger.info(f"\n[{idx}/{total}] 处理: {original_feature}")
|
|
|
|
|
|
|
|
|
|
+ # 步骤1: 获取基础词
|
|
|
|
|
+ base_word = feature_result.get('最高匹配信息', {}).get('人设特征名称', '')
|
|
|
if not base_word:
|
|
if not base_word:
|
|
|
|
|
+ logger.info(f" 无基础词,跳过")
|
|
|
|
|
+ feature_result['组合评估结果'] = []
|
|
|
return
|
|
return
|
|
|
|
|
|
|
|
- # 去重集合(在当前结果项范围内)
|
|
|
|
|
- seen_words: Set[str] = set()
|
|
|
|
|
|
|
+ logger.info(f" 基础词: {base_word}")
|
|
|
|
|
|
|
|
- # 遍历所有关联的特征列表
|
|
|
|
|
- for assoc in result.get('找到的关联', []):
|
|
|
|
|
- for feature in assoc.get('特征列表', []):
|
|
|
|
|
- feature_name = feature.get('特征名称', '')
|
|
|
|
|
|
|
+ # 步骤2: 获取候选词(从高相似度候选中)
|
|
|
|
|
+ high_sim_candidates = feature_result.get('高相似度候选', [])
|
|
|
|
|
|
|
|
- if not feature_name:
|
|
|
|
|
- feature['search_word'] = None
|
|
|
|
|
- continue
|
|
|
|
|
|
|
+ # 限制候选词数量
|
|
|
|
|
+ candidates = high_sim_candidates[:max_candidates]
|
|
|
|
|
+ candidate_words = [c['人设特征名称'] for c in candidates]
|
|
|
|
|
|
|
|
- # 生成组合词
|
|
|
|
|
- search_word = f"{base_word} {feature_name}"
|
|
|
|
|
|
|
+ if not candidate_words:
|
|
|
|
|
+ logger.info(f" 无候选词,跳过")
|
|
|
|
|
+ feature_result['组合评估结果'] = []
|
|
|
|
|
+ return
|
|
|
|
|
|
|
|
- # 检查是否重复
|
|
|
|
|
- if search_word not in seen_words:
|
|
|
|
|
- feature['search_word'] = search_word
|
|
|
|
|
- seen_words.add(search_word)
|
|
|
|
|
- else:
|
|
|
|
|
- feature['search_word'] = None
|
|
|
|
|
|
|
+ logger.info(f" 候选词数量: {len(candidate_words)} (限制: {max_candidates})")
|
|
|
|
|
+
|
|
|
|
|
+ # 步骤3: 生成所有组合
|
|
|
|
|
+ all_combinations = []
|
|
|
|
|
+
|
|
|
|
|
+ # 生成1词到max_combo_length-1词的候选词组合(因为还要加上base_word)
|
|
|
|
|
+ for length in range(1, min(max_combo_length, len(candidate_words) + 1)):
|
|
|
|
|
+ for combo in combinations(candidate_words, length):
|
|
|
|
|
+ # 组合成搜索词:基础词 + 候选词组合
|
|
|
|
|
+ search_phrase = base_word + ' ' + ' '.join(combo)
|
|
|
|
|
+ all_combinations.append({
|
|
|
|
|
+ 'search_word': search_phrase,
|
|
|
|
|
+ 'base_word': base_word,
|
|
|
|
|
+ 'candidate_words': list(combo),
|
|
|
|
|
+ 'combo_length': length + 1 # +1 因为包含base_word
|
|
|
|
|
+ })
|
|
|
|
|
|
|
|
- def _collect_all_search_words(self, feature_result: Dict[str, Any]) -> List[Dict[str, Any]]:
|
|
|
|
|
- """
|
|
|
|
|
- 收集结果项中所有非空的search_word
|
|
|
|
|
|
|
+ logger.info(f" 生成 {len(all_combinations)} 个组合")
|
|
|
|
|
|
|
|
- Args:
|
|
|
|
|
- feature_result: 结果项
|
|
|
|
|
|
|
+ # 步骤4: LLM批量评估
|
|
|
|
|
+ logger.info(f" 开始LLM评估...")
|
|
|
|
|
+ evaluated = self.llm_evaluator.evaluate_search_words_in_batches(
|
|
|
|
|
+ original_feature=original_feature,
|
|
|
|
|
+ search_words=[c['search_word'] for c in all_combinations],
|
|
|
|
|
+ batch_size=50
|
|
|
|
|
+ )
|
|
|
|
|
|
|
|
- Returns:
|
|
|
|
|
- 搜索词列表,每个包含 search_word 和特征引用
|
|
|
|
|
- """
|
|
|
|
|
- search_words = []
|
|
|
|
|
-
|
|
|
|
|
- for assoc_idx, assoc in enumerate(feature_result.get('找到的关联', [])):
|
|
|
|
|
- for feat_idx, feature in enumerate(assoc.get('特征列表', [])):
|
|
|
|
|
- sw = feature.get('search_word')
|
|
|
|
|
- if sw and sw.strip():
|
|
|
|
|
- search_words.append({
|
|
|
|
|
- 'search_word': sw,
|
|
|
|
|
- 'assoc_idx': assoc_idx,
|
|
|
|
|
- 'feat_idx': feat_idx,
|
|
|
|
|
- 'feature_ref': feature # 引用,方便写回
|
|
|
|
|
- })
|
|
|
|
|
|
|
+ # 步骤5: 选出Top 10
|
|
|
|
|
+ top_10 = evaluated[:10]
|
|
|
|
|
+
|
|
|
|
|
+ # 写回结果
|
|
|
|
|
+ feature_result['组合评估结果'] = top_10
|
|
|
|
|
|
|
|
- return search_words
|
|
|
|
|
|
|
+ max_score = top_10[0]['score'] if top_10 else 0.0
|
|
|
|
|
+ logger.info(f" 评估完成,Top 10 最高分: {max_score:.3f}")
|
|
|
|
|
|
|
|
- def _write_back_evaluations(
|
|
|
|
|
|
|
+ # ========== 阶段5:执行搜索 ==========
|
|
|
|
|
+
|
|
|
|
|
+ def _execute_single_search(
|
|
|
self,
|
|
self,
|
|
|
- feature_result: Dict[str, Any],
|
|
|
|
|
- evaluated: List[Dict[str, Any]]
|
|
|
|
|
- ):
|
|
|
|
|
|
|
+ idx: int,
|
|
|
|
|
+ total: int,
|
|
|
|
|
+ search_word: str,
|
|
|
|
|
+ feature_ref: Dict[str, Any]
|
|
|
|
|
+ ) -> Dict[str, Any]:
|
|
|
"""
|
|
"""
|
|
|
- 将LLM评估结果写回到特征节点
|
|
|
|
|
|
|
+ 执行单个搜索任务(用于并发执行)
|
|
|
|
|
|
|
|
Args:
|
|
Args:
|
|
|
- feature_result: 结果项
|
|
|
|
|
- evaluated: 评估结果列表
|
|
|
|
|
|
|
+ idx: 搜索索引
|
|
|
|
|
+ total: 总搜索数
|
|
|
|
|
+ search_word: 搜索词
|
|
|
|
|
+ feature_ref: 特征引用(用于写入结果)
|
|
|
|
|
+
|
|
|
|
|
+ Returns:
|
|
|
|
|
+ 搜索结果信息
|
|
|
"""
|
|
"""
|
|
|
- # 创建查找映射
|
|
|
|
|
- eval_map = {e['search_word']: e for e in evaluated}
|
|
|
|
|
|
|
+ logger.info(f"[{idx}/{total}] 搜索: {search_word}")
|
|
|
|
|
|
|
|
- # 写回到特征节点
|
|
|
|
|
- for assoc in feature_result.get('找到的关联', []):
|
|
|
|
|
- for feature in assoc.get('特征列表', []):
|
|
|
|
|
- sw = feature.get('search_word')
|
|
|
|
|
- if sw and sw in eval_map:
|
|
|
|
|
- eval_result = eval_map[sw]
|
|
|
|
|
- feature['llm_evaluation'] = {
|
|
|
|
|
- 'score': eval_result['score'],
|
|
|
|
|
- 'rank': eval_result['rank'],
|
|
|
|
|
- 'reasoning': eval_result['reasoning'],
|
|
|
|
|
- 'original_feature': eval_result['original_feature']
|
|
|
|
|
- }
|
|
|
|
|
|
|
+ try:
|
|
|
|
|
+ result = self.search_client.search(
|
|
|
|
|
+ keyword=search_word,
|
|
|
|
|
+ content_type='不限',
|
|
|
|
|
+ sort_type='综合',
|
|
|
|
|
+ max_retries=3,
|
|
|
|
|
+ use_cache=True # 启用搜索缓存
|
|
|
|
|
+ )
|
|
|
|
|
|
|
|
- # ========== 阶段5:执行搜索 ==========
|
|
|
|
|
|
|
+ note_count = len(result.get('data', {}).get('data', []))
|
|
|
|
|
+ logger.info(f" ✓ 成功,获取 {note_count} 条帖子")
|
|
|
|
|
+
|
|
|
|
|
+ # 写入结果
|
|
|
|
|
+ feature_ref['search_result'] = result
|
|
|
|
|
+ feature_ref['search_metadata'] = {
|
|
|
|
|
+ 'searched_at': datetime.now().isoformat(),
|
|
|
|
|
+ 'status': 'success',
|
|
|
|
|
+ 'note_count': note_count,
|
|
|
|
|
+ 'search_params': {
|
|
|
|
|
+ 'keyword': search_word,
|
|
|
|
|
+ 'content_type': '图文',
|
|
|
|
|
+ 'sort_type': '综合'
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ return {'status': 'success', 'search_word': search_word, 'note_count': note_count}
|
|
|
|
|
+
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ logger.error(f" ✗ 失败: {e}")
|
|
|
|
|
+ feature_ref['search_result'] = None
|
|
|
|
|
+ feature_ref['search_metadata'] = {
|
|
|
|
|
+ 'searched_at': datetime.now().isoformat(),
|
|
|
|
|
+ 'status': 'failed',
|
|
|
|
|
+ 'note_count': 0,
|
|
|
|
|
+ 'error': str(e)
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ return {'status': 'failed', 'search_word': search_word, 'error': str(e)}
|
|
|
|
|
|
|
|
def stage5_execute_searches(
|
|
def stage5_execute_searches(
|
|
|
self,
|
|
self,
|
|
@@ -799,7 +954,7 @@ class EnhancedSearchV2:
|
|
|
logger.info("阶段5:执行小红书搜索")
|
|
logger.info("阶段5:执行小红书搜索")
|
|
|
logger.info("=" * 60)
|
|
logger.info("=" * 60)
|
|
|
|
|
|
|
|
- # 按原始特征分组收集搜索词
|
|
|
|
|
|
|
+ # 按原始特征分组收集搜索词(从Stage4的组合评估结果读取)
|
|
|
feature_search_groups = {}
|
|
feature_search_groups = {}
|
|
|
|
|
|
|
|
for feature_result in features_data:
|
|
for feature_result in features_data:
|
|
@@ -808,21 +963,19 @@ class EnhancedSearchV2:
|
|
|
if original_feature not in feature_search_groups:
|
|
if original_feature not in feature_search_groups:
|
|
|
feature_search_groups[original_feature] = []
|
|
feature_search_groups[original_feature] = []
|
|
|
|
|
|
|
|
- for assoc in feature_result.get('找到的关联', []):
|
|
|
|
|
- for feature in assoc.get('特征列表', []):
|
|
|
|
|
- sw = feature.get('search_word')
|
|
|
|
|
- if not sw:
|
|
|
|
|
- continue
|
|
|
|
|
|
|
+ # 从Stage4的组合评估结果读取
|
|
|
|
|
+ for eval_item in feature_result.get('组合评估结果', []):
|
|
|
|
|
+ sw = eval_item.get('search_word')
|
|
|
|
|
+ if not sw:
|
|
|
|
|
+ continue
|
|
|
|
|
|
|
|
- # 获取LLM评分
|
|
|
|
|
- llm_eval = feature.get('llm_evaluation', {})
|
|
|
|
|
- score = llm_eval.get('score', 0.0)
|
|
|
|
|
|
|
+ score = eval_item.get('score', 0.0)
|
|
|
|
|
|
|
|
- feature_search_groups[original_feature].append({
|
|
|
|
|
- 'search_word': sw,
|
|
|
|
|
- 'score': score,
|
|
|
|
|
- 'feature_ref': feature
|
|
|
|
|
- })
|
|
|
|
|
|
|
+ feature_search_groups[original_feature].append({
|
|
|
|
|
+ 'search_word': sw,
|
|
|
|
|
+ 'score': score,
|
|
|
|
|
+ 'feature_ref': eval_item # 引用评估项,用于写入搜索结果
|
|
|
|
|
+ })
|
|
|
|
|
|
|
|
# 每组取Top N
|
|
# 每组取Top N
|
|
|
all_searches = []
|
|
all_searches = []
|
|
@@ -844,52 +997,35 @@ class EnhancedSearchV2:
|
|
|
|
|
|
|
|
logger.info(f" {original_feature}: 从 {len(sorted_list)} 个搜索词中选择 Top {len(selected)} (过滤 {filtered} 个)")
|
|
logger.info(f" {original_feature}: 从 {len(sorted_list)} 个搜索词中选择 Top {len(selected)} (过滤 {filtered} 个)")
|
|
|
|
|
|
|
|
- logger.info(f"\n共 {len(all_searches)} 个搜索任务(过滤前: {total_before_filter}, 过滤掉: {total_filtered})")
|
|
|
|
|
-
|
|
|
|
|
- # 执行搜索
|
|
|
|
|
- for idx, item in enumerate(all_searches, 1):
|
|
|
|
|
- sw = item['search_word']
|
|
|
|
|
- feature = item['feature_ref']
|
|
|
|
|
|
|
+ # 应用全局搜索次数限制
|
|
|
|
|
+ if self.max_total_searches and len(all_searches) > self.max_total_searches:
|
|
|
|
|
+ logger.info(f" 应用全局限制:从 {len(all_searches)} 个减少到 {self.max_total_searches} 个")
|
|
|
|
|
+ all_searches = all_searches[:self.max_total_searches]
|
|
|
|
|
|
|
|
- logger.info(f"[{idx}/{len(all_searches)}] 搜索: {sw}")
|
|
|
|
|
|
|
+ logger.info(f"\n共 {len(all_searches)} 个搜索任务(过滤前: {total_before_filter}, 过滤掉: {total_filtered})")
|
|
|
|
|
+ logger.info(f" 并发执行搜索(并发数: {self.search_max_workers})")
|
|
|
|
|
|
|
|
- try:
|
|
|
|
|
- result = self.search_client.search(
|
|
|
|
|
- keyword=sw,
|
|
|
|
|
- content_type='图文',
|
|
|
|
|
- sort_type='综合',
|
|
|
|
|
- max_retries=3
|
|
|
|
|
|
|
+ # 使用ThreadPoolExecutor并发执行搜索
|
|
|
|
|
+ with ThreadPoolExecutor(max_workers=self.search_max_workers) as executor:
|
|
|
|
|
+ # 提交所有搜索任务
|
|
|
|
|
+ futures = []
|
|
|
|
|
+ for idx, item in enumerate(all_searches, 1):
|
|
|
|
|
+ future = executor.submit(
|
|
|
|
|
+ self._execute_single_search,
|
|
|
|
|
+ idx,
|
|
|
|
|
+ len(all_searches),
|
|
|
|
|
+ item['search_word'],
|
|
|
|
|
+ item['feature_ref']
|
|
|
)
|
|
)
|
|
|
|
|
+ futures.append(future)
|
|
|
|
|
|
|
|
- note_count = len(result.get('data', {}).get('data', []))
|
|
|
|
|
- logger.info(f" ✓ 成功,获取 {note_count} 条帖子")
|
|
|
|
|
-
|
|
|
|
|
- # 写入结果
|
|
|
|
|
- feature['search_result'] = result
|
|
|
|
|
- feature['search_metadata'] = {
|
|
|
|
|
- 'searched_at': datetime.now().isoformat(),
|
|
|
|
|
- 'status': 'success',
|
|
|
|
|
- 'note_count': note_count,
|
|
|
|
|
- 'search_params': {
|
|
|
|
|
- 'keyword': sw,
|
|
|
|
|
- 'content_type': '图文',
|
|
|
|
|
- 'sort_type': '综合'
|
|
|
|
|
- }
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
- except Exception as e:
|
|
|
|
|
- logger.error(f" ✗ 失败: {e}")
|
|
|
|
|
- feature['search_result'] = None
|
|
|
|
|
- feature['search_metadata'] = {
|
|
|
|
|
- 'searched_at': datetime.now().isoformat(),
|
|
|
|
|
- 'status': 'failed',
|
|
|
|
|
- 'note_count': 0,
|
|
|
|
|
- 'error': str(e)
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
- # 延迟
|
|
|
|
|
- if idx < len(all_searches):
|
|
|
|
|
- time.sleep(search_delay)
|
|
|
|
|
|
|
+ # 等待所有搜索完成
|
|
|
|
|
+ for future in as_completed(futures):
|
|
|
|
|
+ try:
|
|
|
|
|
+ result = future.result()
|
|
|
|
|
+ # 结果已经写入feature_ref,无需额外处理
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ logger.error(f" 搜索任务失败: {e}")
|
|
|
|
|
|
|
|
# 保存结果
|
|
# 保存结果
|
|
|
output_path = os.path.join(self.output_dir, "stage5_with_search_results.json")
|
|
output_path = os.path.join(self.output_dir, "stage5_with_search_results.json")
|
|
@@ -974,7 +1110,7 @@ class EnhancedSearchV2:
|
|
|
feature_node: Dict[str, Any]
|
|
feature_node: Dict[str, Any]
|
|
|
) -> Dict[str, Any]:
|
|
) -> Dict[str, Any]:
|
|
|
"""
|
|
"""
|
|
|
- 评估单个搜索结果
|
|
|
|
|
|
|
+ 评估单个搜索结果(使用并行评估)
|
|
|
|
|
|
|
|
Args:
|
|
Args:
|
|
|
original_feature: 原始特征
|
|
original_feature: 原始特征
|
|
@@ -986,12 +1122,12 @@ class EnhancedSearchV2:
|
|
|
search_word = feature_node.get('search_word', '')
|
|
search_word = feature_node.get('search_word', '')
|
|
|
notes = feature_node['search_result'].get('data', {}).get('data', [])
|
|
notes = feature_node['search_result'].get('data', {}).get('data', [])
|
|
|
|
|
|
|
|
- return self.llm_evaluator.evaluate_search_results(
|
|
|
|
|
|
|
+ return self.llm_evaluator.evaluate_search_results_parallel(
|
|
|
original_feature=original_feature,
|
|
original_feature=original_feature,
|
|
|
search_word=search_word,
|
|
search_word=search_word,
|
|
|
notes=notes,
|
|
notes=notes,
|
|
|
max_notes=20,
|
|
max_notes=20,
|
|
|
- max_images_per_note=2
|
|
|
|
|
|
|
+ max_workers=20 # 20个并发评估每个帖子
|
|
|
)
|
|
)
|
|
|
|
|
|
|
|
# ========== 阶段7:扩展搜索 ==========
|
|
# ========== 阶段7:扩展搜索 ==========
|
|
@@ -1052,9 +1188,10 @@ class EnhancedSearchV2:
|
|
|
try:
|
|
try:
|
|
|
result = self.search_client.search(
|
|
result = self.search_client.search(
|
|
|
keyword=extended_kw,
|
|
keyword=extended_kw,
|
|
|
- content_type='图文',
|
|
|
|
|
|
|
+ content_type='不限',
|
|
|
sort_type='综合',
|
|
sort_type='综合',
|
|
|
- max_retries=3
|
|
|
|
|
|
|
+ max_retries=3,
|
|
|
|
|
+ use_cache=True # 启用搜索缓存
|
|
|
)
|
|
)
|
|
|
|
|
|
|
|
note_count = len(result.get('data', {}).get('data', []))
|
|
note_count = len(result.get('data', {}).get('data', []))
|
|
@@ -1121,26 +1258,53 @@ class EnhancedSearchV2:
|
|
|
# 阶段2
|
|
# 阶段2
|
|
|
stage2_results = self.stage2_find_associations(stage1_results)
|
|
stage2_results = self.stage2_find_associations(stage1_results)
|
|
|
|
|
|
|
|
- # 阶段3
|
|
|
|
|
- stage3_results = self.stage3_extract_features(stage2_results)
|
|
|
|
|
|
|
+ # 阶段3 - 使用新方法:筛选高相似度匹配
|
|
|
|
|
+ stage3_results = self.stage3_filter_high_similarity_matches(stage2_results)
|
|
|
|
|
|
|
|
# 阶段4
|
|
# 阶段4
|
|
|
- stage4_results = self.stage4_generate_and_evaluate_search_words(stage3_results)
|
|
|
|
|
|
|
+ stage4_results = self.stage4_generate_and_evaluate_search_words(
|
|
|
|
|
+ stage3_results,
|
|
|
|
|
+ max_workers=8, # 提高并发从4到8
|
|
|
|
|
+ max_combo_length=3 # 降低组合长度从4到3
|
|
|
|
|
+ )
|
|
|
|
|
|
|
|
# 阶段5
|
|
# 阶段5
|
|
|
- stage5_results = self.stage5_execute_searches(stage4_results, search_delay=2.0, top_n=10)
|
|
|
|
|
|
|
+ stage5_results = self.stage5_execute_searches(stage4_results, search_delay=2.0, top_n=self.top_n)
|
|
|
|
|
|
|
|
- # 阶段6
|
|
|
|
|
- stage6_results = self.stage6_evaluate_search_results(stage5_results)
|
|
|
|
|
|
|
+ # 阶段6 - 暂时切断执行(代码保留)
|
|
|
|
|
+ # stage6_results = self.stage6_evaluate_search_results(stage5_results)
|
|
|
|
|
|
|
|
- # 阶段7
|
|
|
|
|
- final_results = self.stage7_extended_searches(stage6_results, search_delay=2.0)
|
|
|
|
|
|
|
+ # 阶段7 - 暂时切断执行(代码保留)
|
|
|
|
|
+ # final_results = self.stage7_extended_searches(stage6_results, search_delay=2.0)
|
|
|
|
|
+
|
|
|
|
|
+ logger.info("\n" + "=" * 60)
|
|
|
|
|
+ logger.info("✓ 完整流程执行完成(Stage1-5)")
|
|
|
|
|
+ logger.info("=" * 60)
|
|
|
|
|
|
|
|
|
|
+ # 自动执行可视化
|
|
|
logger.info("\n" + "=" * 60)
|
|
logger.info("\n" + "=" * 60)
|
|
|
- logger.info("✓ 完整流程执行完成")
|
|
|
|
|
|
|
+ logger.info("开始生成可视化...")
|
|
|
logger.info("=" * 60)
|
|
logger.info("=" * 60)
|
|
|
|
|
|
|
|
- return final_results
|
|
|
|
|
|
|
+ try:
|
|
|
|
|
+ result = subprocess.run(
|
|
|
|
|
+ ['python3', 'visualize_stage5_results.py'],
|
|
|
|
|
+ capture_output=True,
|
|
|
|
|
+ text=True,
|
|
|
|
|
+ timeout=60
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+ if result.returncode == 0:
|
|
|
|
|
+ logger.info("✓ 可视化生成成功")
|
|
|
|
|
+ logger.info(result.stdout)
|
|
|
|
|
+ else:
|
|
|
|
|
+ logger.error(f"可视化生成失败: {result.stderr}")
|
|
|
|
|
+ except subprocess.TimeoutExpired:
|
|
|
|
|
+ logger.error("可视化生成超时")
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ logger.error(f"可视化生成异常: {e}")
|
|
|
|
|
+
|
|
|
|
|
+ return stage5_results
|
|
|
|
|
|
|
|
except Exception as e:
|
|
except Exception as e:
|
|
|
logger.error(f"流程执行失败: {e}")
|
|
logger.error(f"流程执行失败: {e}")
|
|
@@ -1152,7 +1316,7 @@ def main():
|
|
|
parser = argparse.ArgumentParser(description='增强搜索系统V2')
|
|
parser = argparse.ArgumentParser(description='增强搜索系统V2')
|
|
|
parser.add_argument(
|
|
parser.add_argument(
|
|
|
'--how-json',
|
|
'--how-json',
|
|
|
- default='69114f150000000007001f30_how.json',
|
|
|
|
|
|
|
+ default='69114f150000000007001f30_how copy.json',
|
|
|
help='How解构文件路径'
|
|
help='How解构文件路径'
|
|
|
)
|
|
)
|
|
|
parser.add_argument(
|
|
parser.add_argument(
|
|
@@ -1175,6 +1339,24 @@ def main():
|
|
|
default='output_v2',
|
|
default='output_v2',
|
|
|
help='输出目录'
|
|
help='输出目录'
|
|
|
)
|
|
)
|
|
|
|
|
+ parser.add_argument(
|
|
|
|
|
+ '--top-n',
|
|
|
|
|
+ type=int,
|
|
|
|
|
+ default=10,
|
|
|
|
|
+ help='每个原始特征取评分最高的N个搜索词(默认10)'
|
|
|
|
|
+ )
|
|
|
|
|
+ parser.add_argument(
|
|
|
|
|
+ '--max-total-searches',
|
|
|
|
|
+ type=int,
|
|
|
|
|
+ default=None,
|
|
|
|
|
+ help='全局最大搜索次数限制(默认None不限制)'
|
|
|
|
|
+ )
|
|
|
|
|
+ parser.add_argument(
|
|
|
|
|
+ '--search-workers',
|
|
|
|
|
+ type=int,
|
|
|
|
|
+ default=3,
|
|
|
|
|
+ help='搜索并发数(默认3)'
|
|
|
|
|
+ )
|
|
|
|
|
|
|
|
args = parser.parse_args()
|
|
args = parser.parse_args()
|
|
|
|
|
|
|
@@ -1184,7 +1366,10 @@ def main():
|
|
|
dimension_associations_path=args.dimension_associations,
|
|
dimension_associations_path=args.dimension_associations,
|
|
|
optimized_clustered_data_path=args.optimized_clustered,
|
|
optimized_clustered_data_path=args.optimized_clustered,
|
|
|
openrouter_api_key=args.api_key,
|
|
openrouter_api_key=args.api_key,
|
|
|
- output_dir=args.output_dir
|
|
|
|
|
|
|
+ output_dir=args.output_dir,
|
|
|
|
|
+ top_n=args.top_n,
|
|
|
|
|
+ max_total_searches=args.max_total_searches,
|
|
|
|
|
+ search_max_workers=args.search_workers
|
|
|
)
|
|
)
|
|
|
|
|
|
|
|
# 执行完整流程
|
|
# 执行完整流程
|