|
|
@@ -0,0 +1,1291 @@
|
|
|
+#!/usr/bin/env python3
|
|
|
+# -*- coding: utf-8 -*-
|
|
|
+"""
|
|
|
+增强搜索系统 V2
|
|
|
+支持LLM评估和扩展搜索的完整流程
|
|
|
+"""
|
|
|
+
|
|
|
+import json
|
|
|
+import logging
|
|
|
+import os
|
|
|
+import argparse
|
|
|
+import subprocess
|
|
|
+from typing import Dict, List, Any, Optional
|
|
|
+from datetime import datetime
|
|
|
+from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
|
+
|
|
|
+from src.clients.openrouter_client import OpenRouterClient
|
|
|
+from src.evaluators.llm_evaluator import LLMEvaluator
|
|
|
+from src.clients.xiaohongshu_search import XiaohongshuSearch
|
|
|
+from src.analyzers.post_deconstruction_analyzer import PostDeconstructionAnalyzer
|
|
|
+from src.analyzers.similarity_analyzer import SimilarityAnalyzer
|
|
|
+
|
|
|
+# 配置日志
|
|
|
+logging.basicConfig(
|
|
|
+ level=logging.INFO,
|
|
|
+ format='%(asctime)s - %(levelname)s - %(message)s',
|
|
|
+ datefmt='%Y-%m-%d %H:%M:%S',
|
|
|
+ handlers=[
|
|
|
+ logging.FileHandler('enhanced_search_v2.log', encoding='utf-8'),
|
|
|
+ logging.StreamHandler()
|
|
|
+ ]
|
|
|
+)
|
|
|
+logger = logging.getLogger(__name__)
|
|
|
+
|
|
|
+
|
|
|
+class EnhancedSearchV2:
|
|
|
+ """增强搜索系统V2"""
|
|
|
+
|
|
|
+ def __init__(
|
|
|
+ self,
|
|
|
+ how_json_path: str,
|
|
|
+ openrouter_api_key: Optional[str] = None,
|
|
|
+ output_dir: str = "output_v2",
|
|
|
+ top_n: int = 10,
|
|
|
+ max_total_searches: Optional[int] = None,
|
|
|
+ search_max_workers: int = 3,
|
|
|
+ max_searches_per_feature: Optional[int] = None,
|
|
|
+ max_searches_per_base_word: Optional[int] = None,
|
|
|
+ enable_evaluation: bool = True,
|
|
|
+ evaluation_max_workers: int = 10,
|
|
|
+ evaluation_max_notes_per_query: int = 20,
|
|
|
+ enable_deep_analysis: bool = False,
|
|
|
+ deep_analysis_only: bool = False,
|
|
|
+ deep_analysis_max_workers: int = 5,
|
|
|
+ deep_analysis_max_notes: Optional[int] = None,
|
|
|
+ deep_analysis_skip_count: int = 0,
|
|
|
+ deep_analysis_sort_by: str = 'score',
|
|
|
+ deep_analysis_api_url: str = "http://192.168.245.150:7000/what/analysis/single",
|
|
|
+ deep_analysis_min_score: float = 0.8,
|
|
|
+ enable_similarity_analysis: bool = False,
|
|
|
+ similarity_weight_embedding: float = 0.5,
|
|
|
+ similarity_weight_semantic: float = 0.5,
|
|
|
+ similarity_max_workers: int = 5,
|
|
|
+ similarity_min_similarity: float = 0.0
|
|
|
+ ):
|
|
|
+ """
|
|
|
+ 初始化系统
|
|
|
+
|
|
|
+ Args:
|
|
|
+ how_json_path: How解构文件路径
|
|
|
+ openrouter_api_key: OpenRouter API密钥
|
|
|
+ output_dir: 输出目录
|
|
|
+ top_n: 每个原始特征取评分最高的N个搜索词(默认10)
|
|
|
+ max_total_searches: 全局最大搜索次数限制(默认None不限制)
|
|
|
+ search_max_workers: 搜索并发数(默认3)
|
|
|
+ max_searches_per_feature: 每个原始特征的最大搜索次数(默认None不限制)
|
|
|
+ max_searches_per_base_word: 每个base_word的最大搜索次数(默认None不限制)
|
|
|
+ enable_evaluation: 是否启用结果评估(默认False)
|
|
|
+ evaluation_max_workers: 结果评估并发评估数(默认10)
|
|
|
+ evaluation_max_notes_per_query: 每个搜索结果评估的最大帖子数(默认20)
|
|
|
+ enable_deep_analysis: 是否启用深度解构(默认False)
|
|
|
+ deep_analysis_only: 只运行深度解构(从结果评估结果开始,默认False)
|
|
|
+ deep_analysis_max_workers: 深度解构并发数(默认5)
|
|
|
+ deep_analysis_max_notes: 深度解构最多处理多少个帖子(默认None不限制)
|
|
|
+ deep_analysis_skip_count: 深度解构跳过前N个帖子(默认0)
|
|
|
+ deep_analysis_sort_by: 深度解构排序方式:score/time/engagement(默认score)
|
|
|
+ deep_analysis_api_url: 深度解构API地址
|
|
|
+ deep_analysis_min_score: 深度解构处理的最低分数阈值(默认0.8,0-1分制)
|
|
|
+ enable_similarity_analysis: 是否启用相似度分析(默认False)
|
|
|
+ similarity_weight_embedding: 相似度分析向量模型权重(默认0.5)
|
|
|
+ similarity_weight_semantic: 相似度分析LLM模型权重(默认0.5)
|
|
|
+ similarity_max_workers: 相似度分析并发数(默认5)
|
|
|
+ similarity_min_similarity: 相似度分析最小相似度阈值(默认0.0)
|
|
|
+ """
|
|
|
+ self.how_json_path = how_json_path
|
|
|
+ self.output_dir = output_dir
|
|
|
+ self.top_n = top_n
|
|
|
+ self.max_total_searches = max_total_searches
|
|
|
+ self.search_max_workers = search_max_workers
|
|
|
+ self.max_searches_per_feature = max_searches_per_feature
|
|
|
+ self.max_searches_per_base_word = max_searches_per_base_word
|
|
|
+ self.enable_evaluation = enable_evaluation
|
|
|
+ self.evaluation_max_workers = evaluation_max_workers
|
|
|
+ self.evaluation_max_notes_per_query = evaluation_max_notes_per_query
|
|
|
+ self.enable_deep_analysis = enable_deep_analysis
|
|
|
+ self.deep_analysis_only = deep_analysis_only
|
|
|
+ self.enable_similarity_analysis = enable_similarity_analysis
|
|
|
+
|
|
|
+ # 创建输出目录
|
|
|
+ os.makedirs(output_dir, exist_ok=True)
|
|
|
+
|
|
|
+ # 加载数据
|
|
|
+ logger.info("加载数据文件...")
|
|
|
+ self.how_data = self._load_json(how_json_path)
|
|
|
+ logger.info(" ✓ 已加载 how.json")
|
|
|
+
|
|
|
+ # 初始化组件
|
|
|
+ logger.info("初始化组件...")
|
|
|
+ self.openrouter_client = OpenRouterClient(
|
|
|
+ api_key=openrouter_api_key,
|
|
|
+ model="google/gemini-2.5-flash",
|
|
|
+ retry_delay=5 # 增加重试延迟避免限流
|
|
|
+ )
|
|
|
+ self.llm_evaluator = LLMEvaluator(self.openrouter_client)
|
|
|
+ self.search_client = XiaohongshuSearch()
|
|
|
+
|
|
|
+ # 初始化深度解构分析器
|
|
|
+ self.deep_analyzer = PostDeconstructionAnalyzer(
|
|
|
+ api_url=deep_analysis_api_url,
|
|
|
+ max_workers=deep_analysis_max_workers,
|
|
|
+ max_notes=deep_analysis_max_notes,
|
|
|
+ min_score=deep_analysis_min_score,
|
|
|
+ skip_count=deep_analysis_skip_count,
|
|
|
+ sort_by=deep_analysis_sort_by,
|
|
|
+ output_dir=output_dir,
|
|
|
+ enable_image_download=False, # 直接使用原始图片URL,不做代理
|
|
|
+ image_server_url="http://localhost:8765", # 图片服务器URL(已弃用)
|
|
|
+ image_download_dir="downloaded_images" # 图片下载目录(已弃用)
|
|
|
+ )
|
|
|
+
|
|
|
+ # 初始化相似度分析器
|
|
|
+ self.similarity_analyzer = SimilarityAnalyzer(
|
|
|
+ weight_embedding=similarity_weight_embedding,
|
|
|
+ weight_semantic=similarity_weight_semantic,
|
|
|
+ max_workers=similarity_max_workers,
|
|
|
+ min_similarity=similarity_min_similarity,
|
|
|
+ evaluation_results_path=os.path.join(output_dir, "evaluated_results.json"),
|
|
|
+ update_evaluation_scores=True # 自动计算综合得分P
|
|
|
+ )
|
|
|
+
|
|
|
+ logger.info("系统初始化完成")
|
|
|
+
|
|
|
+ def _load_json(self, file_path: str) -> Any:
|
|
|
+ """加载JSON文件"""
|
|
|
+ try:
|
|
|
+ with open(file_path, 'r', encoding='utf-8') as f:
|
|
|
+ return json.load(f)
|
|
|
+ except Exception as e:
|
|
|
+ logger.error(f"加载文件失败 {file_path}: {e}")
|
|
|
+ raise
|
|
|
+
|
|
|
+ def _save_json(self, data: Any, file_path: str):
|
|
|
+ """保存JSON文件"""
|
|
|
+ try:
|
|
|
+ with open(file_path, 'w', encoding='utf-8') as f:
|
|
|
+ json.dump(data, f, ensure_ascii=False, indent=2)
|
|
|
+ logger.info(f"已保存: {file_path}")
|
|
|
+ except Exception as e:
|
|
|
+ logger.error(f"保存文件失败 {file_path}: {e}")
|
|
|
+ raise
|
|
|
+
|
|
|
+ # ========== 步骤1:筛选 0.5 <= 相似度 < 0.8 的特征 ==========
|
|
|
+
|
|
|
+ def filter_medium_similarity_features(self) -> List[Dict[str, Any]]:
|
|
|
+ """
|
|
|
+ 步骤1:筛选中等匹配度特征
|
|
|
+
|
|
|
+ 筛选条件:0.5 <= 最高相似度 < 0.8
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ 筛选后的特征列表
|
|
|
+ """
|
|
|
+ logger.info("=" * 60)
|
|
|
+ logger.info("步骤1:筛选中等匹配度特征 (0.5 <= 相似度 < 0.8)")
|
|
|
+ logger.info("=" * 60)
|
|
|
+
|
|
|
+ results = []
|
|
|
+ how_result = self.how_data.get('解构结果', {})
|
|
|
+
|
|
|
+ total_features = 0
|
|
|
+ filtered_out_low = 0 # < 0.5
|
|
|
+ filtered_out_high = 0 # >= 0.8
|
|
|
+ selected_count = 0
|
|
|
+
|
|
|
+ # 遍历三个维度
|
|
|
+ for level_name, level_list in how_result.items():
|
|
|
+ if not isinstance(level_list, list):
|
|
|
+ continue
|
|
|
+
|
|
|
+ logger.info(f"\n处理 {level_name}...")
|
|
|
+
|
|
|
+ for item_idx, item in enumerate(level_list):
|
|
|
+ item_name = item.get('名称', f'未命名-{item_idx}')
|
|
|
+
|
|
|
+ # 新格式:直接读取点层级的匹配人设结果
|
|
|
+ match_results = item.get('匹配人设结果', [])
|
|
|
+
|
|
|
+ total_features += 1
|
|
|
+
|
|
|
+ if not match_results:
|
|
|
+ logger.info(f" ✗ {item_name}: 无匹配结果")
|
|
|
+ continue
|
|
|
+
|
|
|
+ # 找到最高相似度(新格式:相似度是直接字段)
|
|
|
+ max_similarity = max(
|
|
|
+ (m.get('相似度', 0) for m in match_results),
|
|
|
+ default=0
|
|
|
+ )
|
|
|
+
|
|
|
+ # 筛选条件
|
|
|
+ if max_similarity < 0.5:
|
|
|
+ filtered_out_low += 1
|
|
|
+ logger.info(f" ✗ {item_name}: 最高相似度 {max_similarity:.3f} < 0.5(过滤)")
|
|
|
+ continue
|
|
|
+ elif max_similarity >= 0.8:
|
|
|
+ filtered_out_high += 1
|
|
|
+ logger.info(f" ✗ {item_name}: 最高相似度 {max_similarity:.3f} >= 0.8(过滤)")
|
|
|
+ continue
|
|
|
+
|
|
|
+ # 0.5 <= max_similarity < 0.8,保留
|
|
|
+ # 按相似度降序排序,取前3个
|
|
|
+ sorted_matches = sorted(
|
|
|
+ match_results,
|
|
|
+ key=lambda x: x.get('相似度', 0),
|
|
|
+ reverse=True
|
|
|
+ )
|
|
|
+ top3_matches = sorted_matches[:3] # 取前3个
|
|
|
+
|
|
|
+ # 构建top3匹配信息列表
|
|
|
+ top3_match_info = []
|
|
|
+ for match in top3_matches:
|
|
|
+ feature_classification = match.get('特征分类', [])
|
|
|
+ classification_path = self._build_classification_path(feature_classification)
|
|
|
+
|
|
|
+ # 直接从匹配结果读取特征类型
|
|
|
+ is_classification = (match.get('特征类型') == '分类')
|
|
|
+
|
|
|
+ top3_match_info.append({
|
|
|
+ '人设特征名称': match.get('人设特征名称'),
|
|
|
+ '人设特征层级': match.get('人设特征层级'),
|
|
|
+ '特征类型': match.get('特征类型'),
|
|
|
+ '特征分类': feature_classification,
|
|
|
+ '相似度': match.get('相似度', 0), # 直接字段
|
|
|
+ '匹配说明': match.get('说明', ''), # 直接字段
|
|
|
+ '是分类': is_classification,
|
|
|
+ '所属分类路径': classification_path
|
|
|
+ })
|
|
|
+
|
|
|
+ result_item = {
|
|
|
+ '原始特征名称': item_name, # 使用点的名称作为特征名
|
|
|
+ '来源层级': level_name,
|
|
|
+ '权重': 1.0, # 新格式没有权重字段,默认1.0
|
|
|
+ '所属点名称': item_name,
|
|
|
+ '最高匹配信息': top3_match_info[0], # 保留第1个用于Stage2
|
|
|
+ 'top3匹配信息': top3_match_info # 新增字段
|
|
|
+ }
|
|
|
+
|
|
|
+ results.append(result_item)
|
|
|
+ selected_count += 1
|
|
|
+
|
|
|
+ # 显示top3匹配信息
|
|
|
+ top3_names = [m['人设特征名称'] for m in top3_match_info]
|
|
|
+ logger.info(f" ✓ {item_name} → Top{len(top3_match_info)}: {', '.join(top3_names)}")
|
|
|
+
|
|
|
+ # 统计信息
|
|
|
+ logger.info(f"\n" + "=" * 60)
|
|
|
+ logger.info(f"步骤1完成")
|
|
|
+ logger.info(f" 总特征数: {total_features}")
|
|
|
+ logger.info(f" 过滤掉(<0.5): {filtered_out_low}")
|
|
|
+ logger.info(f" 过滤掉(>=0.8): {filtered_out_high}")
|
|
|
+ logger.info(f" 保留(0.5-0.8): {selected_count}")
|
|
|
+ logger.info("=" * 60)
|
|
|
+
|
|
|
+ # 保存结果
|
|
|
+ output_path = os.path.join(self.output_dir, "filtered_features.json")
|
|
|
+ self._save_json(results, output_path)
|
|
|
+
|
|
|
+ return results
|
|
|
+
|
|
|
+ def _build_classification_path(self, feature_classification: List[str]) -> str:
|
|
|
+ """
|
|
|
+ 构建分类路径
|
|
|
+
|
|
|
+ Args:
|
|
|
+ feature_classification: 特征分类数组
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ 分类路径
|
|
|
+ """
|
|
|
+ if not feature_classification:
|
|
|
+ return ""
|
|
|
+
|
|
|
+ # 步骤1: 去掉中间元素的"实质"后缀
|
|
|
+ cleaned = []
|
|
|
+ for i, item in enumerate(feature_classification):
|
|
|
+ if i == len(feature_classification) - 1: # 最后一个保留
|
|
|
+ cleaned.append(item)
|
|
|
+ elif item.endswith("实质") and i != 0: # 中间的去掉"实质"
|
|
|
+ cleaned.append(item[:-2])
|
|
|
+ else:
|
|
|
+ cleaned.append(item)
|
|
|
+
|
|
|
+ # 步骤2: 反转数组
|
|
|
+ reversed_list = list(reversed(cleaned))
|
|
|
+
|
|
|
+ # 步骤3: 拼接路径
|
|
|
+ path = "/".join(reversed_list)
|
|
|
+
|
|
|
+ return path
|
|
|
+
|
|
|
+ # ========== 步骤2:从how文件提取高相似度候选词 ==========
|
|
|
+
|
|
|
+ def extract_candidate_words(self, filtered_features: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
|
+ """
|
|
|
+ 步骤2:从how文件提取候选词
|
|
|
+
|
|
|
+ 处理流程:
|
|
|
+ 1. 提取人设候选词:相似度 >= 0.8 的人设特征名称
|
|
|
+ 2. 提取帖子候选词:点的名称(灵感点、目的点、关键点),要求该点与人设的最高相似度 >= 0.8
|
|
|
+ 3. 合并两种候选词并去重
|
|
|
+ 4. 按相似度降序排序
|
|
|
+ 5. 为每个中心词分配候选词列表
|
|
|
+ 6. 构造 '高相似度候选_按base_word' 结构
|
|
|
+
|
|
|
+ Args:
|
|
|
+ filtered_features: 特征筛选筛选的特征列表
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ 带高相似度候选的特征列表
|
|
|
+ """
|
|
|
+ logger.info("=" * 60)
|
|
|
+ logger.info("步骤2:从how文件提取候选词(人设+帖子)")
|
|
|
+ logger.info("=" * 60)
|
|
|
+
|
|
|
+ how_result = self.how_data.get('解构结果', {})
|
|
|
+
|
|
|
+ # Step 1: 提取人设候选词(相似度 >= 0.8)
|
|
|
+ persona_candidates_dict = {} # {人设特征名称: {候选词信息}}
|
|
|
+
|
|
|
+ for dimension in ['灵感点列表', '关键点列表', '目的点列表']:
|
|
|
+ items_list = how_result.get(dimension, [])
|
|
|
+
|
|
|
+ for item in items_list:
|
|
|
+ item_name = item.get('名称', '')
|
|
|
+ matches = item.get('匹配人设结果', [])
|
|
|
+
|
|
|
+ for match in matches:
|
|
|
+ similarity = match.get('相似度', 0)
|
|
|
+ persona_feature_name = match.get('人设特征名称', '')
|
|
|
+
|
|
|
+ # 筛选相似度 >= 0.8
|
|
|
+ if similarity >= 0.8 and persona_feature_name:
|
|
|
+ # 去重逻辑:保留最高相似度
|
|
|
+ if persona_feature_name not in persona_candidates_dict or \
|
|
|
+ similarity > persona_candidates_dict[persona_feature_name]['相似度']:
|
|
|
+ persona_candidates_dict[persona_feature_name] = {
|
|
|
+ '候选词': persona_feature_name,
|
|
|
+ '候选词类型': 'persona', # 标记为人设候选词
|
|
|
+ '相似度': similarity,
|
|
|
+ '特征类型': match.get('特征类型', ''),
|
|
|
+ '特征分类': match.get('特征分类', []),
|
|
|
+ '人设特征层级': match.get('人设特征层级', ''),
|
|
|
+ '来源层级': 'persona',
|
|
|
+ '来源路径': self._build_classification_path(match.get('特征分类', [])),
|
|
|
+ '匹配说明': match.get('说明', ''),
|
|
|
+ '来源原始特征': item_name
|
|
|
+ }
|
|
|
+
|
|
|
+ # Step 2: 提取帖子候选词(点名称,要求该点与人设的最高相似度 >= 0.8)
|
|
|
+ post_candidates_dict = {} # {点名称: {候选词信息}}
|
|
|
+
|
|
|
+ for dimension in ['灵感点列表', '关键点列表', '目的点列表']:
|
|
|
+ items_list = how_result.get(dimension, [])
|
|
|
+
|
|
|
+ for item in items_list:
|
|
|
+ item_name = item.get('名称', '')
|
|
|
+ matches = item.get('匹配人设结果', [])
|
|
|
+
|
|
|
+ if not item_name or not matches:
|
|
|
+ continue
|
|
|
+
|
|
|
+ # 计算该点与人设的最高相似度
|
|
|
+ max_similarity = max(
|
|
|
+ (m.get('相似度', 0) for m in matches),
|
|
|
+ default=0
|
|
|
+ )
|
|
|
+
|
|
|
+ # 只有最高相似度 >= 0.8 的点才作为帖子候选词
|
|
|
+ if max_similarity >= 0.8:
|
|
|
+ # 如果点名称已经作为人设候选词存在,跳过(优先保留人设候选词)
|
|
|
+ if item_name not in persona_candidates_dict and item_name not in post_candidates_dict:
|
|
|
+ post_candidates_dict[item_name] = {
|
|
|
+ '候选词': item_name,
|
|
|
+ '候选词类型': 'post', # 标记为帖子候选词
|
|
|
+ '相似度': 1.0, # 帖子自身的点,相似度视为1.0
|
|
|
+ '特征类型': item.get('类型', ''),
|
|
|
+ '特征分类': [],
|
|
|
+ '人设特征层级': '',
|
|
|
+ '来源层级': dimension,
|
|
|
+ '来源路径': f"帖子/{dimension}/{item_name}",
|
|
|
+ '匹配说明': item.get('描述', ''),
|
|
|
+ '来源原始特征': item_name,
|
|
|
+ '点最高人设相似度': max_similarity # 记录该点与人设的最高相似度
|
|
|
+ }
|
|
|
+
|
|
|
+ # Step 3: 合并两种候选词
|
|
|
+ all_candidates_dict = {}
|
|
|
+ all_candidates_dict.update(persona_candidates_dict) # 人设候选词
|
|
|
+ all_candidates_dict.update(post_candidates_dict) # 帖子候选词
|
|
|
+
|
|
|
+ # Step 4: 转为列表并按相似度降序排序
|
|
|
+ global_candidates = sorted(
|
|
|
+ all_candidates_dict.values(),
|
|
|
+ key=lambda x: x['相似度'],
|
|
|
+ reverse=True
|
|
|
+ )
|
|
|
+
|
|
|
+ logger.info(f"候选词统计:")
|
|
|
+ logger.info(f" - 人设候选词: {len(persona_candidates_dict)} 个")
|
|
|
+ logger.info(f" - 帖子候选词: {len(post_candidates_dict)} 个")
|
|
|
+ logger.info(f" - 总候选词: {len(global_candidates)} 个")
|
|
|
+
|
|
|
+ # 显示Top 10候选词
|
|
|
+ if global_candidates:
|
|
|
+ logger.info("\nTop 10 候选词:")
|
|
|
+ for i, candidate in enumerate(global_candidates[:10], 1):
|
|
|
+ cand_type = "人设" if candidate['候选词类型'] == 'persona' else "帖子"
|
|
|
+ logger.info(f" {i}. {candidate['候选词']} (相似度: {candidate['相似度']:.3f}, 类型: {cand_type})")
|
|
|
+
|
|
|
+ # Step 3: 为每个特征构造输出结构
|
|
|
+ results = []
|
|
|
+ for idx, feature_data in enumerate(filtered_features, 1):
|
|
|
+ original_feature_name = feature_data.get('原始特征名称', '')
|
|
|
+ logger.info(f"\n[{idx}/{len(filtered_features)}] 处理: {original_feature_name}")
|
|
|
+
|
|
|
+ top3_matches = feature_data.get('top3匹配信息', [])
|
|
|
+
|
|
|
+ # 提取3个中心词
|
|
|
+ base_words = [match.get('人设特征名称', '') for match in top3_matches[:3]]
|
|
|
+ logger.info(f" 中心词: {', '.join(base_words)}")
|
|
|
+
|
|
|
+ # 所有中心词共享相同的候选词列表
|
|
|
+ high_similarity_by_base = {}
|
|
|
+ for base_word in base_words:
|
|
|
+ if base_word:
|
|
|
+ high_similarity_by_base[base_word] = global_candidates.copy()
|
|
|
+
|
|
|
+ logger.info(f" 每个中心词分配 {len(global_candidates)} 个候选词")
|
|
|
+
|
|
|
+ result = {
|
|
|
+ '原始特征名称': original_feature_name,
|
|
|
+ '来源层级': feature_data.get('来源层级', ''), # 保留元数据
|
|
|
+ '权重': feature_data.get('权重', 0), # 保留元数据
|
|
|
+ 'top3匹配信息': top3_matches,
|
|
|
+ '找到的关联_按base_word': {}, # 新方式不需要关联分析
|
|
|
+ '高相似度候选_按base_word': high_similarity_by_base
|
|
|
+ }
|
|
|
+ results.append(result)
|
|
|
+
|
|
|
+ # 保存结果
|
|
|
+ output_path = os.path.join(self.output_dir, 'candidate_words.json')
|
|
|
+ self._save_json(results, output_path)
|
|
|
+
|
|
|
+ logger.info(f"\n" + "=" * 60)
|
|
|
+ logger.info(f"步骤2完成")
|
|
|
+ logger.info(f" 提取候选词: {len(global_candidates)} 个")
|
|
|
+ logger.info(f" 处理特征: {len(results)} 个")
|
|
|
+ logger.info("=" * 60)
|
|
|
+
|
|
|
+ return results
|
|
|
+
|
|
|
+ # ========== 步骤4:多词组合 + LLM评估 ==========
|
|
|
+
|
|
|
+ def generate_search_queries(
|
|
|
+ self,
|
|
|
+ features_data: List[Dict[str, Any]],
|
|
|
+ max_workers: int = 4,
|
|
|
+ max_candidates: int = 20,
|
|
|
+ max_combo_length: int = 4
|
|
|
+ ) -> List[Dict[str, Any]]:
|
|
|
+ """
|
|
|
+ 步骤4:多词组合 + LLM评估
|
|
|
+
|
|
|
+ 基于Stage1的基础词和Stage3的高相似度候选,
|
|
|
+ 生成所有2-N词组合,通过LLM评估选出Top10
|
|
|
+
|
|
|
+ Args:
|
|
|
+ features_data: 阶段3的数据(包含高相似度候选)
|
|
|
+ max_workers: 并发评估的原始特征数(默认4)
|
|
|
+ max_candidates: 参与组合的最大候选词数(默认20)
|
|
|
+ max_combo_length: 最大组合词数(默认4,即基础词+3个候选)
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ 带LLM评估的数据
|
|
|
+ """
|
|
|
+ logger.info("=" * 60)
|
|
|
+ logger.info("步骤4:多词组合 + LLM评估")
|
|
|
+ logger.info(f" 最大候选词数: {max_candidates}")
|
|
|
+ logger.info(f" 最大组合长度: {max_combo_length} 词")
|
|
|
+ logger.info(f" 并发数: {max_workers} 个原始特征")
|
|
|
+ logger.info("=" * 60)
|
|
|
+
|
|
|
+ total_features = len(features_data)
|
|
|
+
|
|
|
+ # 使用ThreadPoolExecutor并行处理不同的原始特征
|
|
|
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
|
+ # 提交所有任务
|
|
|
+ futures = []
|
|
|
+ for idx, feature_result in enumerate(features_data, 1):
|
|
|
+ future = executor.submit(
|
|
|
+ self._process_single_feature_combinations,
|
|
|
+ idx,
|
|
|
+ total_features,
|
|
|
+ feature_result,
|
|
|
+ max_candidates,
|
|
|
+ max_combo_length
|
|
|
+ )
|
|
|
+ futures.append((future, feature_result))
|
|
|
+
|
|
|
+ # 等待所有任务完成并收集结果
|
|
|
+ for future, feature_result in futures:
|
|
|
+ try:
|
|
|
+ _ = future.result() # 等待完成,结果已经写回到feature_result中
|
|
|
+ except Exception as e:
|
|
|
+ logger.error(f" 评估失败: {feature_result['原始特征名称']}, 错误: {e}")
|
|
|
+
|
|
|
+ # 保存结果
|
|
|
+ output_path = os.path.join(self.output_dir, "search_queries_evaluated.json")
|
|
|
+ self._save_json(features_data, output_path)
|
|
|
+
|
|
|
+ logger.info(f"\n" + "=" * 60)
|
|
|
+ logger.info(f"步骤4完成")
|
|
|
+ logger.info("=" * 60)
|
|
|
+
|
|
|
+ return features_data
|
|
|
+
|
|
|
+ def _process_single_feature_combinations(
|
|
|
+ self,
|
|
|
+ idx: int,
|
|
|
+ total: int,
|
|
|
+ feature_result: Dict[str, Any],
|
|
|
+ max_candidates: int,
|
|
|
+ max_combo_length: int
|
|
|
+ ) -> None:
|
|
|
+ """
|
|
|
+ 处理单个原始特征的组合生成和评估
|
|
|
+
|
|
|
+ 改进: 每个base_word使用自己的候选词(而不是共享)
|
|
|
+
|
|
|
+ Steps:
|
|
|
+ 1. Get top3 base_words from Stage1's top3匹配信息
|
|
|
+ 2. For each base_word:
|
|
|
+ a. Get candidates from Stage3's 高相似度候选_按base_word
|
|
|
+ b. Generate combinations
|
|
|
+ c. LLM evaluation
|
|
|
+ d. Select Top 10
|
|
|
+ 3. Save grouped results
|
|
|
+
|
|
|
+ Args:
|
|
|
+ idx: 特征索引
|
|
|
+ total: 总特征数
|
|
|
+ feature_result: 特征结果数据
|
|
|
+ max_candidates: 参与组合的最大候选词数
|
|
|
+ max_combo_length: 最大组合词数
|
|
|
+ """
|
|
|
+ original_feature = feature_result['原始特征名称']
|
|
|
+ logger.info(f"\n[{idx}/{total}] 处理: {original_feature}")
|
|
|
+
|
|
|
+ # 步骤1: 获取top3基础词
|
|
|
+ top3_info = feature_result.get('top3匹配信息', [])
|
|
|
+ if not top3_info:
|
|
|
+ logger.info(f" 无top3匹配信息,跳过")
|
|
|
+ feature_result['组合评估结果_分组'] = []
|
|
|
+ return
|
|
|
+
|
|
|
+ logger.info(f" 找到 {len(top3_info)} 个base_word")
|
|
|
+
|
|
|
+ # 步骤2: 获取按base_word分组的候选词
|
|
|
+ candidates_by_base_word = feature_result.get('高相似度候选_按base_word', {})
|
|
|
+
|
|
|
+ if not candidates_by_base_word:
|
|
|
+ logger.warning(f" 无按base_word分组的候选词,跳过")
|
|
|
+ feature_result['组合评估结果_分组'] = []
|
|
|
+ return
|
|
|
+
|
|
|
+ # 步骤3: 为每个base_word独立处理
|
|
|
+ grouped_results = []
|
|
|
+
|
|
|
+ for base_idx, base_info in enumerate(top3_info, 1):
|
|
|
+ base_word = base_info.get('人设特征名称', '')
|
|
|
+ base_similarity = base_info.get('相似度', 0)
|
|
|
+
|
|
|
+ if not base_word:
|
|
|
+ continue
|
|
|
+
|
|
|
+ logger.info(f" [{base_idx}/{len(top3_info)}] Base Word: {base_word} (相似度: {base_similarity:.3f})")
|
|
|
+
|
|
|
+ # 获取该base_word的候选词
|
|
|
+ base_candidates = candidates_by_base_word.get(base_word, [])
|
|
|
+ candidates = base_candidates[:max_candidates]
|
|
|
+ candidate_words = [c['候选词'] for c in candidates]
|
|
|
+
|
|
|
+ if not candidate_words:
|
|
|
+ logger.warning(f" 该base_word无候选词,跳过")
|
|
|
+ grouped_results.append({
|
|
|
+ 'base_word': base_word,
|
|
|
+ 'base_word_similarity': base_similarity,
|
|
|
+ 'base_word_info': base_info,
|
|
|
+ 'top10_searches': [],
|
|
|
+ 'available_words': []
|
|
|
+ })
|
|
|
+ continue
|
|
|
+
|
|
|
+ logger.info(f" 候选词数量: {len(candidate_words)} (限制: {max_candidates})")
|
|
|
+
|
|
|
+ # LLM生成query(新方式:直接让LLM基于候选词生成query)
|
|
|
+ logger.info(f" 使用LLM生成query(中心词: {base_word})...")
|
|
|
+ evaluated = self.llm_evaluator.generate_queries_from_candidates(
|
|
|
+ original_feature=original_feature,
|
|
|
+ base_word=base_word,
|
|
|
+ candidate_words=candidate_words,
|
|
|
+ max_queries=10
|
|
|
+ )
|
|
|
+
|
|
|
+ # 选出Top 10(已经由LLM生成方法控制数量)
|
|
|
+ top_10 = evaluated[:10]
|
|
|
+ logger.info(f" 生成完成,共 {len(top_10)} 个query")
|
|
|
+
|
|
|
+ # 保存分组结果 - 每个base_word有自己的available_words
|
|
|
+ grouped_results.append({
|
|
|
+ 'base_word': base_word,
|
|
|
+ 'base_word_similarity': base_similarity,
|
|
|
+ 'base_word_info': base_info,
|
|
|
+ 'top10_searches': top_10,
|
|
|
+ 'available_words': candidate_words # 该base_word自己的候选词
|
|
|
+ })
|
|
|
+
|
|
|
+ # 写回结果
|
|
|
+ feature_result['组合评估结果_分组'] = grouped_results
|
|
|
+
|
|
|
+ total_searches = sum(len(g['top10_searches']) for g in grouped_results)
|
|
|
+ logger.info(f" 完成!共 {len(grouped_results)} 个base_word,{total_searches} 个搜索词")
|
|
|
+
|
|
|
+ # ========== 步骤5:执行搜索 ==========
|
|
|
+
|
|
|
+ def _execute_single_search(
|
|
|
+ self,
|
|
|
+ idx: int,
|
|
|
+ total: int,
|
|
|
+ search_word: str,
|
|
|
+ feature_ref: Dict[str, Any]
|
|
|
+ ) -> Dict[str, Any]:
|
|
|
+ """
|
|
|
+ 执行单个搜索任务(用于并发执行)
|
|
|
+
|
|
|
+ Args:
|
|
|
+ idx: 搜索索引
|
|
|
+ total: 总搜索数
|
|
|
+ search_word: 搜索词
|
|
|
+ feature_ref: 特征引用(用于写入结果)
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ 搜索结果信息
|
|
|
+ """
|
|
|
+ logger.info(f"[{idx}/{total}] 搜索: {search_word}")
|
|
|
+
|
|
|
+ try:
|
|
|
+ result = self.search_client.search(
|
|
|
+ keyword=search_word,
|
|
|
+ content_type='不限',
|
|
|
+ sort_type='综合',
|
|
|
+ max_retries=3,
|
|
|
+ use_cache=True # 启用搜索缓存
|
|
|
+ )
|
|
|
+
|
|
|
+ note_count = len(result.get('data', {}).get('data', []))
|
|
|
+ logger.info(f" ✓ 成功,获取 {note_count} 条帖子")
|
|
|
+
|
|
|
+ # 写入结果
|
|
|
+ feature_ref['search_result'] = result
|
|
|
+ feature_ref['search_metadata'] = {
|
|
|
+ 'searched_at': datetime.now().isoformat(),
|
|
|
+ 'status': 'success',
|
|
|
+ 'note_count': note_count,
|
|
|
+ 'search_params': {
|
|
|
+ 'keyword': search_word,
|
|
|
+ 'content_type': '图文',
|
|
|
+ 'sort_type': '综合'
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ return {'status': 'success', 'search_word': search_word, 'note_count': note_count}
|
|
|
+
|
|
|
+ except Exception as e:
|
|
|
+ logger.error(f" ✗ 失败: {e}")
|
|
|
+ feature_ref['search_result'] = None
|
|
|
+ feature_ref['search_metadata'] = {
|
|
|
+ 'searched_at': datetime.now().isoformat(),
|
|
|
+ 'status': 'failed',
|
|
|
+ 'note_count': 0,
|
|
|
+ 'error': str(e)
|
|
|
+ }
|
|
|
+
|
|
|
+ return {'status': 'failed', 'search_word': search_word, 'error': str(e)}
|
|
|
+
|
|
|
+ def execute_search_queries(
|
|
|
+ self,
|
|
|
+ features_data: List[Dict[str, Any]],
|
|
|
+ search_delay: float = 2.0,
|
|
|
+ top_n: int = 10
|
|
|
+ ) -> List[Dict[str, Any]]:
|
|
|
+ """
|
|
|
+ 步骤4:执行小红书搜索
|
|
|
+
|
|
|
+ Args:
|
|
|
+ features_data: 阶段3的数据
|
|
|
+ search_delay: 搜索延迟
|
|
|
+ top_n: 每个原始特征取评分最高的N个搜索词
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ 带搜索结果的数据
|
|
|
+ """
|
|
|
+ logger.info("=" * 60)
|
|
|
+ logger.info("步骤4:执行小红书搜索")
|
|
|
+ logger.info("=" * 60)
|
|
|
+
|
|
|
+ # 按原始特征分组收集搜索词(从搜索词生成结果中读取)
|
|
|
+ feature_search_groups = {}
|
|
|
+
|
|
|
+ for feature_result in features_data:
|
|
|
+ original_feature = feature_result['原始特征名称']
|
|
|
+
|
|
|
+ if original_feature not in feature_search_groups:
|
|
|
+ feature_search_groups[original_feature] = []
|
|
|
+
|
|
|
+ # 从搜索词生成结果中读取(新结构)
|
|
|
+ grouped_results = feature_result.get('组合评估结果_分组', [])
|
|
|
+
|
|
|
+ if grouped_results:
|
|
|
+ # 使用分组结构:每个base_word的top10都执行
|
|
|
+ for group in grouped_results:
|
|
|
+ base_word = group.get('base_word', '')
|
|
|
+ base_similarity = group.get('base_word_similarity', 0)
|
|
|
+
|
|
|
+ base_word_searches = []
|
|
|
+ for eval_item in group.get('top10_searches', []):
|
|
|
+ sw = eval_item.get('search_word')
|
|
|
+ if not sw:
|
|
|
+ continue
|
|
|
+
|
|
|
+ score = eval_item.get('score', 0.0)
|
|
|
+
|
|
|
+ base_word_searches.append({
|
|
|
+ 'search_word': sw,
|
|
|
+ 'score': score,
|
|
|
+ 'base_word': base_word,
|
|
|
+ 'base_word_similarity': base_similarity,
|
|
|
+ 'feature_ref': eval_item # 引用评估项,用于写入搜索结果
|
|
|
+ })
|
|
|
+
|
|
|
+ # 应用每个base_word的搜索次数限制
|
|
|
+ if self.max_searches_per_base_word and len(base_word_searches) > self.max_searches_per_base_word:
|
|
|
+ logger.info(f" 应用base_word限制: {base_word} 从 {len(base_word_searches)} 减少到 {self.max_searches_per_base_word}")
|
|
|
+ base_word_searches = base_word_searches[:self.max_searches_per_base_word]
|
|
|
+
|
|
|
+ feature_search_groups[original_feature].extend(base_word_searches)
|
|
|
+ else:
|
|
|
+ # 兼容旧结构(组合评估结果)
|
|
|
+ for eval_item in feature_result.get('组合评估结果', []):
|
|
|
+ sw = eval_item.get('search_word')
|
|
|
+ if not sw:
|
|
|
+ continue
|
|
|
+
|
|
|
+ score = eval_item.get('score', 0.0)
|
|
|
+
|
|
|
+ feature_search_groups[original_feature].append({
|
|
|
+ 'search_word': sw,
|
|
|
+ 'score': score,
|
|
|
+ 'feature_ref': eval_item
|
|
|
+ })
|
|
|
+
|
|
|
+ # 应用每个原始特征的搜索次数限制
|
|
|
+ if self.max_searches_per_feature and len(feature_search_groups[original_feature]) > self.max_searches_per_feature:
|
|
|
+ logger.info(f" 应用特征限制: {original_feature} 从 {len(feature_search_groups[original_feature])} 减少到 {self.max_searches_per_feature}")
|
|
|
+ feature_search_groups[original_feature] = feature_search_groups[original_feature][:self.max_searches_per_feature]
|
|
|
+
|
|
|
+ # 收集所有搜索任务(分组结构下执行所有base_word的top10,不再过滤)
|
|
|
+ all_searches = []
|
|
|
+ total_count = 0
|
|
|
+
|
|
|
+ for original_feature, search_list in feature_search_groups.items():
|
|
|
+ total_count += len(search_list)
|
|
|
+ all_searches.extend(search_list)
|
|
|
+
|
|
|
+ logger.info(f" {original_feature}: {len(search_list)} 个搜索词")
|
|
|
+
|
|
|
+ # 应用全局搜索次数限制
|
|
|
+ if self.max_total_searches and len(all_searches) > self.max_total_searches:
|
|
|
+ logger.info(f" 应用全局限制:从 {len(all_searches)} 个减少到 {self.max_total_searches} 个")
|
|
|
+ all_searches = all_searches[:self.max_total_searches]
|
|
|
+
|
|
|
+ logger.info(f"\n共 {len(all_searches)} 个搜索任务")
|
|
|
+ logger.info(f" 并发执行搜索(并发数: {self.search_max_workers})")
|
|
|
+
|
|
|
+ # 使用ThreadPoolExecutor并发执行搜索
|
|
|
+ with ThreadPoolExecutor(max_workers=self.search_max_workers) as executor:
|
|
|
+ # 提交所有搜索任务
|
|
|
+ futures = []
|
|
|
+ for idx, item in enumerate(all_searches, 1):
|
|
|
+ future = executor.submit(
|
|
|
+ self._execute_single_search,
|
|
|
+ idx,
|
|
|
+ len(all_searches),
|
|
|
+ item['search_word'],
|
|
|
+ item['feature_ref']
|
|
|
+ )
|
|
|
+ futures.append(future)
|
|
|
+
|
|
|
+ # 等待所有搜索完成
|
|
|
+ for future in as_completed(futures):
|
|
|
+ try:
|
|
|
+ result = future.result()
|
|
|
+ # 结果已经写入feature_ref,无需额外处理
|
|
|
+ except Exception as e:
|
|
|
+ logger.error(f" 搜索任务失败: {e}")
|
|
|
+
|
|
|
+ # 保存结果
|
|
|
+ output_path = os.path.join(self.output_dir, "search_results.json")
|
|
|
+ self._save_json(features_data, output_path)
|
|
|
+
|
|
|
+ logger.info(f"\n" + "=" * 60)
|
|
|
+ logger.info(f"步骤4完成")
|
|
|
+ logger.info("=" * 60)
|
|
|
+
|
|
|
+ return features_data
|
|
|
+
|
|
|
+ # ========== 步骤5:LLM评估搜索结果(两层过滤评估) ==========
|
|
|
+ # 注:旧的单层评估方法已移至 backup/unused_methods_from_enhanced_search_v2.py
|
|
|
+
|
|
|
+ def evaluate_search_results(
|
|
|
+ self,
|
|
|
+ features_data: List[Dict[str, Any]]
|
|
|
+ ) -> List[Dict[str, Any]]:
|
|
|
+ """
|
|
|
+ 步骤5:用LLM评估搜索结果(使用两层过滤评估)
|
|
|
+
|
|
|
+ 遍历所有搜索结果,使用两层评估机制:
|
|
|
+ 1. 第一层:过滤与搜索Query无关的结果
|
|
|
+ 2. 第二层:评估与目标特征的匹配度(0.8-1.0/0.6-0.79/0.5-0.59/≤0.4)
|
|
|
+
|
|
|
+ Args:
|
|
|
+ features_data: 阶段4的数据
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ 带评估结果的数据
|
|
|
+ """
|
|
|
+ logger.info("=" * 60)
|
|
|
+ logger.info("步骤5:LLM评估搜索结果(两层过滤评估)")
|
|
|
+ logger.info(f" 并发数: {self.evaluation_max_workers}")
|
|
|
+ logger.info(f" 每个搜索最多评估: {self.evaluation_max_notes_per_query} 个帖子")
|
|
|
+ logger.info("=" * 60)
|
|
|
+
|
|
|
+ # 收集所有需要评估的搜索项
|
|
|
+ search_items_to_evaluate = []
|
|
|
+
|
|
|
+ for feature_result in features_data:
|
|
|
+ original_feature = feature_result['原始特征名称']
|
|
|
+
|
|
|
+ # 从组合评估结果_分组中读取搜索结果
|
|
|
+ grouped_results = feature_result.get('组合评估结果_分组', [])
|
|
|
+
|
|
|
+ if grouped_results:
|
|
|
+ for group in grouped_results:
|
|
|
+ for eval_item in group.get('top10_searches', []):
|
|
|
+ # 检查是否有搜索结果
|
|
|
+ if eval_item.get('search_result') and eval_item.get('search_metadata', {}).get('status') == 'success':
|
|
|
+ search_items_to_evaluate.append({
|
|
|
+ 'original_feature': original_feature,
|
|
|
+ 'search_item': eval_item,
|
|
|
+ 'base_word': group.get('base_word', '')
|
|
|
+ })
|
|
|
+ else:
|
|
|
+ # 兼容旧结构
|
|
|
+ for eval_item in feature_result.get('组合评估结果', []):
|
|
|
+ if eval_item.get('search_result') and eval_item.get('search_metadata', {}).get('status') == 'success':
|
|
|
+ search_items_to_evaluate.append({
|
|
|
+ 'original_feature': original_feature,
|
|
|
+ 'search_item': eval_item,
|
|
|
+ 'base_word': ''
|
|
|
+ })
|
|
|
+
|
|
|
+ logger.info(f"共 {len(search_items_to_evaluate)} 个搜索结果需要评估")
|
|
|
+
|
|
|
+ # 并行评估所有搜索结果
|
|
|
+ with ThreadPoolExecutor(max_workers=self.evaluation_max_workers) as executor:
|
|
|
+ futures = []
|
|
|
+ for idx, item in enumerate(search_items_to_evaluate, 1):
|
|
|
+ future = executor.submit(
|
|
|
+ self._evaluate_single_search_with_filter,
|
|
|
+ idx,
|
|
|
+ len(search_items_to_evaluate),
|
|
|
+ item['original_feature'],
|
|
|
+ item['search_item'],
|
|
|
+ item['base_word']
|
|
|
+ )
|
|
|
+ futures.append((future, item))
|
|
|
+
|
|
|
+ # 收集结果
|
|
|
+ success_count = 0
|
|
|
+ failed_count = 0
|
|
|
+
|
|
|
+ for future, item in futures:
|
|
|
+ try:
|
|
|
+ evaluation = future.result()
|
|
|
+ item['search_item']['evaluation_with_filter'] = evaluation
|
|
|
+ success_count += 1
|
|
|
+ except Exception as e:
|
|
|
+ logger.error(f" 评估失败: {item['search_item'].get('search_word', 'unknown')}, 错误: {e}")
|
|
|
+ item['search_item']['evaluation_with_filter'] = None
|
|
|
+ failed_count += 1
|
|
|
+
|
|
|
+ logger.info(f"\n评估完成: 成功 {success_count}, 失败 {failed_count}")
|
|
|
+
|
|
|
+ # 保存结果
|
|
|
+ output_path = os.path.join(self.output_dir, "evaluated_results.json")
|
|
|
+ self._save_json(features_data, output_path)
|
|
|
+
|
|
|
+ logger.info(f"\n" + "=" * 60)
|
|
|
+ logger.info(f"步骤5完成")
|
|
|
+ logger.info("=" * 60)
|
|
|
+
|
|
|
+ return features_data
|
|
|
+
|
|
|
+ def _evaluate_single_search_with_filter(
|
|
|
+ self,
|
|
|
+ idx: int,
|
|
|
+ total: int,
|
|
|
+ original_feature: str,
|
|
|
+ search_item: Dict[str, Any],
|
|
|
+ base_word: str
|
|
|
+ ) -> Dict[str, Any]:
|
|
|
+ """
|
|
|
+ 评估单个搜索结果(使用两层过滤)
|
|
|
+
|
|
|
+ Args:
|
|
|
+ idx: 索引
|
|
|
+ total: 总数
|
|
|
+ original_feature: 原始特征
|
|
|
+ search_item: 搜索项(包含search_word和search_result)
|
|
|
+ base_word: 基础词
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ 评估结果
|
|
|
+ """
|
|
|
+ search_word = search_item.get('search_word', '')
|
|
|
+ notes = search_item['search_result'].get('data', {}).get('data', [])
|
|
|
+
|
|
|
+ logger.info(f"[{idx}/{total}] 评估: {search_word} (帖子数: {len(notes)})")
|
|
|
+
|
|
|
+ # 调用LLM评估器的批量评估方法
|
|
|
+ evaluation = self.llm_evaluator.batch_evaluate_notes_with_filter(
|
|
|
+ search_query=search_word,
|
|
|
+ target_feature=original_feature,
|
|
|
+ notes=notes,
|
|
|
+ max_notes=self.evaluation_max_notes_per_query,
|
|
|
+ max_workers=self.evaluation_max_workers
|
|
|
+ )
|
|
|
+
|
|
|
+ # 统计信息
|
|
|
+ filtered_count = evaluation.get('filtered_count', 0)
|
|
|
+ evaluated_count = evaluation.get('evaluated_count', 0)
|
|
|
+ match_dist = evaluation.get('match_distribution', {})
|
|
|
+
|
|
|
+ logger.info(f" ✓ 完成: 过滤 {filtered_count}, 评估 {evaluated_count}, "
|
|
|
+ f"完全匹配 {match_dist.get('完全匹配(0.8-1.0)', 0)}, "
|
|
|
+ f"相似匹配 {match_dist.get('相似匹配(0.6-0.79)', 0)}")
|
|
|
+
|
|
|
+ return evaluation
|
|
|
+
|
|
|
+ # ========== 主流程 ==========
|
|
|
+ # 注:旧的扩展搜索方法(extended_searches)已移至 backup/unused_methods_from_enhanced_search_v2.py
|
|
|
+
|
|
|
+ def run_full_pipeline(self):
|
|
|
+ """执行完整流程"""
|
|
|
+ logger.info("\n" + "=" * 60)
|
|
|
+ logger.info("开始执行完整流程")
|
|
|
+ logger.info("=" * 60)
|
|
|
+
|
|
|
+ try:
|
|
|
+ # 深度解构 Only 模式:只运行深度解构分析(从 结果评估 结果开始)
|
|
|
+ if self.deep_analysis_only:
|
|
|
+ logger.info("运行模式: 深度解构 Only (从 结果评估 结果开始)")
|
|
|
+ evaluation_path = os.path.join(self.output_dir, "evaluated_results.json")
|
|
|
+
|
|
|
+ if not os.path.exists(evaluation_path):
|
|
|
+ raise FileNotFoundError(f"结果评估 结果不存在: {evaluation_path}")
|
|
|
+
|
|
|
+ with open(evaluation_path, 'r', encoding='utf-8') as f:
|
|
|
+ evaluation_results = json.load(f)
|
|
|
+
|
|
|
+ deep_results = self.deep_analyzer.run(evaluation_results)
|
|
|
+ return deep_results
|
|
|
+
|
|
|
+ # 正常流程:从 特征筛选 开始
|
|
|
+ # 步骤1
|
|
|
+ filtered_features = self.filter_medium_similarity_features()
|
|
|
+
|
|
|
+ # 步骤2:从how文件提取候选词
|
|
|
+ candidates = self.extract_candidate_words(filtered_features)
|
|
|
+
|
|
|
+ # 步骤3:多词组合 + LLM评估
|
|
|
+ queries = self.generate_search_queries(
|
|
|
+ candidates,
|
|
|
+ max_workers=8, # 提高并发从4到8
|
|
|
+ max_combo_length=3 # 降低组合长度从4到3
|
|
|
+ )
|
|
|
+
|
|
|
+ # 步骤4:执行搜索
|
|
|
+ search_results = self.execute_search_queries(queries, search_delay=2.0, top_n=self.top_n)
|
|
|
+
|
|
|
+ # 步骤5:LLM评估搜索结果 - 条件执行
|
|
|
+ if self.enable_evaluation:
|
|
|
+ evaluation_results = self.evaluate_search_results(search_results)
|
|
|
+ else:
|
|
|
+ evaluation_results = search_results
|
|
|
+ logger.info("\n" + "=" * 60)
|
|
|
+ logger.info("步骤5:跳过(未启用)")
|
|
|
+ logger.info("=" * 60)
|
|
|
+
|
|
|
+ # 深度解构分析 - 条件执行
|
|
|
+ if self.enable_deep_analysis:
|
|
|
+ deep_results = self.deep_analyzer.run(evaluation_results)
|
|
|
+ final_results = deep_results
|
|
|
+ else:
|
|
|
+ final_results = evaluation_results
|
|
|
+
|
|
|
+ # 相似度分析 - 条件执行
|
|
|
+ if self.enable_similarity_analysis and self.enable_deep_analysis:
|
|
|
+ logger.info("\n" + "=" * 60)
|
|
|
+ logger.info("步骤7:相似度分析(解构特征与原始特征)")
|
|
|
+ logger.info("=" * 60)
|
|
|
+
|
|
|
+ similarity_results = self.similarity_analyzer.run(
|
|
|
+ deep_results,
|
|
|
+ output_path=os.path.join(self.output_dir, "similarity_analysis_results.json")
|
|
|
+ )
|
|
|
+ final_results = similarity_results
|
|
|
+
|
|
|
+ logger.info("\n" + "=" * 60)
|
|
|
+ logger.info("步骤7完成")
|
|
|
+ logger.info("=" * 60)
|
|
|
+
|
|
|
+ logger.info("\n" + "=" * 60)
|
|
|
+ if self.enable_similarity_analysis and self.enable_deep_analysis:
|
|
|
+ logger.info("✓ 完整流程执行完成(完整流程+深度分析+相似度分析)")
|
|
|
+ elif self.enable_deep_analysis:
|
|
|
+ logger.info("✓ 完整流程执行完成(完整流程+深度分析)")
|
|
|
+ elif self.enable_evaluation:
|
|
|
+ logger.info("✓ 完整流程执行完成(完整流程)")
|
|
|
+ else:
|
|
|
+ logger.info("✓ 完整流程执行完成(基础流程)")
|
|
|
+ logger.info("=" * 60)
|
|
|
+
|
|
|
+ # 自动执行可视化
|
|
|
+ logger.info("\n" + "=" * 60)
|
|
|
+ logger.info("开始生成可视化...")
|
|
|
+ logger.info("=" * 60)
|
|
|
+
|
|
|
+ try:
|
|
|
+ # 使用统一的可视化脚本
|
|
|
+ viz_script = 'src/visualizers/search_results_visualizer.py'
|
|
|
+ logger.info(f" 使用可视化脚本: {viz_script}")
|
|
|
+
|
|
|
+ result = subprocess.run(
|
|
|
+ ['python3', viz_script],
|
|
|
+ capture_output=True,
|
|
|
+ text=True,
|
|
|
+ timeout=60
|
|
|
+ )
|
|
|
+
|
|
|
+ if result.returncode == 0:
|
|
|
+ logger.info("✓ 可视化生成成功")
|
|
|
+ logger.info(result.stdout)
|
|
|
+ else:
|
|
|
+ logger.error(f"可视化生成失败: {result.stderr}")
|
|
|
+ except subprocess.TimeoutExpired:
|
|
|
+ logger.error("可视化生成超时")
|
|
|
+ except Exception as e:
|
|
|
+ logger.error(f"可视化生成异常: {e}")
|
|
|
+
|
|
|
+ return final_results
|
|
|
+
|
|
|
+ except Exception as e:
|
|
|
+ logger.error(f"流程执行失败: {e}")
|
|
|
+ raise
|
|
|
+
|
|
|
+
|
|
|
+def main():
|
|
|
+ """主函数"""
|
|
|
+ parser = argparse.ArgumentParser(description='增强搜索系统V2')
|
|
|
+ parser.add_argument(
|
|
|
+ '--how-json',
|
|
|
+ default='input/posts/690d977d0000000007036331_how.json',
|
|
|
+ help='How解构文件路径'
|
|
|
+ )
|
|
|
+ parser.add_argument(
|
|
|
+ '--api-key',
|
|
|
+ default=None,
|
|
|
+ help='OpenRouter API密钥(默认从环境变量读取)'
|
|
|
+ )
|
|
|
+ parser.add_argument(
|
|
|
+ '--output-dir',
|
|
|
+ default='output_v2',
|
|
|
+ help='输出目录'
|
|
|
+ )
|
|
|
+ parser.add_argument(
|
|
|
+ '--top-n',
|
|
|
+ type=int,
|
|
|
+ default=10,
|
|
|
+ help='每个原始特征取评分最高的N个搜索词(默认10)'
|
|
|
+ )
|
|
|
+ parser.add_argument(
|
|
|
+ '--max-total-searches',
|
|
|
+ type=int,
|
|
|
+ default=None,
|
|
|
+ help='全局最大搜索次数限制(默认None不限制)'
|
|
|
+ )
|
|
|
+ parser.add_argument(
|
|
|
+ '--search-workers',
|
|
|
+ type=int,
|
|
|
+ default=3,
|
|
|
+ help='搜索并发数(默认3)'
|
|
|
+ )
|
|
|
+ parser.add_argument(
|
|
|
+ '--max-searches-per-feature',
|
|
|
+ type=int,
|
|
|
+ default=None,
|
|
|
+ help='每个原始特征的最大搜索次数(默认None不限制)'
|
|
|
+ )
|
|
|
+ parser.add_argument(
|
|
|
+ '--max-searches-per-base-word',
|
|
|
+ type=int,
|
|
|
+ default=None,
|
|
|
+ help='每个base_word的最大搜索次数(默认None不限制)'
|
|
|
+ )
|
|
|
+ parser.add_argument(
|
|
|
+ '--enable-stage5',
|
|
|
+ action='store_true',
|
|
|
+ help='启用结果评估(默认False)'
|
|
|
+ )
|
|
|
+ parser.add_argument(
|
|
|
+ '--stage5-max-workers',
|
|
|
+ type=int,
|
|
|
+ default=10,
|
|
|
+ help='结果评估并发评估数(默认10)'
|
|
|
+ )
|
|
|
+ parser.add_argument(
|
|
|
+ '--stage5-max-notes',
|
|
|
+ type=int,
|
|
|
+ default=20,
|
|
|
+ help='每个搜索结果评估的最大帖子数(默认20)'
|
|
|
+ )
|
|
|
+ parser.add_argument(
|
|
|
+ '--enable-stage6',
|
|
|
+ action='store_true',
|
|
|
+ help='启用 深度解构分析'
|
|
|
+ )
|
|
|
+ parser.add_argument(
|
|
|
+ '--stage6-only',
|
|
|
+ action='store_true',
|
|
|
+ help='只运行 深度解构(从 结果评估 结果开始)'
|
|
|
+ )
|
|
|
+ parser.add_argument(
|
|
|
+ '--stage6-max-workers',
|
|
|
+ type=int,
|
|
|
+ default=5,
|
|
|
+ help='深度解构 并发数(默认5)'
|
|
|
+ )
|
|
|
+ parser.add_argument(
|
|
|
+ '--stage6-max-notes',
|
|
|
+ type=int,
|
|
|
+ default=None,
|
|
|
+ help='深度解构 最多处理多少个完全匹配的帖子(默认None不限制)'
|
|
|
+ )
|
|
|
+ parser.add_argument(
|
|
|
+ '--stage6-skip',
|
|
|
+ type=int,
|
|
|
+ default=0,
|
|
|
+ help='深度解构 跳过前 N 个完全匹配的帖子(默认0)'
|
|
|
+ )
|
|
|
+ parser.add_argument(
|
|
|
+ '--stage6-sort-by',
|
|
|
+ type=str,
|
|
|
+ choices=['score', 'time', 'engagement'],
|
|
|
+ default='score',
|
|
|
+ help='深度解构 排序方式: score(评分), time(时间), engagement(互动量)'
|
|
|
+ )
|
|
|
+ parser.add_argument(
|
|
|
+ '--stage6-api-url',
|
|
|
+ type=str,
|
|
|
+ default='http://192.168.245.150:7000/what/analysis/single',
|
|
|
+ help='深度解构 解构 API 地址'
|
|
|
+ )
|
|
|
+ parser.add_argument(
|
|
|
+ '--stage6-min-score',
|
|
|
+ type=float,
|
|
|
+ default=0.8,
|
|
|
+ help='深度解构 处理的最低分数阈值(默认0.8,0-1分制)'
|
|
|
+ )
|
|
|
+ parser.add_argument(
|
|
|
+ '--enable-stage8',
|
|
|
+ action='store_true',
|
|
|
+ help='启用相似度分析(默认False,需要先启用stage6)'
|
|
|
+ )
|
|
|
+ parser.add_argument(
|
|
|
+ '--stage8-weight-embedding',
|
|
|
+ type=float,
|
|
|
+ default=0.5,
|
|
|
+ help='相似度分析向量模型权重(默认0.5)'
|
|
|
+ )
|
|
|
+ parser.add_argument(
|
|
|
+ '--stage8-weight-semantic',
|
|
|
+ type=float,
|
|
|
+ default=0.5,
|
|
|
+ help='相似度分析LLM模型权重(默认0.5)'
|
|
|
+ )
|
|
|
+ parser.add_argument(
|
|
|
+ '--stage8-max-workers',
|
|
|
+ type=int,
|
|
|
+ default=5,
|
|
|
+ help='相似度分析并发数(默认5)'
|
|
|
+ )
|
|
|
+ parser.add_argument(
|
|
|
+ '--stage8-min-similarity',
|
|
|
+ type=float,
|
|
|
+ default=0.0,
|
|
|
+ help='相似度分析最小相似度阈值(默认0.0)'
|
|
|
+ )
|
|
|
+
|
|
|
+ args = parser.parse_args()
|
|
|
+
|
|
|
+ # 创建系统实例
|
|
|
+ system = EnhancedSearchV2(
|
|
|
+ how_json_path=args.how_json,
|
|
|
+ openrouter_api_key=args.api_key,
|
|
|
+ output_dir=args.output_dir,
|
|
|
+ top_n=args.top_n,
|
|
|
+ max_total_searches=args.max_total_searches,
|
|
|
+ search_max_workers=args.search_workers,
|
|
|
+ max_searches_per_feature=args.max_searches_per_feature,
|
|
|
+ max_searches_per_base_word=args.max_searches_per_base_word,
|
|
|
+ enable_evaluation=args.enable_stage5,
|
|
|
+ evaluation_max_workers=args.stage5_max_workers,
|
|
|
+ evaluation_max_notes_per_query=args.stage5_max_notes,
|
|
|
+ enable_deep_analysis=args.enable_stage6,
|
|
|
+ deep_analysis_only=args.stage6_only,
|
|
|
+ deep_analysis_max_workers=args.stage6_max_workers,
|
|
|
+ deep_analysis_max_notes=args.stage6_max_notes,
|
|
|
+ deep_analysis_skip_count=args.stage6_skip,
|
|
|
+ deep_analysis_sort_by=args.stage6_sort_by,
|
|
|
+ deep_analysis_api_url=args.stage6_api_url,
|
|
|
+ deep_analysis_min_score=args.stage6_min_score,
|
|
|
+ enable_similarity_analysis=args.enable_stage8,
|
|
|
+ similarity_weight_embedding=args.stage8_weight_embedding,
|
|
|
+ similarity_weight_semantic=args.stage8_weight_semantic,
|
|
|
+ similarity_max_workers=args.stage8_max_workers,
|
|
|
+ similarity_min_similarity=args.stage8_min_similarity
|
|
|
+ )
|
|
|
+
|
|
|
+ # 执行完整流程
|
|
|
+ system.run_full_pipeline()
|
|
|
+
|
|
|
+
|
|
|
+if __name__ == '__main__':
|
|
|
+ # 当作为主脚本运行时,添加项目根目录到Python路径
|
|
|
+ import sys
|
|
|
+ project_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
|
+ if project_root not in sys.path:
|
|
|
+ sys.path.insert(0, project_root)
|
|
|
+
|
|
|
+ main()
|