|
@@ -0,0 +1,1577 @@
|
|
|
|
|
+#!/usr/bin/env python3
|
|
|
|
|
+# -*- coding: utf-8 -*-
|
|
|
|
|
+"""
|
|
|
|
|
+LLM评估模块
|
|
|
|
|
+用于评估搜索词质量和搜索结果相关度
|
|
|
|
|
+"""
|
|
|
|
|
+
|
|
|
|
|
+import logging
|
|
|
|
|
+from typing import List, Dict, Any, Optional
|
|
|
|
|
+from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
|
|
|
+from src.clients.openrouter_client import OpenRouterClient
|
|
|
|
|
+
|
|
|
|
|
+logger = logging.getLogger(__name__)
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+class LLMEvaluator:
|
|
|
|
|
+ """LLM评估器"""
|
|
|
|
|
+
|
|
|
|
|
+ def __init__(self, openrouter_client: OpenRouterClient):
|
|
|
|
|
+ """
|
|
|
|
|
+ 初始化评估器
|
|
|
|
|
+
|
|
|
|
|
+ Args:
|
|
|
|
|
+ openrouter_client: OpenRouter客户端实例
|
|
|
|
|
+ """
|
|
|
|
|
+ self.client = openrouter_client
|
|
|
|
|
+
|
|
|
|
|
+ def evaluate_search_word(
|
|
|
|
|
+ self,
|
|
|
|
|
+ original_feature: str,
|
|
|
|
|
+ search_word: str
|
|
|
|
|
+ ) -> Dict[str, Any]:
|
|
|
|
|
+ """
|
|
|
|
|
+ 评估搜索词质量(阶段4)
|
|
|
|
|
+
|
|
|
|
|
+ Args:
|
|
|
|
|
+ original_feature: 原始特征名称
|
|
|
|
|
+ search_word: 组合搜索词
|
|
|
|
|
+
|
|
|
|
|
+ Returns:
|
|
|
|
|
+ 评估结果
|
|
|
|
|
+ """
|
|
|
|
|
+ prompt = f"""你是一个小红书内容分析专家。
|
|
|
|
|
+
|
|
|
|
|
+# 任务说明
|
|
|
|
|
+从给定关键词中提取并组合适合在小红书搜索的query词(目标是找到【{original_feature}】相关内容,但query中不能直接出现"{original_feature}")
|
|
|
|
|
+
|
|
|
|
|
+## 可选词汇
|
|
|
|
|
+{search_word}
|
|
|
|
|
+
|
|
|
|
|
+## 要求
|
|
|
|
|
+1. 只能使用可选词汇中的词,可以进行以下变化:
|
|
|
|
|
+ - 直接使用原词或括号内的同义词
|
|
|
|
|
+ - 多个词组合
|
|
|
|
|
+ - 适当精简
|
|
|
|
|
+2. 不能添加可选词汇以外的新词
|
|
|
|
|
+3. 按推荐程度排序(越靠前越推荐)
|
|
|
|
|
+
|
|
|
|
|
+## 输出格式(JSON)
|
|
|
|
|
+{{
|
|
|
|
|
+ "score": 0.75,
|
|
|
|
|
+ "reasoning": "评估理由"
|
|
|
|
|
+}}
|
|
|
|
|
+
|
|
|
|
|
+注意:只返回JSON,不要其他内容。"""
|
|
|
|
|
+
|
|
|
|
|
+ result = self.client.chat_json(prompt=prompt, max_retries=3)
|
|
|
|
|
+
|
|
|
|
|
+ if result:
|
|
|
|
|
+ return {
|
|
|
|
|
+ "score": result.get("score", 0.0),
|
|
|
|
|
+ "reasoning": result.get("reasoning", ""),
|
|
|
|
|
+ "original_feature": original_feature
|
|
|
|
|
+ }
|
|
|
|
|
+ else:
|
|
|
|
|
+ logger.error(f"评估搜索词失败: {search_word}")
|
|
|
|
|
+ return {
|
|
|
|
|
+ "score": 0.0,
|
|
|
|
|
+ "reasoning": "LLM评估失败",
|
|
|
|
|
+ "original_feature": original_feature
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ def evaluate_search_words_batch(
|
|
|
|
|
+ self,
|
|
|
|
|
+ original_feature: str,
|
|
|
|
|
+ search_words: List[str],
|
|
|
|
|
+ max_workers: int = 5
|
|
|
|
|
+ ) -> List[Dict[str, Any]]:
|
|
|
|
|
+ """
|
|
|
|
|
+ 批量评估搜索词(并行)
|
|
|
|
|
+
|
|
|
|
|
+ Args:
|
|
|
|
|
+ original_feature: 原始特征
|
|
|
|
|
+ search_words: 搜索词列表
|
|
|
|
|
+ max_workers: 最大并发数
|
|
|
|
|
+
|
|
|
|
|
+ Returns:
|
|
|
|
|
+ 评估结果列表(已排序)
|
|
|
|
|
+ """
|
|
|
|
|
+ logger.info(f"开始批量评估 {len(search_words)} 个搜索词...")
|
|
|
|
|
+
|
|
|
|
|
+ results = []
|
|
|
|
|
+
|
|
|
|
|
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
|
|
|
+ # 提交任务
|
|
|
|
|
+ future_to_word = {
|
|
|
|
|
+ executor.submit(self.evaluate_search_word, original_feature, word): word
|
|
|
|
|
+ for word in search_words
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ # 收集结果
|
|
|
|
|
+ for idx, future in enumerate(as_completed(future_to_word), 1):
|
|
|
|
|
+ word = future_to_word[future]
|
|
|
|
|
+ try:
|
|
|
|
|
+ result = future.result()
|
|
|
|
|
+ result["search_word"] = word
|
|
|
|
|
+ results.append(result)
|
|
|
|
|
+ logger.info(f" [{idx}/{len(search_words)}] {word}: {result['score']:.3f}")
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ logger.error(f" 评估失败: {word}, 错误: {e}")
|
|
|
|
|
+ results.append({
|
|
|
|
|
+ "search_word": word,
|
|
|
|
|
+ "score": 0.0,
|
|
|
|
|
+ "reasoning": f"评估异常: {str(e)}",
|
|
|
|
|
+ "original_feature": original_feature
|
|
|
|
|
+ })
|
|
|
|
|
+
|
|
|
|
|
+ # 按分数排序
|
|
|
|
|
+ results.sort(key=lambda x: x["score"], reverse=True)
|
|
|
|
|
+
|
|
|
|
|
+ # 添加排名
|
|
|
|
|
+ for rank, result in enumerate(results, 1):
|
|
|
|
|
+ result["rank"] = rank
|
|
|
|
|
+
|
|
|
|
|
+ logger.info(f"批量评估完成,最高分: {results[0]['score']:.3f}")
|
|
|
|
|
+
|
|
|
|
|
+ return results
|
|
|
|
|
+
|
|
|
|
|
+ def evaluate_search_words_in_batches(
|
|
|
|
|
+ self,
|
|
|
|
|
+ original_feature: str,
|
|
|
|
|
+ search_words: List[str],
|
|
|
|
|
+ batch_size: int = 50,
|
|
|
|
|
+ base_word: str = ""
|
|
|
|
|
+ ) -> List[Dict[str, Any]]:
|
|
|
|
|
+ """
|
|
|
|
|
+ 分批评估搜索词(每批N个,减少API调用)
|
|
|
|
|
+
|
|
|
|
|
+ Args:
|
|
|
|
|
+ original_feature: 原始特征
|
|
|
|
|
+ search_words: 搜索词列表
|
|
|
|
|
+ batch_size: 每批处理的搜索词数量,默认10
|
|
|
|
|
+ base_word: 中心词(如果提供,要求所有组合必须包含此词)
|
|
|
|
|
+
|
|
|
|
|
+ Returns:
|
|
|
|
|
+ 评估结果列表(已排序)
|
|
|
|
|
+ """
|
|
|
|
|
+ logger.info(f"开始分批评估 {len(search_words)} 个搜索词(每批 {batch_size} 个)...")
|
|
|
|
|
+
|
|
|
|
|
+ all_results = []
|
|
|
|
|
+ total_batches = (len(search_words) + batch_size - 1) // batch_size
|
|
|
|
|
+
|
|
|
|
|
+ # 分批处理
|
|
|
|
|
+ for batch_idx in range(total_batches):
|
|
|
|
|
+ start_idx = batch_idx * batch_size
|
|
|
|
|
+ end_idx = min(start_idx + batch_size, len(search_words))
|
|
|
|
|
+ batch_words = search_words[start_idx:end_idx]
|
|
|
|
|
+
|
|
|
|
|
+ logger.info(f" 处理第 {batch_idx + 1}/{total_batches} 批({len(batch_words)} 个搜索词)")
|
|
|
|
|
+
|
|
|
|
|
+ # 从搜索词中提取所有独特的词作为可选词汇
|
|
|
|
|
+ available_words_set = set()
|
|
|
|
|
+ for word in batch_words:
|
|
|
|
|
+ # 分割搜索词,提取单个词
|
|
|
|
|
+ parts = word.split()
|
|
|
|
|
+ available_words_set.update(parts)
|
|
|
|
|
+
|
|
|
|
|
+ # 转换为列表并排序(保证稳定性)
|
|
|
|
|
+ available_words = sorted(list(available_words_set))
|
|
|
|
|
+
|
|
|
|
|
+ # 构建可选词汇字符串(逗号分隔)
|
|
|
|
|
+ available_words_str = "、".join(available_words)
|
|
|
|
|
+
|
|
|
|
|
+ # 构建 base_word 约束
|
|
|
|
|
+ base_word_constraint = ""
|
|
|
|
|
+ if base_word:
|
|
|
|
|
+ base_word_constraint = f"""
|
|
|
|
|
+## 中心词约束(重要)
|
|
|
|
|
+- 所有组合词都基于中心词: **{base_word}**
|
|
|
|
|
+- **禁止去掉中心词**,你只负责评分和排序
|
|
|
|
|
+- source_word 必须包含 "{base_word}"
|
|
|
|
|
+"""
|
|
|
|
|
+
|
|
|
|
|
+ prompt = f"""
|
|
|
|
|
+
|
|
|
|
|
+# 任务说明
|
|
|
|
|
+模拟你是一个内容创作者,评估并排序这些基于中心词的搜索组合。
|
|
|
|
|
+{base_word_constraint}
|
|
|
|
|
+
|
|
|
|
|
+## 可选词汇
|
|
|
|
|
+{available_words_str}
|
|
|
|
|
+
|
|
|
|
|
+## 要求
|
|
|
|
|
+1. 只能使用可选词汇中的词,可以进行以下变化:
|
|
|
|
|
+ - 直接使用原词或括号内的同义词
|
|
|
|
|
+ - 多个词组合
|
|
|
|
|
+ - 适当精简
|
|
|
|
|
+2. **source_word 必须包含中心词 "{base_word}"**(如果提供了中心词)
|
|
|
|
|
+3. 不能添加可选词汇以外的新词
|
|
|
|
|
+4. 按推荐程度排序(越靠前越推荐),取top5
|
|
|
|
|
+
|
|
|
|
|
+## 输出格式(JSON):
|
|
|
|
|
+[
|
|
|
|
|
+ {{
|
|
|
|
|
+ "rank": 1,
|
|
|
|
|
+ "search_word": "组合的搜索词",
|
|
|
|
|
+ "source_word": "组合来源词,空格分割,组合来源词都是从available_words_str中选取的",
|
|
|
|
|
+ "score": 0.85,
|
|
|
|
|
+ "reasoning": "推荐理由"
|
|
|
|
|
+ }},
|
|
|
|
|
+ {{
|
|
|
|
|
+ "index": 2,
|
|
|
|
|
+ "search_word": "组合的搜索词",
|
|
|
|
|
+ "source_word": "组合来源词,空格分割,组合来源词都是从available_words_str中选取的",
|
|
|
|
|
+ "score": 0.80,
|
|
|
|
|
+ "reasoning": "推荐理由"
|
|
|
|
|
+ }}
|
|
|
|
|
+]
|
|
|
|
|
+- 只返回JSON数组,不要其他内容"""
|
|
|
|
|
+
|
|
|
|
|
+ # 调用LLM
|
|
|
|
|
+ result = self.client.chat_json(prompt=prompt, max_retries=3)
|
|
|
|
|
+
|
|
|
|
|
+ if result and isinstance(result, list):
|
|
|
|
|
+ # 处理结果 - 新格式直接包含search_word
|
|
|
|
|
+ for idx, item in enumerate(result):
|
|
|
|
|
+ search_word = item.get("search_word", "")
|
|
|
|
|
+ if search_word: # 确保有搜索词
|
|
|
|
|
+ all_results.append({
|
|
|
|
|
+ "search_word": search_word,
|
|
|
|
|
+ "source_word": item.get("source_word", ""),
|
|
|
|
|
+ "score": item.get("score", 0.0),
|
|
|
|
|
+ "reasoning": item.get("reasoning", ""),
|
|
|
|
|
+ "original_feature": original_feature
|
|
|
|
|
+ })
|
|
|
|
|
+ logger.info(f" [{start_idx + idx + 1}/{len(search_words)}] "
|
|
|
|
|
+ f"{search_word}: {item.get('score', 0.0):.3f}")
|
|
|
|
|
+ else:
|
|
|
|
|
+ logger.error(f" 第 {batch_idx + 1} 批评估失败,跳过")
|
|
|
|
|
+ # 为失败的批次添加默认结果(使用原搜索词)
|
|
|
|
|
+ for word in batch_words:
|
|
|
|
|
+ all_results.append({
|
|
|
|
|
+ "search_word": word,
|
|
|
|
|
+ "score": 0.0,
|
|
|
|
|
+ "reasoning": "批量评估失败",
|
|
|
|
|
+ "original_feature": original_feature
|
|
|
|
|
+ })
|
|
|
|
|
+
|
|
|
|
|
+ # 按分数排序
|
|
|
|
|
+ all_results.sort(key=lambda x: x["score"], reverse=True)
|
|
|
|
|
+
|
|
|
|
|
+ # 添加排名
|
|
|
|
|
+ for rank, result in enumerate(all_results, 1):
|
|
|
|
|
+ result["rank"] = rank
|
|
|
|
|
+
|
|
|
|
|
+ logger.info(f"分批评估完成,最高分: {all_results[0]['score']:.3f} (总API调用: {total_batches} 次)")
|
|
|
|
|
+
|
|
|
|
|
+ return all_results
|
|
|
|
|
+
|
|
|
|
|
+ def generate_queries_from_candidates(
|
|
|
|
|
+ self,
|
|
|
|
|
+ original_feature: str,
|
|
|
|
|
+ base_word: str,
|
|
|
|
|
+ candidate_words: List[str],
|
|
|
|
|
+ max_queries: int = 10
|
|
|
|
|
+ ) -> List[Dict[str, Any]]:
|
|
|
|
|
+ """
|
|
|
|
|
+ 基于中心词和候选词列表,让LLM生成搜索query
|
|
|
|
|
+
|
|
|
|
|
+ Args:
|
|
|
|
|
+ original_feature: 原始特征名称
|
|
|
|
|
+ base_word: 中心词
|
|
|
|
|
+ candidate_words: 候选词列表
|
|
|
|
|
+ max_queries: 最大query数量
|
|
|
|
|
+
|
|
|
|
|
+ Returns:
|
|
|
|
|
+ query数组(与旧格式兼容)
|
|
|
|
|
+ """
|
|
|
|
|
+ logger.info(f"LLM生成query(中心词: {base_word}, 候选词: {len(candidate_words)}个)")
|
|
|
|
|
+
|
|
|
|
|
+ candidate_words_str = "、".join(candidate_words)
|
|
|
|
|
+
|
|
|
|
|
+ prompt = f"""# 角色
|
|
|
|
|
+你是一个专业的搜索query生成专家。你的任务是根据输入信息,生成最优的搜索query组合。
|
|
|
|
|
+
|
|
|
|
|
+## 核心规则(必须严格遵守)
|
|
|
|
|
+1. **绝对禁止**:
|
|
|
|
|
+ - 目标动机严格隔离,仅用于最终匹配度评估,Query生成过程中不得使用目标动机原文
|
|
|
|
|
+ - Query中不得包含动机词汇(如"如何"、"方法"、"技巧"、"教程"等意图词)
|
|
|
|
|
+ - 所有分析基于真实信息,不可假设推导
|
|
|
|
|
+
|
|
|
|
|
+2. **query构成**:仅由"中心词(如果有)+待选词"直接组成,无额外信息
|
|
|
|
|
+3. **query结构**:2-4个词,考虑前后顺序,无相似或语义重叠的query
|
|
|
|
|
+4. **输出数量**:生成1-4条备选query
|
|
|
|
|
+搜
|
|
|
|
|
+# 输入格式
|
|
|
|
|
+目标特征:{original_feature}
|
|
|
|
|
+中心词:{base_word}
|
|
|
|
|
+待选词:{candidate_words_str}
|
|
|
|
|
+
|
|
|
|
|
+注:带权重的词用括号标注权重值,无权重或权重为0则平权
|
|
|
|
|
+
|
|
|
|
|
+# query生成流程
|
|
|
|
|
+
|
|
|
|
|
+## 第一步:待选词预处理
|
|
|
|
|
+**去重**
|
|
|
|
|
+- 去除完全重复的词,优先保留权重高的的词
|
|
|
|
|
+
|
|
|
|
|
+## 第二步:待选词关联性分析
|
|
|
|
|
+**如果有中心词:**
|
|
|
|
|
+分析每个待选词与中心词的语义关联强度,判断哪些词与中心词组合能形成有意义的搜索语义
|
|
|
|
|
+
|
|
|
|
|
+**如果无中心词:**
|
|
|
|
|
+分析待选词之间的语义关联强度,判断哪些词组合能形成完整的搜索语义场
|
|
|
|
|
+
|
|
|
|
|
+**关联性分级:**
|
|
|
|
|
+- **强关联(0.7-1.0)**:两词在语义上紧密配合,常在同一场景共现,组合后形成完整概念
|
|
|
|
|
+- **中关联(0.4-0.69)**:两词有明确关联但不强制共现,组合后有一定语义增益
|
|
|
|
|
+- **无关联(0.0-0.39)**:两词无明显语义关联,组合无意义
|
|
|
|
|
+
|
|
|
|
|
+## 第三步:互补性分析
|
|
|
|
|
+
|
|
|
|
|
+对关联度较高的词进行互补性判断:
|
|
|
|
|
+
|
|
|
|
|
+**互补性分级:**
|
|
|
|
|
+- **强互补**:两词描述不同维度,组合后语义更完整(如:主体+场景、形式+内容)
|
|
|
|
|
+- **弱互补**:两词有差异但语义部分重叠
|
|
|
|
|
+- **语义重叠**:两词描述同一维度,组合无新增价值(避免)
|
|
|
|
|
+
|
|
|
|
|
+**常见互补维度组合:**
|
|
|
|
|
+- 主体+场景
|
|
|
|
|
+- 形式+内容
|
|
|
|
|
+- 内容+应用方式
|
|
|
|
|
+- 载体+场景+情绪
|
|
|
|
|
+
|
|
|
|
|
+## 第四步:优先级排序
|
|
|
|
|
+
|
|
|
|
|
+**综合排序考量:**
|
|
|
|
|
+- 与中心词(或其他待选词)的关联强度
|
|
|
|
|
+- 原始权重高低
|
|
|
|
|
+- 互补性强弱
|
|
|
|
|
+
|
|
|
|
|
+**排序原则:**
|
|
|
|
|
+强关联+高权重+强互补 > 强关联+无权重+强互补 > 中关联+高权重
|
|
|
|
|
+
|
|
|
|
|
+## 第五步:组合生成query
|
|
|
|
|
+
|
|
|
|
|
+**组合策略:**
|
|
|
|
|
+
|
|
|
|
|
+**如果有中心词:**
|
|
|
|
|
+1. 中心词 + 强关联且强互补的待选词(1-2个)
|
|
|
|
|
+2. 中心词 + 强关联但弱互补的待选词(1-2个)
|
|
|
|
|
+3. 仅用待选词组合(当纯待选词组合语义更完整时)
|
|
|
|
|
+
|
|
|
|
|
+**如果无中心词:**
|
|
|
|
|
+1. 2-3个强关联且强互补的待选词组合
|
|
|
|
|
+2. 1个核心词 + 1-2个中关联但强互补词
|
|
|
|
|
+
|
|
|
|
|
+**组合规则:**
|
|
|
|
|
+- 同一语义维度只保留1个最优词
|
|
|
|
|
+- 优先选择互补性强的词组合
|
|
|
|
|
+- 构成词数控制在2-3个
|
|
|
|
|
+- 考虑词的前后顺序(词定语在前,核心名词在后;场景词在前,实体词在后)
|
|
|
|
|
+**组合理由:**
|
|
|
|
|
+说明为什么选择这些词组合,词与词之间如何协同工作,形成什么样的搜索语义场
|
|
|
|
|
+
|
|
|
|
|
+## 第六步:query与目标动机匹配度评估
|
|
|
|
|
+**重要说明:** 只有在query生成完成后,才将query与目标动机进行匹配度评估
|
|
|
|
|
+**匹配分含义:**
|
|
|
|
|
+匹配分 = 此query能找到目标动机所需内容的概率(0-1之间)
|
|
|
|
|
+
|
|
|
|
|
+**评分标准:**
|
|
|
|
|
+- **0.8-1.0分**:query在语意上与目标强关联,能精准召回目标动机所需内容,覆盖核心要素
|
|
|
|
|
+- **0.4-0.79分**:query语意部分覆盖目标特征,能召回相关内容但可能不够精准,部分覆盖目标
|
|
|
|
|
+- **0.39分以下**:query召回内容可能偏离目标动机
|
|
|
|
|
+
|
|
|
|
|
+**评分维度:**
|
|
|
|
|
+- query的语义场是否覆盖目标动机的核心要素
|
|
|
|
|
+- query能否精准定位到目标所需的内容类型
|
|
|
|
|
+- query在搜索引擎中的可召回性
|
|
|
|
|
+
|
|
|
|
|
+**组合推理要求:**
|
|
|
|
|
+用流畅的段落说明:
|
|
|
|
|
+- query形成了什么样的搜索语义场
|
|
|
|
|
+- 这个语义场如何与目标动机产生关联
|
|
|
|
|
+- 为什么这个query能/不能召回目标所需内容
|
|
|
|
|
+- 使用因果关联词(因为/由于/所以/因此)串联逻辑
|
|
|
|
|
+- 避免"该query"、"这个"等模糊指代
|
|
|
|
|
+
|
|
|
|
|
+# 输出格式
|
|
|
|
|
+最终按照以下json格式输出
|
|
|
|
|
+{{
|
|
|
|
|
+ "queries": [
|
|
|
|
|
+ {{
|
|
|
|
|
+ "query": "查询词",
|
|
|
|
|
+ "中心词": "{base_word}",
|
|
|
|
|
+ "组合理由": "query词组合理由的详细说明,深度解释该query与目标及中心词的逻辑关联。目标特征的核心诉求是什么,基于这个诉求,选择了哪些词,为什么这些词最相关(说明权重、语义覆盖等原因)这些词如何协同工作,形成什么样的搜索语义场,词与词之间有什么语义延展关系,这个query预期能召回什么类型的内容,为什么能找到目标",
|
|
|
|
|
+ "与目标匹配分": 0.85,
|
|
|
|
|
+ "source_word ": "来源词,待选词和中心词组合"
|
|
|
|
|
+ }}
|
|
|
|
|
+ ]
|
|
|
|
|
+}}
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+**关键点:**
|
|
|
|
|
+1. query生成阶段:只考虑词与词之间的语义关联和互补性
|
|
|
|
|
+2. 匹配评估阶段:才将生成的query与目标动机进行匹配度分析
|
|
|
|
|
+3. 目标动机不参与query生成,仅用于最终评估
|
|
|
|
|
+
|
|
|
|
|
+注意:只返回JSON,不要其他内容。"""
|
|
|
|
|
+
|
|
|
|
|
+ # 调用 LLM
|
|
|
|
|
+ llm_results = self.client.chat_json(prompt=prompt, max_retries=3)
|
|
|
|
|
+
|
|
|
|
|
+ # 适配新的输出格式 {"queries": [...]}
|
|
|
|
|
+ if not llm_results or not isinstance(llm_results, dict):
|
|
|
|
|
+ logger.error("LLM返回格式错误:期待dict格式")
|
|
|
|
|
+ return []
|
|
|
|
|
+
|
|
|
|
|
+ queries_list = llm_results.get("queries", [])
|
|
|
|
|
+ if not isinstance(queries_list, list):
|
|
|
|
|
+ logger.error("LLM返回格式错误:queries字段不是列表")
|
|
|
|
|
+ return []
|
|
|
|
|
+
|
|
|
|
|
+ logger.info(f"LLM生成了 {len(queries_list)} 个query")
|
|
|
|
|
+
|
|
|
|
|
+ # 解析并验证
|
|
|
|
|
+ formatted_results = []
|
|
|
|
|
+ for rank, item in enumerate(queries_list[:max_queries], 1):
|
|
|
|
|
+ # 处理 LLM 输出的字段名:
|
|
|
|
|
+ # - "query" → search_word
|
|
|
|
|
+ # - "source_word " (注意尾随空格) → source_word
|
|
|
|
|
+ # - "组合理由" → reasoning
|
|
|
|
|
+ # - "与目标匹配分" → score
|
|
|
|
|
+ query_text = item.get("query", "")
|
|
|
|
|
+ source_word_raw = item.get("source_word ", item.get("source_word", "")) # 优先尝试带空格的键
|
|
|
|
|
+
|
|
|
|
|
+ validated_source_word = self._validate_and_fix_source_word(
|
|
|
|
|
+ llm_source_word=source_word_raw,
|
|
|
|
|
+ query=query_text,
|
|
|
|
|
+ base_word=base_word,
|
|
|
|
|
+ candidate_words=candidate_words
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+ formatted_results.append({
|
|
|
|
|
+ "search_word": query_text,
|
|
|
|
|
+ "source_word": validated_source_word,
|
|
|
|
|
+ "score": item.get("与目标匹配分", 0.0), # 使用 LLM 提供的分数
|
|
|
|
|
+ "reasoning": item.get("组合理由", ""),
|
|
|
|
|
+ "rank": rank,
|
|
|
|
|
+ "original_feature": original_feature
|
|
|
|
|
+ })
|
|
|
|
|
+
|
|
|
|
|
+ return formatted_results
|
|
|
|
|
+
|
|
|
|
|
+ def _validate_and_fix_source_word(
|
|
|
|
|
+ self,
|
|
|
|
|
+ llm_source_word: str,
|
|
|
|
|
+ query: str,
|
|
|
|
|
+ base_word: str,
|
|
|
|
|
+ candidate_words: List[str]
|
|
|
|
|
+ ) -> str:
|
|
|
|
|
+ """
|
|
|
|
|
+ 验证并修正 LLM 输出的 source_word
|
|
|
|
|
+ 确保只包含"中心词 + 候选词"中的词
|
|
|
|
|
+
|
|
|
|
|
+ Args:
|
|
|
|
|
+ llm_source_word: LLM 输出的 source_word
|
|
|
|
|
+ query: 生成的 search_word
|
|
|
|
|
+ base_word: 中心词
|
|
|
|
|
+ candidate_words: 候选词列表
|
|
|
|
|
+
|
|
|
|
|
+ Returns:
|
|
|
|
|
+ 验证后的 source_word
|
|
|
|
|
+ """
|
|
|
|
|
+ words = llm_source_word.split()
|
|
|
|
|
+ valid_words = []
|
|
|
|
|
+
|
|
|
|
|
+ # 验证每个词是否在允许列表中
|
|
|
|
|
+ for word in words:
|
|
|
|
|
+ if word == base_word or word in candidate_words:
|
|
|
|
|
+ valid_words.append(word)
|
|
|
|
|
+
|
|
|
|
|
+ # 确保中心词存在(如果query中包含)
|
|
|
|
|
+ if base_word in query and base_word not in valid_words:
|
|
|
|
|
+ valid_words.insert(0, base_word)
|
|
|
|
|
+
|
|
|
|
|
+ # 去重
|
|
|
|
|
+ seen = set()
|
|
|
|
|
+ deduplicated = []
|
|
|
|
|
+ for word in valid_words:
|
|
|
|
|
+ if word not in seen:
|
|
|
|
|
+ seen.add(word)
|
|
|
|
|
+ deduplicated.append(word)
|
|
|
|
|
+
|
|
|
|
|
+ return ' '.join(deduplicated)
|
|
|
|
|
+
|
|
|
|
|
+ def evaluate_single_note(
|
|
|
|
|
+ self,
|
|
|
|
|
+ original_feature: str,
|
|
|
|
|
+ search_word: str,
|
|
|
|
|
+ note: Dict[str, Any],
|
|
|
|
|
+ note_index: int = 0
|
|
|
|
|
+ ) -> Dict[str, Any]:
|
|
|
|
|
+ """
|
|
|
|
|
+ 评估单个帖子(阶段6,多模态)
|
|
|
|
|
+
|
|
|
|
|
+ Args:
|
|
|
|
|
+ original_feature: 原始特征
|
|
|
|
|
+ search_word: 搜索词
|
|
|
|
|
+ note: 单个帖子
|
|
|
|
|
+ note_index: 帖子索引
|
|
|
|
|
+
|
|
|
|
|
+ Returns:
|
|
|
|
|
+ 单个帖子的评估结果
|
|
|
|
|
+ """
|
|
|
|
|
+ card = note.get("note_card", {})
|
|
|
|
|
+ title = card.get("display_title", "")
|
|
|
|
|
+ desc = card.get("desc", "")[:500] # 限制长度
|
|
|
|
|
+ images = card.get("image_list", [])[:10] # 最多10张图
|
|
|
|
|
+
|
|
|
|
|
+ prompt = f"""你是一个小红书内容分析专家。
|
|
|
|
|
+
|
|
|
|
|
+任务:评估这个帖子是否包含目标特征"{original_feature}"的元素
|
|
|
|
|
+
|
|
|
|
|
+原始特征:"{original_feature}"
|
|
|
|
|
+搜索词:"{search_word}"
|
|
|
|
|
+
|
|
|
|
|
+帖子内容:
|
|
|
|
|
+标题: {title}
|
|
|
|
|
+正文: {desc}
|
|
|
|
|
+
|
|
|
|
|
+请分析帖子的文字和图片内容,返回JSON格式:
|
|
|
|
|
+{{
|
|
|
|
|
+ "relevance": 0.85, // 0.0-1.0,相关度
|
|
|
|
|
+ "matched_elements": ["元素1", "元素2"], // 匹配的元素列表
|
|
|
|
|
+ "reasoning": "简短的匹配理由"
|
|
|
|
|
+}}
|
|
|
|
|
+
|
|
|
|
|
+只返回JSON,不要其他内容。"""
|
|
|
|
|
+
|
|
|
|
|
+ result = self.client.chat_json(
|
|
|
|
|
+ prompt=prompt,
|
|
|
|
|
+ images=images if images else None,
|
|
|
|
|
+ max_retries=3
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+ if result:
|
|
|
|
|
+ return {
|
|
|
|
|
+ "note_index": note_index,
|
|
|
|
|
+ "relevance": result.get("relevance", 0.0),
|
|
|
|
|
+ "matched_elements": result.get("matched_elements", []),
|
|
|
|
|
+ "reasoning": result.get("reasoning", "")
|
|
|
|
|
+ }
|
|
|
|
|
+ else:
|
|
|
|
|
+ logger.error(f" 评估帖子 {note_index} 失败: {search_word}")
|
|
|
|
|
+ return {
|
|
|
|
|
+ "note_index": note_index,
|
|
|
|
|
+ "relevance": 0.0,
|
|
|
|
|
+ "matched_elements": [],
|
|
|
|
|
+ "reasoning": "评估失败"
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ def evaluate_search_results_parallel(
|
|
|
|
|
+ self,
|
|
|
|
|
+ original_feature: str,
|
|
|
|
|
+ search_word: str,
|
|
|
|
|
+ notes: List[Dict[str, Any]],
|
|
|
|
|
+ max_notes: int = 20,
|
|
|
|
|
+ max_workers: int = 20
|
|
|
|
|
+ ) -> Dict[str, Any]:
|
|
|
|
|
+ """
|
|
|
|
|
+ 并行评估搜索结果(每个帖子独立评估)
|
|
|
|
|
+
|
|
|
|
|
+ Args:
|
|
|
|
|
+ original_feature: 原始特征
|
|
|
|
|
+ search_word: 搜索词
|
|
|
|
|
+ notes: 帖子列表
|
|
|
|
|
+ max_notes: 最多评估几条帖子
|
|
|
|
|
+ max_workers: 最大并发数
|
|
|
|
|
+
|
|
|
|
|
+ Returns:
|
|
|
|
|
+ 评估结果汇总
|
|
|
|
|
+ """
|
|
|
|
|
+ if not notes:
|
|
|
|
|
+ return {
|
|
|
|
|
+ "overall_relevance": 0.0,
|
|
|
|
|
+ "extracted_elements": [],
|
|
|
|
|
+ "evaluated_notes": []
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ notes_to_eval = notes[:max_notes]
|
|
|
|
|
+ evaluated_notes = []
|
|
|
|
|
+
|
|
|
|
|
+ logger.info(f" 并行评估 {len(notes_to_eval)} 个帖子({max_workers}并发)")
|
|
|
|
|
+
|
|
|
|
|
+ # 20并发评估每个帖子
|
|
|
|
|
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
|
|
|
+ futures = []
|
|
|
|
|
+ for idx, note in enumerate(notes_to_eval):
|
|
|
|
|
+ future = executor.submit(
|
|
|
|
|
+ self.evaluate_single_note,
|
|
|
|
|
+ original_feature,
|
|
|
|
|
+ search_word,
|
|
|
|
|
+ note,
|
|
|
|
|
+ idx
|
|
|
|
|
+ )
|
|
|
|
|
+ futures.append(future)
|
|
|
|
|
+
|
|
|
|
|
+ # 收集结果
|
|
|
|
|
+ for future in as_completed(futures):
|
|
|
|
|
+ try:
|
|
|
|
|
+ result = future.result()
|
|
|
|
|
+ evaluated_notes.append(result)
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ logger.error(f" 评估帖子失败: {e}")
|
|
|
|
|
+
|
|
|
|
|
+ # 按note_index排序
|
|
|
|
|
+ evaluated_notes.sort(key=lambda x: x['note_index'])
|
|
|
|
|
+
|
|
|
|
|
+ # 汇总:计算整体相关度和提取元素
|
|
|
|
|
+ if evaluated_notes:
|
|
|
|
|
+ overall_relevance = sum(n['relevance'] for n in evaluated_notes) / len(evaluated_notes)
|
|
|
|
|
+
|
|
|
|
|
+ # 提取所有元素并统计频次
|
|
|
|
|
+ element_counts = {}
|
|
|
|
|
+ for note in evaluated_notes:
|
|
|
|
|
+ for elem in note['matched_elements']:
|
|
|
|
|
+ element_counts[elem] = element_counts.get(elem, 0) + 1
|
|
|
|
|
+
|
|
|
|
|
+ # 按频次排序,取前5个
|
|
|
|
|
+ extracted_elements = sorted(
|
|
|
|
|
+ element_counts.keys(),
|
|
|
|
|
+ key=lambda x: element_counts[x],
|
|
|
|
|
+ reverse=True
|
|
|
|
|
+ )[:5]
|
|
|
|
|
+ else:
|
|
|
|
|
+ overall_relevance = 0.0
|
|
|
|
|
+ extracted_elements = []
|
|
|
|
|
+
|
|
|
|
|
+ return {
|
|
|
|
|
+ "overall_relevance": overall_relevance,
|
|
|
|
|
+ "extracted_elements": extracted_elements,
|
|
|
|
|
+ "evaluated_notes": evaluated_notes
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ def evaluate_search_results(
|
|
|
|
|
+ self,
|
|
|
|
|
+ original_feature: str,
|
|
|
|
|
+ search_word: str,
|
|
|
|
|
+ notes: List[Dict[str, Any]],
|
|
|
|
|
+ max_notes: int = 5,
|
|
|
|
|
+ max_images_per_note: int = 10
|
|
|
|
|
+ ) -> Dict[str, Any]:
|
|
|
|
|
+ """
|
|
|
|
|
+ 评估搜索结果(阶段6,多模态)
|
|
|
|
|
+
|
|
|
|
|
+ Args:
|
|
|
|
|
+ original_feature: 原始特征
|
|
|
|
|
+ search_word: 搜索词
|
|
|
|
|
+ notes: 帖子列表
|
|
|
|
|
+ max_notes: 最多评估几条帖子
|
|
|
|
|
+ max_images_per_note: 每条帖子最多取几张图片
|
|
|
|
|
+
|
|
|
|
|
+ Returns:
|
|
|
|
|
+ 评估结果
|
|
|
|
|
+ """
|
|
|
|
|
+ if not notes:
|
|
|
|
|
+ return {
|
|
|
|
|
+ "overall_relevance": 0.0,
|
|
|
|
|
+ "extracted_elements": [],
|
|
|
|
|
+ "recommended_extension": None,
|
|
|
|
|
+ "evaluated_notes": []
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ # 限制评估数量
|
|
|
|
|
+ notes_to_eval = notes[:max_notes]
|
|
|
|
|
+
|
|
|
|
|
+ # 准备文本信息
|
|
|
|
|
+ notes_info = []
|
|
|
|
|
+ all_images = []
|
|
|
|
|
+
|
|
|
|
|
+ for idx, note in enumerate(notes_to_eval):
|
|
|
|
|
+ card = note.get("note_card", {})
|
|
|
|
|
+ title = card.get("display_title", "")
|
|
|
|
|
+ desc = card.get("desc", "")[:300] # 限制长度
|
|
|
|
|
+
|
|
|
|
|
+ notes_info.append({
|
|
|
|
|
+ "index": idx,
|
|
|
|
|
+ "title": title,
|
|
|
|
|
+ "desc": desc
|
|
|
|
|
+ })
|
|
|
|
|
+
|
|
|
|
|
+ # 收集图片
|
|
|
|
|
+ images = card.get("image_list", [])[:max_images_per_note]
|
|
|
|
|
+ all_images.extend(images)
|
|
|
|
|
+
|
|
|
|
|
+ # 构建提示词
|
|
|
|
|
+ notes_text = "\n\n".join([
|
|
|
|
|
+ f"帖子 {n['index']}:\n标题: {n['title']}\n正文: {n['desc']}"
|
|
|
|
|
+ for n in notes_info
|
|
|
|
|
+ ])
|
|
|
|
|
+
|
|
|
|
|
+ prompt = f"""你是一个小红书内容分析专家。
|
|
|
|
|
+
|
|
|
|
|
+任务:评估搜索结果是否包含目标特征的元素
|
|
|
|
|
+
|
|
|
|
|
+原始特征:"{original_feature}"
|
|
|
|
|
+搜索词:"{search_word}"
|
|
|
|
|
+帖子数量:{len(notes_to_eval)} 条
|
|
|
|
|
+
|
|
|
|
|
+帖子内容:
|
|
|
|
|
+{notes_text}
|
|
|
|
|
+
|
|
|
|
|
+请综合分析帖子的文字和图片内容,判断:
|
|
|
|
|
+1. 这些搜索结果中是否包含与"{original_feature}"相似的元素
|
|
|
|
|
+2. 提取最相关的元素关键词(2-4个字的词组)
|
|
|
|
|
+3. 推荐最适合用于扩展搜索的关键词
|
|
|
|
|
+
|
|
|
|
|
+返回JSON格式:
|
|
|
|
|
+{{
|
|
|
|
|
+ "overall_relevance": 0.72, // 0.0-1.0,整体相关度
|
|
|
|
|
+ "extracted_elements": ["关键词1", "关键词2", "关键词3"], // 提取的相似元素,按相关度排序
|
|
|
|
|
+ "recommended_extension": "关键词1", // 最优的扩展关键词
|
|
|
|
|
+ "evaluated_notes": [
|
|
|
|
|
+ {{
|
|
|
|
|
+ "note_index": 0, // 帖子索引
|
|
|
|
|
+ "relevance": 0.85, // 该帖子的相关度
|
|
|
|
|
+ "matched_elements": ["元素1", "元素2"], // 该帖子匹配的元素
|
|
|
|
|
+ "reasoning": "简短的匹配理由"
|
|
|
|
|
+ }}
|
|
|
|
|
+ ]
|
|
|
|
|
+}}
|
|
|
|
|
+
|
|
|
|
|
+注意:
|
|
|
|
|
+- extracted_elements 应该是帖子中实际包含的、与原始特征相似的元素
|
|
|
|
|
+- 优先提取在图片或文字中明显出现的元素
|
|
|
|
|
+- 只返回JSON,不要其他内容"""
|
|
|
|
|
+
|
|
|
|
|
+ # 调用LLM(带图片)
|
|
|
|
|
+ result = self.client.chat_json(
|
|
|
|
|
+ prompt=prompt,
|
|
|
|
|
+ images=all_images if all_images else None,
|
|
|
|
|
+ max_retries=3
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+ if result:
|
|
|
|
|
+ # 确保返回完整格式
|
|
|
|
|
+ return {
|
|
|
|
|
+ "overall_relevance": result.get("overall_relevance", 0.0),
|
|
|
|
|
+ "extracted_elements": result.get("extracted_elements", []),
|
|
|
|
|
+ "recommended_extension": result.get("recommended_extension"),
|
|
|
|
|
+ "evaluated_notes": result.get("evaluated_notes", [])
|
|
|
|
|
+ }
|
|
|
|
|
+ else:
|
|
|
|
|
+ logger.error(f"评估搜索结果失败: {search_word}")
|
|
|
|
|
+ return {
|
|
|
|
|
+ "overall_relevance": 0.0,
|
|
|
|
|
+ "extracted_elements": [],
|
|
|
|
|
+ "recommended_extension": None,
|
|
|
|
|
+ "evaluated_notes": []
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ def batch_evaluate_search_results(
|
|
|
|
|
+ self,
|
|
|
|
|
+ features_with_results: List[Dict[str, Any]],
|
|
|
|
|
+ max_workers: int = 3
|
|
|
|
|
+ ) -> List[Dict[str, Any]]:
|
|
|
|
|
+ """
|
|
|
|
|
+ 批量评估搜索结果(并行,但并发数较低以避免超时)
|
|
|
|
|
+
|
|
|
|
|
+ Args:
|
|
|
|
|
+ features_with_results: 带搜索结果的特征列表
|
|
|
|
|
+ max_workers: 最大并发数
|
|
|
|
|
+
|
|
|
|
|
+ Returns:
|
|
|
|
|
+ 带评估结果的特征列表
|
|
|
|
|
+ """
|
|
|
|
|
+ logger.info(f"开始批量评估 {len(features_with_results)} 个搜索结果...")
|
|
|
|
|
+
|
|
|
|
|
+ results = []
|
|
|
|
|
+
|
|
|
|
|
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
|
|
|
+ # 提交任务
|
|
|
|
|
+ future_to_feature = {}
|
|
|
|
|
+ for feature in features_with_results:
|
|
|
|
|
+ if not feature.get("search_result"):
|
|
|
|
|
+ # 无搜索结果,跳过
|
|
|
|
|
+ feature["result_evaluation"] = None
|
|
|
|
|
+ results.append(feature)
|
|
|
|
|
+ continue
|
|
|
|
|
+
|
|
|
|
|
+ original_feature = self._get_original_feature(feature)
|
|
|
|
|
+ search_word = feature.get("search_word", "")
|
|
|
|
|
+ notes = feature["search_result"].get("data", {}).get("data", [])
|
|
|
|
|
+
|
|
|
|
|
+ future = executor.submit(
|
|
|
|
|
+ self.evaluate_search_results,
|
|
|
|
|
+ original_feature,
|
|
|
|
|
+ search_word,
|
|
|
|
|
+ notes
|
|
|
|
|
+ )
|
|
|
|
|
+ future_to_feature[future] = feature
|
|
|
|
|
+
|
|
|
|
|
+ # 收集结果
|
|
|
|
|
+ for idx, future in enumerate(as_completed(future_to_feature), 1):
|
|
|
|
|
+ feature = future_to_feature[future]
|
|
|
|
|
+ try:
|
|
|
|
|
+ evaluation = future.result()
|
|
|
|
|
+ feature["result_evaluation"] = evaluation
|
|
|
|
|
+ results.append(feature)
|
|
|
|
|
+ logger.info(f" [{idx}/{len(future_to_feature)}] {feature.get('search_word')}: "
|
|
|
|
|
+ f"relevance={evaluation['overall_relevance']:.3f}")
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ logger.error(f" 评估失败: {feature.get('search_word')}, 错误: {e}")
|
|
|
|
|
+ feature["result_evaluation"] = None
|
|
|
|
|
+ results.append(feature)
|
|
|
|
|
+
|
|
|
|
|
+ logger.info(f"批量评估完成")
|
|
|
|
|
+
|
|
|
|
|
+ return results
|
|
|
|
|
+
|
|
|
|
|
+ def _get_original_feature(self, feature_node: Dict[str, Any]) -> str:
|
|
|
|
|
+ """
|
|
|
|
|
+ 从特征节点中获取原始特征名称
|
|
|
|
|
+
|
|
|
|
|
+ Args:
|
|
|
|
|
+ feature_node: 特征节点
|
|
|
|
|
+
|
|
|
|
|
+ Returns:
|
|
|
|
|
+ 原始特征名称
|
|
|
|
|
+ """
|
|
|
|
|
+ # 尝试从llm_evaluation中获取
|
|
|
|
|
+ if "llm_evaluation" in feature_node:
|
|
|
|
|
+ return feature_node["llm_evaluation"].get("original_feature", "")
|
|
|
|
|
+
|
|
|
|
|
+ # 尝试从其他字段获取
|
|
|
|
|
+ return feature_node.get("原始特征名称", feature_node.get("特征名称", ""))
|
|
|
|
|
+
|
|
|
|
|
+ # ========== Stage 6: 两层评估方法 ==========
|
|
|
|
|
+
|
|
|
|
|
+ def evaluate_query_relevance_batch(
|
|
|
|
|
+ self,
|
|
|
|
|
+ search_query: str,
|
|
|
|
|
+ notes: List[Dict[str, Any]],
|
|
|
|
|
+ max_notes: int = 20
|
|
|
|
|
+ ) -> Dict[str, Any]:
|
|
|
|
|
+ """
|
|
|
|
|
+ 第一层评估:批量判断搜索结果与 Query 的相关性
|
|
|
|
|
+
|
|
|
|
|
+ 一次 LLM 调用评估多个笔记的 Query 相关性
|
|
|
|
|
+
|
|
|
|
|
+ Args:
|
|
|
|
|
+ search_query: 搜索Query
|
|
|
|
|
+ notes: 笔记列表
|
|
|
|
|
+ max_notes: 最多评估几条笔记
|
|
|
|
|
+
|
|
|
|
|
+ Returns:
|
|
|
|
|
+ {
|
|
|
|
|
+ "note_0": {"与query相关性": "相关", "说明": "..."},
|
|
|
|
|
+ "note_1": {"与query相关性": "不相关", "说明": "..."},
|
|
|
|
|
+ ...
|
|
|
|
|
+ }
|
|
|
|
|
+ """
|
|
|
|
|
+ if not notes:
|
|
|
|
|
+ return {}
|
|
|
|
|
+
|
|
|
|
|
+ notes_to_eval = notes[:max_notes]
|
|
|
|
|
+
|
|
|
|
|
+ # 构建笔记列表文本
|
|
|
|
|
+ notes_text = ""
|
|
|
|
|
+ for idx, note in enumerate(notes_to_eval):
|
|
|
|
|
+ note_card = note.get('note_card', {})
|
|
|
|
|
+ title = note_card.get('display_title', '')
|
|
|
|
|
+ content = note_card.get('desc', '')[:800] # 限制长度
|
|
|
|
|
+ images = note_card.get('image_list', [])
|
|
|
|
|
+
|
|
|
|
|
+ notes_text += f"note_{idx}:\n"
|
|
|
|
|
+ notes_text += f"- 标题: {title}\n"
|
|
|
|
|
+ notes_text += f"- 正文: {content}\n"
|
|
|
|
|
+ notes_text += f"- 图像: {len(images)}张图片\n\n"
|
|
|
|
|
+
|
|
|
|
|
+ # 构建完整的第一层评估 Prompt(用户提供,不简化)
|
|
|
|
|
+ prompt = f"""# 任务说明
|
|
|
|
|
+判断搜索结果是否与搜索Query相关,过滤掉完全无关的结果。
|
|
|
|
|
+
|
|
|
|
|
+# 输入信息
|
|
|
|
|
+
|
|
|
|
|
+搜索Query: {search_query}
|
|
|
|
|
+
|
|
|
|
|
+搜索结果列表:
|
|
|
|
|
+{notes_text}
|
|
|
|
|
+
|
|
|
|
|
+# 判断标准
|
|
|
|
|
+✅ 相关(保留)
|
|
|
|
|
+搜索结果的标题、正文或图像内容中包含Query相关的信息:
|
|
|
|
|
+
|
|
|
|
|
+Query的核心关键词在结果中出现
|
|
|
|
|
+或 结果讨论的主题与Query直接相关
|
|
|
|
|
+或 结果是Query概念的上位/下位/平行概念
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+❌ 不相关(过滤)
|
|
|
|
|
+搜索结果与Query完全无关:
|
|
|
|
|
+Query的关键词完全未出现
|
|
|
|
|
+结果主题与Query无任何关联
|
|
|
|
|
+仅因搜索引擎误匹配而出现
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+## 判断示例
|
|
|
|
|
+Query "墨镜搭配" → 结果"太阳镜选购指南" ✅ 保留(墨镜=太阳镜)
|
|
|
|
|
+Query "墨镜搭配" → 结果"眼镜搭配技巧" ✅ 保留(眼镜是墨镜的上位概念)
|
|
|
|
|
+Query "墨镜搭配" → 结果"帽子搭配技巧" ❌ 过滤(完全无关)
|
|
|
|
|
+Query "复古滤镜" → 结果"滤镜调色教程" ✅ 保留(包含滤镜)
|
|
|
|
|
+Query "复古滤镜" → 结果"相机推荐" ❌ 过滤(主题不相关)
|
|
|
|
|
+
|
|
|
|
|
+# 输出格式
|
|
|
|
|
+{{
|
|
|
|
|
+ "note_0": {{
|
|
|
|
|
+ "与query相关性": "相关 / 不相关",
|
|
|
|
|
+ "说明": ""
|
|
|
|
|
+ }},
|
|
|
|
|
+ "note_1": {{
|
|
|
|
|
+ "与query相关性": "相关 / 不相关",
|
|
|
|
|
+ "说明": ""
|
|
|
|
|
+ }}
|
|
|
|
|
+}}
|
|
|
|
|
+
|
|
|
|
|
+# 特殊情况处理
|
|
|
|
|
+
|
|
|
|
|
+- 如果OCR提取的图像文字不完整或正文内容缺失,应在说明中注明,并根据实际可获取的信息进行判断
|
|
|
|
|
+- 当无法明确判断时,倾向于保留(标记为"相关")
|
|
|
|
|
+
|
|
|
|
|
+只返回JSON,不要其他内容。"""
|
|
|
|
|
+
|
|
|
|
|
+ # 调用 LLM(批量评估)
|
|
|
|
|
+ result = self.client.chat_json(
|
|
|
|
|
+ prompt=prompt,
|
|
|
|
|
+ max_retries=3
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+ if result:
|
|
|
|
|
+ return result
|
|
|
|
|
+ else:
|
|
|
|
|
+ logger.error(f" 第一层批量评估失败: Query={search_query}")
|
|
|
|
|
+ # 返回默认结果(全部标记为"相关"以保守处理)
|
|
|
|
|
+ default_result = {}
|
|
|
|
|
+ for idx in range(len(notes_to_eval)):
|
|
|
|
|
+ default_result[f"note_{idx}"] = {
|
|
|
|
|
+ "与query相关性": "相关",
|
|
|
|
|
+ "说明": "LLM评估失败,默认保留"
|
|
|
|
|
+ }
|
|
|
|
|
+ return default_result
|
|
|
|
|
+
|
|
|
|
|
+ def evaluate_feature_matching_single(
|
|
|
|
|
+ self,
|
|
|
|
|
+ target_feature: str,
|
|
|
|
|
+ note_title: str,
|
|
|
|
|
+ note_content: str,
|
|
|
|
|
+ note_images: List[str],
|
|
|
|
|
+ note_index: int
|
|
|
|
|
+ ) -> Dict[str, Any]:
|
|
|
|
|
+ """
|
|
|
|
|
+ 第二层评估:评估单个笔记与目标特征的匹配度
|
|
|
|
|
+
|
|
|
|
|
+ Args:
|
|
|
|
|
+ target_feature: 目标特征
|
|
|
|
|
+ note_title: 笔记标题
|
|
|
|
|
+ note_content: 笔记正文
|
|
|
|
|
+ note_images: 图片URL列表
|
|
|
|
|
+ note_index: 笔记索引
|
|
|
|
|
+
|
|
|
|
|
+ Returns:
|
|
|
|
|
+ {
|
|
|
|
|
+ "综合得分": 0.9, # 0-1分
|
|
|
|
|
+ "匹配类型": "完全匹配",
|
|
|
|
|
+ "评分说明": "...",
|
|
|
|
|
+ "关键匹配点": [...]
|
|
|
|
|
+ }
|
|
|
|
|
+ """
|
|
|
|
|
+ # 构建完整的第二层评估 Prompt(用户提供,不简化)
|
|
|
|
|
+ prompt = f"""# 任务说明
|
|
|
|
|
+你需要判断搜索到的案例与目标特征的相关性。
|
|
|
|
|
+
|
|
|
|
|
+# 输入信息
|
|
|
|
|
+目标特征:{target_feature}
|
|
|
|
|
+
|
|
|
|
|
+搜索结果:
|
|
|
|
|
+- 标题: {note_title}
|
|
|
|
|
+- 正文: {note_content[:800]}
|
|
|
|
|
+- 图像: {len(note_images)}张图片(请仔细分析图片内容,包括OCR提取图片中的文字)
|
|
|
|
|
+
|
|
|
|
|
+# 判断流程
|
|
|
|
|
+## 目标特征匹配度评分
|
|
|
|
|
+综合考虑语义相似度(概念匹配、层级关系)和场景关联度(应用场景、使用语境)进行评分:
|
|
|
|
|
+
|
|
|
|
|
+- 0.8-1分:完全匹配
|
|
|
|
|
+语义层面:找到与目标特征完全相同或高度一致的内容,核心概念完全一致
|
|
|
|
|
+场景层面:完全适用于同一场景、受众、平台和语境
|
|
|
|
|
+
|
|
|
|
|
+示例:
|
|
|
|
|
+目标"复古滤镜" + 小红书穿搭场景 vs 结果"小红书复古滤镜调色教程"
|
|
|
|
|
+目标"墨镜" + 时尚搭配场景 vs 结果"时尚墨镜搭配指南"
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+- 0.6-0.7分:相似匹配
|
|
|
|
|
+语义层面:
|
|
|
|
|
+结果是目标的上位概念(更宽泛)或下位概念(更具体)
|
|
|
|
|
+或属于同一概念的不同表现形式,或属于平行概念(同级不同类)
|
|
|
|
|
+场景层面:场景相近但有差异,需要筛选或调整后可用
|
|
|
|
|
+
|
|
|
|
|
+示例:
|
|
|
|
|
+目标"墨镜" + 时尚搭配 vs 结果"眼镜搭配技巧"(上位概念,需筛选)
|
|
|
|
|
+目标"怀旧滤镜" + 人像拍摄 vs 结果"胶片感调色"(不同表现形式)
|
|
|
|
|
+目标"日常穿搭" + 街拍 vs 结果"通勤穿搭拍照"(场景相近)
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+- 0.5-0.6分:弱相似
|
|
|
|
|
+语义层面:属于同一大类但具体方向或侧重点明显不同,仅提供了相关概念
|
|
|
|
|
+场景层面:场景有明显差异,迁移需要较大改造
|
|
|
|
|
+
|
|
|
|
|
+示例:
|
|
|
|
|
+目标"户外运动穿搭" vs 结果"健身房穿搭指南"
|
|
|
|
|
+目标"小红书图文笔记" vs 结果"抖音短视频脚本"
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+- 0.4分及以下:无匹配
|
|
|
|
|
+语义层面:仅表面词汇重叠,实质关联弱,或概念距离过远
|
|
|
|
|
+场景层面:应用场景基本不同或完全不同
|
|
|
|
|
+
|
|
|
|
|
+示例:
|
|
|
|
|
+目标"墨镜" vs 结果"配饰大全"(概念过于宽泛)
|
|
|
|
|
+目标"美食摄影构图" vs 结果"美食博主日常vlog"
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+## 概念层级关系说明
|
|
|
|
|
+在评分时,需要注意概念层级关系的影响:
|
|
|
|
|
+完全匹配(同一概念 + 同场景)→ 0.8-1分
|
|
|
|
|
+目标"墨镜" vs 结果"墨镜搭配",且都在时尚搭配场景
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+上位/下位概念(层级差一层)→ 通常0.6-0.7分
|
|
|
|
|
+目标"墨镜" vs 结果"眼镜搭配"(结果更宽泛,需筛选)
|
|
|
|
|
+目标"眼镜" vs 结果"墨镜选购"(结果更具体,部分适用)
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+平行概念(同级不同类)→ 通常0.6-0.7分
|
|
|
|
|
+目标"墨镜" vs 结果"近视眼镜"(都是眼镜类,但功能场景不同)
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+远距离概念(层级差两层及以上)→ 0.5分及以下
|
|
|
|
|
+目标"墨镜" vs 结果"配饰"(概念过于宽泛,指导性弱)
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+# 匹配结论判断
|
|
|
|
|
+根据综合得分判定匹配类型:
|
|
|
|
|
+
|
|
|
|
|
+0.8-1.0分:✅ 完全匹配
|
|
|
|
|
+
|
|
|
|
|
+判断:找到了目标特征的直接灵感来源
|
|
|
|
|
+建议:直接采纳为该特征的灵感溯源结果
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+0.6-0.79分:⚠️ 相似匹配
|
|
|
|
|
+
|
|
|
|
|
+判断:找到了相关的灵感参考,但存在一定差异
|
|
|
|
|
+建议:作为候选结果保留,可与其他结果综合判断或继续搜索更精确的匹配
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+0.59分及以下:❌ 无匹配
|
|
|
|
|
+
|
|
|
|
|
+判断:该结果与目标特征关联度不足
|
|
|
|
|
+建议:排除该结果,需要调整搜索策略继续寻找
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+# 输出格式
|
|
|
|
|
+{{
|
|
|
|
|
+ "综合得分": 0.7,
|
|
|
|
|
+ "匹配类型": "相似匹配",
|
|
|
|
|
+ "评分说明": "结果'眼镜搭配技巧'是目标'墨镜'的上位概念,内容涵盖多种眼镜类型。场景都是时尚搭配,但需要从结果中筛选出墨镜相关的内容。概念关系:上位概念(宽泛一层)",
|
|
|
|
|
+ "关键匹配点": [
|
|
|
|
|
+ "眼镜与脸型的搭配原则(部分适用于墨镜)",
|
|
|
|
|
+ "配饰的风格选择方法"
|
|
|
|
|
+ ]
|
|
|
|
|
+}}
|
|
|
|
|
+
|
|
|
|
|
+# 特殊情况处理
|
|
|
|
|
+复合特征评估:如果目标特征是复合型(如"复古滤镜+第一人称视角"),需要分别评估每个子特征的匹配度,然后取平均值作为最终得分
|
|
|
|
|
+信息不完整:如果OCR提取的图像文字不完整或正文内容缺失,应在说明中注明,并根据实际可获取的信息进行评分
|
|
|
|
|
+上位概念的实用性:当结果是目标的上位概念时,评分应考虑:内容中目标相关部分的占比;是否提供了可直接应用于目标的知识;场景的一致性程度;如果结果虽是上位概念但完全不涉及目标内容,应降至5-6分或更低
|
|
|
|
|
+
|
|
|
|
|
+只返回JSON,不要其他内容。"""
|
|
|
|
|
+
|
|
|
|
|
+ # 调用 LLM(传递图片进行多模态分析)
|
|
|
|
|
+ result = self.client.chat_json(
|
|
|
|
|
+ prompt=prompt,
|
|
|
|
|
+ images=note_images if note_images else None,
|
|
|
|
|
+ max_retries=3
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+ if result:
|
|
|
|
|
+ return result
|
|
|
|
|
+ else:
|
|
|
|
|
+ logger.error(f" 第二层评估失败: note {note_index}, target={target_feature}")
|
|
|
|
|
+ return {
|
|
|
|
|
+ "综合得分": 0.0,
|
|
|
|
|
+ "匹配类型": "评估失败",
|
|
|
|
|
+ "评分说明": "LLM评估失败",
|
|
|
|
|
+ "关键匹配点": []
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ def evaluate_note_with_filter(
|
|
|
|
|
+ self,
|
|
|
|
|
+ search_query: str,
|
|
|
|
|
+ target_feature: str,
|
|
|
|
|
+ note_title: str,
|
|
|
|
|
+ note_content: str,
|
|
|
|
|
+ note_images: List[str],
|
|
|
|
|
+ note_index: int = 0
|
|
|
|
|
+ ) -> Dict[str, Any]:
|
|
|
|
|
+ """
|
|
|
|
|
+ 两层评估单个笔记(完整Prompt版本)
|
|
|
|
|
+
|
|
|
|
|
+ 第一层:Query相关性过滤
|
|
|
|
|
+ 第二层:目标特征匹配度评分
|
|
|
|
|
+
|
|
|
|
|
+ Args:
|
|
|
|
|
+ search_query: 搜索Query,如 "外观装扮 发布萌宠内容"
|
|
|
|
|
+ target_feature: 目标特征,如 "佩戴"
|
|
|
|
|
+ note_title: 笔记标题
|
|
|
|
|
+ note_content: 笔记正文
|
|
|
|
|
+ note_images: 图片URL列表(会传递给LLM进行视觉分析和OCR)
|
|
|
|
|
+ note_index: 笔记索引
|
|
|
|
|
+
|
|
|
|
|
+ Returns:
|
|
|
|
|
+ 评估结果字典
|
|
|
|
|
+ """
|
|
|
|
|
+ # 构建完整的评估Prompt(用户提供的完整版本,一字不改)
|
|
|
|
|
+ prompt = f"""# 任务说明
|
|
|
|
|
+你需要判断搜索到的案例信息与目标特征的相关性。判断分为两层:第一层过滤与搜索Query无关的结果,第二层评估与目标特征的匹配度。
|
|
|
|
|
+
|
|
|
|
|
+# 输入信息
|
|
|
|
|
+
|
|
|
|
|
+搜索Query:{search_query}
|
|
|
|
|
+目标特征:{target_feature}
|
|
|
|
|
+搜索结果:
|
|
|
|
|
+- 标题: {note_title}
|
|
|
|
|
+- 正文: {note_content[:800]}
|
|
|
|
|
+- 图像: {len(note_images)}张图片(请仔细分析图片内容,包括OCR提取图片中的文字)
|
|
|
|
|
+
|
|
|
|
|
+# 判断流程
|
|
|
|
|
+第一层:Query相关性过滤
|
|
|
|
|
+判断标准:搜索结果是否与搜索Query相关
|
|
|
|
|
+过滤规则:
|
|
|
|
|
+
|
|
|
|
|
+✅ 保留:搜索结果的标题、正文或图像内容中包含Query相关的信息
|
|
|
|
|
+
|
|
|
|
|
+Query的核心关键词在结果中出现
|
|
|
|
|
+或结果讨论的主题与Query直接相关
|
|
|
|
|
+或结果是Query概念的上位/下位/平行概念
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+❌ 过滤:搜索结果与Query完全无关
|
|
|
|
|
+
|
|
|
|
|
+Query的关键词完全未出现
|
|
|
|
|
+结果主题与Query无任何关联
|
|
|
|
|
+仅因搜索引擎误匹配而出现
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+示例:
|
|
|
|
|
+
|
|
|
|
|
+Query "墨镜搭配" → 结果"太阳镜选购指南" ✅ 保留(墨镜=太阳镜)
|
|
|
|
|
+Query "墨镜搭配" → 结果"眼镜搭配技巧" ✅ 保留(眼镜是上位概念)
|
|
|
|
|
+Query "墨镜搭配" → 结果"帽子搭配技巧" ❌ 过滤(完全无关)
|
|
|
|
|
+Query "复古滤镜" → 结果"滤镜调色教程" ✅ 保留(包含滤镜)
|
|
|
|
|
+Query "复古滤镜" → 结果"相机推荐" ❌ 过滤(主题不相关)
|
|
|
|
|
+
|
|
|
|
|
+输出:
|
|
|
|
|
+如果判定为 ❌ 过滤,直接输出:
|
|
|
|
|
+
|
|
|
|
|
+json{{
|
|
|
|
|
+ "Query相关性": "不相关",
|
|
|
|
|
+ "综合得分": 0,
|
|
|
|
|
+ "匹配类型": "过滤",
|
|
|
|
|
+ "说明": "搜索结果与Query '{search_query}' 完全无关,建议过滤"
|
|
|
|
|
+}}
|
|
|
|
|
+
|
|
|
|
|
+如果判定为 ✅ 保留,进入第二层评分
|
|
|
|
|
+
|
|
|
|
|
+第二层:目标特征匹配度评分
|
|
|
|
|
+综合考虑语义相似度(概念匹配、层级关系、实操价值)和场景关联度(应用场景、使用语境)进行评分:
|
|
|
|
|
+8-10分:完全匹配
|
|
|
|
|
+
|
|
|
|
|
+语义层面:找到与目标特征完全相同或高度一致的内容,核心概念完全一致
|
|
|
|
|
+场景层面:完全适用于同一场景、受众、平台和语境
|
|
|
|
|
+实操价值:提供了具体可执行的方法、步骤或技巧
|
|
|
|
|
+示例:
|
|
|
|
|
+
|
|
|
|
|
+目标"复古滤镜" + 小红书穿搭场景 vs 结果"小红书复古滤镜调色教程"
|
|
|
|
|
+目标"墨镜" + 时尚搭配场景 vs 结果"时尚墨镜搭配指南"
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+6-7分:相似匹配
|
|
|
|
|
+
|
|
|
|
|
+语义层面:
|
|
|
|
|
+
|
|
|
|
|
+结果是目标的上位概念(更宽泛)或下位概念(更具体)
|
|
|
|
|
+或属于同一概念的不同表现形式
|
|
|
|
|
+或属于平行概念(同级不同类)
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+场景层面:场景相近但有差异,需要筛选或调整后可用
|
|
|
|
|
+实操价值:有一定参考价值但需要转化应用
|
|
|
|
|
+示例:
|
|
|
|
|
+
|
|
|
|
|
+目标"墨镜" + 时尚搭配 vs 结果"眼镜搭配技巧"(上位概念,需筛选)
|
|
|
|
|
+目标"怀旧滤镜" + 人像拍摄 vs 结果"胶片感调色"(不同表现形式)
|
|
|
|
|
+目标"日常穿搭" + 街拍 vs 结果"通勤穿搭拍照"(场景相近)
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+5-6分:弱相似
|
|
|
|
|
+
|
|
|
|
|
+语义层面:属于同一大类但具体方向或侧重点明显不同
|
|
|
|
|
+场景层面:场景有明显差异,迁移需要较大改造
|
|
|
|
|
+实操价值:提供了概念启发但需要较大转化
|
|
|
|
|
+示例:
|
|
|
|
|
+
|
|
|
|
|
+目标"户外运动穿搭" vs 结果"健身房穿搭指南"
|
|
|
|
|
+目标"小红书图文笔记" vs 结果"抖音短视频脚本"
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+4分及以下:无匹配
|
|
|
|
|
+
|
|
|
|
|
+语义层面:仅表面词汇重叠,实质关联弱,或概念距离过远
|
|
|
|
|
+场景层面:应用场景基本不同或完全不同
|
|
|
|
|
+实操价值:实操指导价值有限或无价值
|
|
|
|
|
+示例:
|
|
|
|
|
+
|
|
|
|
|
+目标"墨镜" vs 结果"配饰大全"(概念过于宽泛)
|
|
|
|
|
+目标"美食摄影构图" vs 结果"美食博主日常vlog"
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+概念层级关系说明
|
|
|
|
|
+在评分时,需要注意概念层级关系的影响:
|
|
|
|
|
+完全匹配(同一概念 + 同场景)→ 8-10分
|
|
|
|
|
+目标"墨镜" vs 结果"墨镜搭配",且都在时尚搭配场景
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+上位/下位概念(层级差一层)→ 通常6-7分
|
|
|
|
|
+目标"墨镜" vs 结果"眼镜搭配"(结果更宽泛,需筛选)
|
|
|
|
|
+目标"眼镜" vs 结果"墨镜选购"(结果更具体,部分适用)
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+平行概念(同级不同类)→ 通常6-7分
|
|
|
|
|
+目标"墨镜" vs 结果"近视眼镜"(都是眼镜类,但功能场景不同)
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+远距离概念(层级差两层及以上)→ 4分及以下
|
|
|
|
|
+目标"墨镜" vs 结果"配饰"(概念过于宽泛,指导性弱)
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+匹配结论判断
|
|
|
|
|
+根据综合得分判定匹配类型:
|
|
|
|
|
+
|
|
|
|
|
+8.0-10.0分:✅ 完全匹配
|
|
|
|
|
+
|
|
|
|
|
+判断:找到了目标特征的直接灵感来源
|
|
|
|
|
+置信度:高
|
|
|
|
|
+建议:直接采纳为该特征的灵感溯源结果
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+5.0-7.9分:⚠️ 相似匹配
|
|
|
|
|
+
|
|
|
|
|
+判断:找到了相关的灵感参考,但存在一定差异
|
|
|
|
|
+置信度:中
|
|
|
|
|
+建议:作为候选结果保留,可与其他结果综合判断或继续搜索更精确的匹配
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+1.0-4.9分:❌ 无匹配
|
|
|
|
|
+
|
|
|
|
|
+判断:该结果与目标特征关联度不足
|
|
|
|
|
+置信度:低
|
|
|
|
|
+建议:排除该结果,需要调整搜索策略继续寻找
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+# 输出格式
|
|
|
|
|
+通过Query相关性过滤的结果:
|
|
|
|
|
+json{{
|
|
|
|
|
+ "Query相关性": "相关",
|
|
|
|
|
+ "综合得分": 7.0,
|
|
|
|
|
+ "匹配类型": "相似匹配",
|
|
|
|
|
+ "置信度": "中",
|
|
|
|
|
+ "评分说明": "结果'眼镜搭配技巧'是目标'墨镜'的上位概念,内容涵盖多种眼镜类型。场景都是时尚搭配,但需要从结果中筛选出墨镜相关的内容。概念关系:上位概念(宽泛一层)",
|
|
|
|
|
+ "关键匹配点": [
|
|
|
|
|
+ "眼镜与脸型的搭配原则(部分适用于墨镜)",
|
|
|
|
|
+ "配饰的风格选择方法"
|
|
|
|
|
+ ]
|
|
|
|
|
+}}
|
|
|
|
|
+未通过Query相关性过滤的结果:
|
|
|
|
|
+json{{
|
|
|
|
|
+ "Query相关性": "不相关",
|
|
|
|
|
+ "综合得分": 0,
|
|
|
|
|
+ "匹配类型": "过滤",
|
|
|
|
|
+ "说明": "搜索结果'帽子搭配技巧'与Query'墨镜搭配'完全无关,建议过滤"
|
|
|
|
|
+}}
|
|
|
|
|
+
|
|
|
|
|
+# 特殊情况处理
|
|
|
|
|
+
|
|
|
|
|
+复合特征评估:如果目标特征是复合型(如"复古滤镜+第一人称视角"),需要分别评估每个子特征的匹配度,然后取算术平均值作为最终得分
|
|
|
|
|
+信息不完整:如果OCR提取的图像文字不完整或正文内容缺失,应在说明中注明,并根据实际可获取的信息进行评分
|
|
|
|
|
+上位概念的实用性:当结果是目标的上位概念时,评分应考虑:
|
|
|
|
|
+
|
|
|
|
|
+内容中目标相关部分的占比
|
|
|
|
|
+是否提供了可直接应用于目标的知识
|
|
|
|
|
+场景的一致性程度
|
|
|
|
|
+如果结果虽是上位概念但完全不涉及目标内容,应降至5-6分或更低
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+Query与目标特征的关系:
|
|
|
|
|
+如果Query就是目标特征本身,第一层和第二层判断可以合并考虑
|
|
|
|
|
+如果Query是为了探索目标特征而构建的更宽泛查询,第一层更宽松,第二层更严格
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+只返回JSON,不要其他内容。"""
|
|
|
|
|
+
|
|
|
|
|
+ # 调用LLM(传递图片URL进行多模态分析)
|
|
|
|
|
+ result = self.client.chat_json(
|
|
|
|
|
+ prompt=prompt,
|
|
|
|
|
+ images=note_images if note_images else None, # ✅ 传递图片
|
|
|
|
|
+ max_retries=3
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+ if result:
|
|
|
|
|
+ # 添加笔记索引
|
|
|
|
|
+ result['note_index'] = note_index
|
|
|
|
|
+ return result
|
|
|
|
|
+ else:
|
|
|
|
|
+ logger.error(f" 评估笔记 {note_index} 失败: Query={search_query}")
|
|
|
|
|
+ return {
|
|
|
|
|
+ "note_index": note_index,
|
|
|
|
|
+ "Query相关性": "评估失败",
|
|
|
|
|
+ "综合得分": 0,
|
|
|
|
|
+ "匹配类型": "评估失败",
|
|
|
|
|
+ "说明": "LLM评估失败"
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ def batch_evaluate_notes_with_filter(
|
|
|
|
|
+ self,
|
|
|
|
|
+ search_query: str,
|
|
|
|
|
+ target_feature: str,
|
|
|
|
|
+ notes: List[Dict[str, Any]],
|
|
|
|
|
+ max_notes: int = 20,
|
|
|
|
|
+ max_workers: int = 10
|
|
|
|
|
+ ) -> Dict[str, Any]:
|
|
|
|
|
+ """
|
|
|
|
|
+ 两层评估多个笔记(拆分为两次LLM调用)
|
|
|
|
|
+
|
|
|
|
|
+ 第一层:批量评估Query相关性(1次LLM调用)
|
|
|
|
|
+ 第二层:对"相关"的笔记评估特征匹配度(M次LLM调用)
|
|
|
|
|
+
|
|
|
|
|
+ Args:
|
|
|
|
|
+ search_query: 搜索Query
|
|
|
|
|
+ target_feature: 目标特征
|
|
|
|
|
+ notes: 笔记列表
|
|
|
|
|
+ max_notes: 最多评估几条笔记
|
|
|
|
|
+ max_workers: 最大并发数
|
|
|
|
|
+
|
|
|
|
|
+ Returns:
|
|
|
|
|
+ 评估结果汇总(包含统计信息)
|
|
|
|
|
+ """
|
|
|
|
|
+ if not notes:
|
|
|
|
|
+ return {
|
|
|
|
|
+ "total_notes": 0,
|
|
|
|
|
+ "evaluated_notes": 0,
|
|
|
|
|
+ "filtered_count": 0,
|
|
|
|
|
+ "statistics": {},
|
|
|
|
|
+ "notes_evaluation": []
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ notes_to_eval = notes[:max_notes]
|
|
|
|
|
+ logger.info(f" 两层评估 {len(notes_to_eval)} 个笔记")
|
|
|
|
|
+
|
|
|
|
|
+ # ========== 第一层:批量评估Query相关性 ==========
|
|
|
|
|
+ logger.info(f" [第一层] 批量评估Query相关性(1次LLM调用)")
|
|
|
|
|
+ query_relevance_result = self.evaluate_query_relevance_batch(
|
|
|
|
|
+ search_query=search_query,
|
|
|
|
|
+ notes=notes_to_eval,
|
|
|
|
|
+ max_notes=max_notes
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+ # 解析第一层结果,找出"相关"的笔记
|
|
|
|
|
+ relevant_notes_info = []
|
|
|
|
|
+ for idx, note in enumerate(notes_to_eval):
|
|
|
|
|
+ note_key = f"note_{idx}"
|
|
|
|
|
+ relevance_info = query_relevance_result.get(note_key, {})
|
|
|
|
|
+ relevance = relevance_info.get("与query相关性", "相关") # 默认为"相关"
|
|
|
|
|
+
|
|
|
|
|
+ if relevance == "相关":
|
|
|
|
|
+ # 保留笔记信息用于第二层评估
|
|
|
|
|
+ note_card = note.get('note_card', {})
|
|
|
|
|
+ relevant_notes_info.append({
|
|
|
|
|
+ "note_index": idx,
|
|
|
|
|
+ "note_card": note_card,
|
|
|
|
|
+ "title": note_card.get('display_title', ''),
|
|
|
|
|
+ "content": note_card.get('desc', ''),
|
|
|
|
|
+ "images": note_card.get('image_list', []),
|
|
|
|
|
+ "第一层评估": relevance_info
|
|
|
|
|
+ })
|
|
|
|
|
+
|
|
|
|
|
+ logger.info(f" [第一层] 过滤结果: {len(relevant_notes_info)}/{len(notes_to_eval)} 条相关")
|
|
|
|
|
+
|
|
|
|
|
+ # ========== 第二层:对相关笔记评估特征匹配度 ==========
|
|
|
|
|
+ evaluated_notes = []
|
|
|
|
|
+
|
|
|
|
|
+ if relevant_notes_info:
|
|
|
|
|
+ logger.info(f" [第二层] 并行评估特征匹配度({len(relevant_notes_info)}次LLM调用,{max_workers}并发)")
|
|
|
|
|
+
|
|
|
|
|
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
|
|
|
+ futures = []
|
|
|
|
|
+ for note_info in relevant_notes_info:
|
|
|
|
|
+ future = executor.submit(
|
|
|
|
|
+ self.evaluate_feature_matching_single,
|
|
|
|
|
+ target_feature,
|
|
|
|
|
+ note_info["title"],
|
|
|
|
|
+ note_info["content"],
|
|
|
|
|
+ note_info["images"],
|
|
|
|
|
+ note_info["note_index"]
|
|
|
|
|
+ )
|
|
|
|
|
+ futures.append((future, note_info))
|
|
|
|
|
+
|
|
|
|
|
+ # 收集结果并合并
|
|
|
|
|
+ for future, note_info in futures:
|
|
|
|
|
+ try:
|
|
|
|
|
+ second_layer_result = future.result()
|
|
|
|
|
+
|
|
|
|
|
+ # 合并两层评估结果
|
|
|
|
|
+ merged_result = {
|
|
|
|
|
+ "note_index": note_info["note_index"],
|
|
|
|
|
+ "Query相关性": "相关",
|
|
|
|
|
+ "综合得分": second_layer_result.get("综合得分", 0.0), # 0-1分制
|
|
|
|
|
+ "匹配类型": second_layer_result.get("匹配类型", ""),
|
|
|
|
|
+ "评分说明": second_layer_result.get("评分说明", ""),
|
|
|
|
|
+ "关键匹配点": second_layer_result.get("关键匹配点", []),
|
|
|
|
|
+ "第一层评估": note_info["第一层评估"],
|
|
|
|
|
+ "第二层评估": second_layer_result
|
|
|
|
|
+ }
|
|
|
|
|
+ evaluated_notes.append(merged_result)
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ logger.error(f" [第二层] 评估笔记 {note_info['note_index']} 失败: {e}")
|
|
|
|
|
+ # 失败的笔记也加入结果
|
|
|
|
|
+ evaluated_notes.append({
|
|
|
|
|
+ "note_index": note_info["note_index"],
|
|
|
|
|
+ "Query相关性": "相关",
|
|
|
|
|
+ "综合得分": 0.0,
|
|
|
|
|
+ "匹配类型": "评估失败",
|
|
|
|
|
+ "评分说明": f"第二层评估失败: {str(e)}",
|
|
|
|
|
+ "关键匹配点": [],
|
|
|
|
|
+ "第一层评估": note_info["第一层评估"],
|
|
|
|
|
+ "第二层评估": {}
|
|
|
|
|
+ })
|
|
|
|
|
+
|
|
|
|
|
+ # 添加第一层就被过滤的笔记(Query不相关)
|
|
|
|
|
+ for idx, note in enumerate(notes_to_eval):
|
|
|
|
|
+ note_key = f"note_{idx}"
|
|
|
|
|
+ relevance_info = query_relevance_result.get(note_key, {})
|
|
|
|
|
+ relevance = relevance_info.get("与query相关性", "相关")
|
|
|
|
|
+
|
|
|
|
|
+ if relevance == "不相关":
|
|
|
|
|
+ evaluated_notes.append({
|
|
|
|
|
+ "note_index": idx,
|
|
|
|
|
+ "Query相关性": "不相关",
|
|
|
|
|
+ "综合得分": 0.0,
|
|
|
|
|
+ "匹配类型": "过滤",
|
|
|
|
|
+ "说明": relevance_info.get("说明", ""),
|
|
|
|
|
+ "第一层评估": relevance_info
|
|
|
|
|
+ })
|
|
|
|
|
+
|
|
|
|
|
+ # 按note_index排序
|
|
|
|
|
+ evaluated_notes.sort(key=lambda x: x.get('note_index', 0))
|
|
|
|
|
+
|
|
|
|
|
+ # 统计信息
|
|
|
|
|
+ total_notes = len(notes)
|
|
|
|
|
+ evaluated_count = len(evaluated_notes)
|
|
|
|
|
+ filtered_count = sum(1 for n in evaluated_notes if n.get('Query相关性') == '不相关')
|
|
|
|
|
+
|
|
|
|
|
+ # 匹配度分布统计(使用0-1分制的阈值)
|
|
|
|
|
+ match_distribution = {
|
|
|
|
|
+ '完全匹配(0.8-1.0)': 0,
|
|
|
|
|
+ '相似匹配(0.6-0.79)': 0,
|
|
|
|
|
+ '弱相似(0.5-0.59)': 0,
|
|
|
|
|
+ '无匹配(≤0.4)': 0
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ for note_eval in evaluated_notes:
|
|
|
|
|
+ if note_eval.get('Query相关性') == '不相关':
|
|
|
|
|
+ continue # 过滤的不计入分布
|
|
|
|
|
+
|
|
|
|
|
+ score = note_eval.get('综合得分', 0)
|
|
|
|
|
+ if score >= 0.8:
|
|
|
|
|
+ match_distribution['完全匹配(0.8-1.0)'] += 1
|
|
|
|
|
+ elif score >= 0.6:
|
|
|
|
|
+ match_distribution['相似匹配(0.6-0.79)'] += 1
|
|
|
|
|
+ elif score >= 0.5:
|
|
|
|
|
+ match_distribution['弱相似(0.5-0.59)'] += 1
|
|
|
|
|
+ else:
|
|
|
|
|
+ match_distribution['无匹配(≤0.4)'] += 1
|
|
|
|
|
+
|
|
|
|
|
+ logger.info(f" 评估完成: 过滤{filtered_count}条, 匹配分布: {match_distribution}")
|
|
|
|
|
+
|
|
|
|
|
+ return {
|
|
|
|
|
+ "total_notes": total_notes,
|
|
|
|
|
+ "evaluated_notes": evaluated_count,
|
|
|
|
|
+ "filtered_count": filtered_count,
|
|
|
|
|
+ "statistics": match_distribution,
|
|
|
|
|
+ "notes_evaluation": evaluated_notes
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def test_evaluator():
|
|
|
|
|
+ """测试评估器"""
|
|
|
|
|
+ import os
|
|
|
|
|
+
|
|
|
|
|
+ # 初始化客户端
|
|
|
|
|
+ client = OpenRouterClient()
|
|
|
|
|
+ evaluator = LLMEvaluator(client)
|
|
|
|
|
+
|
|
|
|
|
+ # 测试搜索词评估
|
|
|
|
|
+ print("\n=== 测试搜索词评估 ===")
|
|
|
|
|
+ result = evaluator.evaluate_search_word(
|
|
|
|
|
+ original_feature="拟人",
|
|
|
|
|
+ search_word="宠物猫 猫咪"
|
|
|
|
|
+ )
|
|
|
|
|
+ print(f"评分: {result['score']:.3f}")
|
|
|
|
|
+ print(f"理由: {result['reasoning']}")
|
|
|
|
|
+
|
|
|
|
|
+ # 测试批量评估
|
|
|
|
|
+ print("\n=== 测试批量评估 ===")
|
|
|
|
|
+ results = evaluator.evaluate_search_words_batch(
|
|
|
|
|
+ original_feature="拟人",
|
|
|
|
|
+ search_words=["宠物猫 猫咪", "宠物猫 猫孩子", "宠物猫 猫"],
|
|
|
|
|
+ max_workers=2
|
|
|
|
|
+ )
|
|
|
|
|
+ for r in results:
|
|
|
|
|
+ print(f"{r['search_word']}: {r['score']:.3f} (rank={r['rank']})")
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+if __name__ == "__main__":
|
|
|
|
|
+ logging.basicConfig(
|
|
|
|
|
+ level=logging.INFO,
|
|
|
|
|
+ format='%(asctime)s - %(levelname)s - %(message)s'
|
|
|
|
|
+ )
|
|
|
|
|
+ test_evaluator()
|