2 ヶ月前 · eb1a91f535
--- a/src/evaluators/llm_evaluator.py
+++ b/src/evaluators/llm_evaluator.py
@@ -0,0 +1,1577 @@
 
															+#!/usr/bin/env python3
														
 
															+# -*- coding: utf-8 -*-
														
 
															+"""
														
 
															+LLM评估模块
														
 
															+用于评估搜索词质量和搜索结果相关度
														
 
															+"""
														
 
															+
														
 
															+import logging
														
 
															+from typing import List, Dict, Any, Optional
														
 
															+from concurrent.futures import ThreadPoolExecutor, as_completed
														
 
															+from src.clients.openrouter_client import OpenRouterClient
														
 
															+
														
 
															+logger = logging.getLogger(__name__)
														
 
															+
														
 
															+
														
 
															+class LLMEvaluator:
														
 
															+    """LLM评估器"""
														
 
															+
														
 
															+    def __init__(self, openrouter_client: OpenRouterClient):
														
 
															+        """
														
 
															+        初始化评估器
														
 
															+
														
 
															+        Args:
														
 
															+            openrouter_client: OpenRouter客户端实例
														
 
															+        """
														
 
															+        self.client = openrouter_client
														
 
															+
														
 
															+    def evaluate_search_word(
														
 
															+        self,
														
 
															+        original_feature: str,
														
 
															+        search_word: str
														
 
															+    ) -> Dict[str, Any]:
														
 
															+        """
														
 
															+        评估搜索词质量（阶段4）
														
 
															+
														
 
															+        Args:
														
 
															+            original_feature: 原始特征名称
														
 
															+            search_word: 组合搜索词
														
 
															+
														
 
															+        Returns:
														
 
															+            评估结果
														
 
															+        """
														
 
															+        prompt = f"""你是一个小红书内容分析专家。
														
 
															+
														
 
															+# 任务说明
														
 
															+从给定关键词中提取并组合适合在小红书搜索的query词(目标是找到【{original_feature}】相关内容,但query中不能直接出现"{original_feature}")
														
 
															+
														
 
															+## 可选词汇
														
 
															+{search_word}
														
 
															+
														
 
															+## 要求
														
 
															+1. 只能使用可选词汇中的词,可以进行以下变化:
														
 
															+   - 直接使用原词或括号内的同义词
														
 
															+   - 多个词组合
														
 
															+   - 适当精简
														
 
															+2. 不能添加可选词汇以外的新词
														
 
															+3. 按推荐程度排序(越靠前越推荐)
														
 
															+
														
 
															+## 输出格式(JSON)
														
 
															+{{
														
 
															+  "score": 0.75,
														
 
															+  "reasoning": "评估理由"
														
 
															+}}
														
 
															+
														
 
															+注意：只返回JSON，不要其他内容。"""
														
 
															+
														
 
															+        result = self.client.chat_json(prompt=prompt, max_retries=3)
														
 
															+
														
 
															+        if result:
														
 
															+            return {
														
 
															+                "score": result.get("score", 0.0),
														
 
															+                "reasoning": result.get("reasoning", ""),
														
 
															+                "original_feature": original_feature
														
 
															+            }
														
 
															+        else:
														
 
															+            logger.error(f"评估搜索词失败: {search_word}")
														
 
															+            return {
														
 
															+                "score": 0.0,
														
 
															+                "reasoning": "LLM评估失败",
														
 
															+                "original_feature": original_feature
														
 
															+            }
														
 
															+
														
 
															+    def evaluate_search_words_batch(
														
 
															+        self,
														
 
															+        original_feature: str,
														
 
															+        search_words: List[str],
														
 
															+        max_workers: int = 5
														
 
															+    ) -> List[Dict[str, Any]]:
														
 
															+        """
														
 
															+        批量评估搜索词（并行）
														
 
															+
														
 
															+        Args:
														
 
															+            original_feature: 原始特征
														
 
															+            search_words: 搜索词列表
														
 
															+            max_workers: 最大并发数
														
 
															+
														
 
															+        Returns:
														
 
															+            评估结果列表（已排序）
														
 
															+        """
														
 
															+        logger.info(f"开始批量评估 {len(search_words)} 个搜索词...")
														
 
															+
														
 
															+        results = []
														
 
															+
														
 
															+        with ThreadPoolExecutor(max_workers=max_workers) as executor:
														
 
															+            # 提交任务
														
 
															+            future_to_word = {
														
 
															+                executor.submit(self.evaluate_search_word, original_feature, word): word
														
 
															+                for word in search_words
														
 
															+            }
														
 
															+
														
 
															+            # 收集结果
														
 
															+            for idx, future in enumerate(as_completed(future_to_word), 1):
														
 
															+                word = future_to_word[future]
														
 
															+                try:
														
 
															+                    result = future.result()
														
 
															+                    result["search_word"] = word
														
 
															+                    results.append(result)
														
 
															+                    logger.info(f"  [{idx}/{len(search_words)}] {word}: {result['score']:.3f}")
														
 
															+                except Exception as e:
														
 
															+                    logger.error(f"  评估失败: {word}, 错误: {e}")
														
 
															+                    results.append({
														
 
															+                        "search_word": word,
														
 
															+                        "score": 0.0,
														
 
															+                        "reasoning": f"评估异常: {str(e)}",
														
 
															+                        "original_feature": original_feature
														
 
															+                    })
														
 
															+
														
 
															+        # 按分数排序
														
 
															+        results.sort(key=lambda x: x["score"], reverse=True)
														
 
															+
														
 
															+        # 添加排名
														
 
															+        for rank, result in enumerate(results, 1):
														
 
															+            result["rank"] = rank
														
 
															+
														
 
															+        logger.info(f"批量评估完成，最高分: {results[0]['score']:.3f}")
														
 
															+
														
 
															+        return results
														
 
															+
														
 
															+    def evaluate_search_words_in_batches(
														
 
															+        self,
														
 
															+        original_feature: str,
														
 
															+        search_words: List[str],
														
 
															+        batch_size: int = 50,
														
 
															+        base_word: str = ""
														
 
															+    ) -> List[Dict[str, Any]]:
														
 
															+        """
														
 
															+        分批评估搜索词（每批N个，减少API调用）
														
 
															+
														
 
															+        Args:
														
 
															+            original_feature: 原始特征
														
 
															+            search_words: 搜索词列表
														
 
															+            batch_size: 每批处理的搜索词数量，默认10
														
 
															+            base_word: 中心词（如果提供，要求所有组合必须包含此词）
														
 
															+
														
 
															+        Returns:
														
 
															+            评估结果列表（已排序）
														
 
															+        """
														
 
															+        logger.info(f"开始分批评估 {len(search_words)} 个搜索词（每批 {batch_size} 个）...")
														
 
															+
														
 
															+        all_results = []
														
 
															+        total_batches = (len(search_words) + batch_size - 1) // batch_size
														
 
															+
														
 
															+        # 分批处理
														
 
															+        for batch_idx in range(total_batches):
														
 
															+            start_idx = batch_idx * batch_size
														
 
															+            end_idx = min(start_idx + batch_size, len(search_words))
														
 
															+            batch_words = search_words[start_idx:end_idx]
														
 
															+
														
 
															+            logger.info(f"  处理第 {batch_idx + 1}/{total_batches} 批（{len(batch_words)} 个搜索词）")
														
 
															+
														
 
															+            # 从搜索词中提取所有独特的词作为可选词汇
														
 
															+            available_words_set = set()
														
 
															+            for word in batch_words:
														
 
															+                # 分割搜索词，提取单个词
														
 
															+                parts = word.split()
														
 
															+                available_words_set.update(parts)
														
 
															+
														
 
															+            # 转换为列表并排序（保证稳定性）
														
 
															+            available_words = sorted(list(available_words_set))
														
 
															+
														
 
															+            # 构建可选词汇字符串（逗号分隔）
														
 
															+            available_words_str = "、".join(available_words)
														
 
															+
														
 
															+            # 构建 base_word 约束
														
 
															+            base_word_constraint = ""
														
 
															+            if base_word:
														
 
															+                base_word_constraint = f"""
														
 
															+## 中心词约束（重要）
														
 
															+- 所有组合词都基于中心词: **{base_word}**
														
 
															+- **禁止去掉中心词**，你只负责评分和排序
														
 
															+- source_word 必须包含 "{base_word}"
														
 
															+"""
														
 
															+
														
 
															+            prompt = f"""
														
 
															+
														
 
															+# 任务说明
														
 
															+模拟你是一个内容创作者，评估并排序这些基于中心词的搜索组合。
														
 
															+{base_word_constraint}
														
 
															+
														
 
															+## 可选词汇
														
 
															+{available_words_str}
														
 
															+
														
 
															+## 要求
														
 
															+1. 只能使用可选词汇中的词,可以进行以下变化:
														
 
															+   - 直接使用原词或括号内的同义词
														
 
															+   - 多个词组合
														
 
															+   - 适当精简
														
 
															+2. **source_word 必须包含中心词 "{base_word}"**（如果提供了中心词）
														
 
															+3. 不能添加可选词汇以外的新词
														
 
															+4. 按推荐程度排序(越靠前越推荐)，取top5
														
 
															+
														
 
															+## 输出格式（JSON）:
														
 
															+[
														
 
															+  {{
														
 
															+    "rank": 1,
														
 
															+    "search_word": "组合的搜索词",
														
 
															+    "source_word": "组合来源词，空格分割，组合来源词都是从available_words_str中选取的",
														
 
															+    "score": 0.85,
														
 
															+    "reasoning": "推荐理由"
														
 
															+  }},
														
 
															+  {{
														
 
															+    "index": 2,
														
 
															+    "search_word": "组合的搜索词",
														
 
															+    "source_word": "组合来源词，空格分割，组合来源词都是从available_words_str中选取的",
														
 
															+    "score": 0.80,
														
 
															+    "reasoning": "推荐理由"
														
 
															+  }}
														
 
															+]
														
 
															+- 只返回JSON数组，不要其他内容"""
														
 
															+
														
 
															+            # 调用LLM
														
 
															+            result = self.client.chat_json(prompt=prompt, max_retries=3)
														
 
															+
														
 
															+            if result and isinstance(result, list):
														
 
															+                # 处理结果 - 新格式直接包含search_word
														
 
															+                for idx, item in enumerate(result):
														
 
															+                    search_word = item.get("search_word", "")
														
 
															+                    if search_word:  # 确保有搜索词
														
 
															+                        all_results.append({
														
 
															+                            "search_word": search_word,
														
 
															+                            "source_word": item.get("source_word", ""),
														
 
															+                            "score": item.get("score", 0.0),
														
 
															+                            "reasoning": item.get("reasoning", ""),
														
 
															+                            "original_feature": original_feature
														
 
															+                        })
														
 
															+                        logger.info(f"    [{start_idx + idx + 1}/{len(search_words)}] "
														
 
															+                                   f"{search_word}: {item.get('score', 0.0):.3f}")
														
 
															+            else:
														
 
															+                logger.error(f"  第 {batch_idx + 1} 批评估失败，跳过")
														
 
															+                # 为失败的批次添加默认结果（使用原搜索词）
														
 
															+                for word in batch_words:
														
 
															+                    all_results.append({
														
 
															+                        "search_word": word,
														
 
															+                        "score": 0.0,
														
 
															+                        "reasoning": "批量评估失败",
														
 
															+                        "original_feature": original_feature
														
 
															+                    })
														
 
															+
														
 
															+        # 按分数排序
														
 
															+        all_results.sort(key=lambda x: x["score"], reverse=True)
														
 
															+
														
 
															+        # 添加排名
														
 
															+        for rank, result in enumerate(all_results, 1):
														
 
															+            result["rank"] = rank
														
 
															+
														
 
															+        logger.info(f"分批评估完成，最高分: {all_results[0]['score']:.3f} (总API调用: {total_batches} 次)")
														
 
															+
														
 
															+        return all_results
														
 
															+
														
 
															+    def generate_queries_from_candidates(
														
 
															+        self,
														
 
															+        original_feature: str,
														
 
															+        base_word: str,
														
 
															+        candidate_words: List[str],
														
 
															+        max_queries: int = 10
														
 
															+    ) -> List[Dict[str, Any]]:
														
 
															+        """
														
 
															+        基于中心词和候选词列表，让LLM生成搜索query
														
 
															+
														
 
															+        Args:
														
 
															+            original_feature: 原始特征名称
														
 
															+            base_word: 中心词
														
 
															+            candidate_words: 候选词列表
														
 
															+            max_queries: 最大query数量
														
 
															+
														
 
															+        Returns:
														
 
															+            query数组（与旧格式兼容）
														
 
															+        """
														
 
															+        logger.info(f"LLM生成query（中心词: {base_word}, 候选词: {len(candidate_words)}个）")
														
 
															+
														
 
															+        candidate_words_str = "、".join(candidate_words)
														
 
															+
														
 
															+        prompt = f"""# 角色
														
 
															+你是一个专业的搜索query生成专家。你的任务是根据输入信息，生成最优的搜索query组合。
														
 
															+
														
 
															+## 核心规则（必须严格遵守）
														
 
															+1. **绝对禁止**：
														
 
															+   - 目标动机严格隔离，仅用于最终匹配度评估，Query生成过程中不得使用目标动机原文
														
 
															+   - Query中不得包含动机词汇（如"如何"、"方法"、"技巧"、"教程"等意图词）
														
 
															+   - 所有分析基于真实信息，不可假设推导
														
 
															+
														
 
															+2. **query构成**：仅由"中心词（如果有）+待选词"直接组成，无额外信息
														
 
															+3. **query结构**：2-4个词，考虑前后顺序，无相似或语义重叠的query
														
 
															+4. **输出数量**：生成1-4条备选query
														
 
															+搜
														
 
															+# 输入格式
														
 
															+目标特征：{original_feature}
														
 
															+中心词：{base_word}
														
 
															+待选词：{candidate_words_str}
														
 
															+
														
 
															+注：带权重的词用括号标注权重值，无权重或权重为0则平权
														
 
															+
														
 
															+# query生成流程
														
 
															+
														
 
															+## 第一步：待选词预处理
														
 
															+**去重**
														
 
															+- 去除完全重复的词，优先保留权重高的的词
														
 
															+
														
 
															+## 第二步：待选词关联性分析
														
 
															+**如果有中心词：**
														
 
															+分析每个待选词与中心词的语义关联强度，判断哪些词与中心词组合能形成有意义的搜索语义
														
 
															+
														
 
															+**如果无中心词：**
														
 
															+分析待选词之间的语义关联强度，判断哪些词组合能形成完整的搜索语义场
														
 
															+
														
 
															+**关联性分级：**
														
 
															+- **强关联（0.7-1.0）**：两词在语义上紧密配合，常在同一场景共现，组合后形成完整概念
														
 
															+- **中关联（0.4-0.69）**：两词有明确关联但不强制共现，组合后有一定语义增益
														
 
															+- **无关联（0.0-0.39）**：两词无明显语义关联，组合无意义
														
 
															+
														
 
															+## 第三步：互补性分析
														
 
															+
														
 
															+对关联度较高的词进行互补性判断：
														
 
															+
														
 
															+**互补性分级：**
														
 
															+- **强互补**：两词描述不同维度，组合后语义更完整（如：主体+场景、形式+内容）
														
 
															+- **弱互补**：两词有差异但语义部分重叠
														
 
															+- **语义重叠**：两词描述同一维度，组合无新增价值（避免）
														
 
															+
														
 
															+**常见互补维度组合：**
														
 
															+- 主体+场景
														
 
															+- 形式+内容
														
 
															+- 内容+应用方式
														
 
															+- 载体+场景+情绪
														
 
															+
														
 
															+## 第四步：优先级排序
														
 
															+
														
 
															+**综合排序考量：**
														
 
															+- 与中心词（或其他待选词）的关联强度
														
 
															+- 原始权重高低
														
 
															+- 互补性强弱
														
 
															+
														
 
															+**排序原则：**
														
 
															+强关联+高权重+强互补 > 强关联+无权重+强互补 > 中关联+高权重
														
 
															+
														
 
															+## 第五步：组合生成query
														
 
															+
														
 
															+**组合策略：**
														
 
															+
														
 
															+**如果有中心词：**
														
 
															+1. 中心词 + 强关联且强互补的待选词（1-2个）
														
 
															+2. 中心词 + 强关联但弱互补的待选词（1-2个）
														
 
															+3. 仅用待选词组合（当纯待选词组合语义更完整时）
														
 
															+
														
 
															+**如果无中心词：**
														
 
															+1. 2-3个强关联且强互补的待选词组合
														
 
															+2. 1个核心词 + 1-2个中关联但强互补词
														
 
															+
														
 
															+**组合规则：**
														
 
															+- 同一语义维度只保留1个最优词
														
 
															+- 优先选择互补性强的词组合
														
 
															+- 构成词数控制在2-3个
														
 
															+- 考虑词的前后顺序（词定语在前，核心名词在后；场景词在前，实体词在后）
														
 
															+**组合理由：**
														
 
															+说明为什么选择这些词组合，词与词之间如何协同工作，形成什么样的搜索语义场
														
 
															+
														
 
															+## 第六步：query与目标动机匹配度评估
														
 
															+**重要说明：** 只有在query生成完成后，才将query与目标动机进行匹配度评估
														
 
															+**匹配分含义：**
														
 
															+匹配分 = 此query能找到目标动机所需内容的概率（0-1之间）
														
 
															+
														
 
															+**评分标准：**
														
 
															+- **0.8-1.0分**：query在语意上与目标强关联，能精准召回目标动机所需内容，覆盖核心要素
														
 
															+- **0.4-0.79分**：query语意部分覆盖目标特征，能召回相关内容但可能不够精准，部分覆盖目标
														
 
															+- **0.39分以下**：query召回内容可能偏离目标动机
														
 
															+
														
 
															+**评分维度：**
														
 
															+- query的语义场是否覆盖目标动机的核心要素
														
 
															+- query能否精准定位到目标所需的内容类型
														
 
															+- query在搜索引擎中的可召回性
														
 
															+
														
 
															+**组合推理要求：**
														
 
															+用流畅的段落说明：
														
 
															+- query形成了什么样的搜索语义场
														
 
															+- 这个语义场如何与目标动机产生关联
														
 
															+- 为什么这个query能/不能召回目标所需内容
														
 
															+- 使用因果关联词（因为/由于/所以/因此）串联逻辑
														
 
															+- 避免"该query"、"这个"等模糊指代
														
 
															+
														
 
															+# 输出格式
														
 
															+最终按照以下json格式输出
														
 
															+{{
														
 
															+  "queries": [
														
 
															+    {{
														
 
															+      "query": "查询词",
														
 
															+      "中心词": "{base_word}",
														
 
															+      "组合理由": "query词组合理由的详细说明，深度解释该query与目标及中心词的逻辑关联。目标特征的核心诉求是什么，基于这个诉求，选择了哪些词，为什么这些词最相关（说明权重、语义覆盖等原因）这些词如何协同工作，形成什么样的搜索语义场，词与词之间有什么语义延展关系，这个query预期能召回什么类型的内容，为什么能找到目标",
														
 
															+      "与目标匹配分": 0.85,
														
 
															+      "source_word ": "来源词，待选词和中心词组合"
														
 
															+    }}
														
 
															+  ]
														
 
															+}}
														
 
															+
														
 
															+
														
 
															+**关键点：**
														
 
															+1. query生成阶段：只考虑词与词之间的语义关联和互补性
														
 
															+2. 匹配评估阶段：才将生成的query与目标动机进行匹配度分析
														
 
															+3. 目标动机不参与query生成，仅用于最终评估
														
 
															+
														
 
															+注意：只返回JSON，不要其他内容。"""
														
 
															+
														
 
															+        # 调用 LLM
														
 
															+        llm_results = self.client.chat_json(prompt=prompt, max_retries=3)
														
 
															+
														
 
															+        # 适配新的输出格式 {"queries": [...]}
														
 
															+        if not llm_results or not isinstance(llm_results, dict):
														
 
															+            logger.error("LLM返回格式错误：期待dict格式")
														
 
															+            return []
														
 
															+
														
 
															+        queries_list = llm_results.get("queries", [])
														
 
															+        if not isinstance(queries_list, list):
														
 
															+            logger.error("LLM返回格式错误：queries字段不是列表")
														
 
															+            return []
														
 
															+
														
 
															+        logger.info(f"LLM生成了 {len(queries_list)} 个query")
														
 
															+
														
 
															+        # 解析并验证
														
 
															+        formatted_results = []
														
 
															+        for rank, item in enumerate(queries_list[:max_queries], 1):
														
 
															+            # 处理 LLM 输出的字段名：
														
 
															+            # - "query" → search_word
														
 
															+            # - "source_word " (注意尾随空格) → source_word
														
 
															+            # - "组合理由" → reasoning
														
 
															+            # - "与目标匹配分" → score
														
 
															+            query_text = item.get("query", "")
														
 
															+            source_word_raw = item.get("source_word ", item.get("source_word", ""))  # 优先尝试带空格的键
														
 
															+
														
 
															+            validated_source_word = self._validate_and_fix_source_word(
														
 
															+                llm_source_word=source_word_raw,
														
 
															+                query=query_text,
														
 
															+                base_word=base_word,
														
 
															+                candidate_words=candidate_words
														
 
															+            )
														
 
															+
														
 
															+            formatted_results.append({
														
 
															+                "search_word": query_text,
														
 
															+                "source_word": validated_source_word,
														
 
															+                "score": item.get("与目标匹配分", 0.0),  # 使用 LLM 提供的分数
														
 
															+                "reasoning": item.get("组合理由", ""),
														
 
															+                "rank": rank,
														
 
															+                "original_feature": original_feature
														
 
															+            })
														
 
															+
														
 
															+        return formatted_results
														
 
															+
														
 
															+    def _validate_and_fix_source_word(
														
 
															+        self,
														
 
															+        llm_source_word: str,
														
 
															+        query: str,
														
 
															+        base_word: str,
														
 
															+        candidate_words: List[str]
														
 
															+    ) -> str:
														
 
															+        """
														
 
															+        验证并修正 LLM 输出的 source_word
														
 
															+        确保只包含"中心词 + 候选词"中的词
														
 
															+
														
 
															+        Args:
														
 
															+            llm_source_word: LLM 输出的 source_word
														
 
															+            query: 生成的 search_word
														
 
															+            base_word: 中心词
														
 
															+            candidate_words: 候选词列表
														
 
															+
														
 
															+        Returns:
														
 
															+            验证后的 source_word
														
 
															+        """
														
 
															+        words = llm_source_word.split()
														
 
															+        valid_words = []
														
 
															+
														
 
															+        # 验证每个词是否在允许列表中
														
 
															+        for word in words:
														
 
															+            if word == base_word or word in candidate_words:
														
 
															+                valid_words.append(word)
														
 
															+
														
 
															+        # 确保中心词存在（如果query中包含）
														
 
															+        if base_word in query and base_word not in valid_words:
														
 
															+            valid_words.insert(0, base_word)
														
 
															+
														
 
															+        # 去重
														
 
															+        seen = set()
														
 
															+        deduplicated = []
														
 
															+        for word in valid_words:
														
 
															+            if word not in seen:
														
 
															+                seen.add(word)
														
 
															+                deduplicated.append(word)
														
 
															+
														
 
															+        return ' '.join(deduplicated)
														
 
															+
														
 
															+    def evaluate_single_note(
														
 
															+        self,
														
 
															+        original_feature: str,
														
 
															+        search_word: str,
														
 
															+        note: Dict[str, Any],
														
 
															+        note_index: int = 0
														
 
															+    ) -> Dict[str, Any]:
														
 
															+        """
														
 
															+        评估单个帖子（阶段6，多模态）
														
 
															+
														
 
															+        Args:
														
 
															+            original_feature: 原始特征
														
 
															+            search_word: 搜索词
														
 
															+            note: 单个帖子
														
 
															+            note_index: 帖子索引
														
 
															+
														
 
															+        Returns:
														
 
															+            单个帖子的评估结果
														
 
															+        """
														
 
															+        card = note.get("note_card", {})
														
 
															+        title = card.get("display_title", "")
														
 
															+        desc = card.get("desc", "")[:500]  # 限制长度
														
 
															+        images = card.get("image_list", [])[:10]  # 最多10张图
														
 
															+
														
 
															+        prompt = f"""你是一个小红书内容分析专家。
														
 
															+
														
 
															+任务：评估这个帖子是否包含目标特征"{original_feature}"的元素
														
 
															+
														
 
															+原始特征："{original_feature}"
														
 
															+搜索词："{search_word}"
														
 
															+
														
 
															+帖子内容：
														
 
															+标题: {title}
														
 
															+正文: {desc}
														
 
															+
														
 
															+请分析帖子的文字和图片内容，返回JSON格式：
														
 
															+{{
														
 
															+  "relevance": 0.85,  // 0.0-1.0，相关度
														
 
															+  "matched_elements": ["元素1", "元素2"],  // 匹配的元素列表
														
 
															+  "reasoning": "简短的匹配理由"
														
 
															+}}
														
 
															+
														
 
															+只返回JSON，不要其他内容。"""
														
 
															+
														
 
															+        result = self.client.chat_json(
														
 
															+            prompt=prompt,
														
 
															+            images=images if images else None,
														
 
															+            max_retries=3
														
 
															+        )
														
 
															+
														
 
															+        if result:
														
 
															+            return {
														
 
															+                "note_index": note_index,
														
 
															+                "relevance": result.get("relevance", 0.0),
														
 
															+                "matched_elements": result.get("matched_elements", []),
														
 
															+                "reasoning": result.get("reasoning", "")
														
 
															+            }
														
 
															+        else:
														
 
															+            logger.error(f"  评估帖子 {note_index} 失败: {search_word}")
														
 
															+            return {
														
 
															+                "note_index": note_index,
														
 
															+                "relevance": 0.0,
														
 
															+                "matched_elements": [],
														
 
															+                "reasoning": "评估失败"
														
 
															+            }
														
 
															+
														
 
															+    def evaluate_search_results_parallel(
														
 
															+        self,
														
 
															+        original_feature: str,
														
 
															+        search_word: str,
														
 
															+        notes: List[Dict[str, Any]],
														
 
															+        max_notes: int = 20,
														
 
															+        max_workers: int = 20
														
 
															+    ) -> Dict[str, Any]:
														
 
															+        """
														
 
															+        并行评估搜索结果（每个帖子独立评估）
														
 
															+
														
 
															+        Args:
														
 
															+            original_feature: 原始特征
														
 
															+            search_word: 搜索词
														
 
															+            notes: 帖子列表
														
 
															+            max_notes: 最多评估几条帖子
														
 
															+            max_workers: 最大并发数
														
 
															+
														
 
															+        Returns:
														
 
															+            评估结果汇总
														
 
															+        """
														
 
															+        if not notes:
														
 
															+            return {
														
 
															+                "overall_relevance": 0.0,
														
 
															+                "extracted_elements": [],
														
 
															+                "evaluated_notes": []
														
 
															+            }
														
 
															+
														
 
															+        notes_to_eval = notes[:max_notes]
														
 
															+        evaluated_notes = []
														
 
															+
														
 
															+        logger.info(f"  并行评估 {len(notes_to_eval)} 个帖子（{max_workers}并发）")
														
 
															+
														
 
															+        # 20并发评估每个帖子
														
 
															+        with ThreadPoolExecutor(max_workers=max_workers) as executor:
														
 
															+            futures = []
														
 
															+            for idx, note in enumerate(notes_to_eval):
														
 
															+                future = executor.submit(
														
 
															+                    self.evaluate_single_note,
														
 
															+                    original_feature,
														
 
															+                    search_word,
														
 
															+                    note,
														
 
															+                    idx
														
 
															+                )
														
 
															+                futures.append(future)
														
 
															+
														
 
															+            # 收集结果
														
 
															+            for future in as_completed(futures):
														
 
															+                try:
														
 
															+                    result = future.result()
														
 
															+                    evaluated_notes.append(result)
														
 
															+                except Exception as e:
														
 
															+                    logger.error(f"  评估帖子失败: {e}")
														
 
															+
														
 
															+        # 按note_index排序
														
 
															+        evaluated_notes.sort(key=lambda x: x['note_index'])
														
 
															+
														
 
															+        # 汇总：计算整体相关度和提取元素
														
 
															+        if evaluated_notes:
														
 
															+            overall_relevance = sum(n['relevance'] for n in evaluated_notes) / len(evaluated_notes)
														
 
															+
														
 
															+            # 提取所有元素并统计频次
														
 
															+            element_counts = {}
														
 
															+            for note in evaluated_notes:
														
 
															+                for elem in note['matched_elements']:
														
 
															+                    element_counts[elem] = element_counts.get(elem, 0) + 1
														
 
															+
														
 
															+            # 按频次排序，取前5个
														
 
															+            extracted_elements = sorted(
														
 
															+                element_counts.keys(),
														
 
															+                key=lambda x: element_counts[x],
														
 
															+                reverse=True
														
 
															+            )[:5]
														
 
															+        else:
														
 
															+            overall_relevance = 0.0
														
 
															+            extracted_elements = []
														
 
															+
														
 
															+        return {
														
 
															+            "overall_relevance": overall_relevance,
														
 
															+            "extracted_elements": extracted_elements,
														
 
															+            "evaluated_notes": evaluated_notes
														
 
															+        }
														
 
															+
														
 
															+    def evaluate_search_results(
														
 
															+        self,
														
 
															+        original_feature: str,
														
 
															+        search_word: str,
														
 
															+        notes: List[Dict[str, Any]],
														
 
															+        max_notes: int = 5,
														
 
															+        max_images_per_note: int = 10
														
 
															+    ) -> Dict[str, Any]:
														
 
															+        """
														
 
															+        评估搜索结果（阶段6，多模态）
														
 
															+
														
 
															+        Args:
														
 
															+            original_feature: 原始特征
														
 
															+            search_word: 搜索词
														
 
															+            notes: 帖子列表
														
 
															+            max_notes: 最多评估几条帖子
														
 
															+            max_images_per_note: 每条帖子最多取几张图片
														
 
															+
														
 
															+        Returns:
														
 
															+            评估结果
														
 
															+        """
														
 
															+        if not notes:
														
 
															+            return {
														
 
															+                "overall_relevance": 0.0,
														
 
															+                "extracted_elements": [],
														
 
															+                "recommended_extension": None,
														
 
															+                "evaluated_notes": []
														
 
															+            }
														
 
															+
														
 
															+        # 限制评估数量
														
 
															+        notes_to_eval = notes[:max_notes]
														
 
															+
														
 
															+        # 准备文本信息
														
 
															+        notes_info = []
														
 
															+        all_images = []
														
 
															+
														
 
															+        for idx, note in enumerate(notes_to_eval):
														
 
															+            card = note.get("note_card", {})
														
 
															+            title = card.get("display_title", "")
														
 
															+            desc = card.get("desc", "")[:300]  # 限制长度
														
 
															+
														
 
															+            notes_info.append({
														
 
															+                "index": idx,
														
 
															+                "title": title,
														
 
															+                "desc": desc
														
 
															+            })
														
 
															+
														
 
															+            # 收集图片
														
 
															+            images = card.get("image_list", [])[:max_images_per_note]
														
 
															+            all_images.extend(images)
														
 
															+
														
 
															+        # 构建提示词
														
 
															+        notes_text = "\n\n".join([
														
 
															+            f"帖子 {n['index']}:\n标题: {n['title']}\n正文: {n['desc']}"
														
 
															+            for n in notes_info
														
 
															+        ])
														
 
															+
														
 
															+        prompt = f"""你是一个小红书内容分析专家。
														
 
															+
														
 
															+任务：评估搜索结果是否包含目标特征的元素
														
 
															+
														
 
															+原始特征："{original_feature}"
														
 
															+搜索词："{search_word}"
														
 
															+帖子数量：{len(notes_to_eval)} 条
														
 
															+
														
 
															+帖子内容：
														
 
															+{notes_text}
														
 
															+
														
 
															+请综合分析帖子的文字和图片内容，判断：
														
 
															+1. 这些搜索结果中是否包含与"{original_feature}"相似的元素
														
 
															+2. 提取最相关的元素关键词（2-4个字的词组）
														
 
															+3. 推荐最适合用于扩展搜索的关键词
														
 
															+
														
 
															+返回JSON格式：
														
 
															+{{
														
 
															+  "overall_relevance": 0.72,  // 0.0-1.0，整体相关度
														
 
															+  "extracted_elements": ["关键词1", "关键词2", "关键词3"],  // 提取的相似元素，按相关度排序
														
 
															+  "recommended_extension": "关键词1",  // 最优的扩展关键词
														
 
															+  "evaluated_notes": [
														
 
															+    {{
														
 
															+      "note_index": 0,  // 帖子索引
														
 
															+      "relevance": 0.85,  // 该帖子的相关度
														
 
															+      "matched_elements": ["元素1", "元素2"],  // 该帖子匹配的元素
														
 
															+      "reasoning": "简短的匹配理由"
														
 
															+    }}
														
 
															+  ]
														
 
															+}}
														
 
															+
														
 
															+注意：
														
 
															+- extracted_elements 应该是帖子中实际包含的、与原始特征相似的元素
														
 
															+- 优先提取在图片或文字中明显出现的元素
														
 
															+- 只返回JSON，不要其他内容"""
														
 
															+
														
 
															+        # 调用LLM（带图片）
														
 
															+        result = self.client.chat_json(
														
 
															+            prompt=prompt,
														
 
															+            images=all_images if all_images else None,
														
 
															+            max_retries=3
														
 
															+        )
														
 
															+
														
 
															+        if result:
														
 
															+            # 确保返回完整格式
														
 
															+            return {
														
 
															+                "overall_relevance": result.get("overall_relevance", 0.0),
														
 
															+                "extracted_elements": result.get("extracted_elements", []),
														
 
															+                "recommended_extension": result.get("recommended_extension"),
														
 
															+                "evaluated_notes": result.get("evaluated_notes", [])
														
 
															+            }
														
 
															+        else:
														
 
															+            logger.error(f"评估搜索结果失败: {search_word}")
														
 
															+            return {
														
 
															+                "overall_relevance": 0.0,
														
 
															+                "extracted_elements": [],
														
 
															+                "recommended_extension": None,
														
 
															+                "evaluated_notes": []
														
 
															+            }
														
 
															+
														
 
															+    def batch_evaluate_search_results(
														
 
															+        self,
														
 
															+        features_with_results: List[Dict[str, Any]],
														
 
															+        max_workers: int = 3
														
 
															+    ) -> List[Dict[str, Any]]:
														
 
															+        """
														
 
															+        批量评估搜索结果（并行，但并发数较低以避免超时）
														
 
															+
														
 
															+        Args:
														
 
															+            features_with_results: 带搜索结果的特征列表
														
 
															+            max_workers: 最大并发数
														
 
															+
														
 
															+        Returns:
														
 
															+            带评估结果的特征列表
														
 
															+        """
														
 
															+        logger.info(f"开始批量评估 {len(features_with_results)} 个搜索结果...")
														
 
															+
														
 
															+        results = []
														
 
															+
														
 
															+        with ThreadPoolExecutor(max_workers=max_workers) as executor:
														
 
															+            # 提交任务
														
 
															+            future_to_feature = {}
														
 
															+            for feature in features_with_results:
														
 
															+                if not feature.get("search_result"):
														
 
															+                    # 无搜索结果，跳过
														
 
															+                    feature["result_evaluation"] = None
														
 
															+                    results.append(feature)
														
 
															+                    continue
														
 
															+
														
 
															+                original_feature = self._get_original_feature(feature)
														
 
															+                search_word = feature.get("search_word", "")
														
 
															+                notes = feature["search_result"].get("data", {}).get("data", [])
														
 
															+
														
 
															+                future = executor.submit(
														
 
															+                    self.evaluate_search_results,
														
 
															+                    original_feature,
														
 
															+                    search_word,
														
 
															+                    notes
														
 
															+                )
														
 
															+                future_to_feature[future] = feature
														
 
															+
														
 
															+            # 收集结果
														
 
															+            for idx, future in enumerate(as_completed(future_to_feature), 1):
														
 
															+                feature = future_to_feature[future]
														
 
															+                try:
														
 
															+                    evaluation = future.result()
														
 
															+                    feature["result_evaluation"] = evaluation
														
 
															+                    results.append(feature)
														
 
															+                    logger.info(f"  [{idx}/{len(future_to_feature)}] {feature.get('search_word')}: "
														
 
															+                               f"relevance={evaluation['overall_relevance']:.3f}")
														
 
															+                except Exception as e:
														
 
															+                    logger.error(f"  评估失败: {feature.get('search_word')}, 错误: {e}")
														
 
															+                    feature["result_evaluation"] = None
														
 
															+                    results.append(feature)
														
 
															+
														
 
															+        logger.info(f"批量评估完成")
														
 
															+
														
 
															+        return results
														
 
															+
														
 
															+    def _get_original_feature(self, feature_node: Dict[str, Any]) -> str:
														
 
															+        """
														
 
															+        从特征节点中获取原始特征名称
														
 
															+
														
 
															+        Args:
														
 
															+            feature_node: 特征节点
														
 
															+
														
 
															+        Returns:
														
 
															+            原始特征名称
														
 
															+        """
														
 
															+        # 尝试从llm_evaluation中获取
														
 
															+        if "llm_evaluation" in feature_node:
														
 
															+            return feature_node["llm_evaluation"].get("original_feature", "")
														
 
															+
														
 
															+        # 尝试从其他字段获取
														
 
															+        return feature_node.get("原始特征名称", feature_node.get("特征名称", ""))
														
 
															+
														
 
															+    # ========== Stage 6: 两层评估方法 ==========
														
 
															+
														
 
															+    def evaluate_query_relevance_batch(
														
 
															+        self,
														
 
															+        search_query: str,
														
 
															+        notes: List[Dict[str, Any]],
														
 
															+        max_notes: int = 20
														
 
															+    ) -> Dict[str, Any]:
														
 
															+        """
														
 
															+        第一层评估：批量判断搜索结果与 Query 的相关性
														
 
															+
														
 
															+        一次 LLM 调用评估多个笔记的 Query 相关性
														
 
															+
														
 
															+        Args:
														
 
															+            search_query: 搜索Query
														
 
															+            notes: 笔记列表
														
 
															+            max_notes: 最多评估几条笔记
														
 
															+
														
 
															+        Returns:
														
 
															+            {
														
 
															+              "note_0": {"与query相关性": "相关", "说明": "..."},
														
 
															+              "note_1": {"与query相关性": "不相关", "说明": "..."},
														
 
															+              ...
														
 
															+            }
														
 
															+        """
														
 
															+        if not notes:
														
 
															+            return {}
														
 
															+
														
 
															+        notes_to_eval = notes[:max_notes]
														
 
															+
														
 
															+        # 构建笔记列表文本
														
 
															+        notes_text = ""
														
 
															+        for idx, note in enumerate(notes_to_eval):
														
 
															+            note_card = note.get('note_card', {})
														
 
															+            title = note_card.get('display_title', '')
														
 
															+            content = note_card.get('desc', '')[:800]  # 限制长度
														
 
															+            images = note_card.get('image_list', [])
														
 
															+
														
 
															+            notes_text += f"note_{idx}:\n"
														
 
															+            notes_text += f"- 标题: {title}\n"
														
 
															+            notes_text += f"- 正文: {content}\n"
														
 
															+            notes_text += f"- 图像: {len(images)}张图片\n\n"
														
 
															+
														
 
															+        # 构建完整的第一层评估 Prompt（用户提供，不简化）
														
 
															+        prompt = f"""# 任务说明
														
 
															+判断搜索结果是否与搜索Query相关,过滤掉完全无关的结果。
														
 
															+
														
 
															+# 输入信息
														
 
															+
														
 
															+搜索Query: {search_query}
														
 
															+
														
 
															+搜索结果列表:
														
 
															+{notes_text}
														
 
															+
														
 
															+# 判断标准
														
 
															+✅ 相关(保留)
														
 
															+搜索结果的标题、正文或图像内容中包含Query相关的信息:
														
 
															+
														
 
															+Query的核心关键词在结果中出现
														
 
															+或 结果讨论的主题与Query直接相关
														
 
															+或 结果是Query概念的上位/下位/平行概念
														
 
															+
														
 
															+
														
 
															+❌ 不相关(过滤)
														
 
															+搜索结果与Query完全无关:
														
 
															+Query的关键词完全未出现
														
 
															+结果主题与Query无任何关联
														
 
															+仅因搜索引擎误匹配而出现
														
 
															+
														
 
															+
														
 
															+## 判断示例
														
 
															+Query "墨镜搭配" → 结果"太阳镜选购指南" ✅ 保留（墨镜=太阳镜）
														
 
															+Query "墨镜搭配" → 结果"眼镜搭配技巧" ✅ 保留（眼镜是墨镜的上位概念）
														
 
															+Query "墨镜搭配" → 结果"帽子搭配技巧" ❌ 过滤（完全无关）
														
 
															+Query "复古滤镜" → 结果"滤镜调色教程" ✅ 保留（包含滤镜）
														
 
															+Query "复古滤镜" → 结果"相机推荐" ❌ 过滤（主题不相关）
														
 
															+
														
 
															+# 输出格式
														
 
															+{{
														
 
															+  "note_0": {{
														
 
															+    "与query相关性": "相关 / 不相关",
														
 
															+    "说明": ""
														
 
															+  }},
														
 
															+  "note_1": {{
														
 
															+    "与query相关性": "相关 / 不相关",
														
 
															+    "说明": ""
														
 
															+  }}
														
 
															+}}
														
 
															+
														
 
															+# 特殊情况处理
														
 
															+
														
 
															+- 如果OCR提取的图像文字不完整或正文内容缺失,应在说明中注明,并根据实际可获取的信息进行判断
														
 
															+- 当无法明确判断时,倾向于保留(标记为"相关")
														
 
															+
														
 
															+只返回JSON，不要其他内容。"""
														
 
															+
														
 
															+        # 调用 LLM（批量评估）
														
 
															+        result = self.client.chat_json(
														
 
															+            prompt=prompt,
														
 
															+            max_retries=3
														
 
															+        )
														
 
															+
														
 
															+        if result:
														
 
															+            return result
														
 
															+        else:
														
 
															+            logger.error(f"  第一层批量评估失败: Query={search_query}")
														
 
															+            # 返回默认结果（全部标记为"相关"以保守处理）
														
 
															+            default_result = {}
														
 
															+            for idx in range(len(notes_to_eval)):
														
 
															+                default_result[f"note_{idx}"] = {
														
 
															+                    "与query相关性": "相关",
														
 
															+                    "说明": "LLM评估失败，默认保留"
														
 
															+                }
														
 
															+            return default_result
														
 
															+
														
 
															+    def evaluate_feature_matching_single(
														
 
															+        self,
														
 
															+        target_feature: str,
														
 
															+        note_title: str,
														
 
															+        note_content: str,
														
 
															+        note_images: List[str],
														
 
															+        note_index: int
														
 
															+    ) -> Dict[str, Any]:
														
 
															+        """
														
 
															+        第二层评估：评估单个笔记与目标特征的匹配度
														
 
															+
														
 
															+        Args:
														
 
															+            target_feature: 目标特征
														
 
															+            note_title: 笔记标题
														
 
															+            note_content: 笔记正文
														
 
															+            note_images: 图片URL列表
														
 
															+            note_index: 笔记索引
														
 
															+
														
 
															+        Returns:
														
 
															+            {
														
 
															+              "综合得分": 0.9,  # 0-1分
														
 
															+              "匹配类型": "完全匹配",
														
 
															+              "评分说明": "...",
														
 
															+              "关键匹配点": [...]
														
 
															+            }
														
 
															+        """
														
 
															+        # 构建完整的第二层评估 Prompt（用户提供，不简化）
														
 
															+        prompt = f"""# 任务说明
														
 
															+你需要判断搜索到的案例与目标特征的相关性。
														
 
															+
														
 
															+# 输入信息
														
 
															+目标特征：{target_feature}
														
 
															+
														
 
															+搜索结果：
														
 
															+- 标题: {note_title}
														
 
															+- 正文: {note_content[:800]}
														
 
															+- 图像: {len(note_images)}张图片（请仔细分析图片内容，包括OCR提取图片中的文字）
														
 
															+
														
 
															+# 判断流程
														
 
															+## 目标特征匹配度评分
														
 
															+综合考虑语义相似度（概念匹配、层级关系）和场景关联度（应用场景、使用语境）进行评分：
														
 
															+
														
 
															+- 0.8-1分：完全匹配
														
 
															+语义层面：找到与目标特征完全相同或高度一致的内容，核心概念完全一致
														
 
															+场景层面：完全适用于同一场景、受众、平台和语境
														
 
															+
														
 
															+示例：
														
 
															+目标"复古滤镜" + 小红书穿搭场景 vs 结果"小红书复古滤镜调色教程"
														
 
															+目标"墨镜" + 时尚搭配场景 vs 结果"时尚墨镜搭配指南"
														
 
															+
														
 
															+
														
 
															+- 0.6-0.7分：相似匹配
														
 
															+语义层面：
														
 
															+结果是目标的上位概念（更宽泛）或下位概念（更具体）
														
 
															+或属于同一概念的不同表现形式，或属于平行概念（同级不同类）
														
 
															+场景层面：场景相近但有差异，需要筛选或调整后可用
														
 
															+
														
 
															+示例：
														
 
															+目标"墨镜" + 时尚搭配 vs 结果"眼镜搭配技巧"（上位概念，需筛选）
														
 
															+目标"怀旧滤镜" + 人像拍摄 vs 结果"胶片感调色"（不同表现形式）
														
 
															+目标"日常穿搭" + 街拍 vs 结果"通勤穿搭拍照"（场景相近）
														
 
															+
														
 
															+
														
 
															+
														
 
															+- 0.5-0.6分：弱相似
														
 
															+语义层面：属于同一大类但具体方向或侧重点明显不同，仅提供了相关概念
														
 
															+场景层面：场景有明显差异，迁移需要较大改造
														
 
															+
														
 
															+示例：
														
 
															+目标"户外运动穿搭" vs 结果"健身房穿搭指南"
														
 
															+目标"小红书图文笔记" vs 结果"抖音短视频脚本"
														
 
															+
														
 
															+
														
 
															+- 0.4分及以下：无匹配
														
 
															+语义层面：仅表面词汇重叠，实质关联弱，或概念距离过远
														
 
															+场景层面：应用场景基本不同或完全不同
														
 
															+
														
 
															+示例：
														
 
															+目标"墨镜" vs 结果"配饰大全"（概念过于宽泛）
														
 
															+目标"美食摄影构图" vs 结果"美食博主日常vlog"
														
 
															+
														
 
															+
														
 
															+
														
 
															+## 概念层级关系说明
														
 
															+在评分时，需要注意概念层级关系的影响：
														
 
															+完全匹配（同一概念 + 同场景）→ 0.8-1分
														
 
															+目标"墨镜" vs 结果"墨镜搭配"，且都在时尚搭配场景
														
 
															+
														
 
															+
														
 
															+上位/下位概念（层级差一层）→ 通常0.6-0.7分
														
 
															+目标"墨镜" vs 结果"眼镜搭配"（结果更宽泛，需筛选）
														
 
															+目标"眼镜" vs 结果"墨镜选购"（结果更具体，部分适用）
														
 
															+
														
 
															+
														
 
															+平行概念（同级不同类）→ 通常0.6-0.7分
														
 
															+目标"墨镜" vs 结果"近视眼镜"（都是眼镜类，但功能场景不同）
														
 
															+
														
 
															+
														
 
															+远距离概念（层级差两层及以上）→ 0.5分及以下
														
 
															+目标"墨镜" vs 结果"配饰"（概念过于宽泛，指导性弱）
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+# 匹配结论判断
														
 
															+根据综合得分判定匹配类型：
														
 
															+
														
 
															+0.8-1.0分：✅ 完全匹配
														
 
															+
														
 
															+判断：找到了目标特征的直接灵感来源
														
 
															+建议：直接采纳为该特征的灵感溯源结果
														
 
															+
														
 
															+
														
 
															+0.6-0.79分：⚠️ 相似匹配
														
 
															+
														
 
															+判断：找到了相关的灵感参考，但存在一定差异
														
 
															+建议：作为候选结果保留，可与其他结果综合判断或继续搜索更精确的匹配
														
 
															+
														
 
															+
														
 
															+0.59分及以下：❌ 无匹配
														
 
															+
														
 
															+判断：该结果与目标特征关联度不足
														
 
															+建议：排除该结果，需要调整搜索策略继续寻找
														
 
															+
														
 
															+
														
 
															+# 输出格式
														
 
															+{{
														
 
															+  "综合得分": 0.7,
														
 
															+  "匹配类型": "相似匹配",
														
 
															+  "评分说明": "结果'眼镜搭配技巧'是目标'墨镜'的上位概念,内容涵盖多种眼镜类型。场景都是时尚搭配,但需要从结果中筛选出墨镜相关的内容。概念关系:上位概念(宽泛一层)",
														
 
															+  "关键匹配点": [
														
 
															+    "眼镜与脸型的搭配原则(部分适用于墨镜)",
														
 
															+    "配饰的风格选择方法"
														
 
															+  ]
														
 
															+}}
														
 
															+
														
 
															+# 特殊情况处理
														
 
															+复合特征评估：如果目标特征是复合型（如"复古滤镜+第一人称视角"），需要分别评估每个子特征的匹配度，然后取平均值作为最终得分
														
 
															+信息不完整：如果OCR提取的图像文字不完整或正文内容缺失，应在说明中注明，并根据实际可获取的信息进行评分
														
 
															+上位概念的实用性：当结果是目标的上位概念时，评分应考虑：内容中目标相关部分的占比；是否提供了可直接应用于目标的知识；场景的一致性程度；如果结果虽是上位概念但完全不涉及目标内容，应降至5-6分或更低
														
 
															+
														
 
															+只返回JSON，不要其他内容。"""
														
 
															+
														
 
															+        # 调用 LLM（传递图片进行多模态分析）
														
 
															+        result = self.client.chat_json(
														
 
															+            prompt=prompt,
														
 
															+            images=note_images if note_images else None,
														
 
															+            max_retries=3
														
 
															+        )
														
 
															+
														
 
															+        if result:
														
 
															+            return result
														
 
															+        else:
														
 
															+            logger.error(f"  第二层评估失败: note {note_index}, target={target_feature}")
														
 
															+            return {
														
 
															+                "综合得分": 0.0,
														
 
															+                "匹配类型": "评估失败",
														
 
															+                "评分说明": "LLM评估失败",
														
 
															+                "关键匹配点": []
														
 
															+            }
														
 
															+
														
 
															+    def evaluate_note_with_filter(
														
 
															+        self,
														
 
															+        search_query: str,
														
 
															+        target_feature: str,
														
 
															+        note_title: str,
														
 
															+        note_content: str,
														
 
															+        note_images: List[str],
														
 
															+        note_index: int = 0
														
 
															+    ) -> Dict[str, Any]:
														
 
															+        """
														
 
															+        两层评估单个笔记（完整Prompt版本）
														
 
															+
														
 
															+        第一层：Query相关性过滤
														
 
															+        第二层：目标特征匹配度评分
														
 
															+
														
 
															+        Args:
														
 
															+            search_query: 搜索Query，如 "外观装扮 发布萌宠内容"
														
 
															+            target_feature: 目标特征，如 "佩戴"
														
 
															+            note_title: 笔记标题
														
 
															+            note_content: 笔记正文
														
 
															+            note_images: 图片URL列表（会传递给LLM进行视觉分析和OCR）
														
 
															+            note_index: 笔记索引
														
 
															+
														
 
															+        Returns:
														
 
															+            评估结果字典
														
 
															+        """
														
 
															+        # 构建完整的评估Prompt（用户提供的完整版本，一字不改）
														
 
															+        prompt = f"""# 任务说明
														
 
															+你需要判断搜索到的案例信息与目标特征的相关性。判断分为两层：第一层过滤与搜索Query无关的结果，第二层评估与目标特征的匹配度。
														
 
															+
														
 
															+# 输入信息
														
 
															+
														
 
															+搜索Query：{search_query}
														
 
															+目标特征：{target_feature}
														
 
															+搜索结果：
														
 
															+- 标题: {note_title}
														
 
															+- 正文: {note_content[:800]}
														
 
															+- 图像: {len(note_images)}张图片（请仔细分析图片内容，包括OCR提取图片中的文字）
														
 
															+
														
 
															+# 判断流程
														
 
															+第一层：Query相关性过滤
														
 
															+判断标准：搜索结果是否与搜索Query相关
														
 
															+过滤规则：
														
 
															+
														
 
															+✅ 保留：搜索结果的标题、正文或图像内容中包含Query相关的信息
														
 
															+
														
 
															+Query的核心关键词在结果中出现
														
 
															+或结果讨论的主题与Query直接相关
														
 
															+或结果是Query概念的上位/下位/平行概念
														
 
															+
														
 
															+
														
 
															+❌ 过滤：搜索结果与Query完全无关
														
 
															+
														
 
															+Query的关键词完全未出现
														
 
															+结果主题与Query无任何关联
														
 
															+仅因搜索引擎误匹配而出现
														
 
															+
														
 
															+
														
 
															+
														
 
															+示例：
														
 
															+
														
 
															+Query "墨镜搭配" → 结果"太阳镜选购指南" ✅ 保留（墨镜=太阳镜）
														
 
															+Query "墨镜搭配" → 结果"眼镜搭配技巧" ✅ 保留（眼镜是上位概念）
														
 
															+Query "墨镜搭配" → 结果"帽子搭配技巧" ❌ 过滤（完全无关）
														
 
															+Query "复古滤镜" → 结果"滤镜调色教程" ✅ 保留（包含滤镜）
														
 
															+Query "复古滤镜" → 结果"相机推荐" ❌ 过滤（主题不相关）
														
 
															+
														
 
															+输出：
														
 
															+如果判定为 ❌ 过滤，直接输出：
														
 
															+
														
 
															+json{{
														
 
															+  "Query相关性": "不相关",
														
 
															+  "综合得分": 0,
														
 
															+  "匹配类型": "过滤",
														
 
															+  "说明": "搜索结果与Query '{search_query}' 完全无关，建议过滤"
														
 
															+}}
														
 
															+
														
 
															+如果判定为 ✅ 保留，进入第二层评分
														
 
															+
														
 
															+第二层：目标特征匹配度评分
														
 
															+综合考虑语义相似度（概念匹配、层级关系、实操价值）和场景关联度（应用场景、使用语境）进行评分：
														
 
															+8-10分：完全匹配
														
 
															+
														
 
															+语义层面：找到与目标特征完全相同或高度一致的内容，核心概念完全一致
														
 
															+场景层面：完全适用于同一场景、受众、平台和语境
														
 
															+实操价值：提供了具体可执行的方法、步骤或技巧
														
 
															+示例：
														
 
															+
														
 
															+目标"复古滤镜" + 小红书穿搭场景 vs 结果"小红书复古滤镜调色教程"
														
 
															+目标"墨镜" + 时尚搭配场景 vs 结果"时尚墨镜搭配指南"
														
 
															+
														
 
															+
														
 
															+
														
 
															+6-7分：相似匹配
														
 
															+
														
 
															+语义层面：
														
 
															+
														
 
															+结果是目标的上位概念（更宽泛）或下位概念（更具体）
														
 
															+或属于同一概念的不同表现形式
														
 
															+或属于平行概念（同级不同类）
														
 
															+
														
 
															+
														
 
															+场景层面：场景相近但有差异，需要筛选或调整后可用
														
 
															+实操价值：有一定参考价值但需要转化应用
														
 
															+示例：
														
 
															+
														
 
															+目标"墨镜" + 时尚搭配 vs 结果"眼镜搭配技巧"（上位概念，需筛选）
														
 
															+目标"怀旧滤镜" + 人像拍摄 vs 结果"胶片感调色"（不同表现形式）
														
 
															+目标"日常穿搭" + 街拍 vs 结果"通勤穿搭拍照"（场景相近）
														
 
															+
														
 
															+
														
 
															+
														
 
															+5-6分：弱相似
														
 
															+
														
 
															+语义层面：属于同一大类但具体方向或侧重点明显不同
														
 
															+场景层面：场景有明显差异，迁移需要较大改造
														
 
															+实操价值：提供了概念启发但需要较大转化
														
 
															+示例：
														
 
															+
														
 
															+目标"户外运动穿搭" vs 结果"健身房穿搭指南"
														
 
															+目标"小红书图文笔记" vs 结果"抖音短视频脚本"
														
 
															+
														
 
															+
														
 
															+
														
 
															+4分及以下：无匹配
														
 
															+
														
 
															+语义层面：仅表面词汇重叠，实质关联弱，或概念距离过远
														
 
															+场景层面：应用场景基本不同或完全不同
														
 
															+实操价值：实操指导价值有限或无价值
														
 
															+示例：
														
 
															+
														
 
															+目标"墨镜" vs 结果"配饰大全"（概念过于宽泛）
														
 
															+目标"美食摄影构图" vs 结果"美食博主日常vlog"
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+概念层级关系说明
														
 
															+在评分时，需要注意概念层级关系的影响：
														
 
															+完全匹配（同一概念 + 同场景）→ 8-10分
														
 
															+目标"墨镜" vs 结果"墨镜搭配"，且都在时尚搭配场景
														
 
															+
														
 
															+
														
 
															+上位/下位概念（层级差一层）→ 通常6-7分
														
 
															+目标"墨镜" vs 结果"眼镜搭配"（结果更宽泛，需筛选）
														
 
															+目标"眼镜" vs 结果"墨镜选购"（结果更具体，部分适用）
														
 
															+
														
 
															+
														
 
															+平行概念（同级不同类）→ 通常6-7分
														
 
															+目标"墨镜" vs 结果"近视眼镜"（都是眼镜类，但功能场景不同）
														
 
															+
														
 
															+
														
 
															+远距离概念（层级差两层及以上）→ 4分及以下
														
 
															+目标"墨镜" vs 结果"配饰"（概念过于宽泛，指导性弱）
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+匹配结论判断
														
 
															+根据综合得分判定匹配类型：
														
 
															+
														
 
															+8.0-10.0分：✅ 完全匹配
														
 
															+
														
 
															+判断：找到了目标特征的直接灵感来源
														
 
															+置信度：高
														
 
															+建议：直接采纳为该特征的灵感溯源结果
														
 
															+
														
 
															+
														
 
															+5.0-7.9分：⚠️ 相似匹配
														
 
															+
														
 
															+判断：找到了相关的灵感参考，但存在一定差异
														
 
															+置信度：中
														
 
															+建议：作为候选结果保留，可与其他结果综合判断或继续搜索更精确的匹配
														
 
															+
														
 
															+
														
 
															+1.0-4.9分：❌ 无匹配
														
 
															+
														
 
															+判断：该结果与目标特征关联度不足
														
 
															+置信度：低
														
 
															+建议：排除该结果，需要调整搜索策略继续寻找
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+# 输出格式
														
 
															+通过Query相关性过滤的结果：
														
 
															+json{{
														
 
															+  "Query相关性": "相关",
														
 
															+  "综合得分": 7.0,
														
 
															+  "匹配类型": "相似匹配",
														
 
															+  "置信度": "中",
														
 
															+  "评分说明": "结果'眼镜搭配技巧'是目标'墨镜'的上位概念，内容涵盖多种眼镜类型。场景都是时尚搭配，但需要从结果中筛选出墨镜相关的内容。概念关系：上位概念（宽泛一层）",
														
 
															+  "关键匹配点": [
														
 
															+    "眼镜与脸型的搭配原则（部分适用于墨镜）",
														
 
															+    "配饰的风格选择方法"
														
 
															+  ]
														
 
															+}}
														
 
															+未通过Query相关性过滤的结果：
														
 
															+json{{
														
 
															+  "Query相关性": "不相关",
														
 
															+  "综合得分": 0,
														
 
															+  "匹配类型": "过滤",
														
 
															+  "说明": "搜索结果'帽子搭配技巧'与Query'墨镜搭配'完全无关，建议过滤"
														
 
															+}}
														
 
															+
														
 
															+# 特殊情况处理
														
 
															+
														
 
															+复合特征评估：如果目标特征是复合型（如"复古滤镜+第一人称视角"），需要分别评估每个子特征的匹配度，然后取算术平均值作为最终得分
														
 
															+信息不完整：如果OCR提取的图像文字不完整或正文内容缺失，应在说明中注明，并根据实际可获取的信息进行评分
														
 
															+上位概念的实用性：当结果是目标的上位概念时，评分应考虑：
														
 
															+
														
 
															+内容中目标相关部分的占比
														
 
															+是否提供了可直接应用于目标的知识
														
 
															+场景的一致性程度
														
 
															+如果结果虽是上位概念但完全不涉及目标内容，应降至5-6分或更低
														
 
															+
														
 
															+
														
 
															+Query与目标特征的关系：
														
 
															+如果Query就是目标特征本身，第一层和第二层判断可以合并考虑
														
 
															+如果Query是为了探索目标特征而构建的更宽泛查询，第一层更宽松，第二层更严格
														
 
															+
														
 
															+
														
 
															+
														
 
															+只返回JSON，不要其他内容。"""
														
 
															+
														
 
															+        # 调用LLM（传递图片URL进行多模态分析）
														
 
															+        result = self.client.chat_json(
														
 
															+            prompt=prompt,
														
 
															+            images=note_images if note_images else None,  # ✅ 传递图片
														
 
															+            max_retries=3
														
 
															+        )
														
 
															+
														
 
															+        if result:
														
 
															+            # 添加笔记索引
														
 
															+            result['note_index'] = note_index
														
 
															+            return result
														
 
															+        else:
														
 
															+            logger.error(f"  评估笔记 {note_index} 失败: Query={search_query}")
														
 
															+            return {
														
 
															+                "note_index": note_index,
														
 
															+                "Query相关性": "评估失败",
														
 
															+                "综合得分": 0,
														
 
															+                "匹配类型": "评估失败",
														
 
															+                "说明": "LLM评估失败"
														
 
															+            }
														
 
															+
														
 
															+    def batch_evaluate_notes_with_filter(
														
 
															+        self,
														
 
															+        search_query: str,
														
 
															+        target_feature: str,
														
 
															+        notes: List[Dict[str, Any]],
														
 
															+        max_notes: int = 20,
														
 
															+        max_workers: int = 10
														
 
															+    ) -> Dict[str, Any]:
														
 
															+        """
														
 
															+        两层评估多个笔记（拆分为两次LLM调用）
														
 
															+
														
 
															+        第一层：批量评估Query相关性（1次LLM调用）
														
 
															+        第二层：对"相关"的笔记评估特征匹配度（M次LLM调用）
														
 
															+
														
 
															+        Args:
														
 
															+            search_query: 搜索Query
														
 
															+            target_feature: 目标特征
														
 
															+            notes: 笔记列表
														
 
															+            max_notes: 最多评估几条笔记
														
 
															+            max_workers: 最大并发数
														
 
															+
														
 
															+        Returns:
														
 
															+            评估结果汇总（包含统计信息）
														
 
															+        """
														
 
															+        if not notes:
														
 
															+            return {
														
 
															+                "total_notes": 0,
														
 
															+                "evaluated_notes": 0,
														
 
															+                "filtered_count": 0,
														
 
															+                "statistics": {},
														
 
															+                "notes_evaluation": []
														
 
															+            }
														
 
															+
														
 
															+        notes_to_eval = notes[:max_notes]
														
 
															+        logger.info(f"    两层评估 {len(notes_to_eval)} 个笔记")
														
 
															+
														
 
															+        # ========== 第一层：批量评估Query相关性 ==========
														
 
															+        logger.info(f"      [第一层] 批量评估Query相关性（1次LLM调用）")
														
 
															+        query_relevance_result = self.evaluate_query_relevance_batch(
														
 
															+            search_query=search_query,
														
 
															+            notes=notes_to_eval,
														
 
															+            max_notes=max_notes
														
 
															+        )
														
 
															+
														
 
															+        # 解析第一层结果，找出"相关"的笔记
														
 
															+        relevant_notes_info = []
														
 
															+        for idx, note in enumerate(notes_to_eval):
														
 
															+            note_key = f"note_{idx}"
														
 
															+            relevance_info = query_relevance_result.get(note_key, {})
														
 
															+            relevance = relevance_info.get("与query相关性", "相关")  # 默认为"相关"
														
 
															+
														
 
															+            if relevance == "相关":
														
 
															+                # 保留笔记信息用于第二层评估
														
 
															+                note_card = note.get('note_card', {})
														
 
															+                relevant_notes_info.append({
														
 
															+                    "note_index": idx,
														
 
															+                    "note_card": note_card,
														
 
															+                    "title": note_card.get('display_title', ''),
														
 
															+                    "content": note_card.get('desc', ''),
														
 
															+                    "images": note_card.get('image_list', []),
														
 
															+                    "第一层评估": relevance_info
														
 
															+                })
														
 
															+
														
 
															+        logger.info(f"      [第一层] 过滤结果: {len(relevant_notes_info)}/{len(notes_to_eval)} 条相关")
														
 
															+
														
 
															+        # ========== 第二层：对相关笔记评估特征匹配度 ==========
														
 
															+        evaluated_notes = []
														
 
															+
														
 
															+        if relevant_notes_info:
														
 
															+            logger.info(f"      [第二层] 并行评估特征匹配度（{len(relevant_notes_info)}次LLM调用，{max_workers}并发）")
														
 
															+
														
 
															+            with ThreadPoolExecutor(max_workers=max_workers) as executor:
														
 
															+                futures = []
														
 
															+                for note_info in relevant_notes_info:
														
 
															+                    future = executor.submit(
														
 
															+                        self.evaluate_feature_matching_single,
														
 
															+                        target_feature,
														
 
															+                        note_info["title"],
														
 
															+                        note_info["content"],
														
 
															+                        note_info["images"],
														
 
															+                        note_info["note_index"]
														
 
															+                    )
														
 
															+                    futures.append((future, note_info))
														
 
															+
														
 
															+                # 收集结果并合并
														
 
															+                for future, note_info in futures:
														
 
															+                    try:
														
 
															+                        second_layer_result = future.result()
														
 
															+
														
 
															+                        # 合并两层评估结果
														
 
															+                        merged_result = {
														
 
															+                            "note_index": note_info["note_index"],
														
 
															+                            "Query相关性": "相关",
														
 
															+                            "综合得分": second_layer_result.get("综合得分", 0.0),  # 0-1分制
														
 
															+                            "匹配类型": second_layer_result.get("匹配类型", ""),
														
 
															+                            "评分说明": second_layer_result.get("评分说明", ""),
														
 
															+                            "关键匹配点": second_layer_result.get("关键匹配点", []),
														
 
															+                            "第一层评估": note_info["第一层评估"],
														
 
															+                            "第二层评估": second_layer_result
														
 
															+                        }
														
 
															+                        evaluated_notes.append(merged_result)
														
 
															+                    except Exception as e:
														
 
															+                        logger.error(f"      [第二层] 评估笔记 {note_info['note_index']} 失败: {e}")
														
 
															+                        # 失败的笔记也加入结果
														
 
															+                        evaluated_notes.append({
														
 
															+                            "note_index": note_info["note_index"],
														
 
															+                            "Query相关性": "相关",
														
 
															+                            "综合得分": 0.0,
														
 
															+                            "匹配类型": "评估失败",
														
 
															+                            "评分说明": f"第二层评估失败: {str(e)}",
														
 
															+                            "关键匹配点": [],
														
 
															+                            "第一层评估": note_info["第一层评估"],
														
 
															+                            "第二层评估": {}
														
 
															+                        })
														
 
															+
														
 
															+        # 添加第一层就被过滤的笔记（Query不相关）
														
 
															+        for idx, note in enumerate(notes_to_eval):
														
 
															+            note_key = f"note_{idx}"
														
 
															+            relevance_info = query_relevance_result.get(note_key, {})
														
 
															+            relevance = relevance_info.get("与query相关性", "相关")
														
 
															+
														
 
															+            if relevance == "不相关":
														
 
															+                evaluated_notes.append({
														
 
															+                    "note_index": idx,
														
 
															+                    "Query相关性": "不相关",
														
 
															+                    "综合得分": 0.0,
														
 
															+                    "匹配类型": "过滤",
														
 
															+                    "说明": relevance_info.get("说明", ""),
														
 
															+                    "第一层评估": relevance_info
														
 
															+                })
														
 
															+
														
 
															+        # 按note_index排序
														
 
															+        evaluated_notes.sort(key=lambda x: x.get('note_index', 0))
														
 
															+
														
 
															+        # 统计信息
														
 
															+        total_notes = len(notes)
														
 
															+        evaluated_count = len(evaluated_notes)
														
 
															+        filtered_count = sum(1 for n in evaluated_notes if n.get('Query相关性') == '不相关')
														
 
															+
														
 
															+        # 匹配度分布统计（使用0-1分制的阈值）
														
 
															+        match_distribution = {
														
 
															+            '完全匹配(0.8-1.0)': 0,
														
 
															+            '相似匹配(0.6-0.79)': 0,
														
 
															+            '弱相似(0.5-0.59)': 0,
														
 
															+            '无匹配(≤0.4)': 0
														
 
															+        }
														
 
															+
														
 
															+        for note_eval in evaluated_notes:
														
 
															+            if note_eval.get('Query相关性') == '不相关':
														
 
															+                continue  # 过滤的不计入分布
														
 
															+
														
 
															+            score = note_eval.get('综合得分', 0)
														
 
															+            if score >= 0.8:
														
 
															+                match_distribution['完全匹配(0.8-1.0)'] += 1
														
 
															+            elif score >= 0.6:
														
 
															+                match_distribution['相似匹配(0.6-0.79)'] += 1
														
 
															+            elif score >= 0.5:
														
 
															+                match_distribution['弱相似(0.5-0.59)'] += 1
														
 
															+            else:
														
 
															+                match_distribution['无匹配(≤0.4)'] += 1
														
 
															+
														
 
															+        logger.info(f"    评估完成: 过滤{filtered_count}条, 匹配分布: {match_distribution}")
														
 
															+
														
 
															+        return {
														
 
															+            "total_notes": total_notes,
														
 
															+            "evaluated_notes": evaluated_count,
														
 
															+            "filtered_count": filtered_count,
														
 
															+            "statistics": match_distribution,
														
 
															+            "notes_evaluation": evaluated_notes
														
 
															+        }
														
 
															+
														
 
															+
														
 
															+def test_evaluator():
														
 
															+    """测试评估器"""
														
 
															+    import os
														
 
															+
														
 
															+    # 初始化客户端
														
 
															+    client = OpenRouterClient()
														
 
															+    evaluator = LLMEvaluator(client)
														
 
															+
														
 
															+    # 测试搜索词评估
														
 
															+    print("\n=== 测试搜索词评估 ===")
														
 
															+    result = evaluator.evaluate_search_word(
														
 
															+        original_feature="拟人",
														
 
															+        search_word="宠物猫 猫咪"
														
 
															+    )
														
 
															+    print(f"评分: {result['score']:.3f}")
														
 
															+    print(f"理由: {result['reasoning']}")
														
 
															+
														
 
															+    # 测试批量评估
														
 
															+    print("\n=== 测试批量评估 ===")
														
 
															+    results = evaluator.evaluate_search_words_batch(
														
 
															+        original_feature="拟人",
														
 
															+        search_words=["宠物猫 猫咪", "宠物猫 猫孩子", "宠物猫 猫"],
														
 
															+        max_workers=2
														
 
															+    )
														
 
															+    for r in results:
														
 
															+        print(f"{r['search_word']}: {r['score']:.3f} (rank={r['rank']})")
														
 
															+
														
 
															+
														
 
															+if __name__ == "__main__":
														
 
															+    logging.basicConfig(
														
 
															+        level=logging.INFO,
														
 
															+        format='%(asctime)s - %(levelname)s - %(message)s'
														
 
															+    )
														
 
															+    test_evaluator()