|
@@ -442,6 +442,34 @@ class CategoryEvaluation(BaseModel):
|
|
|
品类维度得分: float = Field(..., description="品类维度得分 -1~1")
|
|
品类维度得分: float = Field(..., description="品类维度得分 -1~1")
|
|
|
简要说明品类维度相关度理由: str = Field(..., description="品类维度相关度理由")
|
|
简要说明品类维度相关度理由: str = Field(..., description="品类维度相关度理由")
|
|
|
|
|
|
|
|
|
|
+# ============================================================================
|
|
|
|
|
+# 批量评估数据模型
|
|
|
|
|
+# ============================================================================
|
|
|
|
|
+
|
|
|
|
|
+class BatchMotivationItem(BaseModel):
|
|
|
|
|
+ """批量动机评估中的单个SUG结果"""
|
|
|
|
|
+ sug_text: str = Field(..., description="SUG文本")
|
|
|
|
|
+ 原始问题核心动机提取: CoreMotivationExtraction = Field(..., description="原始问题核心动机提取")
|
|
|
|
|
+ 动机维度得分: float = Field(..., description="动机维度得分 -1~1")
|
|
|
|
|
+ 简要说明动机维度相关度理由: str = Field(..., description="动机维度相关度理由")
|
|
|
|
|
+ 得分为零的原因: str = Field(default="不适用", description="原始问题无动机/sug词条无动机/动机不匹配/不适用")
|
|
|
|
|
+
|
|
|
|
|
+class BatchMotivationResult(BaseModel):
|
|
|
|
|
+ """批量动机评估结果"""
|
|
|
|
|
+ evaluations: list[BatchMotivationItem] = Field(..., description="所有SUG的动机评估结果")
|
|
|
|
|
+
|
|
|
|
|
+class BatchCategoryItem(BaseModel):
|
|
|
|
|
+ """批量品类评估中的单个SUG结果"""
|
|
|
|
|
+ sug_text: str = Field(..., description="SUG文本")
|
|
|
|
|
+ 品类维度得分: float = Field(..., description="品类维度得分 -1~1")
|
|
|
|
|
+ 简要说明品类维度相关度理由: str = Field(..., description="品类维度相关度理由")
|
|
|
|
|
+
|
|
|
|
|
+class BatchCategoryResult(BaseModel):
|
|
|
|
|
+ """批量品类评估结果"""
|
|
|
|
|
+ evaluations: list[BatchCategoryItem] = Field(..., description="所有SUG的品类评估结果")
|
|
|
|
|
+
|
|
|
|
|
+# ============================================================================
|
|
|
|
|
+
|
|
|
class ExtensionWordEvaluation(BaseModel):
|
|
class ExtensionWordEvaluation(BaseModel):
|
|
|
"""延伸词评估"""
|
|
"""延伸词评估"""
|
|
|
延伸词得分: float = Field(..., ge=-1, le=1, description="延伸词得分 -1~1")
|
|
延伸词得分: float = Field(..., ge=-1, le=1, description="延伸词得分 -1~1")
|
|
@@ -1147,6 +1175,293 @@ extension_word_evaluator = Agent[None](
|
|
|
model_settings=ModelSettings(temperature=0.2)
|
|
model_settings=ModelSettings(temperature=0.2)
|
|
|
)
|
|
)
|
|
|
|
|
|
|
|
|
|
+# ============================================================================
|
|
|
|
|
+# 批量评估专用 Prompt 和 Agent(性能优化:每批10个SUG)
|
|
|
|
|
+# ============================================================================
|
|
|
|
|
+
|
|
|
|
|
+# 批量动机评估prompt - 从batch_evaluation_demo.py复制(已验证有效)
|
|
|
|
|
+batch_motivation_evaluation_instructions = """
|
|
|
|
|
+# 角色
|
|
|
|
|
+你是**专业的动机意图评估专家**。
|
|
|
|
|
+任务:判断<平台sug词条>与<原始问题>的**动机意图匹配度**,给出**-1到1之间**的数值评分。
|
|
|
|
|
+
|
|
|
|
|
+---
|
|
|
|
|
+# 输入信息
|
|
|
|
|
+你将接收到以下输入:
|
|
|
|
|
+- **<原始问题>**:用户的初始查询问题,代表用户的真实需求意图。
|
|
|
|
|
+- **<平台sug词条列表>**:待评估的多个词条(编号1-N),每个词条需要独立评估
|
|
|
|
|
+
|
|
|
|
|
+**批量评估说明**:
|
|
|
|
|
+- 输入格式为编号列表:1. 词条1 2. 词条2 ...
|
|
|
|
|
+- 每个词条都是独立的评估对象
|
|
|
|
|
+- 对每个词条使用完全相同的评估标准
|
|
|
|
|
+---
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+# 核心约束
|
|
|
|
|
+
|
|
|
|
|
+## 维度独立性声明
|
|
|
|
|
+【严格约束】本评估**仅评估动机意图维度**:
|
|
|
|
|
+- **只评估** 用户"想要做什么",即原始问题的行为意图和目的
|
|
|
|
|
+- 核心是 **动词**:获取、学习、拍摄、制作、寻找等
|
|
|
|
|
+- 包括:核心动作 + 使用场景 + 最终目的
|
|
|
|
|
+- **评估重点**:动作本身及其语义方向
|
|
|
|
|
+ **禁止使用"主题相关"作为评分依据**:评分理由中不得出现"主题"、"内容"、"话题"等词
|
|
|
|
|
+
|
|
|
|
|
+---
|
|
|
|
|
+
|
|
|
|
|
+# 作用域与动作意图
|
|
|
|
|
+
|
|
|
|
|
+## 什么是作用域?
|
|
|
|
|
+**作用域 = 动机层 + 对象层 + 场景层**
|
|
|
|
|
+
|
|
|
|
|
+## 动作意图的识别
|
|
|
|
|
+
|
|
|
|
|
+### 方法1: 显性动词直接提取
|
|
|
|
|
+
|
|
|
|
|
+当原始问题明确包含动词时,直接提取
|
|
|
|
|
+示例:
|
|
|
|
|
+"如何获取素材" → 核心动机 = "获取"
|
|
|
|
|
+"寻找拍摄技巧" → 核心动机 = "寻找"(或"学习")
|
|
|
|
|
+"制作视频教程" → 核心动机 = "制作"
|
|
|
|
|
+
|
|
|
|
|
+### 方法2: 隐性动词语义推理
|
|
|
|
|
+当原始问题没有显性动词时,需要结合上下文推理
|
|
|
|
|
+
|
|
|
|
|
+如果原始问题是纯名词短语,无任何动作线索:
|
|
|
|
|
+→ 核心动机 = 无法识别
|
|
|
|
|
+→ 在此情况下,动机维度得分应为 0。
|
|
|
|
|
+示例:
|
|
|
|
|
+"摄影" → 无法识别动机,动机维度得分 = 0
|
|
|
|
|
+"川西风光" → 无法识别动机,动机维度得分 = 0
|
|
|
|
|
+
|
|
|
|
|
+---
|
|
|
|
|
+
|
|
|
|
|
+# 部分作用域的处理
|
|
|
|
|
+
|
|
|
|
|
+## 情况1:sug词条是原始问题的部分作用域
|
|
|
|
|
+
|
|
|
|
|
+当sug词条只包含原始问题的部分作用域时,需要判断:
|
|
|
|
|
+1. sug词条是否包含动作意图
|
|
|
|
|
+2. 如果包含,动作是否匹配
|
|
|
|
|
+
|
|
|
|
|
+**示例**:
|
|
|
|
|
+```
|
|
|
|
|
+原始问题:"川西旅行行程规划"
|
|
|
|
|
+- 完整作用域:规划(动作)+ 旅行行程(对象)+ 川西(场景)
|
|
|
|
|
+
|
|
|
|
|
+Sug词条:"川西旅行"
|
|
|
|
|
+- 包含作用域:旅行(部分对象)+ 川西(场景)
|
|
|
|
|
+- 缺失作用域:规划(动作)
|
|
|
|
|
+- 动作意图评分:0(无动作意图)
|
|
|
|
|
+```
|
|
|
|
|
+
|
|
|
|
|
+**评分原则**:
|
|
|
|
|
+- 如果sug词条缺失动机层(动作) → 动作意图得分 = 0
|
|
|
|
|
+- 如果sug词条包含动机层 → 按动作匹配度评分
|
|
|
|
|
+
|
|
|
|
|
+---
|
|
|
|
|
+
|
|
|
|
|
+# 评分标准
|
|
|
|
|
+
|
|
|
|
|
+## 【正向匹配】
|
|
|
|
|
+
|
|
|
|
|
+### +0.9~1.0:核心动作完全一致
|
|
|
|
|
+**示例**:
|
|
|
|
|
+- "规划旅行行程" vs "安排旅行路线" → 0.98
|
|
|
|
|
+ - 规划≈安排,语义完全一致
|
|
|
|
|
+- "获取素材" vs "下载素材" → 0.97
|
|
|
|
|
+ - 获取≈下载,语义完全一致
|
|
|
|
|
+
|
|
|
|
|
+- 特殊规则: 如果sug词的核心动作是原始问题动作的**具体化子集**,也判定为完全一致
|
|
|
|
|
+例: 原始问题"扣除猫咪主体的方法" vs sug词"扣除猫咪眼睛的方法"(子集但目的一致
|
|
|
|
|
+**注意**:此处不考虑对象和场景是否一致,只看动作本身
|
|
|
|
|
+
|
|
|
|
|
+###+0.75~0.95: 核心动作语义相近或为同义表达
|
|
|
|
|
+ - 例: 原始问题"如何获取素材" vs sug词"如何下载素材"
|
|
|
|
|
+ - 同义词对: 获取≈下载≈寻找, 技巧≈方法≈教程≈攻略
|
|
|
|
|
+
|
|
|
|
|
+### +0.50~0.75:动作意图相关
|
|
|
|
|
+**判定标准**:
|
|
|
|
|
+- 动作是实现原始意图的相关路径
|
|
|
|
|
+- 或动作是原始意图的前置/后置步骤
|
|
|
|
|
+
|
|
|
|
|
+**示例**:
|
|
|
|
|
+- "获取素材" vs "管理素材" → 0.65
|
|
|
|
|
+ - 管理是获取后的相关步骤
|
|
|
|
|
+- "规划行程" vs "预订酒店" → 0.60
|
|
|
|
|
+ - 预订是规划的具体实施步骤
|
|
|
|
|
+
|
|
|
|
|
+### +0.25~0.50:动作意图弱相关
|
|
|
|
|
+**判定标准**:
|
|
|
|
|
+- 动作在同一大类但方向不同
|
|
|
|
|
+- 或动作有间接关联
|
|
|
|
|
+
|
|
|
|
|
+**示例**:
|
|
|
|
|
+- "学习摄影技巧" vs "欣赏摄影作品" → 0.35
|
|
|
|
|
+ - 都与摄影有关,但学习≠欣赏
|
|
|
|
|
+- "规划旅行" vs "回忆旅行" → 0.30
|
|
|
|
|
+ - 都与旅行有关,但方向不同
|
|
|
|
|
+
|
|
|
|
|
+---
|
|
|
|
|
+
|
|
|
|
|
+## 【中性/无关】
|
|
|
|
|
+
|
|
|
|
|
+### 0:无动作意图或动作完全无关
|
|
|
|
|
+**适用场景**:
|
|
|
|
|
+1. 原始问题或sug词条无法识别动作
|
|
|
|
|
+2. 两者动作意图完全无关
|
|
|
|
|
+
|
|
|
|
|
+**示例**:
|
|
|
|
|
+- "如何获取素材" vs "摄影器材" → 0
|
|
|
|
|
+ - sug词条无动作意图
|
|
|
|
|
+- "川西风光" vs "风光摄影作品" → 0
|
|
|
|
|
+ - 原始问题无动作意图
|
|
|
|
|
+
|
|
|
|
|
+**理由模板**:
|
|
|
|
|
+- "sug词条无明确动作意图,无法评估动作匹配度"
|
|
|
|
|
+- "原始问题无明确动作意图,动作维度得分为0"
|
|
|
|
|
+
|
|
|
|
|
+---
|
|
|
|
|
+
|
|
|
|
|
+## 【负向偏离】
|
|
|
|
|
+
|
|
|
|
|
+### -0.2~-0.05:动作方向轻度偏离
|
|
|
|
|
+**示例**:
|
|
|
|
|
+- "学习摄影技巧" vs "销售摄影课程" → -0.10
|
|
|
|
|
+ - 学习 vs 销售,方向有偏差
|
|
|
|
|
+
|
|
|
|
|
+### -0.5~-0.25:动作意图明显冲突
|
|
|
|
|
+**示例**:
|
|
|
|
|
+- "获取免费素材" vs "购买素材" → -0.35
|
|
|
|
|
+ - 获取免费 vs 购买,明显冲突
|
|
|
|
|
+
|
|
|
|
|
+### -1.0~-0.55:动作意图完全相反
|
|
|
|
|
+**示例**:
|
|
|
|
|
+- "下载素材" vs "上传素材" → -0.70
|
|
|
|
|
+ - 下载 vs 上传,方向完全相反
|
|
|
|
|
+
|
|
|
|
|
+---
|
|
|
|
|
+
|
|
|
|
|
+## 得分为零的原因(语义判断)
|
|
|
|
|
+
|
|
|
|
|
+当动机维度得分为 0 时,需要在 `得分为零的原因` 字段中选择以下之一:
|
|
|
|
|
+- **"原始问题无动机"**:原始问题是纯名词短语,无法识别任何动作意图
|
|
|
|
|
+- **"sug词条无动机"**:sug词条中不包含任何动作意图
|
|
|
|
|
+- **"动机不匹配"**:双方都有动作,但完全无关联
|
|
|
|
|
+- **"不适用"**:得分不为零时使用此默认值
|
|
|
|
|
+
|
|
|
|
|
+---
|
|
|
|
|
+
|
|
|
|
|
+# 批量评估核心原则
|
|
|
|
|
+
|
|
|
|
|
+## 【极其重要】独立评估原则
|
|
|
|
|
+1. **绝对评分**:每个SUG的评分必须基于与原始问题的匹配度,使用固定的评分标准
|
|
|
|
|
+2. **禁止相对比较**:不要比较SUG之间的好坏,不要因为"其他SUG更好"而降低某个SUG的分数
|
|
|
|
|
+3. **标准一致性**:对第1个SUG和第10个SUG使用完全相同的评分标准
|
|
|
|
|
+4. **独立判断**:评估SUG A时,完全不考虑SUG B/C/D的存在
|
|
|
|
|
+
|
|
|
|
|
+**错误示例**:
|
|
|
|
|
+- ❌ "这个SUG比列表中其他的更好,给0.9"
|
|
|
|
|
+- ❌ "相比第一个SUG,这个稍差一些,给0.7"
|
|
|
|
|
+
|
|
|
|
|
+**正确示例**:
|
|
|
|
|
+- ✅ "这个SUG的动作'获取'与原始问题'获取'完全一致,根据评分标准给0.97"
|
|
|
|
|
+- ✅ "这个SUG无动作意图,根据评分标准给0"
|
|
|
|
|
+
|
|
|
|
|
+---
|
|
|
|
|
+
|
|
|
|
|
+# 输出格式
|
|
|
|
|
+输出结果必须为一个 **JSON 格式**,包含evaluations数组,每个元素包含:
|
|
|
|
|
+```json
|
|
|
|
|
+{
|
|
|
|
|
+ "evaluations": [
|
|
|
|
|
+ {
|
|
|
|
|
+ "sug_text": "SUG文本",
|
|
|
|
|
+ "原始问题核心动机提取": {
|
|
|
|
|
+ "简要说明核心动机": ""
|
|
|
|
|
+ },
|
|
|
|
|
+ "动机维度得分": "-1到1之间的小数",
|
|
|
|
|
+ "简要说明动机维度相关度理由": "评估理由",
|
|
|
|
|
+ "得分为零的原因": "原始问题无动机/sug词条无动机/动机不匹配/不适用"
|
|
|
|
|
+ }
|
|
|
|
|
+ ]
|
|
|
|
|
+}
|
|
|
|
|
+```
|
|
|
|
|
+
|
|
|
|
|
+**输出约束(非常重要)**:
|
|
|
|
|
+1. **字符串长度限制**:\"简要说明动机维度相关度理由\"字段必须控制在**150字以内**
|
|
|
|
|
+2. **JSON格式规范**:必须生成完整的JSON格式,确保字符串用双引号包裹且正确闭合
|
|
|
|
|
+3. **引号使用**:字符串中如需表达引用,请使用《》或「」代替单引号或双引号
|
|
|
|
|
+4. **顺序严格对应(极其重要)**:
|
|
|
|
|
+ - evaluations数组必须与输入的sug词条列表严格1对1对应
|
|
|
|
|
+ - 第1个元素必须是输入列表的第1个SUG,第2个元素必须是第2个SUG,以此类推
|
|
|
|
|
+ - 每个元素的sug_text必须与输入SUG完全一致(逐字匹配,包括标点)
|
|
|
|
|
+ - 禁止改变顺序、禁止遗漏任何SUG、禁止重复评估
|
|
|
|
|
+ - 示例:输入"1. 秋季摄影素材 2. 川西风光" → 输出[{sug_text:"秋季摄影素材",...}, {sug_text:"川西风光",...}]
|
|
|
|
|
+ - 错误示例:输出[{sug_text:"川西风光",...}, {sug_text:"秋季摄影素材",...}] ← 顺序错误❌
|
|
|
|
|
+
|
|
|
|
|
+---
|
|
|
|
|
+
|
|
|
|
|
+# 核心原则总结
|
|
|
|
|
+1. **只评估动作**:完全聚焦于动作意图,不管对象和场景
|
|
|
|
|
+2. **作用域识别**:识别作用域但只评估动机层
|
|
|
|
|
+3. **严格标准一致性**:对所有用例使用相同的评估标准,避免评分飘移
|
|
|
|
|
+4. **理由纯粹**:评分理由只能谈动作,不能谈对象、场景、主题
|
|
|
|
|
+5. **独立评估**:每个SUG完全独立评估,禁止相对比较
|
|
|
|
|
+""".strip()
|
|
|
|
|
+
|
|
|
|
|
+# 批量品类评估prompt - 从batch_evaluation_demo.py复制(与单个品类prompt类似,添加批量说明)
|
|
|
|
|
+# 注:完整prompt见batch_evaluation_demo.py:724-966行,此处使用相同内容
|
|
|
|
|
+batch_category_evaluation_instructions = category_evaluation_instructions.replace(
|
|
|
|
|
+ "- **<平台sug词条>**:待评估的词条,可能是单个或多个作用域的组合",
|
|
|
|
|
+ """- **<平台sug词条列表>**:待评估的多个词条(编号1-N),每个词条需要独立评估
|
|
|
|
|
+
|
|
|
|
|
+**批量评估说明**:
|
|
|
|
|
+- 输入格式为编号列表:1. 词条1 2. 词条2 ...
|
|
|
|
|
+- 每个词条都是独立的评估对象
|
|
|
|
|
+- 对每个词条使用完全相同的评估标准"""
|
|
|
|
|
+).replace(
|
|
|
|
|
+ '"品类维度得分": "-1到1之间的小数",\n "简要说明品类维度相关度理由": "评估该sug词条与原始问题品类匹配程度的理由,包含作用域覆盖理由"',
|
|
|
|
|
+ ''' "evaluations": [
|
|
|
|
|
+ {
|
|
|
|
|
+ "sug_text": "SUG文本",
|
|
|
|
|
+ "品类维度得分": "-1到1之间的小数",
|
|
|
|
|
+ "简要说明品类维度相关度理由": "评估理由"
|
|
|
|
|
+ }
|
|
|
|
|
+ ]'''
|
|
|
|
|
+).replace(
|
|
|
|
|
+ "1. **只看名词和限定词**:完全忽略动作和意图",
|
|
|
|
|
+ """## 【极其重要】独立评估原则
|
|
|
|
|
+1. **绝对评分**:每个SUG的评分必须基于与原始问题的匹配度,使用固定的评分标准
|
|
|
|
|
+2. **禁止相对比较**:不要比较SUG之间的好坏,不要因为"其他SUG更好"而降低某个SUG的分数
|
|
|
|
|
+3. **标准一致性**:对第1个SUG和第10个SUG使用完全相同的评分标准
|
|
|
|
|
+4. **独立判断**:评估SUG A时,完全不考虑SUG B/C/D的存在
|
|
|
|
|
+
|
|
|
|
|
+---
|
|
|
|
|
+
|
|
|
|
|
+# 核心原则总结
|
|
|
|
|
+
|
|
|
|
|
+1. **只看名词和限定词**:完全忽略动作和意图"""
|
|
|
|
|
+) + """
|
|
|
|
|
+6. **独立评估**:每个SUG完全独立评估,禁止相对比较
|
|
|
|
|
+7. **顺序严格对应(极其重要)**:evaluations数组必须与输入的sug词条列表严格1对1对应
|
|
|
|
|
+"""
|
|
|
|
|
+
|
|
|
|
|
+# 批量评估Agent定义
|
|
|
|
|
+batch_motivation_evaluator = Agent[None](
|
|
|
|
|
+ name="批量动机维度评估专家",
|
|
|
|
|
+ instructions=batch_motivation_evaluation_instructions,
|
|
|
|
|
+ model=get_model(MODEL_NAME),
|
|
|
|
|
+ output_type=BatchMotivationResult,
|
|
|
|
|
+)
|
|
|
|
|
+
|
|
|
|
|
+batch_category_evaluator = Agent[None](
|
|
|
|
|
+ name="批量品类维度评估专家",
|
|
|
|
|
+ instructions=batch_category_evaluation_instructions,
|
|
|
|
|
+ model=get_model(MODEL_NAME),
|
|
|
|
|
+ output_type=BatchCategoryResult,
|
|
|
|
|
+)
|
|
|
|
|
|
|
|
# ============================================================================
|
|
# ============================================================================
|
|
|
# Round 0 专用 Agent(v124新增 - 需求1)
|
|
# Round 0 专用 Agent(v124新增 - 需求1)
|
|
@@ -2486,6 +2801,198 @@ async def evaluate_with_o(text: str, o: str, cache: dict[str, tuple[float, str]]
|
|
|
return 0.0, fallback_reason
|
|
return 0.0, fallback_reason
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
+async def evaluate_batch_with_o(
|
|
|
|
|
+ texts: list[str],
|
|
|
|
|
+ o: str,
|
|
|
|
|
+ cache: dict[str, tuple[float, str]] | None = None,
|
|
|
|
|
+ context: RunContext | None = None,
|
|
|
|
|
+ round_num: int = 1
|
|
|
|
|
+) -> list[tuple[float, str]]:
|
|
|
|
|
+ """批量评估函数(每批最多10个)- Round 1+
|
|
|
|
|
+
|
|
|
|
|
+ 对多个SUG进行批量评估,自动分批处理(每批最多10个)
|
|
|
|
|
+ 使用批量Agent一次性评估多个SUG,显著提升性能
|
|
|
|
|
+
|
|
|
|
|
+ Args:
|
|
|
|
|
+ texts: 待评估的SUG列表
|
|
|
|
|
+ o: 原始问题
|
|
|
|
|
+ cache: 评估缓存(可选)
|
|
|
|
|
+ context: 运行上下文(可选),用于统计
|
|
|
|
|
+ round_num: 轮次编号,用于日志输出
|
|
|
|
|
+
|
|
|
|
|
+ Returns:
|
|
|
|
|
+ list[tuple[float, str]]: 每个SUG的(最终得分, 评估理由)列表,顺序与输入严格对应
|
|
|
|
|
+ """
|
|
|
|
|
+ import time
|
|
|
|
|
+
|
|
|
|
|
+ BATCH_SIZE = 10 # 每批最多10个SUG
|
|
|
|
|
+ results = []
|
|
|
|
|
+
|
|
|
|
|
+ # 分批处理
|
|
|
|
|
+ for batch_idx in range(0, len(texts), BATCH_SIZE):
|
|
|
|
|
+ batch_texts = texts[batch_idx:batch_idx + BATCH_SIZE]
|
|
|
|
|
+ batch_start_time = time.time()
|
|
|
|
|
+
|
|
|
|
|
+ print(f" [Round {round_num} 批量评估] 批次{batch_idx//BATCH_SIZE + 1}: 评估 {len(batch_texts)} 个SUG...")
|
|
|
|
|
+
|
|
|
|
|
+ # 先检查缓存,分离已缓存和未缓存的
|
|
|
|
|
+ cached_results = {}
|
|
|
|
|
+ uncached_texts = []
|
|
|
|
|
+ uncached_indices = []
|
|
|
|
|
+
|
|
|
|
|
+ for i, text in enumerate(batch_texts):
|
|
|
|
|
+ if cache is not None and text in cache:
|
|
|
|
|
+ cached_results[i] = cache[text]
|
|
|
|
|
+ print(f" ⚡ 缓存命中: {text} -> {cache[text][0]:.2f}")
|
|
|
|
|
+ else:
|
|
|
|
|
+ uncached_texts.append(text)
|
|
|
|
|
+ uncached_indices.append(i)
|
|
|
|
|
+
|
|
|
|
|
+ # 如果全部命中缓存,直接返回
|
|
|
|
|
+ if not uncached_texts:
|
|
|
|
|
+ print(f" ✅ 全部命中缓存,跳过批量评估")
|
|
|
|
|
+ results.extend([cached_results[i] for i in range(len(batch_texts))])
|
|
|
|
|
+ continue
|
|
|
|
|
+
|
|
|
|
|
+ # 构建批量评估输入
|
|
|
|
|
+ sug_list_str = "\n".join([f"{i}. {sug}" for i, sug in enumerate(uncached_texts, 1)])
|
|
|
|
|
+
|
|
|
|
|
+ batch_input = f"""
|
|
|
|
|
+<原始问题>
|
|
|
|
|
+{o}
|
|
|
|
|
+</原始问题>
|
|
|
|
|
+
|
|
|
|
|
+<平台sug词条列表>
|
|
|
|
|
+{sug_list_str}
|
|
|
|
|
+</平台sug词条列表>
|
|
|
|
|
+
|
|
|
|
|
+请对以上所有SUG每一个进行完全独立评估。
|
|
|
|
|
+"""
|
|
|
|
|
+
|
|
|
|
|
+ # 统计LLM调用(批量调用计为2次:动机+品类)
|
|
|
|
|
+ if context is not None:
|
|
|
|
|
+ context.stats_llm_calls += 2
|
|
|
|
|
+
|
|
|
|
|
+ # 添加重试机制
|
|
|
|
|
+ max_retries = 2
|
|
|
|
|
+ last_error = None
|
|
|
|
|
+ batch_success = False
|
|
|
|
|
+
|
|
|
|
|
+ for attempt in range(max_retries):
|
|
|
|
|
+ try:
|
|
|
|
|
+ # 并发调用批量评估器(不含延伸词)
|
|
|
|
|
+ motivation_task = Runner.run(batch_motivation_evaluator, batch_input)
|
|
|
|
|
+ category_task = Runner.run(batch_category_evaluator, batch_input)
|
|
|
|
|
+
|
|
|
|
|
+ motivation_result, category_result = await asyncio.gather(
|
|
|
|
|
+ motivation_task,
|
|
|
|
|
+ category_task
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+ batch_motivation: BatchMotivationResult = motivation_result.final_output
|
|
|
|
|
+ batch_category: BatchCategoryResult = category_result.final_output
|
|
|
|
|
+
|
|
|
|
|
+ # 验证返回数量
|
|
|
|
|
+ if len(batch_motivation.evaluations) != len(uncached_texts):
|
|
|
|
|
+ raise ValueError(f"动机评估数量不匹配: 期望{len(uncached_texts)},实际{len(batch_motivation.evaluations)}")
|
|
|
|
|
+ if len(batch_category.evaluations) != len(uncached_texts):
|
|
|
|
|
+ raise ValueError(f"品类评估数量不匹配: 期望{len(uncached_texts)},实际{len(batch_category.evaluations)}")
|
|
|
|
|
+
|
|
|
|
|
+ # 验证顺序
|
|
|
|
|
+ for i, (expected_text, mot_item, cat_item) in enumerate(zip(uncached_texts, batch_motivation.evaluations, batch_category.evaluations)):
|
|
|
|
|
+ if mot_item.sug_text != expected_text:
|
|
|
|
|
+ raise ValueError(f"动机评估顺序错误: 位置{i+1}期望'{expected_text}',实际'{mot_item.sug_text}'")
|
|
|
|
|
+ if cat_item.sug_text != expected_text:
|
|
|
|
|
+ raise ValueError(f"品类评估顺序错误: 位置{i+1}期望'{expected_text}',实际'{cat_item.sug_text}'")
|
|
|
|
|
+
|
|
|
|
|
+ # 处理每个SUG的结果
|
|
|
|
|
+ batch_results_temp = []
|
|
|
|
|
+ for mot_item, cat_item in zip(batch_motivation.evaluations, batch_category.evaluations):
|
|
|
|
|
+ motivation_score = mot_item.动机维度得分
|
|
|
|
|
+ category_score = cat_item.品类维度得分
|
|
|
|
|
+ zero_reason = mot_item.得分为零的原因
|
|
|
|
|
+
|
|
|
|
|
+ # 应用规则计算最终得分(不含延伸词维度)
|
|
|
|
|
+ final_score, rule_applied = calculate_final_score_v2(
|
|
|
|
|
+ motivation_score, category_score
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+ # 组合评估理由
|
|
|
|
|
+ core_motivation = mot_item.原始问题核心动机提取.简要说明核心动机
|
|
|
|
|
+ motivation_reason = mot_item.简要说明动机维度相关度理由
|
|
|
|
|
+ category_reason = cat_item.简要说明品类维度相关度理由
|
|
|
|
|
+
|
|
|
|
|
+ combined_reason = (
|
|
|
|
|
+ f'【评估对象】词条"{mot_item.sug_text}" vs 原始问题"{o}"\n'
|
|
|
|
|
+ f"【核心动机】{core_motivation}\n"
|
|
|
|
|
+ f"【动机维度 {motivation_score:.2f}】{motivation_reason}\n"
|
|
|
|
|
+ f"【品类维度 {category_score:.2f}】{category_reason}\n"
|
|
|
|
|
+ f"【最终得分 {final_score:.2f}】"
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+ if rule_applied:
|
|
|
|
|
+ combined_reason += f"\n【规则说明】{rule_applied}"
|
|
|
|
|
+
|
|
|
|
|
+ batch_results_temp.append((final_score, combined_reason))
|
|
|
|
|
+
|
|
|
|
|
+ # 存入缓存
|
|
|
|
|
+ if cache is not None:
|
|
|
|
|
+ cache[mot_item.sug_text] = (final_score, combined_reason)
|
|
|
|
|
+
|
|
|
|
|
+ # 合并缓存结果和批量评估结果
|
|
|
|
|
+ final_batch_results = []
|
|
|
|
|
+ uncached_idx = 0
|
|
|
|
|
+ for i in range(len(batch_texts)):
|
|
|
|
|
+ if i in cached_results:
|
|
|
|
|
+ final_batch_results.append(cached_results[i])
|
|
|
|
|
+ else:
|
|
|
|
|
+ final_batch_results.append(batch_results_temp[uncached_idx])
|
|
|
|
|
+ uncached_idx += 1
|
|
|
|
|
+
|
|
|
|
|
+ results.extend(final_batch_results)
|
|
|
|
|
+ batch_success = True
|
|
|
|
|
+
|
|
|
|
|
+ batch_elapsed = time.time() - batch_start_time
|
|
|
|
|
+ print(f" ✅ 批次{batch_idx//BATCH_SIZE + 1}完成: {len(uncached_texts)}个SUG,耗时{batch_elapsed:.2f}秒")
|
|
|
|
|
+ break
|
|
|
|
|
+
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ last_error = e
|
|
|
|
|
+ error_msg = str(e)
|
|
|
|
|
+
|
|
|
|
|
+ if attempt < max_retries - 1:
|
|
|
|
|
+ print(f" ⚠️ 批量评估失败 (尝试 {attempt+1}/{max_retries}): {error_msg[:150]}")
|
|
|
|
|
+ print(f" 正在重试...")
|
|
|
|
|
+ await asyncio.sleep(1)
|
|
|
|
|
+ else:
|
|
|
|
|
+ print(f" ❌ 批量评估失败 (已达最大重试次数): {error_msg[:150]}")
|
|
|
|
|
+
|
|
|
|
|
+ # 如果批量评估失败,回退到单个评估
|
|
|
|
|
+ if not batch_success:
|
|
|
|
|
+ print(f" ⚠️ 批量评估失败,回退到单个评估模式...")
|
|
|
|
|
+ for text in uncached_texts:
|
|
|
|
|
+ try:
|
|
|
|
|
+ score, reason = await evaluate_with_o(text, o, cache, context)
|
|
|
|
|
+ batch_results_temp.append((score, reason))
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ print(f" ❌ 单个评估也失败: {text[:30]}... - {str(e)[:100]}")
|
|
|
|
|
+ batch_results_temp.append((0.0, f"评估失败: {str(e)[:100]}"))
|
|
|
|
|
+
|
|
|
|
|
+ # 合并结果
|
|
|
|
|
+ final_batch_results = []
|
|
|
|
|
+ uncached_idx = 0
|
|
|
|
|
+ for i in range(len(batch_texts)):
|
|
|
|
|
+ if i in cached_results:
|
|
|
|
|
+ final_batch_results.append(cached_results[i])
|
|
|
|
|
+ else:
|
|
|
|
|
+ final_batch_results.append(batch_results_temp[uncached_idx])
|
|
|
|
|
+ uncached_idx += 1
|
|
|
|
|
+
|
|
|
|
|
+ results.extend(final_batch_results)
|
|
|
|
|
+
|
|
|
|
|
+ return results
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
async def evaluate_with_o_round0(text: str, o: str, cache: dict[str, tuple[float, str]] | None = None) -> tuple[float, str]:
|
|
async def evaluate_with_o_round0(text: str, o: str, cache: dict[str, tuple[float, str]] | None = None) -> tuple[float, str]:
|
|
|
"""Round 0专用评估函数(v124新增 - 需求1)
|
|
"""Round 0专用评估函数(v124新增 - 需求1)
|
|
|
|
|
|
|
@@ -2593,6 +3100,194 @@ async def evaluate_with_o_round0(text: str, o: str, cache: dict[str, tuple[float
|
|
|
return 0.0, fallback_reason
|
|
return 0.0, fallback_reason
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
+async def evaluate_batch_with_o_round0(
|
|
|
|
|
+ texts: list[str],
|
|
|
|
|
+ o: str,
|
|
|
|
|
+ cache: dict[str, tuple[float, str]] | None = None
|
|
|
|
|
+) -> list[tuple[float, str]]:
|
|
|
|
|
+ """批量评估函数(每批最多10个)- Round 0 专用
|
|
|
|
|
+
|
|
|
|
|
+ 对多个words进行批量评估,自动分批处理(每批最多10个)
|
|
|
|
|
+ 使用批量Agent一次性评估多个words,显著提升性能
|
|
|
|
|
+ 专用于Round 0的segment和word评估
|
|
|
|
|
+
|
|
|
|
|
+ Args:
|
|
|
|
|
+ texts: 待评估的word列表
|
|
|
|
|
+ o: 原始问题
|
|
|
|
|
+ cache: 评估缓存(可选)
|
|
|
|
|
+
|
|
|
|
|
+ Returns:
|
|
|
|
|
+ list[tuple[float, str]]: 每个word的(最终得分, 评估理由)列表,顺序与输入严格对应
|
|
|
|
|
+ """
|
|
|
|
|
+ import time
|
|
|
|
|
+
|
|
|
|
|
+ BATCH_SIZE = 10 # 每批最多10个words
|
|
|
|
|
+ results = []
|
|
|
|
|
+
|
|
|
|
|
+ # 分批处理
|
|
|
|
|
+ for batch_idx in range(0, len(texts), BATCH_SIZE):
|
|
|
|
|
+ batch_texts = texts[batch_idx:batch_idx + BATCH_SIZE]
|
|
|
|
|
+ batch_start_time = time.time()
|
|
|
|
|
+
|
|
|
|
|
+ print(f" [Round 0 批量评估] 批次{batch_idx//BATCH_SIZE + 1}: 评估 {len(batch_texts)} 个words...")
|
|
|
|
|
+
|
|
|
|
|
+ # 先检查缓存,分离已缓存和未缓存的
|
|
|
|
|
+ cached_results = {}
|
|
|
|
|
+ uncached_texts = []
|
|
|
|
|
+ uncached_indices = []
|
|
|
|
|
+
|
|
|
|
|
+ for i, text in enumerate(batch_texts):
|
|
|
|
|
+ cache_key = f"round0:{text}:{o}"
|
|
|
|
|
+ if cache is not None and cache_key in cache:
|
|
|
|
|
+ cached_results[i] = cache[cache_key]
|
|
|
|
|
+ print(f" ⚡ Round0缓存命中: {text} -> {cache[cache_key][0]:.2f}")
|
|
|
|
|
+ else:
|
|
|
|
|
+ uncached_texts.append(text)
|
|
|
|
|
+ uncached_indices.append(i)
|
|
|
|
|
+
|
|
|
|
|
+ # 如果全部命中缓存,直接返回
|
|
|
|
|
+ if not uncached_texts:
|
|
|
|
|
+ print(f" ✅ 全部命中缓存,跳过批量评估")
|
|
|
|
|
+ results.extend([cached_results[i] for i in range(len(batch_texts))])
|
|
|
|
|
+ continue
|
|
|
|
|
+
|
|
|
|
|
+ # 构建批量评估输入
|
|
|
|
|
+ word_list_str = "\n".join([f"{i}. {word}" for i, word in enumerate(uncached_texts, 1)])
|
|
|
|
|
+
|
|
|
|
|
+ batch_input = f"""
|
|
|
|
|
+<原始问题>
|
|
|
|
|
+{o}
|
|
|
|
|
+</原始问题>
|
|
|
|
|
+
|
|
|
|
|
+<词条列表>
|
|
|
|
|
+{word_list_str}
|
|
|
|
|
+</词条列表>
|
|
|
|
|
+
|
|
|
|
|
+请对以上所有词条每一个进行完全独立评估。
|
|
|
|
|
+"""
|
|
|
|
|
+
|
|
|
|
|
+ # 添加重试机制
|
|
|
|
|
+ max_retries = 2
|
|
|
|
|
+ last_error = None
|
|
|
|
|
+ batch_success = False
|
|
|
|
|
+
|
|
|
|
|
+ for attempt in range(max_retries):
|
|
|
|
|
+ try:
|
|
|
|
|
+ # 并发调用批量评估器(不含延伸词,使用Round 0专用prompt)
|
|
|
|
|
+ # 注意: Round 0使用与Round 1+相同的批量Agent,因为prompt中已包含所有必要约束
|
|
|
|
|
+ motivation_task = Runner.run(batch_motivation_evaluator, batch_input)
|
|
|
|
|
+ category_task = Runner.run(batch_category_evaluator, batch_input)
|
|
|
|
|
+
|
|
|
|
|
+ motivation_result, category_result = await asyncio.gather(
|
|
|
|
|
+ motivation_task,
|
|
|
|
|
+ category_task
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+ batch_motivation: BatchMotivationResult = motivation_result.final_output
|
|
|
|
|
+ batch_category: BatchCategoryResult = category_result.final_output
|
|
|
|
|
+
|
|
|
|
|
+ # 验证返回数量
|
|
|
|
|
+ if len(batch_motivation.evaluations) != len(uncached_texts):
|
|
|
|
|
+ raise ValueError(f"Round0动机评估数量不匹配: 期望{len(uncached_texts)},实际{len(batch_motivation.evaluations)}")
|
|
|
|
|
+ if len(batch_category.evaluations) != len(uncached_texts):
|
|
|
|
|
+ raise ValueError(f"Round0品类评估数量不匹配: 期望{len(uncached_texts)},实际{len(batch_category.evaluations)}")
|
|
|
|
|
+
|
|
|
|
|
+ # 验证顺序
|
|
|
|
|
+ for i, (expected_text, mot_item, cat_item) in enumerate(zip(uncached_texts, batch_motivation.evaluations, batch_category.evaluations)):
|
|
|
|
|
+ if mot_item.sug_text != expected_text:
|
|
|
|
|
+ raise ValueError(f"Round0动机评估顺序错误: 位置{i+1}期望'{expected_text}',实际'{mot_item.sug_text}'")
|
|
|
|
|
+ if cat_item.sug_text != expected_text:
|
|
|
|
|
+ raise ValueError(f"Round0品类评估顺序错误: 位置{i+1}期望'{expected_text}',实际'{cat_item.sug_text}'")
|
|
|
|
|
+
|
|
|
|
|
+ # 处理每个word的结果
|
|
|
|
|
+ batch_results_temp = []
|
|
|
|
|
+ for mot_item, cat_item in zip(batch_motivation.evaluations, batch_category.evaluations):
|
|
|
|
|
+ motivation_score = mot_item.动机维度得分
|
|
|
|
|
+ category_score = cat_item.品类维度得分
|
|
|
|
|
+
|
|
|
|
|
+ # 应用Round 0专用规则计算最终得分(不含延伸词)
|
|
|
|
|
+ final_score, rule_applied = calculate_final_score_v2(
|
|
|
|
|
+ motivation_score, category_score
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+ # 组合评估理由
|
|
|
|
|
+ core_motivation = mot_item.原始问题核心动机提取.简要说明核心动机
|
|
|
|
|
+ motivation_reason = mot_item.简要说明动机维度相关度理由
|
|
|
|
|
+ category_reason = cat_item.简要说明品类维度相关度理由
|
|
|
|
|
+
|
|
|
|
|
+ combined_reason = (
|
|
|
|
|
+ f'【评估对象】词条"{mot_item.sug_text}" vs 原始问题"{o}"\n'
|
|
|
|
|
+ f"【核心动机】{core_motivation}\n"
|
|
|
|
|
+ f"【动机维度 {motivation_score:.2f}】{motivation_reason}\n"
|
|
|
|
|
+ f"【品类维度 {category_score:.2f}】{category_reason}\n"
|
|
|
|
|
+ f"【最终得分 {final_score:.2f}】"
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+ if rule_applied:
|
|
|
|
|
+ combined_reason += f"\n【规则说明】{rule_applied}"
|
|
|
|
|
+
|
|
|
|
|
+ batch_results_temp.append((final_score, combined_reason))
|
|
|
|
|
+
|
|
|
|
|
+ # 存入缓存(使用round0前缀)
|
|
|
|
|
+ if cache is not None:
|
|
|
|
|
+ cache_key = f"round0:{mot_item.sug_text}:{o}"
|
|
|
|
|
+ cache[cache_key] = (final_score, combined_reason)
|
|
|
|
|
+
|
|
|
|
|
+ # 合并缓存结果和批量评估结果
|
|
|
|
|
+ final_batch_results = []
|
|
|
|
|
+ uncached_idx = 0
|
|
|
|
|
+ for i in range(len(batch_texts)):
|
|
|
|
|
+ if i in cached_results:
|
|
|
|
|
+ final_batch_results.append(cached_results[i])
|
|
|
|
|
+ else:
|
|
|
|
|
+ final_batch_results.append(batch_results_temp[uncached_idx])
|
|
|
|
|
+ uncached_idx += 1
|
|
|
|
|
+
|
|
|
|
|
+ results.extend(final_batch_results)
|
|
|
|
|
+ batch_success = True
|
|
|
|
|
+
|
|
|
|
|
+ batch_elapsed = time.time() - batch_start_time
|
|
|
|
|
+ print(f" ✅ 批次{batch_idx//BATCH_SIZE + 1}完成: {len(uncached_texts)}个words,耗时{batch_elapsed:.2f}秒")
|
|
|
|
|
+ break
|
|
|
|
|
+
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ last_error = e
|
|
|
|
|
+ error_msg = str(e)
|
|
|
|
|
+
|
|
|
|
|
+ if attempt < max_retries - 1:
|
|
|
|
|
+ print(f" ⚠️ Round0批量评估失败 (尝试 {attempt+1}/{max_retries}): {error_msg[:150]}")
|
|
|
|
|
+ print(f" 正在重试...")
|
|
|
|
|
+ await asyncio.sleep(1)
|
|
|
|
|
+ else:
|
|
|
|
|
+ print(f" ❌ Round0批量评估失败 (已达最大重试次数): {error_msg[:150]}")
|
|
|
|
|
+
|
|
|
|
|
+ # 如果批量评估失败,回退到单个评估
|
|
|
|
|
+ if not batch_success:
|
|
|
|
|
+ print(f" ⚠️ Round0批量评估失败,回退到单个评估模式...")
|
|
|
|
|
+ batch_results_temp = []
|
|
|
|
|
+ for text in uncached_texts:
|
|
|
|
|
+ try:
|
|
|
|
|
+ score, reason = await evaluate_with_o_round0(text, o, cache)
|
|
|
|
|
+ batch_results_temp.append((score, reason))
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ print(f" ❌ Round0单个评估也失败: {text[:30]}... - {str(e)[:100]}")
|
|
|
|
|
+ batch_results_temp.append((0.0, f"Round0评估失败: {str(e)[:100]}"))
|
|
|
|
|
+
|
|
|
|
|
+ # 合并结果
|
|
|
|
|
+ final_batch_results = []
|
|
|
|
|
+ uncached_idx = 0
|
|
|
|
|
+ for i in range(len(batch_texts)):
|
|
|
|
|
+ if i in cached_results:
|
|
|
|
|
+ final_batch_results.append(cached_results[i])
|
|
|
|
|
+ else:
|
|
|
|
|
+ final_batch_results.append(batch_results_temp[uncached_idx])
|
|
|
|
|
+ uncached_idx += 1
|
|
|
|
|
+
|
|
|
|
|
+ results.extend(final_batch_results)
|
|
|
|
|
+
|
|
|
|
|
+ return results
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
async def evaluate_within_scope(text: str, scope_text: str, cache: dict[str, tuple[float, str]] | None = None, context: RunContext | None = None) -> tuple[float, str]:
|
|
async def evaluate_within_scope(text: str, scope_text: str, cache: dict[str, tuple[float, str]] | None = None, context: RunContext | None = None) -> tuple[float, str]:
|
|
|
"""域内/域间专用评估函数(v124新增 - 需求2&3)
|
|
"""域内/域间专用评估函数(v124新增 - 需求2&3)
|
|
|
|
|
|
|
@@ -3061,21 +3756,27 @@ async def run_round(
|
|
|
all_sugs.append(sug)
|
|
all_sugs.append(sug)
|
|
|
sug_to_q_map[id(sug)] = q_text
|
|
sug_to_q_map[id(sug)] = q_text
|
|
|
|
|
|
|
|
- # 2.2 并发评估所有sug(使用信号量限制并发数)
|
|
|
|
|
- # 每个 evaluate_sug 内部会并发调用 2 个 LLM,所以这里限制为 5,实际并发 LLM 请求为 10
|
|
|
|
|
- MAX_CONCURRENT_EVALUATIONS = 30 # 🚀 性能优化:从5提升到30,并发评估能力提升6倍
|
|
|
|
|
- semaphore = asyncio.Semaphore(MAX_CONCURRENT_EVALUATIONS)
|
|
|
|
|
|
|
+ # 2.2 批量评估所有sug(每批最多10个)
|
|
|
|
|
+ # 🚀 性能优化:使用批量评估替代单个并发评估,显著提升性能
|
|
|
|
|
+ if all_sugs:
|
|
|
|
|
+ print(f" 开始批量评估 {len(all_sugs)} 个建议词(每批最多10个)...")
|
|
|
|
|
|
|
|
- async def evaluate_sug(sug: Sug) -> Sug:
|
|
|
|
|
- async with semaphore: # 限制并发数
|
|
|
|
|
- # 根据轮次选择 prompt: 第一轮使用 round1 prompt,后续使用标准 prompt
|
|
|
|
|
- sug.score_with_o, sug.reason = await evaluate_with_o(sug.text, o, context.evaluation_cache, context=context, round_num=round_num)
|
|
|
|
|
- return sug
|
|
|
|
|
|
|
+ # 提取所有sug的text
|
|
|
|
|
+ sug_texts = [sug.text for sug in all_sugs]
|
|
|
|
|
|
|
|
- if all_sugs:
|
|
|
|
|
- print(f" 开始评估 {len(all_sugs)} 个建议词(并发限制: {MAX_CONCURRENT_EVALUATIONS})...")
|
|
|
|
|
- eval_tasks = [evaluate_sug(sug) for sug in all_sugs]
|
|
|
|
|
- await asyncio.gather(*eval_tasks)
|
|
|
|
|
|
|
+ # 批量评估
|
|
|
|
|
+ batch_results = await evaluate_batch_with_o(
|
|
|
|
|
+ texts=sug_texts,
|
|
|
|
|
+ o=o,
|
|
|
|
|
+ cache=context.evaluation_cache,
|
|
|
|
|
+ context=context,
|
|
|
|
|
+ round_num=round_num
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+ # 将结果分配回sug对象
|
|
|
|
|
+ for sug, (score, reason) in zip(all_sugs, batch_results):
|
|
|
|
|
+ sug.score_with_o = score
|
|
|
|
|
+ sug.reason = reason
|
|
|
|
|
|
|
|
# 2.3 打印结果并组织到sug_details
|
|
# 2.3 打印结果并组织到sug_details
|
|
|
sug_details = {} # 保存每个Q对应的sug列表
|
|
sug_details = {} # 保存每个Q对应的sug列表
|
|
@@ -3568,41 +4269,60 @@ async def initialize_v2(o: str, context: RunContext) -> list[Segment]:
|
|
|
# 2. 对每个segment拆词并评估
|
|
# 2. 对每个segment拆词并评估
|
|
|
print(f"\n[步骤2] 对每个segment拆词并评估...")
|
|
print(f"\n[步骤2] 对每个segment拆词并评估...")
|
|
|
|
|
|
|
|
- MAX_CONCURRENT_EVALUATIONS = 30 # 🚀 性能优化:从5提升到30,并发评估能力提升6倍
|
|
|
|
|
|
|
+ # 2.1 先对所有segment拆词(并发)
|
|
|
|
|
+ MAX_CONCURRENT_EVALUATIONS = 30
|
|
|
semaphore = asyncio.Semaphore(MAX_CONCURRENT_EVALUATIONS)
|
|
semaphore = asyncio.Semaphore(MAX_CONCURRENT_EVALUATIONS)
|
|
|
|
|
|
|
|
- async def process_segment(segment: Segment) -> Segment:
|
|
|
|
|
- """处理单个segment: 拆词 + 评估segment + 评估词"""
|
|
|
|
|
|
|
+ async def segment_words(segment: Segment) -> Segment:
|
|
|
|
|
+ """对segment进行拆词"""
|
|
|
async with semaphore:
|
|
async with semaphore:
|
|
|
- # 2.1 拆词
|
|
|
|
|
word_result = await Runner.run(word_segmenter, segment.text)
|
|
word_result = await Runner.run(word_segmenter, segment.text)
|
|
|
word_segmentation: WordSegmentation = word_result.final_output
|
|
word_segmentation: WordSegmentation = word_result.final_output
|
|
|
segment.words = word_segmentation.words
|
|
segment.words = word_segmentation.words
|
|
|
|
|
+ return segment
|
|
|
|
|
|
|
|
- # 2.2 评估segment与原始问题的相关度(使用Round 0专用评估)
|
|
|
|
|
- segment.score_with_o, segment.reason = await evaluate_with_o_round0(
|
|
|
|
|
- segment.text, o, context.evaluation_cache
|
|
|
|
|
- )
|
|
|
|
|
-
|
|
|
|
|
- # 2.3 评估每个词与原始问题的相关度(使用Round 0专用评估)
|
|
|
|
|
- word_eval_tasks = []
|
|
|
|
|
- for word in segment.words:
|
|
|
|
|
- async def eval_word(w: str) -> tuple[str, float, str]:
|
|
|
|
|
- score, reason = await evaluate_with_o_round0(w, o, context.evaluation_cache)
|
|
|
|
|
- return w, score, reason
|
|
|
|
|
- word_eval_tasks.append(eval_word(word))
|
|
|
|
|
|
|
+ if segment_list:
|
|
|
|
|
+ print(f" [步骤2.1] 对 {len(segment_list)} 个segment进行拆词...")
|
|
|
|
|
+ word_tasks = [segment_words(seg) for seg in segment_list]
|
|
|
|
|
+ await asyncio.gather(*word_tasks)
|
|
|
|
|
+
|
|
|
|
|
+ # 2.2 批量评估所有segments
|
|
|
|
|
+ print(f" [步骤2.2] 批量评估 {len(segment_list)} 个segments...")
|
|
|
|
|
+ segment_texts = [seg.text for seg in segment_list]
|
|
|
|
|
+ segment_results = await evaluate_batch_with_o_round0(
|
|
|
|
|
+ texts=segment_texts,
|
|
|
|
|
+ o=o,
|
|
|
|
|
+ cache=context.evaluation_cache
|
|
|
|
|
+ )
|
|
|
|
|
|
|
|
- word_results = await asyncio.gather(*word_eval_tasks)
|
|
|
|
|
- for word, score, reason in word_results:
|
|
|
|
|
- segment.word_scores[word] = score
|
|
|
|
|
- segment.word_reasons[word] = reason
|
|
|
|
|
|
|
+ # 分配segment评估结果
|
|
|
|
|
+ for segment, (score, reason) in zip(segment_list, segment_results):
|
|
|
|
|
+ segment.score_with_o = score
|
|
|
|
|
+ segment.reason = reason
|
|
|
|
|
|
|
|
- return segment
|
|
|
|
|
|
|
+ # 2.3 收集所有words并批量评估
|
|
|
|
|
+ all_words = []
|
|
|
|
|
+ word_to_segments = {} # 记录每个word属于哪些segments
|
|
|
|
|
+ for segment in segment_list:
|
|
|
|
|
+ for word in segment.words:
|
|
|
|
|
+ if word not in word_to_segments:
|
|
|
|
|
+ all_words.append(word)
|
|
|
|
|
+ word_to_segments[word] = []
|
|
|
|
|
+ word_to_segments[word].append(segment)
|
|
|
|
|
+
|
|
|
|
|
+ if all_words:
|
|
|
|
|
+ print(f" [步骤2.3] 批量评估 {len(all_words)} 个words(去重后)...")
|
|
|
|
|
+ word_results = await evaluate_batch_with_o_round0(
|
|
|
|
|
+ texts=all_words,
|
|
|
|
|
+ o=o,
|
|
|
|
|
+ cache=context.evaluation_cache
|
|
|
|
|
+ )
|
|
|
|
|
|
|
|
- if segment_list:
|
|
|
|
|
- print(f" 开始处理 {len(segment_list)} 个segment(并发限制: {MAX_CONCURRENT_EVALUATIONS})...")
|
|
|
|
|
- process_tasks = [process_segment(seg) for seg in segment_list]
|
|
|
|
|
- await asyncio.gather(*process_tasks)
|
|
|
|
|
|
|
+ # 分配word评估结果到所有相关的segments
|
|
|
|
|
+ for word, (score, reason) in zip(all_words, word_results):
|
|
|
|
|
+ for segment in word_to_segments[word]:
|
|
|
|
|
+ segment.word_scores[word] = score
|
|
|
|
|
+ segment.word_reasons[word] = reason
|
|
|
|
|
|
|
|
# 打印步骤1结果
|
|
# 打印步骤1结果
|
|
|
print(f"\n[步骤1: 分段及拆词 结果]")
|
|
print(f"\n[步骤1: 分段及拆词 结果]")
|
|
@@ -4093,6 +4813,7 @@ async def iterative_loop_v2(
|
|
|
enable_evaluation: bool = False
|
|
enable_evaluation: bool = False
|
|
|
):
|
|
):
|
|
|
"""v121 主迭代循环"""
|
|
"""v121 主迭代循环"""
|
|
|
|
|
+ import time
|
|
|
|
|
|
|
|
print(f"\n{'='*60}")
|
|
print(f"\n{'='*60}")
|
|
|
print(f"开始v121迭代循环(语义分段跨域组词版)")
|
|
print(f"开始v121迭代循环(语义分段跨域组词版)")
|
|
@@ -4101,7 +4822,13 @@ async def iterative_loop_v2(
|
|
|
print(f"{'='*60}")
|
|
print(f"{'='*60}")
|
|
|
|
|
|
|
|
# Round 0: 初始化(语义分段 + 拆词)
|
|
# Round 0: 初始化(语义分段 + 拆词)
|
|
|
|
|
+ print(f"\n{'='*60}")
|
|
|
|
|
+ print(f"Round 0: 初始化(语义分段 + 拆词)")
|
|
|
|
|
+ print(f"{'='*60}")
|
|
|
|
|
+ round0_start_time = time.time()
|
|
|
segments = await initialize_v2(context.o, context)
|
|
segments = await initialize_v2(context.o, context)
|
|
|
|
|
+ round0_elapsed = time.time() - round0_start_time
|
|
|
|
|
+ print(f"\n✅ Round 0 完成,耗时: {round0_elapsed:.2f}秒")
|
|
|
|
|
|
|
|
# API实例
|
|
# API实例
|
|
|
xiaohongshu_api = XiaohongshuSearchRecommendations()
|
|
xiaohongshu_api = XiaohongshuSearchRecommendations()
|
|
@@ -4120,8 +4847,11 @@ async def iterative_loop_v2(
|
|
|
num_segments = len(segments)
|
|
num_segments = len(segments)
|
|
|
actual_max_rounds = min(max_rounds, num_segments)
|
|
actual_max_rounds = min(max_rounds, num_segments)
|
|
|
round_num = 1
|
|
round_num = 1
|
|
|
|
|
+ rounds_elapsed_times = [] # 记录每轮耗时
|
|
|
|
|
|
|
|
while query_input and round_num <= actual_max_rounds:
|
|
while query_input and round_num <= actual_max_rounds:
|
|
|
|
|
+ round_start_time = time.time()
|
|
|
|
|
+
|
|
|
query_input, search_list = await run_round_v2( # 不再接收提取结果
|
|
query_input, search_list = await run_round_v2( # 不再接收提取结果
|
|
|
round_num=round_num,
|
|
round_num=round_num,
|
|
|
query_input=query_input, # 传递上一轮的输出
|
|
query_input=query_input, # 传递上一轮的输出
|
|
@@ -4135,6 +4865,10 @@ async def iterative_loop_v2(
|
|
|
enable_evaluation=enable_evaluation
|
|
enable_evaluation=enable_evaluation
|
|
|
)
|
|
)
|
|
|
|
|
|
|
|
|
|
+ round_elapsed = time.time() - round_start_time
|
|
|
|
|
+ rounds_elapsed_times.append(round_elapsed)
|
|
|
|
|
+ print(f"\n✅ Round {round_num} 完成,耗时: {round_elapsed:.2f}秒")
|
|
|
|
|
+
|
|
|
all_search_list.extend(search_list)
|
|
all_search_list.extend(search_list)
|
|
|
# all_extraction_results.update(extraction_results) # 内容提取流程已断开
|
|
# all_extraction_results.update(extraction_results) # 内容提取流程已断开
|
|
|
|
|
|
|
@@ -4151,6 +4885,12 @@ async def iterative_loop_v2(
|
|
|
print(f" 总搜索次数: {len(all_search_list)}")
|
|
print(f" 总搜索次数: {len(all_search_list)}")
|
|
|
print(f" 总帖子数: {sum(len(s.post_list) for s in all_search_list)}")
|
|
print(f" 总帖子数: {sum(len(s.post_list) for s in all_search_list)}")
|
|
|
# print(f" 提取帖子数: {len(all_extraction_results)}") # 内容提取流程已断开
|
|
# print(f" 提取帖子数: {len(all_extraction_results)}") # 内容提取流程已断开
|
|
|
|
|
+ print(f"\n[耗时统计]")
|
|
|
|
|
+ print(f" Round 0 耗时: {round0_elapsed:.2f}秒")
|
|
|
|
|
+ for i, elapsed in enumerate(rounds_elapsed_times, 1):
|
|
|
|
|
+ print(f" Round {i} 耗时: {elapsed:.2f}秒")
|
|
|
|
|
+ total_rounds_time = round0_elapsed + sum(rounds_elapsed_times)
|
|
|
|
|
+ print(f" 所有轮次总耗时: {total_rounds_time:.2f}秒 ({total_rounds_time/60:.2f}分钟)")
|
|
|
print(f"\n[统计信息]")
|
|
print(f"\n[统计信息]")
|
|
|
print(f" LLM评估调用: {context.stats_llm_calls} 次")
|
|
print(f" LLM评估调用: {context.stats_llm_calls} 次")
|
|
|
print(f" SUG请求: {context.stats_sug_requests} 次 (缓存命中: {context.stats_sug_cache_hits} 次)")
|
|
print(f" SUG请求: {context.stats_sug_requests} 次 (缓存命中: {context.stats_sug_cache_hits} 次)")
|
|
@@ -4166,6 +4906,9 @@ async def iterative_loop_v2(
|
|
|
|
|
|
|
|
async def main(input_dir: str, max_rounds: int = 2, sug_threshold: float = 0.7, visualize: bool = False, enable_evaluation: bool = False):
|
|
async def main(input_dir: str, max_rounds: int = 2, sug_threshold: float = 0.7, visualize: bool = False, enable_evaluation: bool = False):
|
|
|
"""主函数"""
|
|
"""主函数"""
|
|
|
|
|
+ import time
|
|
|
|
|
+ total_start_time = time.time() # 记录总开始时间
|
|
|
|
|
+
|
|
|
current_time, log_url = set_trace()
|
|
current_time, log_url = set_trace()
|
|
|
|
|
|
|
|
# 读取输入
|
|
# 读取输入
|
|
@@ -4229,7 +4972,12 @@ async def main(input_dir: str, max_rounds: int = 2, sug_threshold: float = 0.7,
|
|
|
output += f"总搜索次数:{len(all_search_list)}\n"
|
|
output += f"总搜索次数:{len(all_search_list)}\n"
|
|
|
output += f"总帖子数:{sum(len(s.post_list) for s in all_search_list)}\n"
|
|
output += f"总帖子数:{sum(len(s.post_list) for s in all_search_list)}\n"
|
|
|
# output += f"提取帖子数:{len(all_extraction_results)}\n" # 内容提取流程已断开
|
|
# output += f"提取帖子数:{len(all_extraction_results)}\n" # 内容提取流程已断开
|
|
|
|
|
+
|
|
|
|
|
+ # 计算总耗时
|
|
|
|
|
+ total_elapsed_time = time.time() - total_start_time
|
|
|
|
|
+
|
|
|
output += f"\n统计信息:\n"
|
|
output += f"\n统计信息:\n"
|
|
|
|
|
+ output += f" 总耗时: {total_elapsed_time:.2f}秒 ({total_elapsed_time/60:.2f}分钟)\n"
|
|
|
output += f" LLM评估调用: {run_context.stats_llm_calls} 次\n"
|
|
output += f" LLM评估调用: {run_context.stats_llm_calls} 次\n"
|
|
|
output += f" SUG请求: {run_context.stats_sug_requests} 次 (缓存命中: {run_context.stats_sug_cache_hits} 次)\n"
|
|
output += f" SUG请求: {run_context.stats_sug_requests} 次 (缓存命中: {run_context.stats_sug_cache_hits} 次)\n"
|
|
|
output += f" 搜索调用: {run_context.stats_search_calls} 次\n"
|
|
output += f" 搜索调用: {run_context.stats_search_calls} 次\n"
|