|
@@ -1,6 +1,7 @@
|
|
|
import json
|
|
|
import datetime
|
|
|
import random
|
|
|
+import traceback
|
|
|
|
|
|
from tqdm import tqdm
|
|
|
from openai import OpenAI
|
|
@@ -9,6 +10,34 @@ from pqai_agent.database import MySQLManager
|
|
|
from pqai_agent.agents.message_push_agent import MessagePushAgent
|
|
|
from pqai_agent.agents.message_reply_agent import MessageReplyAgent
|
|
|
|
|
|
+evaluation_metrics_dict = {
|
|
|
+ "1.2": "是否识别关键信息",
|
|
|
+ "1.3": "是否能够理解歧义词/模糊词",
|
|
|
+ "1.4": "是否能理解表情包,图片消息",
|
|
|
+ "1.5": "是否能理解语音/方言",
|
|
|
+ "2.1": "回复是否与用户意图相关",
|
|
|
+ "2.2": "回复是否清晰简洁",
|
|
|
+ "2.3": "回复是否流畅",
|
|
|
+ "2.4": "回复语法是否规范",
|
|
|
+ "3.1": "是否能理解代词(他,她, 她, 这个那个)",
|
|
|
+ "3.2": "是否能延续上文话题",
|
|
|
+ "3.3": "是否记住上文的基础信息",
|
|
|
+ "3.4": "是否及时结束聊天",
|
|
|
+ "4.1": "是否讨论超出角色认知范围的信息",
|
|
|
+ "4.2": "是否讨论了不符合当前时代背景的语言、物品、事件、概念",
|
|
|
+ "4.3": "是否表现出与agent 人设相符的专业知识、生活经验或者常识",
|
|
|
+ "5.1": "agent 的言行是否反映其预设的核心性格",
|
|
|
+ "5.2": "agent 的价值观和道德观是否符合其预设标准",
|
|
|
+ "6.1": "agent 使用的词汇、句式、语法复杂度、行话/俚语是否符合其身份、教育背景和时代?",
|
|
|
+ "6.2": "agent 语气、语调(恭敬、傲慢、亲切、疏离、热情、冷淡)是否稳定?",
|
|
|
+ "6.3": "agent 表达习惯、口头禅是否符合角色预设特点",
|
|
|
+ "7.1": "agent 在对话中表现出的目标、关注重心是否与其设定的核心动机一致?",
|
|
|
+ "8.1": "agent 是否按照预设的互动模式与用户沟通",
|
|
|
+ "8.2": "agent 是否对自身角色有正确理解",
|
|
|
+ "8.3": "agent 是否回复超越用户认知的信息"
|
|
|
+}
|
|
|
+
|
|
|
+
|
|
|
def fetch_deepseek_completion(prompt, output_type='text'):
|
|
|
"""
|
|
|
deep_seek方法
|
|
@@ -31,7 +60,7 @@ def fetch_deepseek_completion(prompt, output_type='text'):
|
|
|
"content": prompt,
|
|
|
}
|
|
|
],
|
|
|
- model="deepseek-chat",
|
|
|
+ model="deepseek-reasoner",
|
|
|
response_format=response_format,
|
|
|
)
|
|
|
response = chat_completion.choices[0].message.content
|
|
@@ -205,16 +234,80 @@ def evaluate_push_agent_prompt(dialogue_history, push_message, user_profile_, ag
|
|
|
return prompt_
|
|
|
|
|
|
|
|
|
-def evaluate_reply_agent(dialogue_history, reply_message, user_profile_, agent_profile):
|
|
|
+def evaluate_reply_agent_prompt(dialogue_history, message, user_profile_, agent_profile, push_time):
|
|
|
"""
|
|
|
|
|
|
:param dialogue_history:
|
|
|
- :param reply_message:
|
|
|
+ :param message:
|
|
|
:param user_profile_:
|
|
|
:param agent_profile:
|
|
|
:return:
|
|
|
"""
|
|
|
- return
|
|
|
+ output_format = {
|
|
|
+ "1.1": {
|
|
|
+ "score": 1,
|
|
|
+ "reason": "理由"
|
|
|
+ },
|
|
|
+ "1.2": {
|
|
|
+ "score": 0,
|
|
|
+ "reason": "理由"
|
|
|
+ }
|
|
|
+ }
|
|
|
+ prompt_ = f"""
|
|
|
+ **评估任务:** 基于给定的对话历史和 Agent 预设信息,评估 Agent 在对话中的表现。使用以下维度和指标进行评分。
|
|
|
+ **评估指标:**
|
|
|
+ 1. 理解能力
|
|
|
+ 1.1 是否识别用户核心意图
|
|
|
+ 1.2 是否识别关键信息
|
|
|
+ 1.3 是否能够理解歧义词/模糊词
|
|
|
+ 1.4 是否能理解表情包,图片消息
|
|
|
+ 1.5 是否能理解语音/方言
|
|
|
+ 2. 回复能力
|
|
|
+ 2.1 回复是否与用户意图相关
|
|
|
+ 2.2 回复是否清晰简洁
|
|
|
+ 2.3 回复是否流畅
|
|
|
+ 2.4 回复语法是否规范
|
|
|
+ 3. 上下文管理能力
|
|
|
+ 3.1 是否能理解代词(他,她, 她, 这个那个)
|
|
|
+ 3.2 是否能延续上文话题rye5
|
|
|
+ 3.4 是否及时结束聊天
|
|
|
+ 4. 背景知识一致性
|
|
|
+ 4.1 是否讨论超出角色认知范围的信息
|
|
|
+ 4.2 是否讨论了不符合当前时代背景的语言、物品、事件、概念
|
|
|
+ 4.3 是否表现出与agent 人设相符的专业知识、生活经验或者常识
|
|
|
+ 5. 性格行为一致性
|
|
|
+ 5.1 agent 的言行是否反映其预设的核心性格
|
|
|
+ 5.2 agent 的价值观和道德观是否符合其预设标准
|
|
|
+ 6. 语言风格一致性
|
|
|
+ 6.1 agent 使用的词汇、句式、语法复杂度、行话/俚语是否符合其身份、教育背景和时代?
|
|
|
+ 6.2 agent 语气、语调(恭敬、傲慢、亲切、疏离、热情、冷淡)是否稳定?
|
|
|
+ 6.3 agent 表达习惯、口头禅是否符合角色预设特点
|
|
|
+ 7. 目标动机一致性
|
|
|
+ 7.1 agent 在对话中表现出的目标、关注重心是否与其设定的核心动机一致?
|
|
|
+ 8. 关系认知一致性
|
|
|
+ 8.1 agent 是否按照预设的互动模式与用户沟通
|
|
|
+ 8.2 agent 是否对自身角色有正确理解
|
|
|
+ 8.3 agent 是否回复超越用户认知的信息
|
|
|
+
|
|
|
+ **评估规则:**
|
|
|
+ - 对于每个指标:
|
|
|
+ - 如果符合要求,得 1 分。
|
|
|
+ - 如果不符合要求,得 0 分。
|
|
|
+ - 如果指标不适用(如对话未涉及相关场景),得 1 分(无需评估。
|
|
|
+ - 理由必须基于对话内容,简短且客观,理由需要是中文, 如果是无需评估,则理由写无需评估
|
|
|
+
|
|
|
+ **输入:**
|
|
|
+ - **对话历史**: {dialogue_history}
|
|
|
+ - **Agent 预设信息**: {agent_profile}
|
|
|
+ - **用户预设信息**: {user_profile_}
|
|
|
+ - **Agent 消息**: {message}
|
|
|
+ - **Agent 发送消息时间**:{push_time}
|
|
|
+
|
|
|
+ **输出格式要求:JSON 格式**
|
|
|
+ 输出格式参考:{output_format}
|
|
|
+
|
|
|
+ """
|
|
|
+ return prompt_
|
|
|
|
|
|
|
|
|
|
|
@@ -231,53 +324,62 @@ mysql_client = MySQLManager(config)
|
|
|
if __name__ == '__main__':
|
|
|
import pqai_agent.logging_service
|
|
|
pqai_agent.logging_service.setup_root_logger()
|
|
|
- with open("scripts/dialogue_data_set.json", "r", encoding="utf-8") as f:
|
|
|
+ with open("push_message_evaluation_result.json", "r", encoding="utf-8") as f:
|
|
|
data = json.load(f)
|
|
|
|
|
|
- dialogues = random.sample(data, 100)
|
|
|
+ # dialogues = random.sample(data, 1)
|
|
|
F = []
|
|
|
- for sub_dialogues in tqdm(dialogues):
|
|
|
+ for sub_dialogues in tqdm(data):
|
|
|
try:
|
|
|
# user 相关
|
|
|
- user_id = sub_dialogues['user_id']
|
|
|
- user_profile_response = get_profile_info(user_id, "user")
|
|
|
- user_profile, avatar = json.loads(user_profile_response[0]['profile']), user_profile_response[0]['avatar']
|
|
|
-
|
|
|
- user_profile['avatar'] = avatar
|
|
|
- user_profile['current_datetime'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
|
|
+ # user_id = sub_dialogues['user_id']
|
|
|
+ # user_profile_response = get_profile_info(user_id, "user")
|
|
|
+ # user_profile, avatar = json.loads(user_profile_response[0]['profile']), user_profile_response[0]['avatar']
|
|
|
+ #
|
|
|
+ # user_profile['avatar'] = avatar
|
|
|
+ # user_profile['current_datetime'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
|
|
+ #
|
|
|
+ # # staff 相关
|
|
|
+ # staff_id = sub_dialogues['staff_id']
|
|
|
+ # staff_profile_response = get_profile_info(staff_id, "staff")
|
|
|
+ # staff_profile = json.loads(staff_profile_response[0]['profile'])
|
|
|
+ #
|
|
|
+ # user_profile['formatted_staff_profile'] = staff_profile
|
|
|
+ #
|
|
|
+ # push_agent = MessagePushAgent()
|
|
|
+ # # reply_agent = MessageReplyAgent()
|
|
|
+ #
|
|
|
+ # # message 相关
|
|
|
+ # message = sub_dialogues['dialogue_history']
|
|
|
+ # agent_message = push_agent.generate_message(
|
|
|
+ # context=user_profile,
|
|
|
+ # dialogue_history=message
|
|
|
+ # )
|
|
|
+ message = sub_dialogues["dialogue_history"]
|
|
|
+ agent_message = sub_dialogues["push_message"]
|
|
|
+ push_time = sub_dialogues["push_time"]
|
|
|
+ user_profile = sub_dialogues["user_profile"]
|
|
|
+ staff_profile = sub_dialogues["agent_profile"]
|
|
|
+ if agent_message:
|
|
|
+ prompt = evaluate_reply_agent_prompt(message, agent_message, user_profile, staff_profile, push_time)
|
|
|
+ # prompt = evaluate_conversation_quality_task(message, user_profile, staff_profile)
|
|
|
+ response = fetch_deepseek_completion(prompt, output_type='json')
|
|
|
+ obj = {
|
|
|
+ "user_profile": user_profile,
|
|
|
+ "agent_profile": staff_profile,
|
|
|
+ "dialogue_history": message,
|
|
|
+ "push_message": agent_message,
|
|
|
+ "push_time": push_time,
|
|
|
+ "evaluation_result": response
|
|
|
+ }
|
|
|
+ F.append(obj)
|
|
|
|
|
|
- # staff 相关
|
|
|
- staff_id = sub_dialogues['staff_id']
|
|
|
- staff_profile_response = get_profile_info(staff_id, "staff")
|
|
|
- staff_profile = json.loads(staff_profile_response[0]['profile'])
|
|
|
+ except Exception as e:
|
|
|
+ print(e)
|
|
|
+ print(traceback.format_exc())
|
|
|
|
|
|
- user_profile['formatted_staff_profile'] = staff_profile
|
|
|
-
|
|
|
- push_agent = MessagePushAgent()
|
|
|
- # reply_agent = MessageReplyAgent()
|
|
|
-
|
|
|
- # message 相关
|
|
|
- message = sub_dialogues['dialogue_history']
|
|
|
- agent_message = push_agent.generate_message(
|
|
|
- context=user_profile,
|
|
|
- dialogue_history=message
|
|
|
- )
|
|
|
- prompt = evaluate_push_agent_prompt(message, agent_message, user_profile, staff_profile)
|
|
|
- # prompt = evaluate_conversation_quality_task(message, user_profile, staff_profile)
|
|
|
- response = fetch_deepseek_completion(prompt, output_type='json')
|
|
|
- obj = {
|
|
|
- "user_profile": user_profile,
|
|
|
- "agent_profile": staff_profile,
|
|
|
- "dialogue_history": message,
|
|
|
- "push_message": agent_message,
|
|
|
- "push_time": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
|
|
- "evaluation_result": response
|
|
|
- }
|
|
|
- F.append(obj)
|
|
|
- except:
|
|
|
- continue
|
|
|
|
|
|
- with open("push_message_evaluation_result.json", "w", encoding="utf-8") as f:
|
|
|
- f.write(json.dumps(F, indent=4, ensure_ascii=False))
|
|
|
+ with open("push_message_evaluation_result_2.json", "w", encoding="utf-8") as f:
|
|
|
+ f.write(json.dumps(F, ensure_ascii=False, indent=4))
|
|
|
|
|
|
|