|  | @@ -1,6 +1,7 @@
 | 
											
												
													
														|  |  import json
 |  |  import json
 | 
											
												
													
														|  |  import datetime
 |  |  import datetime
 | 
											
												
													
														|  |  import random
 |  |  import random
 | 
											
												
													
														|  | 
 |  | +import traceback
 | 
											
												
													
														|  |  
 |  |  
 | 
											
												
													
														|  |  from tqdm import tqdm
 |  |  from tqdm import tqdm
 | 
											
												
													
														|  |  from openai import OpenAI
 |  |  from openai import OpenAI
 | 
											
										
											
												
													
														|  | @@ -9,6 +10,34 @@ from pqai_agent.database import MySQLManager
 | 
											
												
													
														|  |  from pqai_agent.agents.message_push_agent import MessagePushAgent
 |  |  from pqai_agent.agents.message_push_agent import MessagePushAgent
 | 
											
												
													
														|  |  from pqai_agent.agents.message_reply_agent import MessageReplyAgent
 |  |  from pqai_agent.agents.message_reply_agent import MessageReplyAgent
 | 
											
												
													
														|  |  
 |  |  
 | 
											
												
													
														|  | 
 |  | +evaluation_metrics_dict = {
 | 
											
												
													
														|  | 
 |  | +    "1.2": "是否识别关键信息",
 | 
											
												
													
														|  | 
 |  | +    "1.3": "是否能够理解歧义词/模糊词",
 | 
											
												
													
														|  | 
 |  | +    "1.4": "是否能理解表情包,图片消息",
 | 
											
												
													
														|  | 
 |  | +    "1.5": "是否能理解语音/方言",
 | 
											
												
													
														|  | 
 |  | +    "2.1": "回复是否与用户意图相关",
 | 
											
												
													
														|  | 
 |  | +    "2.2": "回复是否清晰简洁",
 | 
											
												
													
														|  | 
 |  | +    "2.3": "回复是否流畅",
 | 
											
												
													
														|  | 
 |  | +    "2.4": "回复语法是否规范",
 | 
											
												
													
														|  | 
 |  | +    "3.1": "是否能理解代词(他,她, 她, 这个那个)",
 | 
											
												
													
														|  | 
 |  | +    "3.2": "是否能延续上文话题",
 | 
											
												
													
														|  | 
 |  | +    "3.3": "是否记住上文的基础信息",
 | 
											
												
													
														|  | 
 |  | +    "3.4": "是否及时结束聊天",
 | 
											
												
													
														|  | 
 |  | +    "4.1": "是否讨论超出角色认知范围的信息",
 | 
											
												
													
														|  | 
 |  | +    "4.2": "是否讨论了不符合当前时代背景的语言、物品、事件、概念",
 | 
											
												
													
														|  | 
 |  | +    "4.3": "是否表现出与agent 人设相符的专业知识、生活经验或者常识",
 | 
											
												
													
														|  | 
 |  | +    "5.1": "agent 的言行是否反映其预设的核心性格",
 | 
											
												
													
														|  | 
 |  | +    "5.2": "agent 的价值观和道德观是否符合其预设标准",
 | 
											
												
													
														|  | 
 |  | +    "6.1": "agent 使用的词汇、句式、语法复杂度、行话/俚语是否符合其身份、教育背景和时代?",
 | 
											
												
													
														|  | 
 |  | +    "6.2": "agent 语气、语调(恭敬、傲慢、亲切、疏离、热情、冷淡)是否稳定?",
 | 
											
												
													
														|  | 
 |  | +    "6.3": "agent 表达习惯、口头禅是否符合角色预设特点",
 | 
											
												
													
														|  | 
 |  | +    "7.1": "agent 在对话中表现出的目标、关注重心是否与其设定的核心动机一致?",
 | 
											
												
													
														|  | 
 |  | +    "8.1": "agent 是否按照预设的互动模式与用户沟通",
 | 
											
												
													
														|  | 
 |  | +    "8.2": "agent 是否对自身角色有正确理解",
 | 
											
												
													
														|  | 
 |  | +    "8.3": "agent 是否回复超越用户认知的信息"
 | 
											
												
													
														|  | 
 |  | +}
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  |  def fetch_deepseek_completion(prompt, output_type='text'):
 |  |  def fetch_deepseek_completion(prompt, output_type='text'):
 | 
											
												
													
														|  |      """
 |  |      """
 | 
											
												
													
														|  |      deep_seek方法
 |  |      deep_seek方法
 | 
											
										
											
												
													
														|  | @@ -31,7 +60,7 @@ def fetch_deepseek_completion(prompt, output_type='text'):
 | 
											
												
													
														|  |                  "content": prompt,
 |  |                  "content": prompt,
 | 
											
												
													
														|  |              }
 |  |              }
 | 
											
												
													
														|  |          ],
 |  |          ],
 | 
											
												
													
														|  | -        model="deepseek-chat",
 |  | 
 | 
											
												
													
														|  | 
 |  | +        model="deepseek-reasoner",
 | 
											
												
													
														|  |          response_format=response_format,
 |  |          response_format=response_format,
 | 
											
												
													
														|  |      )
 |  |      )
 | 
											
												
													
														|  |      response = chat_completion.choices[0].message.content
 |  |      response = chat_completion.choices[0].message.content
 | 
											
										
											
												
													
														|  | @@ -205,16 +234,80 @@ def evaluate_push_agent_prompt(dialogue_history, push_message, user_profile_, ag
 | 
											
												
													
														|  |      return prompt_
 |  |      return prompt_
 | 
											
												
													
														|  |  
 |  |  
 | 
											
												
													
														|  |  
 |  |  
 | 
											
												
													
														|  | -def evaluate_reply_agent(dialogue_history, reply_message, user_profile_, agent_profile):
 |  | 
 | 
											
												
													
														|  | 
 |  | +def evaluate_reply_agent_prompt(dialogue_history, message, user_profile_, agent_profile, push_time):
 | 
											
												
													
														|  |      """
 |  |      """
 | 
											
												
													
														|  |  
 |  |  
 | 
											
												
													
														|  |      :param dialogue_history:
 |  |      :param dialogue_history:
 | 
											
												
													
														|  | -    :param reply_message:
 |  | 
 | 
											
												
													
														|  | 
 |  | +    :param message:
 | 
											
												
													
														|  |      :param user_profile_:
 |  |      :param user_profile_:
 | 
											
												
													
														|  |      :param agent_profile:
 |  |      :param agent_profile:
 | 
											
												
													
														|  |      :return:
 |  |      :return:
 | 
											
												
													
														|  |      """
 |  |      """
 | 
											
												
													
														|  | -    return
 |  | 
 | 
											
												
													
														|  | 
 |  | +    output_format = {
 | 
											
												
													
														|  | 
 |  | +        "1.1": {
 | 
											
												
													
														|  | 
 |  | +            "score": 1,
 | 
											
												
													
														|  | 
 |  | +            "reason": "理由"
 | 
											
												
													
														|  | 
 |  | +        },
 | 
											
												
													
														|  | 
 |  | +        "1.2": {
 | 
											
												
													
														|  | 
 |  | +            "score": 0,
 | 
											
												
													
														|  | 
 |  | +            "reason": "理由"
 | 
											
												
													
														|  | 
 |  | +        }
 | 
											
												
													
														|  | 
 |  | +    }
 | 
											
												
													
														|  | 
 |  | +    prompt_ = f"""
 | 
											
												
													
														|  | 
 |  | +        **评估任务:** 基于给定的对话历史和 Agent 预设信息,评估 Agent 在对话中的表现。使用以下维度和指标进行评分。
 | 
											
												
													
														|  | 
 |  | +        **评估指标:**
 | 
											
												
													
														|  | 
 |  | +          1. 理解能力
 | 
											
												
													
														|  | 
 |  | +            1.1 是否识别用户核心意图
 | 
											
												
													
														|  | 
 |  | +            1.2 是否识别关键信息
 | 
											
												
													
														|  | 
 |  | +            1.3 是否能够理解歧义词/模糊词
 | 
											
												
													
														|  | 
 |  | +            1.4 是否能理解表情包,图片消息
 | 
											
												
													
														|  | 
 |  | +            1.5 是否能理解语音/方言
 | 
											
												
													
														|  | 
 |  | +          2. 回复能力
 | 
											
												
													
														|  | 
 |  | +            2.1 回复是否与用户意图相关
 | 
											
												
													
														|  | 
 |  | +            2.2 回复是否清晰简洁
 | 
											
												
													
														|  | 
 |  | +            2.3 回复是否流畅
 | 
											
												
													
														|  | 
 |  | +            2.4 回复语法是否规范
 | 
											
												
													
														|  | 
 |  | +          3. 上下文管理能力
 | 
											
												
													
														|  | 
 |  | +            3.1 是否能理解代词(他,她, 她, 这个那个)
 | 
											
												
													
														|  | 
 |  | +            3.2 是否能延续上文话题rye5
 | 
											
												
													
														|  | 
 |  | +            3.4 是否及时结束聊天
 | 
											
												
													
														|  | 
 |  | +          4. 背景知识一致性
 | 
											
												
													
														|  | 
 |  | +            4.1 是否讨论超出角色认知范围的信息
 | 
											
												
													
														|  | 
 |  | +            4.2 是否讨论了不符合当前时代背景的语言、物品、事件、概念
 | 
											
												
													
														|  | 
 |  | +            4.3 是否表现出与agent 人设相符的专业知识、生活经验或者常识
 | 
											
												
													
														|  | 
 |  | +          5. 性格行为一致性
 | 
											
												
													
														|  | 
 |  | +            5.1 agent 的言行是否反映其预设的核心性格
 | 
											
												
													
														|  | 
 |  | +            5.2 agent 的价值观和道德观是否符合其预设标准
 | 
											
												
													
														|  | 
 |  | +          6. 语言风格一致性
 | 
											
												
													
														|  | 
 |  | +            6.1 agent 使用的词汇、句式、语法复杂度、行话/俚语是否符合其身份、教育背景和时代?
 | 
											
												
													
														|  | 
 |  | +            6.2 agent 语气、语调(恭敬、傲慢、亲切、疏离、热情、冷淡)是否稳定?
 | 
											
												
													
														|  | 
 |  | +            6.3 agent 表达习惯、口头禅是否符合角色预设特点
 | 
											
												
													
														|  | 
 |  | +          7. 目标动机一致性
 | 
											
												
													
														|  | 
 |  | +            7.1 agent 在对话中表现出的目标、关注重心是否与其设定的核心动机一致?
 | 
											
												
													
														|  | 
 |  | +          8. 关系认知一致性
 | 
											
												
													
														|  | 
 |  | +            8.1 agent 是否按照预设的互动模式与用户沟通 
 | 
											
												
													
														|  | 
 |  | +            8.2 agent 是否对自身角色有正确理解
 | 
											
												
													
														|  | 
 |  | +            8.3 agent 是否回复超越用户认知的信息
 | 
											
												
													
														|  | 
 |  | +        
 | 
											
												
													
														|  | 
 |  | +        **评估规则:**
 | 
											
												
													
														|  | 
 |  | +        - 对于每个指标:
 | 
											
												
													
														|  | 
 |  | +          - 如果符合要求,得 1 分。
 | 
											
												
													
														|  | 
 |  | +          - 如果不符合要求,得 0 分。
 | 
											
												
													
														|  | 
 |  | +          - 如果指标不适用(如对话未涉及相关场景),得 1 分(无需评估。
 | 
											
												
													
														|  | 
 |  | +        - 理由必须基于对话内容,简短且客观,理由需要是中文, 如果是无需评估,则理由写无需评估
 | 
											
												
													
														|  | 
 |  | +        
 | 
											
												
													
														|  | 
 |  | +        **输入:**
 | 
											
												
													
														|  | 
 |  | +        - **对话历史**: {dialogue_history}
 | 
											
												
													
														|  | 
 |  | +        - **Agent 预设信息**: {agent_profile}
 | 
											
												
													
														|  | 
 |  | +        - **用户预设信息**: {user_profile_}
 | 
											
												
													
														|  | 
 |  | +        - **Agent 消息**: {message}
 | 
											
												
													
														|  | 
 |  | +        - **Agent 发送消息时间**:{push_time}
 | 
											
												
													
														|  | 
 |  | +        
 | 
											
												
													
														|  | 
 |  | +        **输出格式要求:JSON 格式**
 | 
											
												
													
														|  | 
 |  | +        输出格式参考:{output_format}
 | 
											
												
													
														|  | 
 |  | +        
 | 
											
												
													
														|  | 
 |  | +    """
 | 
											
												
													
														|  | 
 |  | +    return prompt_
 | 
											
												
													
														|  |  
 |  |  
 | 
											
												
													
														|  |  
 |  |  
 | 
											
												
													
														|  |  
 |  |  
 | 
											
										
											
												
													
														|  | @@ -231,53 +324,62 @@ mysql_client = MySQLManager(config)
 | 
											
												
													
														|  |  if __name__ == '__main__':
 |  |  if __name__ == '__main__':
 | 
											
												
													
														|  |      import pqai_agent.logging_service
 |  |      import pqai_agent.logging_service
 | 
											
												
													
														|  |      pqai_agent.logging_service.setup_root_logger()
 |  |      pqai_agent.logging_service.setup_root_logger()
 | 
											
												
													
														|  | -    with open("scripts/dialogue_data_set.json", "r", encoding="utf-8") as f:
 |  | 
 | 
											
												
													
														|  | 
 |  | +    with open("push_message_evaluation_result.json", "r", encoding="utf-8") as f:
 | 
											
												
													
														|  |          data = json.load(f)
 |  |          data = json.load(f)
 | 
											
												
													
														|  |  
 |  |  
 | 
											
												
													
														|  | -    dialogues = random.sample(data, 100)
 |  | 
 | 
											
												
													
														|  | 
 |  | +    # dialogues = random.sample(data, 1)
 | 
											
												
													
														|  |      F = []
 |  |      F = []
 | 
											
												
													
														|  | -    for sub_dialogues in tqdm(dialogues):
 |  | 
 | 
											
												
													
														|  | 
 |  | +    for sub_dialogues in tqdm(data):
 | 
											
												
													
														|  |          try:
 |  |          try:
 | 
											
												
													
														|  |              # user 相关
 |  |              # user 相关
 | 
											
												
													
														|  | -            user_id = sub_dialogues['user_id']
 |  | 
 | 
											
												
													
														|  | -            user_profile_response = get_profile_info(user_id, "user")
 |  | 
 | 
											
												
													
														|  | -            user_profile, avatar = json.loads(user_profile_response[0]['profile']), user_profile_response[0]['avatar']
 |  | 
 | 
											
												
													
														|  | -
 |  | 
 | 
											
												
													
														|  | -            user_profile['avatar'] = avatar
 |  | 
 | 
											
												
													
														|  | -            user_profile['current_datetime'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
 |  | 
 | 
											
												
													
														|  | 
 |  | +            # user_id = sub_dialogues['user_id']
 | 
											
												
													
														|  | 
 |  | +            # user_profile_response = get_profile_info(user_id, "user")
 | 
											
												
													
														|  | 
 |  | +            # user_profile, avatar = json.loads(user_profile_response[0]['profile']), user_profile_response[0]['avatar']
 | 
											
												
													
														|  | 
 |  | +            #
 | 
											
												
													
														|  | 
 |  | +            # user_profile['avatar'] = avatar
 | 
											
												
													
														|  | 
 |  | +            # user_profile['current_datetime'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
 | 
											
												
													
														|  | 
 |  | +            #
 | 
											
												
													
														|  | 
 |  | +            # # staff 相关
 | 
											
												
													
														|  | 
 |  | +            # staff_id = sub_dialogues['staff_id']
 | 
											
												
													
														|  | 
 |  | +            # staff_profile_response = get_profile_info(staff_id, "staff")
 | 
											
												
													
														|  | 
 |  | +            # staff_profile = json.loads(staff_profile_response[0]['profile'])
 | 
											
												
													
														|  | 
 |  | +            #
 | 
											
												
													
														|  | 
 |  | +            # user_profile['formatted_staff_profile'] = staff_profile
 | 
											
												
													
														|  | 
 |  | +            #
 | 
											
												
													
														|  | 
 |  | +            # push_agent = MessagePushAgent()
 | 
											
												
													
														|  | 
 |  | +            # # reply_agent = MessageReplyAgent()
 | 
											
												
													
														|  | 
 |  | +            #
 | 
											
												
													
														|  | 
 |  | +            # # message 相关
 | 
											
												
													
														|  | 
 |  | +            # message = sub_dialogues['dialogue_history']
 | 
											
												
													
														|  | 
 |  | +            # agent_message = push_agent.generate_message(
 | 
											
												
													
														|  | 
 |  | +            #     context=user_profile,
 | 
											
												
													
														|  | 
 |  | +            #     dialogue_history=message
 | 
											
												
													
														|  | 
 |  | +            # )
 | 
											
												
													
														|  | 
 |  | +            message = sub_dialogues["dialogue_history"]
 | 
											
												
													
														|  | 
 |  | +            agent_message = sub_dialogues["push_message"]
 | 
											
												
													
														|  | 
 |  | +            push_time = sub_dialogues["push_time"]
 | 
											
												
													
														|  | 
 |  | +            user_profile = sub_dialogues["user_profile"]
 | 
											
												
													
														|  | 
 |  | +            staff_profile = sub_dialogues["agent_profile"]
 | 
											
												
													
														|  | 
 |  | +            if agent_message:
 | 
											
												
													
														|  | 
 |  | +                prompt = evaluate_reply_agent_prompt(message, agent_message, user_profile, staff_profile, push_time)
 | 
											
												
													
														|  | 
 |  | +                # prompt = evaluate_conversation_quality_task(message, user_profile, staff_profile)
 | 
											
												
													
														|  | 
 |  | +                response = fetch_deepseek_completion(prompt, output_type='json')
 | 
											
												
													
														|  | 
 |  | +                obj = {
 | 
											
												
													
														|  | 
 |  | +                    "user_profile": user_profile,
 | 
											
												
													
														|  | 
 |  | +                    "agent_profile": staff_profile,
 | 
											
												
													
														|  | 
 |  | +                    "dialogue_history": message,
 | 
											
												
													
														|  | 
 |  | +                    "push_message": agent_message,
 | 
											
												
													
														|  | 
 |  | +                    "push_time": push_time,
 | 
											
												
													
														|  | 
 |  | +                    "evaluation_result": response
 | 
											
												
													
														|  | 
 |  | +                }
 | 
											
												
													
														|  | 
 |  | +                F.append(obj)
 | 
											
												
													
														|  |  
 |  |  
 | 
											
												
													
														|  | -            # staff 相关
 |  | 
 | 
											
												
													
														|  | -            staff_id = sub_dialogues['staff_id']
 |  | 
 | 
											
												
													
														|  | -            staff_profile_response = get_profile_info(staff_id, "staff")
 |  | 
 | 
											
												
													
														|  | -            staff_profile = json.loads(staff_profile_response[0]['profile'])
 |  | 
 | 
											
												
													
														|  | 
 |  | +        except Exception as e:
 | 
											
												
													
														|  | 
 |  | +            print(e)
 | 
											
												
													
														|  | 
 |  | +            print(traceback.format_exc())
 | 
											
												
													
														|  |  
 |  |  
 | 
											
												
													
														|  | -            user_profile['formatted_staff_profile'] = staff_profile
 |  | 
 | 
											
												
													
														|  | -
 |  | 
 | 
											
												
													
														|  | -            push_agent = MessagePushAgent()
 |  | 
 | 
											
												
													
														|  | -            # reply_agent = MessageReplyAgent()
 |  | 
 | 
											
												
													
														|  | -
 |  | 
 | 
											
												
													
														|  | -            # message 相关
 |  | 
 | 
											
												
													
														|  | -            message = sub_dialogues['dialogue_history']
 |  | 
 | 
											
												
													
														|  | -            agent_message = push_agent.generate_message(
 |  | 
 | 
											
												
													
														|  | -                context=user_profile,
 |  | 
 | 
											
												
													
														|  | -                dialogue_history=message
 |  | 
 | 
											
												
													
														|  | -            )
 |  | 
 | 
											
												
													
														|  | -            prompt = evaluate_push_agent_prompt(message, agent_message, user_profile, staff_profile)
 |  | 
 | 
											
												
													
														|  | -            # prompt = evaluate_conversation_quality_task(message, user_profile, staff_profile)
 |  | 
 | 
											
												
													
														|  | -            response = fetch_deepseek_completion(prompt, output_type='json')
 |  | 
 | 
											
												
													
														|  | -            obj = {
 |  | 
 | 
											
												
													
														|  | -                "user_profile": user_profile,
 |  | 
 | 
											
												
													
														|  | -                "agent_profile": staff_profile,
 |  | 
 | 
											
												
													
														|  | -                "dialogue_history": message,
 |  | 
 | 
											
												
													
														|  | -                "push_message": agent_message,
 |  | 
 | 
											
												
													
														|  | -                "push_time": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
 |  | 
 | 
											
												
													
														|  | -                "evaluation_result": response
 |  | 
 | 
											
												
													
														|  | -            }
 |  | 
 | 
											
												
													
														|  | -            F.append(obj)
 |  | 
 | 
											
												
													
														|  | -        except:
 |  | 
 | 
											
												
													
														|  | -            continue
 |  | 
 | 
											
												
													
														|  |  
 |  |  
 | 
											
												
													
														|  | -    with open("push_message_evaluation_result.json", "w", encoding="utf-8") as f:
 |  | 
 | 
											
												
													
														|  | -        f.write(json.dumps(F, indent=4, ensure_ascii=False))
 |  | 
 | 
											
												
													
														|  | 
 |  | +    with open("push_message_evaluation_result_2.json", "w", encoding="utf-8") as f:
 | 
											
												
													
														|  | 
 |  | +        f.write(json.dumps(F, ensure_ascii=False, indent=4))
 | 
											
												
													
														|  |  
 |  |  
 | 
											
												
													
														|  |  
 |  |  
 |