hace 1 semana · 5f953d7406
--- a/evaluate_agent.py
+++ b/evaluate_agent.py
@@ -1,6 +1,7 @@
 
				 import json
			
 
				 import datetime
			
 
				 import random
			
 
				+import traceback
			
 
				 
			
 
				 from tqdm import tqdm
			
 
				 from openai import OpenAI
			
@@ -9,6 +10,34 @@ from pqai_agent.database import MySQLManager
 
				 from pqai_agent.agents.message_push_agent import MessagePushAgent
			
 
				 from pqai_agent.agents.message_reply_agent import MessageReplyAgent
			
 
				 
			
 
				+evaluation_metrics_dict = {
			
 
				+    "1.2": "是否识别关键信息",
			
 
				+    "1.3": "是否能够理解歧义词/模糊词",
			
 
				+    "1.4": "是否能理解表情包，图片消息",
			
 
				+    "1.5": "是否能理解语音/方言",
			
 
				+    "2.1": "回复是否与用户意图相关",
			
 
				+    "2.2": "回复是否清晰简洁",
			
 
				+    "2.3": "回复是否流畅",
			
 
				+    "2.4": "回复语法是否规范",
			
 
				+    "3.1": "是否能理解代词（他，她， 她， 这个那个）",
			
 
				+    "3.2": "是否能延续上文话题",
			
 
				+    "3.3": "是否记住上文的基础信息",
			
 
				+    "3.4": "是否及时结束聊天",
			
 
				+    "4.1": "是否讨论超出角色认知范围的信息",
			
 
				+    "4.2": "是否讨论了不符合当前时代背景的语言、物品、事件、概念",
			
 
				+    "4.3": "是否表现出与agent 人设相符的专业知识、生活经验或者常识",
			
 
				+    "5.1": "agent 的言行是否反映其预设的核心性格",
			
 
				+    "5.2": "agent 的价值观和道德观是否符合其预设标准",
			
 
				+    "6.1": "agent 使用的词汇、句式、语法复杂度、行话/俚语是否符合其身份、教育背景和时代？",
			
 
				+    "6.2": "agent 语气、语调（恭敬、傲慢、亲切、疏离、热情、冷淡）是否稳定？",
			
 
				+    "6.3": "agent 表达习惯、口头禅是否符合角色预设特点",
			
 
				+    "7.1": "agent 在对话中表现出的目标、关注重心是否与其设定的核心动机一致？",
			
 
				+    "8.1": "agent 是否按照预设的互动模式与用户沟通",
			
 
				+    "8.2": "agent 是否对自身角色有正确理解",
			
 
				+    "8.3": "agent 是否回复超越用户认知的信息"
			
 
				+}
			
 
				+
			
 
				+
			
 
				 def fetch_deepseek_completion(prompt, output_type='text'):
			
 
				     """
			
 
				     deep_seek方法
			
@@ -31,7 +60,7 @@ def fetch_deepseek_completion(prompt, output_type='text'):
 
				                 "content": prompt,
			
 
				             }
			
 
				         ],
			
 
				-        model="deepseek-chat",
			
 
				+        model="deepseek-reasoner",
			
 
				         response_format=response_format,
			
 
				     )
			
 
				     response = chat_completion.choices[0].message.content
			
@@ -205,16 +234,80 @@ def evaluate_push_agent_prompt(dialogue_history, push_message, user_profile_, ag
 
				     return prompt_
			
 
				 
			
 
				 
			
 
				-def evaluate_reply_agent(dialogue_history, reply_message, user_profile_, agent_profile):
			
 
				+def evaluate_reply_agent_prompt(dialogue_history, message, user_profile_, agent_profile, push_time):
			
 
				     """
			
 
				 
			
 
				     :param dialogue_history:
			
 
				-    :param reply_message:
			
 
				+    :param message:
			
 
				     :param user_profile_:
			
 
				     :param agent_profile:
			
 
				     :return:
			
 
				     """
			
 
				-    return
			
 
				+    output_format = {
			
 
				+        "1.1": {
			
 
				+            "score": 1,
			
 
				+            "reason": "理由"
			
 
				+        },
			
 
				+        "1.2": {
			
 
				+            "score": 0,
			
 
				+            "reason": "理由"
			
 
				+        }
			
 
				+    }
			
 
				+    prompt_ = f"""
			
 
				+        **评估任务：** 基于给定的对话历史和 Agent 预设信息，评估 Agent 在对话中的表现。使用以下维度和指标进行评分。
			
 
				+        **评估指标：**
			
 
				+          1. 理解能力
			
 
				+            1.1 是否识别用户核心意图
			
 
				+            1.2 是否识别关键信息
			
 
				+            1.3 是否能够理解歧义词/模糊词
			
 
				+            1.4 是否能理解表情包，图片消息
			
 
				+            1.5 是否能理解语音/方言
			
 
				+          2. 回复能力
			
 
				+            2.1 回复是否与用户意图相关
			
 
				+            2.2 回复是否清晰简洁
			
 
				+            2.3 回复是否流畅
			
 
				+            2.4 回复语法是否规范
			
 
				+          3. 上下文管理能力
			
 
				+            3.1 是否能理解代词（他，她， 她， 这个那个）
			
 
				+            3.2 是否能延续上文话题rye5
			
 
				+            3.4 是否及时结束聊天
			
 
				+          4. 背景知识一致性
			
 
				+            4.1 是否讨论超出角色认知范围的信息
			
 
				+            4.2 是否讨论了不符合当前时代背景的语言、物品、事件、概念
			
 
				+            4.3 是否表现出与agent 人设相符的专业知识、生活经验或者常识
			
 
				+          5. 性格行为一致性
			
 
				+            5.1 agent 的言行是否反映其预设的核心性格
			
 
				+            5.2 agent 的价值观和道德观是否符合其预设标准
			
 
				+          6. 语言风格一致性
			
 
				+            6.1 agent 使用的词汇、句式、语法复杂度、行话/俚语是否符合其身份、教育背景和时代？
			
 
				+            6.2 agent 语气、语调（恭敬、傲慢、亲切、疏离、热情、冷淡）是否稳定？
			
 
				+            6.3 agent 表达习惯、口头禅是否符合角色预设特点
			
 
				+          7. 目标动机一致性
			
 
				+            7.1 agent 在对话中表现出的目标、关注重心是否与其设定的核心动机一致？
			
 
				+          8. 关系认知一致性
			
 
				+            8.1 agent 是否按照预设的互动模式与用户沟通 
			
 
				+            8.2 agent 是否对自身角色有正确理解
			
 
				+            8.3 agent 是否回复超越用户认知的信息
			
 
				+        
			
 
				+        **评估规则：**
			
 
				+        - 对于每个指标：
			
 
				+          - 如果符合要求，得 1 分。
			
 
				+          - 如果不符合要求，得 0 分。
			
 
				+          - 如果指标不适用（如对话未涉及相关场景），得 1 分（无需评估。
			
 
				+        - 理由必须基于对话内容，简短且客观，理由需要是中文， 如果是无需评估，则理由写无需评估
			
 
				+        
			
 
				+        **输入：**
			
 
				+        - **对话历史**： {dialogue_history}
			
 
				+        - **Agent 预设信息**： {agent_profile}
			
 
				+        - **用户预设信息**： {user_profile_}
			
 
				+        - **Agent 消息**： {message}
			
 
				+        - **Agent 发送消息时间**：{push_time}
			
 
				+        
			
 
				+        **输出格式要求：JSON 格式**
			
 
				+        输出格式参考：{output_format}
			
 
				+        
			
 
				+    """
			
 
				+    return prompt_
			
 
				 
			
 
				 
			
 
				 
			
@@ -231,53 +324,62 @@ mysql_client = MySQLManager(config)
 
				 if __name__ == '__main__':
			
 
				     import pqai_agent.logging_service
			
 
				     pqai_agent.logging_service.setup_root_logger()
			
 
				-    with open("scripts/dialogue_data_set.json", "r", encoding="utf-8") as f:
			
 
				+    with open("push_message_evaluation_result.json", "r", encoding="utf-8") as f:
			
 
				         data = json.load(f)
			
 
				 
			
 
				-    dialogues = random.sample(data, 100)
			
 
				+    # dialogues = random.sample(data, 1)
			
 
				     F = []
			
 
				-    for sub_dialogues in tqdm(dialogues):
			
 
				+    for sub_dialogues in tqdm(data):
			
 
				         try:
			
 
				             # user 相关
			
 
				-            user_id = sub_dialogues['user_id']
			
 
				-            user_profile_response = get_profile_info(user_id, "user")
			
 
				-            user_profile, avatar = json.loads(user_profile_response[0]['profile']), user_profile_response[0]['avatar']
			
 
				-
			
 
				-            user_profile['avatar'] = avatar
			
 
				-            user_profile['current_datetime'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
			
 
				+            # user_id = sub_dialogues['user_id']
			
 
				+            # user_profile_response = get_profile_info(user_id, "user")
			
 
				+            # user_profile, avatar = json.loads(user_profile_response[0]['profile']), user_profile_response[0]['avatar']
			
 
				+            #
			
 
				+            # user_profile['avatar'] = avatar
			
 
				+            # user_profile['current_datetime'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
			
 
				+            #
			
 
				+            # # staff 相关
			
 
				+            # staff_id = sub_dialogues['staff_id']
			
 
				+            # staff_profile_response = get_profile_info(staff_id, "staff")
			
 
				+            # staff_profile = json.loads(staff_profile_response[0]['profile'])
			
 
				+            #
			
 
				+            # user_profile['formatted_staff_profile'] = staff_profile
			
 
				+            #
			
 
				+            # push_agent = MessagePushAgent()
			
 
				+            # # reply_agent = MessageReplyAgent()
			
 
				+            #
			
 
				+            # # message 相关
			
 
				+            # message = sub_dialogues['dialogue_history']
			
 
				+            # agent_message = push_agent.generate_message(
			
 
				+            #     context=user_profile,
			
 
				+            #     dialogue_history=message
			
 
				+            # )
			
 
				+            message = sub_dialogues["dialogue_history"]
			
 
				+            agent_message = sub_dialogues["push_message"]
			
 
				+            push_time = sub_dialogues["push_time"]
			
 
				+            user_profile = sub_dialogues["user_profile"]
			
 
				+            staff_profile = sub_dialogues["agent_profile"]
			
 
				+            if agent_message:
			
 
				+                prompt = evaluate_reply_agent_prompt(message, agent_message, user_profile, staff_profile, push_time)
			
 
				+                # prompt = evaluate_conversation_quality_task(message, user_profile, staff_profile)
			
 
				+                response = fetch_deepseek_completion(prompt, output_type='json')
			
 
				+                obj = {
			
 
				+                    "user_profile": user_profile,
			
 
				+                    "agent_profile": staff_profile,
			
 
				+                    "dialogue_history": message,
			
 
				+                    "push_message": agent_message,
			
 
				+                    "push_time": push_time,
			
 
				+                    "evaluation_result": response
			
 
				+                }
			
 
				+                F.append(obj)
			
 
				 
			
 
				-            # staff 相关
			
 
				-            staff_id = sub_dialogues['staff_id']
			
 
				-            staff_profile_response = get_profile_info(staff_id, "staff")
			
 
				-            staff_profile = json.loads(staff_profile_response[0]['profile'])
			
 
				+        except Exception as e:
			
 
				+            print(e)
			
 
				+            print(traceback.format_exc())
			
 
				 
			
 
				-            user_profile['formatted_staff_profile'] = staff_profile
			
 
				-
			
 
				-            push_agent = MessagePushAgent()
			
 
				-            # reply_agent = MessageReplyAgent()
			
 
				-
			
 
				-            # message 相关
			
 
				-            message = sub_dialogues['dialogue_history']
			
 
				-            agent_message = push_agent.generate_message(
			
 
				-                context=user_profile,
			
 
				-                dialogue_history=message
			
 
				-            )
			
 
				-            prompt = evaluate_push_agent_prompt(message, agent_message, user_profile, staff_profile)
			
 
				-            # prompt = evaluate_conversation_quality_task(message, user_profile, staff_profile)
			
 
				-            response = fetch_deepseek_completion(prompt, output_type='json')
			
 
				-            obj = {
			
 
				-                "user_profile": user_profile,
			
 
				-                "agent_profile": staff_profile,
			
 
				-                "dialogue_history": message,
			
 
				-                "push_message": agent_message,
			
 
				-                "push_time": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
			
 
				-                "evaluation_result": response
			
 
				-            }
			
 
				-            F.append(obj)
			
 
				-        except:
			
 
				-            continue
			
 
				 
			
 
				-    with open("push_message_evaluation_result.json", "w", encoding="utf-8") as f:
			
 
				-        f.write(json.dumps(F, indent=4, ensure_ascii=False))
			
 
				+    with open("push_message_evaluation_result_2.json", "w", encoding="utf-8") as f:
			
 
				+        f.write(json.dumps(F, ensure_ascii=False, indent=4))
			
 
				 
			
 
				 
			
--- a/evaluate_agent_v2.py
+++ b/evaluate_agent_v2.py
@@ -0,0 +1,242 @@
 
				+import json
			
 
				+
			
 
				+from openai import OpenAI
			
 
				+
			
 
				+def fetch_deepseek_completion(prompt, output_type='text'):
			
 
				+    """
			
 
				+    deep_seek方法
			
 
				+    """
			
 
				+    client = OpenAI(
			
 
				+        api_key='sk-cfd2df92c8864ab999d66a615ee812c5',
			
 
				+        base_url="https://api.deepseek.com"
			
 
				+    )
			
 
				+
			
 
				+    # get response format
			
 
				+    if output_type == "json":
			
 
				+        response_format = {"type": "json_object"}
			
 
				+    else:
			
 
				+        response_format = {"type": "text"}
			
 
				+
			
 
				+    chat_completion = client.chat.completions.create(
			
 
				+        messages=[
			
 
				+            {
			
 
				+                "role": "user",
			
 
				+                "content": prompt,
			
 
				+            }
			
 
				+        ],
			
 
				+        model="deepseek-chat",
			
 
				+        response_format=response_format,
			
 
				+    )
			
 
				+    response = chat_completion.choices[0].message.content
			
 
				+    if output_type == "json":
			
 
				+        response_json = json.loads(response)
			
 
				+        return response_json
			
 
				+
			
 
				+    return response
			
 
				+
			
 
				+
			
 
				+class PushAgentEvaluator:
			
 
				+    def __init__(self, dialogue_history, push_message, push_time, user_profile, agent_profile):
			
 
				+        self.dialogue_history = dialogue_history
			
 
				+        self.push_message = push_message
			
 
				+        self.user_profile = user_profile
			
 
				+        self.agent_profile = agent_profile
			
 
				+        self.push_time = push_time
			
 
				+
			
 
				+    def evaluate_user_match(self):
			
 
				+        """用户匹配度评估（24分）"""
			
 
				+        out_put_format = {
			
 
				+                "语言风格": {
			
 
				+                    "score": 8,
			
 
				+                    "reason": "语言风格符合用户特征，年龄、性别、地域等特征都匹配"
			
 
				+                },
			
 
				+                "信息边界": {
			
 
				+                    "score": 10,
			
 
				+                    "reason": "信息范围符合用户认知范围"
			
 
				+                },
			
 
				+                "特征匹配": {
			
 
				+                    "score": 10,
			
 
				+                    "reason": "特征匹配度高，user_profile中有兴趣爱好，健康状态，用药状态，居住地址，家庭成员等特征，在 push_message需要体现出来"
			
 
				+            }
			
 
				+        }
			
 
				+        analysis_prompt = f"""
			
 
				+            通过分析用户的以下特征
			
 
				+                年龄: {self.user_profile['age']}, 
			
 
				+                性别: {self.user_profile['gender']}, 
			
 
				+                兴趣: {self.user_profile['interests']},
			
 
				+                健康状态: {self.user_profile['health_conditions']},
			
 
				+                用药状态: {self.user_profile['medications']},
			
 
				+                居住地址: {self.user_profile['region']},
			
 
				+                家庭成员: {self.user_profile['family_members']},
			
 
				+            和 agent 发送的消息: {self.push_message}, 判断该消息是否和该用户的特征匹配
			
 
				+            包括一下三点，每一点的满分为 8 分：
			
 
				+                语言风格：是否符合用户的年龄、性别、地域等特征；
			
 
				+                信息范围：是否超出用户认知范围；
			
 
				+                特征匹配：如果 user_profile中有兴趣爱好，健康状态，用药状态，居住地址，家庭成员等特征，在 push_message需要体现出来
			
 
				+            输出的格式为 JSON，示例如：{out_put_format}
			
 
				+        """
			
 
				+        completion = fetch_deepseek_completion(prompt=analysis_prompt, output_type='json')
			
 
				+        return completion
			
 
				+
			
 
				+    def evaluate_agent_consistency(self):
			
 
				+        """人设一致性评估（16分）"""
			
 
				+        out_put_format = {
			
 
				+            "语言风格": {
			
 
				+                "score": 8,
			
 
				+                "reason": "语言风格符合用户特征，年龄、性别、地域等特征都匹配"
			
 
				+            },
			
 
				+            "消息边界": {
			
 
				+                "score": 8,
			
 
				+                "reason": "信息范围符合用户认知范围"
			
 
				+            },
			
 
				+        }
			
 
				+        analysis_prompt = f"""
			
 
				+            通过分析 agent 的一下特征：
			
 
				+                职业：{self.agent_profile['occupation']},
			
 
				+                年龄：{self.agent_profile['age']},
			
 
				+                性别：{self.agent_profile['gender']},
			
 
				+                地址：{self.agent_profile['region']},
			
 
				+                教育背景：{self.agent_profile['education']},
			
 
				+                工作经验：{self.agent_profile['work_experience']}
			
 
				+            和 agent 发送的消息: {self.push_message}, 判断该消息是否和该 agent 的特征匹配
			
 
				+            包括一下 2 点，每一点的满分为 8 分：
			
 
				+                语言风格：是否符合 agent 的年龄、性别、地域等特征；需要从不同性别，不同职业，不同年龄， 不同地域的人的说话风格去分析
			
 
				+                信息范围：是否超出 agent 认知范围；
			
 
				+            输出的格式为 JSON，示例如：{out_put_format}
			
 
				+        """
			
 
				+        completion = fetch_deepseek_completion(prompt=analysis_prompt, output_type='json')
			
 
				+        return completion
			
 
				+
			
 
				+    def evaluate_interest_arousal(self):
			
 
				+        """兴趣激发评估（30）"""
			
 
				+        out_put_format = {
			
 
				+            "好奇设计": {
			
 
				+                "score": 15,
			
 
				+                "reason": "包含开放式问题，包括对用户认知边界的探索，用户兴趣爱好的联想等方面"
			
 
				+            },
			
 
				+            "利益设计": {
			
 
				+                "score": 15,
			
 
				+                "reason": "如果用户在历史对话中提到了某种需求，新的唤起消息是否有为用户解决需求的趋势"
			
 
				+            }
			
 
				+        }
			
 
				+        analysis_prompt = f"""
			
 
				+            通过分析用户的以下特征
			
 
				+                年龄: {self.user_profile['age']}, 
			
 
				+                性别: {self.user_profile['gender']}, 
			
 
				+                兴趣: {self.user_profile['interests']},
			
 
				+                健康状态: {self.user_profile['health_conditions']},
			
 
				+                用药状态: {self.user_profile['medications']},
			
 
				+                居住地址: {self.user_profile['region']},
			
 
				+                家庭成员: {self.user_profile['family_members']},
			
 
				+            和 agent 发送的消息: {self.push_message}, 以及用户和 agent 的历史对话: {self.dialogue_history}, 
			
 
				+            判断该消息是否能唤起用户的兴趣，驱动用户的聊天激情，主要从以下几2个方面考虑，每一个要点满分 15分
			
 
				+                好奇设计：是否包含开放式问题，包括对用户认知边界的探索，用户兴趣爱好的联想等方面
			
 
				+                利益设计：如果用户在历史对话中提到了某种需求，新的唤起消息是否有为用户解决需求的趋势；
			
 
				+                        如果没提出，可以从用户特征中联想出用户的需求，比如用户的兴趣爱好，健康状态，用药状态，居住地址，家庭成员等特征，
			
 
				+                        在 push_message需要体现出来
			
 
				+            输出的格式为 JSON，示例如：{out_put_format}
			
 
				+        """
			
 
				+        completion = fetch_deepseek_completion(prompt=analysis_prompt, output_type='json')
			
 
				+        return completion
			
 
				+
			
 
				+    def evaluate_execution_quality(self):
			
 
				+        """执行质量评估（15分）"""
			
 
				+        output_format = {
			
 
				+            "信息保真": {
			
 
				+                "score": 5,
			
 
				+                "reason": "信息真实"
			
 
				+            },
			
 
				+            "政策合规": {
			
 
				+                "score": 5,
			
 
				+                "reason": "发送消息是否符合现有政策要求，是否违反相关规则"
			
 
				+            },
			
 
				+            "语言拟人": {
			
 
				+                "score": 5,
			
 
				+                "reason": "表现出一定的拟人化和情商"
			
 
				+            }
			
 
				+        }
			
 
				+        analysis_prompt = f"""
			
 
				+            通过分析 agent 发送的消息：{self.push_message}, 
			
 
				+            信息保真（满分 5 分）:
			
 
				+                拆分消息重点各个事件点，联网搜索，判断事件是否为真实事件，比如说节日祝福需要和发送时间{self.push_time}对应， 天气需要和地区相关
			
 
				+                不能在早上发晚上好之类等等
			
 
				+            政策合规（满分 5 分）:
			
 
				+                并且判断发送的消息是否存在违反规则的现象。
			
 
				+            语言拟人:（满分 5 分）:
			
 
				+                判断发送的消息是否存在机械化，同质化的现象，比如重复发相同或者相似的消息，比如过于机械的回复等等
			
 
				+            输出的格式为 JSON，示例如：{output_format}
			
 
				+        """
			
 
				+        completion = fetch_deepseek_completion(prompt=analysis_prompt, output_type='json')
			
 
				+        return completion
			
 
				+
			
 
				+    def evaluate_emotional_intelligence(self):
			
 
				+        """情感智能评估（15分）"""
			
 
				+        output_format = {
			
 
				+            "共情深度": {
			
 
				+                "score": 10,
			
 
				+                "reason": "识别用户情感状态，提供相应的回应"
			
 
				+            },
			
 
				+        }
			
 
				+        analysis_prompt = f"""
			
 
				+             通过分析用户的以下特征
			
 
				+                年龄: {self.user_profile['age']}, 
			
 
				+                性别: {self.user_profile['gender']}, 
			
 
				+                兴趣: {self.user_profile['interests']},
			
 
				+                健康状态: {self.user_profile['health_conditions']},
			
 
				+                用药状态: {self.user_profile['medications']},
			
 
				+                居住地址: {self.user_profile['region']},
			
 
				+                家庭成员: {self.user_profile['family_members']},
			
 
				+            和 agent 和用户的对话：{self.dialogue_history} 挖掘出用户的情感需求
			
 
				+            并且 agent 发送的消息: {self.push_message}, 判断该message是否能和用户共情，提升用户的情感体验
			
 
				+            满分 15分
			
 
				+            输出格式为 JSON，示例如：{output_format}
			
 
				+        """
			
 
				+        completion = fetch_deepseek_completion(prompt=analysis_prompt, output_type='json')
			
 
				+        return completion
			
 
				+
			
 
				+
			
 
				+def evaluate_push_agent(
			
 
				+        dialogue_history,
			
 
				+        push_message,
			
 
				+        push_time,
			
 
				+        user_profile,
			
 
				+        agent_profile
			
 
				+):
			
 
				+    # 创建评估引擎实例
			
 
				+    evaluator = PushAgentEvaluator(
			
 
				+        dialogue_history,
			
 
				+        push_message,
			
 
				+        push_time,
			
 
				+        user_profile,
			
 
				+        agent_profile,
			
 
				+    )
			
 
				+
			
 
				+    # 执行模块化评估
			
 
				+    evaluation_report = {
			
 
				+        "基础能力": {
			
 
				+            "用户匹配": evaluator.evaluate_user_match(),
			
 
				+            "人设一致": evaluator.evaluate_agent_consistency()
			
 
				+        },
			
 
				+        "任务效能": {
			
 
				+            "兴趣激发": evaluator.evaluate_interest_arousal(),
			
 
				+            "执行质量": evaluator.evaluate_execution_quality(),
			
 
				+            "情感智能": evaluator.evaluate_emotional_intelligence()
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    return evaluation_report
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    with open("dev.json") as f:
			
 
				+        data = json.load(f)
			
 
				+
			
 
				+    evaluation_report = evaluate_push_agent(
			
 
				+        data['dialogue_history'],
			
 
				+        data['push_message'],
			
 
				+        data['push_time'],
			
 
				+        data['user_profile'],
			
 
				+        data['agent_profile']
			
 
				+    )
			
 
				+    print(json.dumps(evaluation_report, indent=4, ensure_ascii=False))
			
 
				+
			
--- a/push_message_evaluation_result.json
+++ b/push_message_evaluation_result.json
@@ -14831,6 +14831,7 @@
 
				             "improvement_suggestions": "建议更深入地挖掘和利用用户已知的兴趣爱好来定制push_message，以增强相关性和用户参与度。同时，可以尝试更多共情的表达方式，以提升用户的情感体验。"
			
 
				         }
			
 
				     },
			
 
				+
			
 
				     {
			
 
				         "user_profile": {
			
 
				             "name": "",