evaluate_agent.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283
  1. import json
  2. import datetime
  3. import random
  4. from tqdm import tqdm
  5. from openai import OpenAI
  6. from pymysql.cursors import DictCursor
  7. from pqai_agent.database import MySQLManager
  8. from pqai_agent.agents.message_push_agent import MessagePushAgent
  9. from pqai_agent.agents.message_reply_agent import MessageReplyAgent
  10. def fetch_deepseek_completion(prompt, output_type='text'):
  11. """
  12. deep_seek方法
  13. """
  14. client = OpenAI(
  15. api_key='sk-cfd2df92c8864ab999d66a615ee812c5',
  16. base_url="https://api.deepseek.com"
  17. )
  18. # get response format
  19. if output_type == "json":
  20. response_format = {"type": "json_object"}
  21. else:
  22. response_format = {"type": "text"}
  23. chat_completion = client.chat.completions.create(
  24. messages=[
  25. {
  26. "role": "user",
  27. "content": prompt,
  28. }
  29. ],
  30. model="deepseek-chat",
  31. response_format=response_format,
  32. )
  33. response = chat_completion.choices[0].message.content
  34. if output_type == "json":
  35. response_json = json.loads(response)
  36. return response_json
  37. return response
  38. def get_profile_info(user_id_, user_type):
  39. match user_type:
  40. case "user":
  41. sql = f"""
  42. select iconurl as 'avatar', profile_data_v1 as 'profile'
  43. from third_party_user where third_party_user_id = %s;
  44. """
  45. case "staff":
  46. sql = f"""
  47. select agent_profile as 'profile'
  48. from qywx_employee where third_party_user_id = %s;
  49. """
  50. case _:
  51. raise ValueError("user_type must be 'user' or 'staff'")
  52. return mysql_client.select(sql, cursor_type=DictCursor, args=(user_id_,))
  53. def evaluate_conversation_quality_task(dialogue_history, user_profile_, agent_profile):
  54. """
  55. :param dialogue_history:
  56. :param user_profile_:
  57. :param agent_profile:
  58. :return:
  59. """
  60. output_format = {
  61. "1.1": {
  62. "score": 5,
  63. "reason": ""
  64. },
  65. "1.2": {
  66. "score": 8,
  67. "reason": "reason"
  68. },
  69. "1.3": {
  70. "score": 10,
  71. "reason": "reason"
  72. },
  73. "1.4": {
  74. "score": 10,
  75. "reason": "reason"
  76. },
  77. "1.5": {
  78. "score": 10,
  79. "reason": "reason"
  80. },
  81. "1.6": {
  82. "score": 10,
  83. "reason": "reason"
  84. },
  85. "2.1": {
  86. "score": 9,
  87. "reason": "reason"
  88. },
  89. "2.2": {
  90. "score": 10,
  91. "reason": "reason"
  92. },
  93. "2.3": {
  94. "score": 10,
  95. "reason": "reason"
  96. },
  97. "total_score": "total_score",
  98. "improvement_suggestions": "suggestions",
  99. }
  100. prompt_ = f"""
  101. 你是一名优秀的 agent 评估员,请根据以下场景和输入,对该 agent 的回复能力进行评估,用分数量化
  102. 场景:
  103. 智能体对话场景, 智能体(agent)和用户(user)进行对话聊天
  104. 输入:
  105. agent 的人设:agent_profile: {agent_profile}
  106. 用户的人设: user_profile: {user_profile_}
  107. 对话历史:dialogue_history: {dialogue_history}
  108. 评估标准, 满分为 100分,拆分到以下每一个小项,每一个小项的得分表示该小项的能力,60% 的分表示及格,80% 的分表示优秀:
  109. 1. 对话能力(30分)
  110. 1.1 语言是否流畅(10分)
  111. 1.2 上下文是否连贯,语义是否一致(10分)
  112. 1.3 agent 是否感知用户结束聊天的意图并且适当结束聊天(10分)
  113. 1.4 agent 回复消息的时间间隔是否合理,符合真人对话规律 (10分)
  114. 1.5 agent 回复的消息是否具有高情商,互动能力是否好,能否和用户共情,提升用户的情感体验 (20分)
  115. 1.6 agent 回复的消息是否解决了用户提出的问题 (10分)
  116. 2. 角色一致性(30分)
  117. 2.1 agent 语言风格是否符合agent人设(10分)
  118. 2.2 agent 语言风格是否适合用户人设(10分)
  119. 2.3 agent 回复内容不要超越用户的认知上限(10分)
  120. 输出:
  121. 输出为 json 格式,输出格式规范 {output_format}
  122. """
  123. return prompt_
  124. def evaluate_push_agent_prompt(dialogue_history, push_message, user_profile_, agent_profile):
  125. """
  126. :param dialogue_history:
  127. :param push_message:
  128. :param user_profile_:
  129. :param agent_profile:
  130. :return:
  131. """
  132. output_format = {
  133. "1.1": {
  134. "score": 5,
  135. "reason": "push_message尝试联系用户的头像,但用户兴趣未明确提及戏曲"
  136. },
  137. "1.2": {
  138. "score": 8,
  139. "reason": "语言风格轻松友好,适合大多数用户,但未完全匹配用户特定风格"
  140. },
  141. "1.3": {
  142. "score": 10,
  143. "reason": "信息未超出用户认知范围"
  144. },
  145. "2.1": {
  146. "score": 9,
  147. "reason": "语言风格符合agent人设,友好且亲切"
  148. },
  149. "2.2": {
  150. "score": 10,
  151. "reason": "信息未超出agent人设的认知范围"
  152. },
  153. "3.1": {
  154. "score": 15,
  155. "reason": "push_message有潜力勾起用户兴趣,但未直接关联用户已知兴趣"
  156. },
  157. "3.2": {
  158. "score": 10,
  159. "reason": "信息真实"
  160. },
  161. "3.3": {
  162. "score": 12,
  163. "reason": "表现出一定的拟人化和情商,但共情程度可进一步提升"
  164. },
  165. "total_score": 79,
  166. "improvement_suggestions": "建议更深入地挖掘和利用用户已知的兴趣爱好来定制push_message,以增强相关性和用户参与度。同时,可以尝试更多共情的表达方式,以提升用户的情感体验。"
  167. }
  168. prompt_ = f"""
  169. 你是一名优秀的 agent 评估员,请根据以下场景和输入,对该 agent 的能力进行评估,用分数量化
  170. 场景:
  171. 智能体对话场景, 智能体(agent)向用户发起对话
  172. agent 需要通过分析 user 和 agent 直接的历史对话,以及 user 和 agent 的人设信息,向用户发送一条消息(push_message)
  173. 输入:
  174. agent 的人设:agent_profile: {agent_profile}
  175. 用户的人设: user_profile: {user_profile_}
  176. 对话历史:dialogue_history: {dialogue_history}
  177. agent 的唤起对话:push_message: {push_message}
  178. 评估标准, 满分为 100分,拆分到以下每一个小项,每一个小项的得分表示该小项的能力,60% 的分表示及格,80% 的分表示优秀:
  179. 1. push_message 的内容 和 user_profile的相关性(30分)
  180. 1.1 push_message 是否迎合用户的兴趣爱好 (满分 10分)
  181. 1.2 push_message 的语言风格是否适合用户语言风格 (满分 10分)
  182. 1.3 push_message 的信息是否超出用户的认知范围 (满分 10分)
  183. 2. push_message 和 agent_profile 的相关性(20分)
  184. 2.1 push_message 的语言风格是否符合 agent 人设(满分 10分)
  185. 2.2 push_message 的信息是否超出 agent人设的认知范围(满分 10分)
  186. 3. push_message 质量量化 (50分)
  187. 3.1 push_message 是否能勾起用户的兴趣,驱动用户聊天激情 (满分 25分)
  188. 3.2 push_message 的信息是否真实 (满分 10分)
  189. 3.3 push_message 是否具有拟人化,高情商,与用户共情,提升用户的情感体验(满分 15分)
  190. 输出:
  191. 输出为 json 格式,输出格式规范 {output_format}
  192. """
  193. return prompt_
  194. def evaluate_reply_agent(dialogue_history, reply_message, user_profile_, agent_profile):
  195. """
  196. :param dialogue_history:
  197. :param reply_message:
  198. :param user_profile_:
  199. :param agent_profile:
  200. :return:
  201. """
  202. return
  203. config = {
  204. 'host': 'rm-bp13g3ra2f59q49xs.mysql.rds.aliyuncs.com',
  205. 'port': 3306,
  206. 'user': 'wqsd',
  207. 'password': 'wqsd@2025',
  208. 'database': 'ai_agent',
  209. 'charset': 'utf8mb4'
  210. }
  211. mysql_client = MySQLManager(config)
  212. if __name__ == '__main__':
  213. import pqai_agent.logging_service
  214. pqai_agent.logging_service.setup_root_logger()
  215. with open("scripts/dialogue_data_set.json", "r", encoding="utf-8") as f:
  216. data = json.load(f)
  217. dialogues = random.sample(data, 100)
  218. F = []
  219. for sub_dialogues in tqdm(dialogues):
  220. try:
  221. # user 相关
  222. user_id = sub_dialogues['user_id']
  223. user_profile_response = get_profile_info(user_id, "user")
  224. user_profile, avatar = json.loads(user_profile_response[0]['profile']), user_profile_response[0]['avatar']
  225. user_profile['avatar'] = avatar
  226. user_profile['current_datetime'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
  227. # staff 相关
  228. staff_id = sub_dialogues['staff_id']
  229. staff_profile_response = get_profile_info(staff_id, "staff")
  230. staff_profile = json.loads(staff_profile_response[0]['profile'])
  231. user_profile['formatted_staff_profile'] = staff_profile
  232. push_agent = MessagePushAgent()
  233. # reply_agent = MessageReplyAgent()
  234. # message 相关
  235. message = sub_dialogues['dialogue_history']
  236. agent_message = push_agent.generate_message(
  237. context=user_profile,
  238. dialogue_history=message
  239. )
  240. prompt = evaluate_push_agent_prompt(message, agent_message, user_profile, staff_profile)
  241. # prompt = evaluate_conversation_quality_task(message, user_profile, staff_profile)
  242. response = fetch_deepseek_completion(prompt, output_type='json')
  243. obj = {
  244. "user_profile": user_profile,
  245. "agent_profile": staff_profile,
  246. "dialogue_history": message,
  247. "push_message": agent_message,
  248. "push_time": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
  249. "evaluation_result": response
  250. }
  251. F.append(obj)
  252. except:
  253. continue
  254. with open("push_message_evaluation_result.json", "w", encoding="utf-8") as f:
  255. f.write(json.dumps(F, indent=4, ensure_ascii=False))