|
@@ -0,0 +1,138 @@
|
|
|
|
+import json
|
|
|
|
+import time
|
|
|
|
+import random
|
|
|
|
+
|
|
|
|
+from tqdm import tqdm
|
|
|
|
+from pymysql.cursors import DictCursor
|
|
|
|
+from pqai_agent.database import MySQLManager
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+config = {
|
|
|
|
+ 'host': 'rm-bp13g3ra2f59q49xs.mysql.rds.aliyuncs.com',
|
|
|
|
+ 'port': 3306,
|
|
|
|
+ 'user': 'wqsd',
|
|
|
|
+ 'password': 'wqsd@2025',
|
|
|
|
+ 'database': 'ai_agent',
|
|
|
|
+ 'charset': 'utf8mb4'
|
|
|
|
+}
|
|
|
|
+mysql_client = MySQLManager(config)
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def split_dialogue_history(dialogue_history_, timeout=30*60*1000):
|
|
|
|
+ """
|
|
|
|
+ :param dialogue_history_:
|
|
|
|
+ :param timeout: 30 minutes
|
|
|
|
+ :return:
|
|
|
|
+ """
|
|
|
|
+ messages_sorted = sorted(dialogue_history_, key=lambda x: x['timestamp'])
|
|
|
|
+ dialogues = []
|
|
|
|
+ current_dialogue = []
|
|
|
|
+
|
|
|
|
+ for i, msg in enumerate(messages_sorted):
|
|
|
|
+ if not current_dialogue:
|
|
|
|
+ current_dialogue.append(msg)
|
|
|
|
+ continue
|
|
|
|
+
|
|
|
|
+ prev_msg = messages_sorted[i - 1]
|
|
|
|
+ time_diff = msg["timestamp"] - prev_msg["timestamp"]
|
|
|
|
+
|
|
|
|
+ # 判断是否为新对话
|
|
|
|
+ is_new_dialogue = False
|
|
|
|
+ if time_diff > timeout:
|
|
|
|
+ is_new_dialogue = True
|
|
|
|
+
|
|
|
|
+ if is_new_dialogue:
|
|
|
|
+ dialogues.append(current_dialogue)
|
|
|
|
+ current_dialogue = [msg]
|
|
|
|
+ else:
|
|
|
|
+ current_dialogue.append(msg)
|
|
|
|
+
|
|
|
|
+ if current_dialogue:
|
|
|
|
+ dialogues.append(current_dialogue)
|
|
|
|
+
|
|
|
|
+ return dialogues
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def get_conversation_info():
|
|
|
|
+ sql = f"""
|
|
|
|
+ select roomid, count(id) as 'article_num'
|
|
|
|
+ from qywx_chat_history where msg_type = 1 group by roomid
|
|
|
|
+ having count(id) > 20;
|
|
|
|
+ """
|
|
|
|
+ return mysql_client.select(sql, cursor_type=DictCursor)
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def get_dialogue_history(room_id_):
|
|
|
|
+ """
|
|
|
|
+ 获取对话历史
|
|
|
|
+ :param room_id_:
|
|
|
|
+ :return:
|
|
|
|
+ """
|
|
|
|
+ sql = f"""
|
|
|
|
+ select sender, receiver, sendtime, content
|
|
|
|
+ from qywx_chat_history
|
|
|
|
+ where roomid = %s and msg_type = %s;
|
|
|
|
+ """
|
|
|
|
+ return mysql_client.select(sql=sql, cursor_type=DictCursor, args=(room_id_, 1))
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def get_profile_info(user_id_, user_type):
|
|
|
|
+ match user_type:
|
|
|
|
+ case "user":
|
|
|
|
+ sql = f"""
|
|
|
|
+ select iconurl as 'avatar', profile_data_v1 as 'profile'
|
|
|
|
+ from third_party_user where third_party_user_id = %s;
|
|
|
|
+ """
|
|
|
|
+ case "staff":
|
|
|
|
+ sql = f"""
|
|
|
|
+ select agent_profile as 'profile'
|
|
|
|
+ from qywx_employee where third_party_user_id = %s;
|
|
|
|
+ """
|
|
|
|
+ case _:
|
|
|
|
+ raise ValueError("user_type must be 'user' or 'staff'")
|
|
|
|
+
|
|
|
|
+ return mysql_client.select(sql, cursor_type=DictCursor, args=(user_id_,))
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+if __name__ == "__main__":
|
|
|
|
+ conversation_info_list = get_conversation_info()
|
|
|
|
+ data_set = []
|
|
|
|
+ for conversation_info in tqdm(conversation_info_list):
|
|
|
|
+ room_id = conversation_info["roomid"]
|
|
|
|
+ staff_id = room_id.split(":")[1]
|
|
|
|
+ user_id = room_id.split(":")[2]
|
|
|
|
+ if staff_id and user_id:
|
|
|
|
+ dialogue_history = get_dialogue_history(room_id)
|
|
|
|
+ for idx, dialogue_info in enumerate(dialogue_history):
|
|
|
|
+ if dialogue_info["sender"] == staff_id:
|
|
|
|
+ conversation = dialogue_history[: idx]
|
|
|
|
+ history_conversation = [
|
|
|
|
+ {
|
|
|
|
+ "content": i['content'],
|
|
|
|
+ "role": "assistant" if i['sender'] == staff_id else "user",
|
|
|
|
+ "timestamp": int(i['sendtime'] / 1000)
|
|
|
|
+ } for i in conversation]
|
|
|
|
+ # filter history_conversation
|
|
|
|
+ history_conversation = [i for i in history_conversation if i['timestamp'] > int(dialogue_info['sendtime'] / 1000) - 60 * 60 * 24 * 30]
|
|
|
|
+
|
|
|
|
+ if len(history_conversation) > 100:
|
|
|
|
+ history_conversation = history_conversation[-100:]
|
|
|
|
+
|
|
|
|
+ reply_msg = dialogue_info['content']
|
|
|
|
+ obj = {
|
|
|
|
+ "staff_id": staff_id,
|
|
|
|
+ "user_id": user_id,
|
|
|
|
+ "conversation": history_conversation,
|
|
|
|
+ "reply_msg": reply_msg
|
|
|
|
+ }
|
|
|
|
+ data_set.append(obj)
|
|
|
|
+
|
|
|
|
+ with open("reply_data_set_filter.json", "w", encoding="utf-8") as f:
|
|
|
|
+ f.write(json.dumps(data_set, ensure_ascii=False, indent=4))
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|