Bladeren bron

Update dialogue_manager: support images

StrayWarrior 4 dagen geleden
bovenliggende
commit
a42f0135b4
2 gewijzigde bestanden met toevoegingen van 47 en 10 verwijderingen
  1. 1 0
      chat_service.py
  2. 46 10
      dialogue_manager.py

+ 1 - 0
chat_service.py

@@ -19,6 +19,7 @@ VOLCENGINE_BASE_URL = "https://ark.cn-beijing.volces.com/api/v3"
 VOLCENGINE_MODEL_DEEPSEEK_V3 = "deepseek-v3-250324"
 VOLCENGINE_MODEL_DOUBAO_PRO_1_5 = 'ep-20250307150409-4blz9'
 VOLCENGINE_MODEL_DOUBAO_PRO_32K = 'ep-20250414202859-6nkz5'
+VOLCENGINE_MODEL_DOUBAO_1_5_VISION_PRO = 'ep-20250421193334-nz5wd'
 DEEPSEEK_API_TOKEN = 'sk-67daad8f424f4854bda7f1fed7ef220b'
 DEEPSEEK_BASE_URL = 'https://api.deepseek.com/'
 DEEPSEEK_CHAT_MODEL = 'deepseek-chat'

+ 46 - 10
dialogue_manager.py

@@ -436,29 +436,63 @@ class DialogueManager:
         return state_to_prompt_map[state]
 
     @staticmethod
-    def _select_coze_bot(state):
+    def _select_coze_bot(state, dialogue: List[Dict], multimodal=False):
         state_to_bot_map = {
             DialogueState.GREETING: '7486112546798780425',
             DialogueState.CHITCHAT: '7491300566573301770',
-            DialogueState.FAREWELL: '7491300566573301770'
+            DialogueState.FAREWELL: '7491300566573301770',
         }
+
+        if multimodal:
+            state_to_bot_map = {
+                DialogueState.GREETING: '7486112546798780425',
+                DialogueState.CHITCHAT: '7495692989504438308',
+                DialogueState.FAREWELL: '7491300566573301770',
+            }
         return state_to_bot_map[state]
 
+    @staticmethod
+    def need_multimodal_model(dialogue: List[Dict], max_message_to_use: int = 10):
+        # 当前仅为简单实现
+        recent_messages = dialogue[-max_message_to_use:]
+        ret = False
+        for entry in recent_messages:
+            if entry.get('type') in (MessageType.IMAGE_GW, MessageType.IMAGE_QW):
+                ret = True
+                break
+        return ret
+
     def _create_system_message(self, prompt_context):
         prompt_template = self._select_prompt(self.current_state)
         prompt = prompt_template.format(**prompt_context)
         return {'role': 'system', 'content': prompt}
 
     @staticmethod
-    def compose_chat_messages_openai_compatible(dialogue_history, current_time):
+    def compose_chat_messages_openai_compatible(dialogue_history, current_time, multimodal=False):
         messages = []
         for entry in dialogue_history:
             role = entry['role']
+            msg_type = entry.get('type', MessageType.TEXT)
             fmt_time = DialogueManager.format_timestamp(entry['timestamp'])
-            messages.append({
-                "role": role,
-                "content": '[{}] {}'.format(fmt_time, entry["content"])
-            })
+            if msg_type in (MessageType.IMAGE_GW, MessageType.IMAGE_QW):
+                if multimodal:
+                    messages.append({
+                        "role": role,
+                        "content": [
+                            {"type": "image_url", "image_url": {"url": entry["content"]}}
+                        ]
+                    })
+                else:
+                    logger.warning("Image in non-multimodal mode")
+                    messages.append({
+                        "role": role,
+                        "content": "[{}] {}".format(fmt_time, '[图片]')
+                    })
+            else:
+                messages.append({
+                    "role": role,
+                    "content": '[{}] {}'.format(fmt_time, entry["content"])
+                })
         # 添加一条前缀用于 约束时间场景
         msg_prefix = '[{}]'.format(current_time)
         messages.append({'role': 'assistant', 'content': msg_prefix})
@@ -467,7 +501,7 @@ class DialogueManager:
     @staticmethod
     def compose_chat_messages_coze(dialogue_history, current_time, staff_id, user_id):
         messages = []
-        # 如果system后的第1条消息不为user,需要补一条user消息
+        # 如果system后的第1条消息不为user,需要在最开始补一条user消息,否则会吞assistant消息
         if len(dialogue_history) > 0 and dialogue_history[0]['role'] != 'user':
             fmt_time = DialogueManager.format_timestamp(dialogue_history[0]['timestamp'])
             messages.append(cozepy.Message.build_user_question_text(f'[{fmt_time}] '))
@@ -531,10 +565,12 @@ class DialogueManager:
         if overwrite_context and 'current_time' in overwrite_context:
             current_time = overwrite_context.get('current_time')
 
+        need_multimodal = self.need_multimodal_model(dialogue_history)
+        config['use_multimodal_model'] = need_multimodal
         if chat_service_type == ChatServiceType.OPENAI_COMPATIBLE:
             system_message = self._create_system_message(prompt_context)
             messages.append(system_message)
-            messages.extend(self.compose_chat_messages_openai_compatible(dialogue_history, current_time))
+            messages.extend(self.compose_chat_messages_openai_compatible(dialogue_history, current_time, need_multimodal))
         elif chat_service_type == ChatServiceType.COZE_CHAT:
             dialogue_history = dialogue_history[-95:] # Coze最多支持100条,还需要附加系统消息
             messages = self.compose_chat_messages_coze(dialogue_history, current_time, self.staff_id, self.user_id)
@@ -543,7 +579,7 @@ class DialogueManager:
                 custom_variables[k] = str(v)
             custom_variables.pop('user_profile', None)
             config['custom_variables'] = custom_variables
-            config['bot_id'] = self._select_coze_bot(self.current_state)
+            config['bot_id'] = self._select_coze_bot(self.current_state, dialogue_history, need_multimodal)
 
         #FIXME(zhoutian): 临时报警
         if user_message and not messages: