丁云鹏 22 horas atrás
pai
commit
6ea5abd416
1 arquivos alterados com 37 adições e 14 exclusões
  1. 37 14
      agents/clean_agent/tools.py

+ 37 - 14
agents/clean_agent/tools.py

@@ -237,14 +237,27 @@ def batch_call_llm_for_evaluation(contents: list, query_word: str) -> list:
         # 处理返回结果
         evaluation_results = []
         for i, result in enumerate(results):
-            # 只处理大括号外面的内容,保留JSON内部格式
-            result = re.sub(r'(^\s*```json)|(\s*```\s*$)', '', result, flags=re.MULTILINE).strip()
-            result = json.loads(result)
-            parsing_id = contents[i].id
-            parsing_data = contents[i].parsing_data  
-            content_id = contents[i].content_id
-            score = result.get("score", -2)
-            score_reason = result.get("reason", "")
+            try:
+                # 只处理大括号外面的内容,保留JSON内部格式
+                result = re.sub(r'(^\s*```json)|(\s*```\s*$)', '', result, flags=re.MULTILINE).strip()
+                # 尝试修复常见的JSON格式问题
+                result = result.replace("'", "\"")  # 将单引号替换为双引号
+                result = re.sub(r'([{,])\s*(\w+)\s*:', r'\1"\2":', result)  # 确保属性名有双引号
+                
+                # 解析JSON
+                parsed_result = json.loads(result)
+                parsing_id = contents[i].id
+                parsing_data = contents[i].parsing_data  
+                content_id = contents[i].content_id
+                score = parsed_result.get("score", -2)
+                score_reason = parsed_result.get("reason", "")
+            except Exception as json_error:
+                logger.error(f"评估JSON解析错误: {str(json_error)},原始内容: {result[:100]}...")
+                parsing_id = contents[i].id
+                parsing_data = contents[i].parsing_data
+                content_id = contents[i].content_id
+                score = -1
+                score_reason = f"JSON解析错误: {str(json_error)}"
             
             evaluation_results.append((parsing_id, score, score_reason, parsing_data, content_id))
         
@@ -277,17 +290,27 @@ def batch_call_llm_for_extraction(evaluation_results: list, query_word: str) ->
         # 处理返回结果
         extraction_results = []
         for i, result in enumerate(results):
-            # 只处理大括号外面的内容,保留JSON内部格式
-            result = re.sub(r'(^\s*```json)|(\s*```\s*$)', '', result, flags=re.MULTILINE).strip()
-            result = json.loads(result)
-            extracted_data = result.get("extracted_content", "未提取到内容")
-            clean_reason = result.get("analysis_reason", "未返回原因")
+            try:
+                # 只处理大括号外面的内容,保留JSON内部格式
+                result = re.sub(r'(^\s*```json)|(\s*```\s*$)', '', result, flags=re.MULTILINE).strip()
+                # 尝试修复常见的JSON格式问题
+                result = result.replace("'", "\"")  # 将单引号替换为双引号
+                result = re.sub(r'([{,])\s*(\w+)\s*:', r'\1"\2":', result)  # 确保属性名有双引号
+                
+                # 解析JSON
+                parsed_result = json.loads(result)
+                extracted_data = parsed_result.get("extracted_content", "未提取到内容")
+                clean_reason = parsed_result.get("analysis_reason", "未返回原因")
+            except Exception as json_error:
+                logger.error(f"JSON解析错误: {str(json_error)},原始内容: {result[:100]}...")
+                extracted_data = "未提取到内容"
+                clean_reason = f"JSON解析错误: {str(json_error)}"
             
             extraction_results.append((extracted_data, clean_reason))
         
         return extraction_results
         
     except Exception as e:
-        logger.error(f"批量抽取过程异常: {str(e)}")
+        logger.error(f"批量抽取过程异常: {str(e)} results:{results}")
         # 返回空结果,确保返回类型为元组列表
         return [("未提取到内容", "抽取过程异常") for _ in range(len(evaluation_results))]