Browse Source

Merge branch 'main' of https://git.yishihui.com/ai/knowledge-agent

jihuaqiang 1 day ago
parent
commit
43e16488cf
1 changed files with 37 additions and 16 deletions
  1. 37 16
      agents/clean_agent/tools.py

+ 37 - 16
agents/clean_agent/tools.py

@@ -78,8 +78,6 @@ def execute_continuous_evaluation_extraction(request_id: str, db: Session, query
             try:
                 # 批量评估内容并创建KnowledgeExtractionContent对象
                 evaluation_results = batch_evaluate_content(contents, db, request_id, query_word)
-                
-                print(f"""evaluation_results: {evaluation_results}""")
 
                 # 对评分大于阈值的内容进行抽取
                 high_score_results = [result for result in evaluation_results if result["score"] >= SCORE_THRESHOLD]
@@ -237,14 +235,27 @@ def batch_call_llm_for_evaluation(contents: list, query_word: str) -> list:
         # 处理返回结果
         evaluation_results = []
         for i, result in enumerate(results):
-            # 只处理大括号外面的内容,保留JSON内部格式
-            result = re.sub(r'(^\s*```json)|(\s*```\s*$)', '', result, flags=re.MULTILINE).strip()
-            result = json.loads(result)
-            parsing_id = contents[i].id
-            parsing_data = contents[i].parsing_data  
-            content_id = contents[i].content_id
-            score = result.get("score", -2)
-            score_reason = result.get("reason", "")
+            try:
+                # 只处理大括号外面的内容,保留JSON内部格式
+                result = re.sub(r'(^\s*```json)|(\s*```\s*$)', '', result, flags=re.MULTILINE).strip()
+                # 尝试修复常见的JSON格式问题
+                result = result.replace("'", "\"")  # 将单引号替换为双引号
+                result = re.sub(r'([{,])\s*(\w+)\s*:', r'\1"\2":', result)  # 确保属性名有双引号
+                
+                # 解析JSON
+                parsed_result = json.loads(result)
+                parsing_id = contents[i].id
+                parsing_data = contents[i].parsing_data  
+                content_id = contents[i].content_id
+                score = parsed_result.get("score", -2)
+                score_reason = parsed_result.get("reason", "")
+            except Exception as json_error:
+                logger.error(f"评估JSON解析错误: {str(json_error)},原始内容: {result[:100]}...")
+                parsing_id = contents[i].id
+                parsing_data = contents[i].parsing_data
+                content_id = contents[i].content_id
+                score = -1
+                score_reason = f"JSON解析错误: {str(json_error)}"
             
             evaluation_results.append((parsing_id, score, score_reason, parsing_data, content_id))
         
@@ -277,17 +288,27 @@ def batch_call_llm_for_extraction(evaluation_results: list, query_word: str) ->
         # 处理返回结果
         extraction_results = []
         for i, result in enumerate(results):
-            # 只处理大括号外面的内容,保留JSON内部格式
-            result = re.sub(r'(^\s*```json)|(\s*```\s*$)', '', result, flags=re.MULTILINE).strip()
-            result = json.loads(result)
-            extracted_data = result.get("extracted_content", "未提取到内容")
-            clean_reason = result.get("analysis_reason", "未返回原因")
+            try:
+                # 只处理大括号外面的内容,保留JSON内部格式
+                result = re.sub(r'(^\s*```json)|(\s*```\s*$)', '', result, flags=re.MULTILINE).strip()
+                # 尝试修复常见的JSON格式问题
+                result = result.replace("'", "\"")  # 将单引号替换为双引号
+                result = re.sub(r'([{,])\s*(\w+)\s*:', r'\1"\2":', result)  # 确保属性名有双引号
+                
+                # 解析JSON
+                parsed_result = json.loads(result)
+                extracted_data = parsed_result.get("extracted_content", "未提取到内容")
+                clean_reason = parsed_result.get("analysis_reason", "未返回原因")
+            except Exception as json_error:
+                logger.error(f"JSON解析错误: {str(json_error)},原始内容: {result[:100]}...")
+                extracted_data = "未提取到内容"
+                clean_reason = f"JSON解析错误: {str(json_error)}"
             
             extraction_results.append((extracted_data, clean_reason))
         
         return extraction_results
         
     except Exception as e:
-        logger.error(f"批量抽取过程异常: {str(e)}")
+        logger.error(f"批量抽取过程异常: {str(e)} results:{results}")
         # 返回空结果,确保返回类型为元组列表
         return [("未提取到内容", "抽取过程异常") for _ in range(len(evaluation_results))]