|
@@ -78,8 +78,6 @@ def execute_continuous_evaluation_extraction(request_id: str, db: Session, query
|
|
|
try:
|
|
|
# 批量评估内容并创建KnowledgeExtractionContent对象
|
|
|
evaluation_results = batch_evaluate_content(contents, db, request_id, query_word)
|
|
|
-
|
|
|
- print(f"""evaluation_results: {evaluation_results}""")
|
|
|
|
|
|
# 对评分大于阈值的内容进行抽取
|
|
|
high_score_results = [result for result in evaluation_results if result["score"] >= SCORE_THRESHOLD]
|
|
@@ -237,14 +235,27 @@ def batch_call_llm_for_evaluation(contents: list, query_word: str) -> list:
|
|
|
# 处理返回结果
|
|
|
evaluation_results = []
|
|
|
for i, result in enumerate(results):
|
|
|
- # 只处理大括号外面的内容,保留JSON内部格式
|
|
|
- result = re.sub(r'(^\s*```json)|(\s*```\s*$)', '', result, flags=re.MULTILINE).strip()
|
|
|
- result = json.loads(result)
|
|
|
- parsing_id = contents[i].id
|
|
|
- parsing_data = contents[i].parsing_data
|
|
|
- content_id = contents[i].content_id
|
|
|
- score = result.get("score", -2)
|
|
|
- score_reason = result.get("reason", "")
|
|
|
+ try:
|
|
|
+ # 只处理大括号外面的内容,保留JSON内部格式
|
|
|
+ result = re.sub(r'(^\s*```json)|(\s*```\s*$)', '', result, flags=re.MULTILINE).strip()
|
|
|
+ # 尝试修复常见的JSON格式问题
|
|
|
+ result = result.replace("'", "\"") # 将单引号替换为双引号
|
|
|
+ result = re.sub(r'([{,])\s*(\w+)\s*:', r'\1"\2":', result) # 确保属性名有双引号
|
|
|
+
|
|
|
+ # 解析JSON
|
|
|
+ parsed_result = json.loads(result)
|
|
|
+ parsing_id = contents[i].id
|
|
|
+ parsing_data = contents[i].parsing_data
|
|
|
+ content_id = contents[i].content_id
|
|
|
+ score = parsed_result.get("score", -2)
|
|
|
+ score_reason = parsed_result.get("reason", "")
|
|
|
+ except Exception as json_error:
|
|
|
+ logger.error(f"评估JSON解析错误: {str(json_error)},原始内容: {result[:100]}...")
|
|
|
+ parsing_id = contents[i].id
|
|
|
+ parsing_data = contents[i].parsing_data
|
|
|
+ content_id = contents[i].content_id
|
|
|
+ score = -1
|
|
|
+ score_reason = f"JSON解析错误: {str(json_error)}"
|
|
|
|
|
|
evaluation_results.append((parsing_id, score, score_reason, parsing_data, content_id))
|
|
|
|
|
@@ -277,17 +288,27 @@ def batch_call_llm_for_extraction(evaluation_results: list, query_word: str) ->
|
|
|
# 处理返回结果
|
|
|
extraction_results = []
|
|
|
for i, result in enumerate(results):
|
|
|
- # 只处理大括号外面的内容,保留JSON内部格式
|
|
|
- result = re.sub(r'(^\s*```json)|(\s*```\s*$)', '', result, flags=re.MULTILINE).strip()
|
|
|
- result = json.loads(result)
|
|
|
- extracted_data = result.get("extracted_content", "未提取到内容")
|
|
|
- clean_reason = result.get("analysis_reason", "未返回原因")
|
|
|
+ try:
|
|
|
+ # 只处理大括号外面的内容,保留JSON内部格式
|
|
|
+ result = re.sub(r'(^\s*```json)|(\s*```\s*$)', '', result, flags=re.MULTILINE).strip()
|
|
|
+ # 尝试修复常见的JSON格式问题
|
|
|
+ result = result.replace("'", "\"") # 将单引号替换为双引号
|
|
|
+ result = re.sub(r'([{,])\s*(\w+)\s*:', r'\1"\2":', result) # 确保属性名有双引号
|
|
|
+
|
|
|
+ # 解析JSON
|
|
|
+ parsed_result = json.loads(result)
|
|
|
+ extracted_data = parsed_result.get("extracted_content", "未提取到内容")
|
|
|
+ clean_reason = parsed_result.get("analysis_reason", "未返回原因")
|
|
|
+ except Exception as json_error:
|
|
|
+ logger.error(f"JSON解析错误: {str(json_error)},原始内容: {result[:100]}...")
|
|
|
+ extracted_data = "未提取到内容"
|
|
|
+ clean_reason = f"JSON解析错误: {str(json_error)}"
|
|
|
|
|
|
extraction_results.append((extracted_data, clean_reason))
|
|
|
|
|
|
return extraction_results
|
|
|
|
|
|
except Exception as e:
|
|
|
- logger.error(f"批量抽取过程异常: {str(e)}")
|
|
|
+ logger.error(f"批量抽取过程异常: {str(e)} results:{results}")
|
|
|
# 返回空结果,确保返回类型为元组列表
|
|
|
return [("未提取到内容", "抽取过程异常") for _ in range(len(evaluation_results))]
|