|
@@ -123,8 +123,7 @@ def batch_evaluate_content(contents: list, db: Session, request_id: str, query_w
|
|
|
try:
|
|
try:
|
|
|
# 批量调用大模型进行评估
|
|
# 批量调用大模型进行评估
|
|
|
evaluation_results_raw = batch_call_llm_for_evaluation(contents, query_word)
|
|
evaluation_results_raw = batch_call_llm_for_evaluation(contents, query_word)
|
|
|
-
|
|
|
|
|
- print(evaluation_results_raw)
|
|
|
|
|
|
|
+
|
|
|
# 处理评估结果
|
|
# 处理评估结果
|
|
|
evaluation_results = []
|
|
evaluation_results = []
|
|
|
|
|
|
|
@@ -168,7 +167,7 @@ def batch_extract_and_save_content(evaluation_results: list, db: Session, reques
|
|
|
success_ids = []
|
|
success_ids = []
|
|
|
failed_ids = []
|
|
failed_ids = []
|
|
|
|
|
|
|
|
- for i, extraction_data in enumerate(extraction_data_list):
|
|
|
|
|
|
|
+ for i, (extracted_data, clean_reason) in enumerate(extraction_data_list):
|
|
|
try:
|
|
try:
|
|
|
evaluation_result = evaluation_results[i]
|
|
evaluation_result = evaluation_results[i]
|
|
|
parsing_id = evaluation_result.get("parsing_id")
|
|
parsing_id = evaluation_result.get("parsing_id")
|
|
@@ -176,7 +175,8 @@ def batch_extract_and_save_content(evaluation_results: list, db: Session, reques
|
|
|
if "extraction_content" in evaluation_result and parsing_id:
|
|
if "extraction_content" in evaluation_result and parsing_id:
|
|
|
# 更新已有对象的data字段和状态
|
|
# 更新已有对象的data字段和状态
|
|
|
extraction_content = evaluation_result["extraction_content"]
|
|
extraction_content = evaluation_result["extraction_content"]
|
|
|
- extraction_content.data = extraction_data
|
|
|
|
|
|
|
+ extraction_content.data = extracted_data
|
|
|
|
|
+ extraction_content.clean_reason = clean_reason
|
|
|
extraction_content.status = 2 # 处理完成
|
|
extraction_content.status = 2 # 处理完成
|
|
|
success_ids.append(parsing_id)
|
|
success_ids.append(parsing_id)
|
|
|
except Exception as e:
|
|
except Exception as e:
|
|
@@ -216,10 +216,8 @@ evaluation_prompt_path = os.path.join(project_root, 'prompt', 'evaluation.md')
|
|
|
extraction_prompt_path = os.path.join(project_root, 'prompt', 'extraction.md')
|
|
extraction_prompt_path = os.path.join(project_root, 'prompt', 'extraction.md')
|
|
|
|
|
|
|
|
# 打印路径信息,用于调试
|
|
# 打印路径信息,用于调试
|
|
|
-logger.info(f"评估提示词路径: {evaluation_prompt_path}")
|
|
|
|
|
EVALUATION_PROMPT = read_prompt_file(evaluation_prompt_path)
|
|
EVALUATION_PROMPT = read_prompt_file(evaluation_prompt_path)
|
|
|
|
|
|
|
|
-logger.info(f"抽取提示词路径: {extraction_prompt_path}")
|
|
|
|
|
EXTRACTION_PROMPT = read_prompt_file(extraction_prompt_path)
|
|
EXTRACTION_PROMPT = read_prompt_file(extraction_prompt_path)
|
|
|
|
|
|
|
|
def batch_call_llm_for_evaluation(contents: list, query_word: str) -> list:
|
|
def batch_call_llm_for_evaluation(contents: list, query_word: str) -> list:
|
|
@@ -245,7 +243,7 @@ def batch_call_llm_for_evaluation(contents: list, query_word: str) -> list:
|
|
|
parsing_id = contents[i].id
|
|
parsing_id = contents[i].id
|
|
|
parsing_data = contents[i].parsing_data
|
|
parsing_data = contents[i].parsing_data
|
|
|
score = result.get("score", -2)
|
|
score = result.get("score", -2)
|
|
|
- score_reason = result.get("score_reason", "")
|
|
|
|
|
|
|
+ score_reason = result.get("reason", "")
|
|
|
|
|
|
|
|
evaluation_results.append((parsing_id, score, score_reason, parsing_data))
|
|
evaluation_results.append((parsing_id, score, score_reason, parsing_data))
|
|
|
|
|
|
|
@@ -265,8 +263,7 @@ def batch_call_llm_for_extraction(evaluation_results: list, query_word: str) ->
|
|
|
"query_word": query_word,
|
|
"query_word": query_word,
|
|
|
"content": parsing_data
|
|
"content": parsing_data
|
|
|
})
|
|
})
|
|
|
-
|
|
|
|
|
- logger.info(f"批量抽取内容: {extraction_contents}")
|
|
|
|
|
|
|
+
|
|
|
try:
|
|
try:
|
|
|
# 批量调用 Gemini 进行抽取
|
|
# 批量调用 Gemini 进行抽取
|
|
|
results = gemini_processor.batch_process(extraction_contents, EXTRACTION_PROMPT)
|
|
results = gemini_processor.batch_process(extraction_contents, EXTRACTION_PROMPT)
|
|
@@ -275,7 +272,11 @@ def batch_call_llm_for_extraction(evaluation_results: list, query_word: str) ->
|
|
|
extraction_results = []
|
|
extraction_results = []
|
|
|
for i, result in enumerate(results):
|
|
for i, result in enumerate(results):
|
|
|
result = re.sub(r'^\s*```json|\s*```\s*$', '', result, flags=re.MULTILINE).strip()
|
|
result = re.sub(r'^\s*```json|\s*```\s*$', '', result, flags=re.MULTILINE).strip()
|
|
|
- extraction_results.append(result)
|
|
|
|
|
|
|
+ result = json.loads(result)
|
|
|
|
|
+ extracted_data = result.get("extracted_content", "未提取到内容")
|
|
|
|
|
+ clean_reason = result.get("analysis_reason", "未返回原因")
|
|
|
|
|
+
|
|
|
|
|
+ extraction_results.append((extracted_data, clean_reason))
|
|
|
|
|
|
|
|
return extraction_results
|
|
return extraction_results
|
|
|
|
|
|