|
@@ -17,6 +17,9 @@ import errno
|
|
|
import multiprocessing
|
|
|
from typing import Any, Dict, List, Optional, TypedDict, Annotated
|
|
|
from contextlib import asynccontextmanager
|
|
|
+
|
|
|
+# 设置环境变量以抑制 gRPC fork 警告
|
|
|
+os.environ.setdefault('GRPC_POLL_STRATEGY', 'poll')
|
|
|
from utils.mysql_db import MysqlHelper
|
|
|
from fastapi import FastAPI, HTTPException, BackgroundTasks
|
|
|
from fastapi.responses import JSONResponse
|
|
@@ -358,6 +361,101 @@ RUNNING_LOCK = asyncio.Lock()
|
|
|
# LangGraph 工作流定义
|
|
|
# =========================
|
|
|
|
|
|
+def process_single_item(args):
|
|
|
+ """处理单个数据项的函数,用于多进程 (模块级,便于pickle)"""
|
|
|
+ idx, item, request_id = args
|
|
|
+ try:
|
|
|
+ crawl_data = item.get('crawl_data') or {}
|
|
|
+ content_id = item.get('content_id') or ''
|
|
|
+ task_id = item.get('task_id') or ''
|
|
|
+
|
|
|
+ # 先在库中查询是否已经处理过
|
|
|
+ check_sql = "SELECT id,status,indentify_data FROM knowledge_parsing_content WHERE request_id = %s AND content_id = %s"
|
|
|
+ check_result = MysqlHelper.get_values(check_sql, (request_id, content_id))
|
|
|
+ result_status = 0
|
|
|
+ result_id = 0
|
|
|
+ result_indentify_data = {}
|
|
|
+ if check_result:
|
|
|
+ id, status, indentify_data = check_result[0]
|
|
|
+ logger.info(f"查询到待结构化处理的条目,id: {id}, status: {status}, indentify_data: {str(indentify_data)[:100]}")
|
|
|
+ result_status = status
|
|
|
+ result_id = id
|
|
|
+ result_indentify_data = indentify_data
|
|
|
+ if status == 5:
|
|
|
+ return {
|
|
|
+ "index": idx,
|
|
|
+ "dbInserted": True,
|
|
|
+ "identifyError": None,
|
|
|
+ "status": 2,
|
|
|
+ "success": True
|
|
|
+ }
|
|
|
+
|
|
|
+ # 0 未识别 3识别失败,需要重新进行识别
|
|
|
+ if result_status == 0 or result_status == 3:
|
|
|
+ # Step 1: 识别
|
|
|
+ identify_result = identify_tool.run(
|
|
|
+ crawl_data if isinstance(crawl_data, dict) else {}
|
|
|
+ )
|
|
|
+
|
|
|
+ # Step 2: 结构化并入库
|
|
|
+ affected = UpdateDataTool.store_indentify_result(
|
|
|
+ request_id,
|
|
|
+ {
|
|
|
+ "content_id": content_id,
|
|
|
+ "task_id": task_id
|
|
|
+ },
|
|
|
+ identify_result
|
|
|
+ )
|
|
|
+ else:
|
|
|
+ # result_indentify_data是JSON字符串,需要解析为对象
|
|
|
+ identify_result = json.loads(result_indentify_data) if isinstance(result_indentify_data, str) else result_indentify_data
|
|
|
+ affected = result_id
|
|
|
+
|
|
|
+ # 使用StructureTool进行内容结构化处理
|
|
|
+ structure_tool = StructureTool()
|
|
|
+ structure_result = structure_tool.process_content_structure(identify_result)
|
|
|
+
|
|
|
+ # 存储结构化解析结果
|
|
|
+ parsing_affected = UpdateDataTool.store_parsing_result(
|
|
|
+ request_id,
|
|
|
+ {
|
|
|
+ "id": affected,
|
|
|
+ "content_id": content_id,
|
|
|
+ "task_id": task_id
|
|
|
+ },
|
|
|
+ structure_result
|
|
|
+ )
|
|
|
+ logger.info(f"调试信息: affected={affected}, content_id={content_id}, result_status={result_status}")
|
|
|
+ ok = affected is not None and affected > 0 and parsing_affected is not None and parsing_affected > 0
|
|
|
+ if ok:
|
|
|
+ success = True
|
|
|
+ else:
|
|
|
+ success = True
|
|
|
+ logger.error(f"处理第 {idx} 项时出错: {identify_result.get('error') or structure_result.get('error')}")
|
|
|
+
|
|
|
+ # 记录处理详情
|
|
|
+ detail = {
|
|
|
+ "index": idx,
|
|
|
+ "dbInserted": ok,
|
|
|
+ "identifyError": identify_result.get('error') or structure_result.get('error'),
|
|
|
+ "status": 2 if ok else 3,
|
|
|
+ "success": success
|
|
|
+ }
|
|
|
+
|
|
|
+ logger.info(f"处理进度: {idx} - {'成功' if ok else '失败'}")
|
|
|
+ return detail
|
|
|
+
|
|
|
+ except Exception as e:
|
|
|
+ logger.error(f"处理第 {idx} 项时出错: {e}")
|
|
|
+ return {
|
|
|
+ "index": idx,
|
|
|
+ "dbInserted": False,
|
|
|
+ "identifyError": str(e),
|
|
|
+ "status": 3,
|
|
|
+ "success": False
|
|
|
+ }
|
|
|
+
|
|
|
+
|
|
|
def create_langgraph_workflow():
|
|
|
"""创建 LangGraph 工作流"""
|
|
|
if not HAS_LANGGRAPH:
|
|
@@ -388,99 +486,7 @@ def create_langgraph_workflow():
|
|
|
state["status"] = "error"
|
|
|
return state
|
|
|
|
|
|
- def process_single_item(args):
|
|
|
- """处理单个数据项的函数,用于多进程"""
|
|
|
- idx, item, request_id = args
|
|
|
- try:
|
|
|
- crawl_data = item.get('crawl_data') or {}
|
|
|
- content_id = item.get('content_id') or ''
|
|
|
- task_id = item.get('task_id') or ''
|
|
|
-
|
|
|
- # 先在库中查询是否已经处理过
|
|
|
- check_sql = "SELECT id,status,indentify_data FROM knowledge_parsing_content WHERE request_id = %s AND content_id = %s"
|
|
|
- check_result = MysqlHelper.get_values(check_sql, (request_id, content_id))
|
|
|
- result_status = 0
|
|
|
- result_id = 0
|
|
|
- result_indentify_data = {}
|
|
|
- if check_result:
|
|
|
- id, status, indentify_data = check_result[0]
|
|
|
- logger.info(f"查询到待结构化处理的条目,id: {id}, status: {status}, indentify_data: {str(indentify_data)[:100]}")
|
|
|
- result_status = status
|
|
|
- result_id = id
|
|
|
- result_indentify_data = indentify_data
|
|
|
- if status == 5:
|
|
|
- return {
|
|
|
- "index": idx,
|
|
|
- "dbInserted": True,
|
|
|
- "identifyError": None,
|
|
|
- "status": 2,
|
|
|
- "success": True
|
|
|
- }
|
|
|
-
|
|
|
- # 0 未识别 3识别失败,需要重新进行识别
|
|
|
- if result_status == 0 or result_status == 3:
|
|
|
- # Step 1: 识别
|
|
|
- identify_result = identify_tool.run(
|
|
|
- crawl_data if isinstance(crawl_data, dict) else {}
|
|
|
- )
|
|
|
-
|
|
|
- # Step 2: 结构化并入库
|
|
|
- affected = UpdateDataTool.store_indentify_result(
|
|
|
- request_id,
|
|
|
- {
|
|
|
- "content_id": content_id,
|
|
|
- "task_id": task_id
|
|
|
- },
|
|
|
- identify_result
|
|
|
- )
|
|
|
- else:
|
|
|
- # result_indentify_data是JSON字符串,需要解析为对象
|
|
|
- identify_result = json.loads(result_indentify_data) if isinstance(result_indentify_data, str) else result_indentify_data
|
|
|
- affected = result_id
|
|
|
-
|
|
|
- # 使用StructureTool进行内容结构化处理
|
|
|
- structure_tool = StructureTool()
|
|
|
- structure_result = structure_tool.process_content_structure(identify_result)
|
|
|
-
|
|
|
- # 存储结构化解析结果
|
|
|
- parsing_affected = UpdateDataTool.store_parsing_result(
|
|
|
- request_id,
|
|
|
- {
|
|
|
- "id": affected,
|
|
|
- "content_id": content_id,
|
|
|
- "task_id": task_id
|
|
|
- },
|
|
|
- structure_result
|
|
|
- )
|
|
|
- logger.info(f"调试信息: affected={affected}, content_id={content_id}, result_status={result_status}")
|
|
|
- ok = affected is not None and affected > 0 and parsing_affected is not None and parsing_affected > 0
|
|
|
- if ok:
|
|
|
- success = True
|
|
|
- else:
|
|
|
- success = True
|
|
|
- logger.error(f"处理第 {idx} 项时出错: {identify_result.get('error') or structure_result.get('error')}")
|
|
|
-
|
|
|
- # 记录处理详情
|
|
|
- detail = {
|
|
|
- "index": idx,
|
|
|
- "dbInserted": ok,
|
|
|
- "identifyError": identify_result.get('error') or structure_result.get('error'),
|
|
|
- "status": 2 if ok else 3,
|
|
|
- "success": success
|
|
|
- }
|
|
|
-
|
|
|
- logger.info(f"处理进度: {idx} - {'成功' if ok else '失败'}")
|
|
|
- return detail
|
|
|
-
|
|
|
- except Exception as e:
|
|
|
- logger.error(f"处理第 {idx} 项时出错: {e}")
|
|
|
- return {
|
|
|
- "index": idx,
|
|
|
- "dbInserted": False,
|
|
|
- "identifyError": str(e),
|
|
|
- "status": 3,
|
|
|
- "success": False
|
|
|
- }
|
|
|
+
|
|
|
|
|
|
def process_items_batch(state: AgentState) -> AgentState:
|
|
|
"""批量处理所有数据项 - 使用多进程并行处理"""
|
|
@@ -495,9 +501,22 @@ def create_langgraph_workflow():
|
|
|
|
|
|
# 使用3个进程并行处理,添加多进程保护
|
|
|
if __name__ == '__main__' or multiprocessing.current_process().name == 'MainProcess':
|
|
|
+ # 设置多进程启动方法为 'spawn' 以避免 gRPC fork 问题
|
|
|
+ original_start_method = multiprocessing.get_start_method()
|
|
|
+ try:
|
|
|
+ multiprocessing.set_start_method('spawn', force=True)
|
|
|
+ except RuntimeError:
|
|
|
+ pass # 如果已经设置过,忽略错误
|
|
|
+
|
|
|
with multiprocessing.Pool(processes=3) as pool:
|
|
|
logger.info(f"开始多进程处理: 数量={len(process_args)}")
|
|
|
results = pool.map(process_single_item, process_args)
|
|
|
+
|
|
|
+ # 恢复原始启动方法
|
|
|
+ try:
|
|
|
+ multiprocessing.set_start_method(original_start_method, force=True)
|
|
|
+ except RuntimeError:
|
|
|
+ pass
|
|
|
else:
|
|
|
# 如果不在主进程中,回退到串行处理
|
|
|
logger.warning("不在主进程中,回退到串行处理")
|