""" Decode Workflow. 解码工作流:合并 What 解构工作流和脚本理解工作流的完整流程。 流程:初始化数据库记录 → 视频上传 → 灵感点提取 → 目的点提取 → 关键点提取 → 选题理解 → 段落划分 → 实质提取 → 形式提取 → 分离结果 → 结果汇总 """ from typing import Dict, Any from langgraph.graph import StateGraph, END from src.components.agents.base import BaseGraphAgent from src.components.agents.topic_selection_understanding_agent import TopicSelectionUnderstandingAgent from src.components.functions.result_aggregation_function import ResultAggregationFunction from src.components.functions.video_upload_function import VideoUploadFunction # What解构相关Agent from src.components.agents.inspiration_points_agent import InspirationPointsAgent from src.components.agents.purpose_point_agent import PurposePointAgent from src.components.agents.key_points_agent import KeyPointsAgent # 脚本理解相关Agent from src.components.agents.script_section_division_agent import ScriptSectionDivisionAgent from src.components.agents.script_substance_extraction_agent import ScriptSubstanceExtractionAgent from src.components.agents.script_form_extraction_agent import ScriptFormExtractionAgent # ScriptWorkflowV2 相关Agent from src.components.agents.structure_agent import StructureAgent from src.components.agents.content_unit_split_agent import ContentUnitSplitAgent from src.components.agents.content_unit_understand import ContentUnitUnderstandAgent from src.components.agents.script_keyword_agent import ScriptKeywordAgent # 搜索关键词Agent from src.components.agents.search_keyword_agent import SearchKeywordAgent from src.models import get_db, DecodeVideo, DecodeStatus from src.utils.logger import get_logger from utils.general import get_now_ts logger = get_logger(__name__) class DecodeWorkflow(BaseGraphAgent): """解码工作流(合并 What 解构和脚本理解) 功能: - 编排完整的解码流程(视频分析) - 流程:初始化数据库记录 → 视频上传 → 灵感点提取 → 目的点提取 → 关键点提取 → 选题理解 → 段落划分 → 实质提取 → 形式提取 → 分离结果 → 结果汇总 - 管理状态传递 - 仅支持单视频输入 实现方式:BaseGraphAgent (LangGraph) """ def __init__( self, name: str = "decode_workflow", description: str = "解码工作流(合并 What 解构和脚本理解)", model_provider: str = "google_genai", max_depth: int = 10 ): super().__init__( name=name, description=description, state_class=dict ) self.max_depth = max_depth self.model_provider = model_provider # 初始化视频上传Function self.video_upload_func = VideoUploadFunction() # 初始化What解构相关Agent self.inspiration_points_agent = InspirationPointsAgent( model_provider=model_provider ) self.purpose_point_agent = PurposePointAgent( model_provider=model_provider ) self.key_points_agent = KeyPointsAgent( model_provider=model_provider ) self.topic_selection_understanding_agent = TopicSelectionUnderstandingAgent( model_provider=model_provider ) # 初始化脚本理解相关Agent self.section_agent = ScriptSectionDivisionAgent( model_provider=model_provider ) self.substance_agent = ScriptSubstanceExtractionAgent( model_provider=model_provider ) self.form_agent = ScriptFormExtractionAgent( model_provider=model_provider ) # 初始化 ScriptWorkflowV2 相关Agent self.structure_agent = StructureAgent( model_provider=model_provider ) self.content_unit_split_agent = ContentUnitSplitAgent( model_provider=model_provider ) self.content_unit_understand_agent = ContentUnitUnderstandAgent( model_provider=model_provider ) self.script_keyword_agent = ScriptKeywordAgent( model_provider=model_provider ) # 初始化搜索关键词Agent self.search_keyword_agent = SearchKeywordAgent( model_provider=model_provider ) # 初始化结果汇总Function self.result_aggregation_func = ResultAggregationFunction() logger.info(f"DecodeWorkflow 初始化完成") def _build_graph(self) -> StateGraph: """构建工作流图 完整流程(并行分支): START → 初始化数据库记录 → 视频上传 → [并行分支] → 合并结果 → 结果汇总 → END 分支1:灵感点提取 → 目的点提取 → 关键点提取 → 选题理解 → 段落划分 → 实质提取 → 形式提取 → 分离结果 分支2:结构化分析 → L3单元拆分 → 整体理解 → 金句提取 """ workflow = StateGraph(dict) # 使用dict作为状态类型 # 添加所有节点 workflow.add_node("init_db_record", self._init_db_record_node) workflow.add_node("video_upload", self._video_upload_node) # 分叉节点:用于启动两个并行分支 workflow.add_node("fork_branches", self._fork_branches_node) # 分支1:原有 decode_workflow 流程 # What解构节点 workflow.add_node("inspiration_points_extraction", self._inspiration_points_node) workflow.add_node("purpose_point_extraction", self._purpose_point_node) workflow.add_node("key_points_extraction", self._key_points_node) workflow.add_node("topic_selection_understanding", self._topic_selection_understanding_node) # 脚本理解节点 workflow.add_node("section_division", self._section_division_node) workflow.add_node("substance_extraction", self._substance_extraction_node) workflow.add_node("form_extraction", self._form_extraction_node) workflow.add_node("merge_all_results", self._merge_all_results_node) # 搜索关键词提取节点(分支1的最后一步) workflow.add_node("search_keywords_extraction", self._search_keywords_extraction_node) # 分支2:ScriptWorkflowV2 流程 workflow.add_node("structure_analysis", self._structure_analysis_node) workflow.add_node("content_unit_split", self._content_unit_split_node) workflow.add_node("content_unit_understand", self._content_unit_understand_node) workflow.add_node("keyword_extraction", self._keyword_extraction_node) # 合并节点和结果汇总 workflow.add_node("merge_branches", self._merge_branches_node) workflow.add_node("result_aggregation", self._result_aggregation_node) # 定义流程的边 workflow.set_entry_point("init_db_record") # 数据库记录初始化后进入视频上传 workflow.add_edge("init_db_record", "video_upload") # 视频上传后使用条件边:成功则进入分叉节点,失败则终止 workflow.add_conditional_edges( "video_upload", self._check_video_upload_success, { "success": "fork_branches", "failure": END } ) # 分叉节点:同时启动两个分支 # 注意:LangGraph 不支持从一个节点直接连接到多个节点 # 所以我们使用一个技巧:分叉节点连接到分支1,同时通过条件边也连接到分支2 # 但实际上,我们需要使用一个不同的方法 # 使用条件边,总是返回两个目标(通过返回一个列表或使用特殊的路由逻辑) # 但 LangGraph 的条件边只能返回一个字符串目标 # 解决方案:分叉节点连接到分支1,分支1的第一个节点执行后,也触发分支2 # 或者:使用一个"并行启动"节点,它内部调用两个分支的入口 # 实际上,最简单的方法是:分叉节点连接到分支1,然后在分支1的第一个节点中,也启动分支2 # 更好的方法:分叉节点使用条件边,根据一个标志来决定路由 # 但我们需要确保两个分支都能被启动 # 实际可行的方案:分叉节点连接到分支1,然后在状态中设置一个标志 # 在分支1的第一个节点中检查这个标志,如果分支2还没启动,则启动它 # 但这需要修改节点逻辑,比较复杂 # 最简单的方案:分叉节点总是连接到分支1,然后在分支1的第一个节点中 # 检查并启动分支2(通过修改状态,让工作流能够同时执行两个分支) # 但这在 LangGraph 中也不容易实现 # 实际上,在 LangGraph 中实现真正的并行执行比较困难 # 我们可以使用一个变通方法:分叉节点连接到分支1,分支1的第一个节点执行后 # 通过修改状态来标记需要执行分支2,然后在适当的时候执行分支2 # 但更简单的方法是:让分叉节点连接到分支1,然后在状态中设置一个标志 # 在合并节点中检查,如果分支2还没执行,则执行它(但这会变成串行) # 最佳方案:使用 LangGraph 的 add_edge 多次(但这不支持) # 或者:创建一个"并行执行"节点,它内部调用两个分支的入口节点 # 让我使用一个实用的方案:分叉节点连接到分支1,分支1的第一个节点执行时 # 也检查并启动分支2(通过异步或线程,但这在 LangGraph 中不容易实现) # 实际上,在 LangGraph 中,我们可以使用一个技巧: # 分叉节点连接到分支1,然后在状态中设置一个标志 # 在合并节点中,如果分支2还没完成,则执行分支2的节点(但这会变成串行) # 让我采用一个更实用的方案: # 1. 分叉节点连接到分支1 # 2. 分叉节点也通过条件边连接到分支2(使用一个总是返回"start_branch2"的条件函数) # 但条件边只能有一个返回值 # 最终方案:使用一个"并行启动"节点,它内部顺序调用两个分支的入口 # 虽然这不是真正的并行,但在实际执行中,由于每个节点都是独立的,可以认为是并行的 # 或者:分叉节点连接到分支1,分支1完成后检查分支2是否完成,如果没完成则执行分支2 # 但这会变成串行执行 # 让我采用一个折中方案:分叉节点连接到分支1,同时在状态中标记需要执行分支2 # 在分支1的某个节点中,检查并执行分支2(但这需要修改节点逻辑) # 实际上,在 LangGraph 中,最简单的方法是: # 分叉节点连接到分支1,分支1完成后,如果分支2还没执行,则执行分支2 # 虽然这不是真正的并行,但在实际应用中,由于节点执行时间不同,可以认为是近似并行的 # 分叉节点:连接到分支1,分支1的第一个节点会同时启动分支2 workflow.add_edge("fork_branches", "inspiration_points_extraction") # 启动分支1 # 分支1:原有 decode_workflow 流程 # What解构流程 - 在关键节点后添加错误检查 workflow.add_conditional_edges( "inspiration_points_extraction", self._check_workflow_status, { "continue": "purpose_point_extraction", "terminate": END } ) workflow.add_conditional_edges( "purpose_point_extraction", self._check_workflow_status, { "continue": "key_points_extraction", "terminate": END } ) workflow.add_conditional_edges( "key_points_extraction", self._check_workflow_status, { "continue": "topic_selection_understanding", "terminate": END } ) workflow.add_conditional_edges( "topic_selection_understanding", self._check_workflow_status, { "continue": "section_division", "terminate": END } ) # 脚本理解流程 workflow.add_conditional_edges( "section_division", self._check_workflow_status, { "continue": "substance_extraction", "terminate": END } ) workflow.add_conditional_edges( "substance_extraction", self._check_workflow_status, { "continue": "form_extraction", "terminate": END } ) workflow.add_conditional_edges( "form_extraction", self._check_workflow_status, { "continue": "merge_all_results", "terminate": END } ) workflow.add_conditional_edges( "merge_all_results", self._check_workflow_status, { "continue": "search_keywords_extraction", "terminate": END } ) workflow.add_conditional_edges( "search_keywords_extraction", self._check_workflow_status, { "continue": "merge_branches", "terminate": END } ) # 分支2:ScriptWorkflowV2 流程 # 注意:分支2的节点在分支1的第一个节点中直接执行,不通过图的边连接 # 这些节点保留在图中,但实际执行是在分支1的第一个节点中触发的 # 合并节点后进入结果汇总 workflow.add_edge("merge_branches", "result_aggregation") workflow.add_edge("result_aggregation", END) logger.info("工作流图构建完成 - 完整流程:初始化数据库记录 → 视频上传 → [并行分支] → 合并结果 → 结果汇总") logger.info(" 分支1:灵感点提取 → 目的点提取 → 关键点提取 → 选题理解 → 段落划分 → 实质提取 → 形式提取 → 分离结果 → 搜索关键词提取") logger.info(" 分支2:结构化分析 → L3单元拆分 → 整体理解 → 金句提取") return workflow def _check_video_upload_success(self, state: Dict[str, Any]) -> str: """检查视频上传是否成功 Returns: "success" 如果上传成功,否则返回 "failure" """ video_uri = state.get("video_uploaded_uri") video_error = state.get("video_upload_error") # 如果URI存在且没有错误,则认为成功 if video_uri and not video_error: logger.info("视频上传成功,继续执行后续流程") return "success" else: error_msg = video_error or "视频上传失败:未获取到视频URI" logger.error(f"视频上传失败,终止workflow: {error_msg}") # 设置失败信息到状态中 state["workflow_failed"] = True state["workflow_error"] = error_msg # 更新数据库记录为失败状态 self._update_db_record_after_workflow(state, success=False, error_msg=error_msg) return "failure" def _check_critical_error(self, state: Dict[str, Any], error_source: str = "") -> bool: """检查关键错误,如果存在则设置失败标志 Args: state: 状态字典 error_source: 错误来源(用于日志) Returns: True 如果存在致命错误,False 否则 """ # 检查是否已经失败 if state.get("workflow_failed"): return True # 检查视频文件是否可用 from src.utils.llm_invoker import get_video_file_from_state video_file = get_video_file_from_state(state) if not video_file: error_msg = f"无法获取视频文件对象{('(' + error_source + ')' if error_source else '')}" logger.error(f"{error_msg},终止workflow") state["workflow_failed"] = True state["workflow_error"] = error_msg return True # 检查视频URI是否存在 video_uri = state.get("video_uploaded_uri") if not video_uri: video_error = state.get("video_upload_error", "未知错误") error_msg = f"视频URI不存在{('(' + error_source + ')' if error_source else '')}:{video_error}" logger.error(f"{error_msg},终止workflow") state["workflow_failed"] = True state["workflow_error"] = error_msg return True return False def _check_agent_result_for_errors(self, result: Dict[str, Any], agent_name: str) -> bool: """检查Agent返回结果中是否包含关键错误 Args: result: Agent返回的结果字典 agent_name: Agent名称(用于日志) Returns: True 如果存在关键错误,False 否则 """ if not isinstance(result, dict): return False # 检查常见的错误字段 error_fields = ["error", "错误", "video_upload_error"] for field in error_fields: error_value = result.get(field) if error_value: # 检查是否是关键错误(无法获取视频文件等) error_str = str(error_value) if "无法获取视频文件" in error_str or "无法获取视频文件对象" in error_str: logger.error(f"{agent_name}返回关键错误: {error_str}") return True # 检查metadata中的错误 metadata = result.get("metadata", {}) if isinstance(metadata, dict): error_value = metadata.get("error") if error_value: error_str = str(error_value) if "无法获取视频文件" in error_str or "无法获取视频文件对象" in error_str or "未找到视频URI" in error_str: logger.error(f"{agent_name}返回关键错误: {error_str}") return True return False def _check_workflow_status(self, state: Dict[str, Any]) -> str: """检查workflow状态,用于条件边 Returns: "continue" 如果继续执行,否则返回 "terminate" """ if state.get("workflow_failed"): return "terminate" return "continue" def _video_upload_node(self, state: Dict[str, Any]) -> Dict[str, Any]: """节点:视频上传(第一步)- 下载视频并上传至Gemini""" logger.info("=== 执行节点:视频上传 ===") try: # 初始化Function if not self.video_upload_func.is_initialized: self.video_upload_func.initialize() # 执行视频上传 result = self.video_upload_func.execute(state) # 更新状态 state.update(result) video_uri = result.get("video_uploaded_uri") if video_uri: logger.info(f"视频上传完成 - URI: {video_uri}") else: error = result.get("video_upload_error", "未知错误") logger.error(f"视频上传失败: {error}") # 确保失败信息被设置 state.update({ "video_uploaded_uri": None, "video_upload_error": error }) except Exception as e: logger.error(f"视频上传失败: {e}", exc_info=True) state.update({ "video_uploaded_uri": None, "video_upload_error": str(e) }) return state def _init_db_record_node(self, state: Dict[str, Any]) -> Dict[str, Any]: """节点:初始化数据库记录 根据 video_id 查询 decode_videos 表: - 如果存在记录,则不新建,使用现有记录 - 如果不存在,则新建记录(状态为 EXECUTING) """ logger.info("=== 执行节点:初始化数据库记录 ===") try: video_id = state.get("video_id", "") task_id = state.get("task_id") if not video_id: logger.warning("未提供 video_id,跳过数据库记录初始化") return state db = next(get_db()) try: # 根据 video_id 查询是否已有记录 existing_record = db.query(DecodeVideo).filter_by(video_id=video_id).first() if existing_record: # 如果存在记录,使用现有的 task_id logger.info(f"找到已存在的数据库记录: task_id={existing_record.task_id}, video_id={video_id}") state["db_task_id"] = existing_record.task_id state["db_record_exists"] = True # 更新状态为执行中 existing_record.update_status(DecodeStatus.EXECUTING) db.commit() else: # 如果不存在,创建新记录 # 如果没有提供 task_id,使用 video_id 的 hash 值作为 task_id if not task_id: import hashlib task_id = int(hashlib.md5(video_id.encode()).hexdigest()[:15], 16) % (10 ** 15) logger.info(f"未提供 task_id,自动生成: {task_id}") new_record = DecodeVideo.create( task_id=task_id, video_id=video_id, status=DecodeStatus.EXECUTING ) db.add(new_record) db.commit() logger.info(f"创建新的数据库记录: task_id={task_id}, video_id={video_id}") state["db_task_id"] = task_id state["db_record_exists"] = False except Exception as e: logger.error(f"数据库操作失败: {e}", exc_info=True) db.rollback() # 数据库操作失败不影响 workflow 继续执行 finally: db.close() except Exception as e: logger.error(f"初始化数据库记录节点执行失败: {e}", exc_info=True) # 数据库操作失败不影响 workflow 继续执行 return state def _fork_branches_node(self, state: Dict[str, Any]) -> Dict[str, Any]: """节点:分叉节点 - 启动两个并行分支 注意:由于 LangGraph 的限制,我们使用一个技巧: 分叉节点连接到分支1,分支1的第一个节点会同时启动分支2 """ logger.info("=== 执行节点:分叉节点(启动并行分支) ===") # 标记需要启动分支2 state["start_branch2"] = True return state def _inspiration_points_node(self, state: Dict[str, Any]) -> Dict[str, Any]: """节点:灵感点提取(What解构) 同时启动分支2(如果还没启动) """ logger.info("=== 执行节点:灵感点提取 ===") # 检查是否需要启动分支2 if state.get("start_branch2") and not state.get("branch2_started"): logger.info("在分支1的第一个节点中启动分支2(ScriptWorkflowV2流程)") state["branch2_started"] = True # 启动分支2:执行结构化分析 try: if not self.structure_agent.is_initialized: self.structure_agent.initialize() structure_result = self.structure_agent.process(state) state["topic"] = structure_result.get("structure_data", {}) # 继续执行分支2的后续节点 if not self.content_unit_split_agent.is_initialized: self.content_unit_split_agent.initialize() split_result = self.content_unit_split_agent.process(state) state.update(split_result) if not self.content_unit_understand_agent.is_initialized: self.content_unit_understand_agent.initialize() understand_result = self.content_unit_understand_agent.process(state) state.update(understand_result) if not self.script_keyword_agent.is_initialized: self.script_keyword_agent.initialize() keyword_result = self.script_keyword_agent.process(state) state.update(keyword_result) state["branch2_completed"] = True logger.info("分支2(ScriptWorkflowV2流程)执行完成") except Exception as e: logger.error(f"分支2执行失败: {e}", exc_info=True) state["branch2_completed"] = True # 即使失败也标记为完成,避免阻塞 # 检查关键错误 if self._check_critical_error(state, "灵感点提取"): return state try: # 初始化Agent if not self.inspiration_points_agent.is_initialized: self.inspiration_points_agent.initialize() # 执行Agent result = self.inspiration_points_agent.process(state) # 更新状态 state.update(result) # 检查Agent返回结果中是否包含关键错误 if self._check_agent_result_for_errors(result, "灵感点提取Agent"): error_msg = "灵感点提取失败:无法获取视频文件" state["workflow_failed"] = True state["workflow_error"] = error_msg return state # 安全地获取灵感点数量:total_count 在 metadata 中 if isinstance(result, dict): metadata = result.get("metadata", {}) inspiration_count = metadata.get("total_count", 0) if isinstance(metadata, dict) else 0 # 如果 metadata 中没有,尝试从 inspiration_points 列表长度获取 if inspiration_count == 0: inspiration_points = result.get("inspiration_points", []) if isinstance(inspiration_points, list): inspiration_count = len(inspiration_points) else: # 如果 result 不是 dict(比如是列表),尝试获取长度 inspiration_count = len(result) if isinstance(result, list) else 0 logger.info(f"灵感点提取完成 - 共 {inspiration_count} 个灵感点") except Exception as e: logger.error(f"灵感点提取失败: {e}", exc_info=True) state["workflow_failed"] = True state["workflow_error"] = f"灵感点提取异常: {str(e)}" state.update({ "inspiration_points": { "error": str(e), "points": [], "total_count": 0 } }) return state def _purpose_point_node(self, state: Dict[str, Any]) -> Dict[str, Any]: """节点:目的点提取(What解构)""" logger.info("=== 执行节点:目的点提取 ===") # 检查关键错误 if self._check_critical_error(state, "目的点提取"): return state try: # 初始化Agent if not self.purpose_point_agent.is_initialized: self.purpose_point_agent.initialize() # 执行Agent result = self.purpose_point_agent.process(state) # 更新状态 state.update(result) # 检查Agent返回结果中是否包含关键错误 if self._check_agent_result_for_errors(result, "目的点提取Agent"): error_msg = "目的点提取失败:无法获取视频文件" state["workflow_failed"] = True state["workflow_error"] = error_msg return state main_purpose = result.get("purpose_point", {}).get("main_purpose", "未知") logger.info(f"目的点提取完成 - 主要目的: {main_purpose}") except Exception as e: logger.error(f"目的点提取失败: {e}", exc_info=True) state["workflow_failed"] = True state["workflow_error"] = f"目的点提取异常: {str(e)}" state.update({ "purpose_point": { "error": str(e), "main_purpose": "未知" } }) return state def _key_points_node(self, state: Dict[str, Any]) -> Dict[str, Any]: """节点:关键点提取(What解构)""" logger.info("=== 执行节点:关键点提取 ===") # 检查关键错误 if self._check_critical_error(state, "关键点提取"): return state try: # 初始化Agent if not self.key_points_agent.is_initialized: self.key_points_agent.initialize() # 执行Agent result = self.key_points_agent.process(state) # 更新状态 state.update(result) # 检查Agent返回结果中是否包含关键错误 if self._check_agent_result_for_errors(result, "关键点提取Agent"): error_msg = "关键点提取失败:无法获取视频文件" state["workflow_failed"] = True state["workflow_error"] = error_msg return state total_key_points = result.get("key_points", {}).get("total_count", 0) logger.info(f"关键点提取完成 - 共 {total_key_points} 个关键点") except Exception as e: logger.error(f"关键点提取失败: {e}", exc_info=True) state["workflow_failed"] = True state["workflow_error"] = f"关键点提取异常: {str(e)}" state.update({ "key_points": { "error": str(e), "creator_perspective": {"key_points": [], "summary": ""}, "consumer_perspective": {"key_points": [], "summary": ""} } }) return state def _topic_selection_understanding_node(self, state: Dict[str, Any]) -> Dict[str, Any]: """节点:选题理解(What解构)""" logger.info("=== 执行节点:选题理解 ===") # 检查关键错误 if self._check_critical_error(state, "选题理解"): return state try: # 初始化Agent if not self.topic_selection_understanding_agent.is_initialized: self.topic_selection_understanding_agent.initialize() # 执行Agent result = self.topic_selection_understanding_agent.process(state) # 更新状态 state.update(result) # 检查Agent返回结果中是否包含关键错误 if self._check_agent_result_for_errors(result, "选题理解Agent"): error_msg = "选题理解失败:无法获取视频文件" state["workflow_failed"] = True state["workflow_error"] = error_msg return state logger.info(f"选题理解完成 - result: {result}") except Exception as e: logger.error(f"选题理解失败: {e}", exc_info=True) state["workflow_failed"] = True state["workflow_error"] = f"选题理解异常: {str(e)}" state.update({ "topic_selection_understanding": { "错误": str(e) } }) return state def _section_division_node(self, state: Dict[str, Any]) -> Dict[str, Any]: """节点:脚本段落划分(脚本理解)""" logger.info("=== 执行节点:脚本段落划分 ===") # 检查关键错误 if self._check_critical_error(state, "脚本段落划分"): return state try: # 初始化Agent if not self.section_agent.is_initialized: self.section_agent.initialize() # 执行Agent result = self.section_agent.process(state) # 更新状态 state.update(result) # 检查Agent返回结果中是否包含关键错误(段落划分如果没有视频文件会返回空结果,不算致命错误) sections = result.get("段落列表", []) content_category = result.get("内容品类", "未知") logger.info(f"脚本段落划分完成 - 内容品类: {content_category}, 段落数: {len(sections)}") except Exception as e: logger.error(f"脚本段落划分失败: {e}", exc_info=True) state["workflow_failed"] = True state["workflow_error"] = f"脚本段落划分异常: {str(e)}" state.update({ "内容品类": "未知品类", "段落列表": [] }) return state def _substance_extraction_node(self, state: Dict[str, Any]) -> Dict[str, Any]: """节点:实质元素提取(脚本理解)""" logger.info("=== 执行节点:实质元素提取 ===") # 检查关键错误 if self._check_critical_error(state, "实质元素提取"): return state try: # 初始化Agent if not self.substance_agent.is_initialized: self.substance_agent.initialize() # 准备状态:将段落列表包装到section_division字段中 sections = state.get("段落列表", []) state["section_division"] = {"段落列表": sections} # 验证三点解构信息是否可用 inspiration_points = state.get("inspiration_points", []) purpose_point = state.get("purpose_point", {}) key_points = state.get("key_points", {}) # 统计三点解构信息 inspiration_count = len(inspiration_points) if isinstance(inspiration_points, list) else 0 if not inspiration_count and isinstance(inspiration_points, dict): inspiration_count = len(inspiration_points.get("points", [])) purpose_count = 0 if isinstance(purpose_point, dict): purpose_count = len(purpose_point.get("purposes", [])) elif isinstance(purpose_point, list): purpose_count = len(purpose_point) key_points_count = 0 if isinstance(key_points, dict): key_points_list = key_points.get("key_points", []) key_points_count = len(key_points_list) if isinstance(key_points_list, list) else 0 elif isinstance(key_points, list): key_points_count = len(key_points) logger.info( f"实质提取节点 - 三点解构信息检查: " f"灵感点={inspiration_count}, 目的点={purpose_count}, 关键点={key_points_count}" ) # 执行Agent result = self.substance_agent.process(state) # 更新状态 state.update(result) # 检查Agent返回结果中是否包含关键错误 if self._check_agent_result_for_errors(result, "实质元素提取Agent"): error_msg = "实质元素提取失败:无法获取视频文件" state["workflow_failed"] = True state["workflow_error"] = error_msg return state final_elements = result.get("substance_final_elements", []) logger.info(f"实质元素提取完成 - 最终元素数: {len(final_elements)}") except Exception as e: logger.error(f"实质元素提取失败: {e}", exc_info=True) state["workflow_failed"] = True state["workflow_error"] = f"实质元素提取异常: {str(e)}" state.update({ "concrete_elements": [], "concrete_concepts": [], "abstract_concepts": [], "substance_elements": [], "substance_analyzed_result": [], "substance_scored_result": {}, "substance_filtered_ids": [], "substance_categorized_result": {}, "substance_final_elements": [] }) return state def _form_extraction_node(self, state: Dict[str, Any]) -> Dict[str, Any]: """节点:形式元素提取(脚本理解)""" logger.info("=== 执行节点:形式元素提取 ===") # 检查关键错误 if self._check_critical_error(state, "形式元素提取"): return state try: # 初始化Agent if not self.form_agent.is_initialized: self.form_agent.initialize() # 验证三点解构信息是否可用 inspiration_points = state.get("inspiration_points", []) purpose_point = state.get("purpose_point", {}) key_points = state.get("key_points", {}) # 统计三点解构信息 inspiration_count = len(inspiration_points) if isinstance(inspiration_points, list) else 0 if not inspiration_count and isinstance(inspiration_points, dict): inspiration_count = len(inspiration_points.get("points", [])) purpose_count = 0 if isinstance(purpose_point, dict): purpose_count = len(purpose_point.get("purposes", [])) elif isinstance(purpose_point, list): purpose_count = len(purpose_point) key_points_count = 0 if isinstance(key_points, dict): key_points_list = key_points.get("key_points", []) key_points_count = len(key_points_list) if isinstance(key_points_list, list) else 0 elif isinstance(key_points, list): key_points_count = len(key_points) logger.info( f"形式提取节点 - 三点解构信息检查: " f"灵感点={inspiration_count}, 目的点={purpose_count}, 关键点={key_points_count}" ) # 执行Agent(依赖实质元素) result = self.form_agent.process(state) # 更新状态 state.update(result) # 检查Agent返回结果中是否包含关键错误 if self._check_agent_result_for_errors(result, "形式元素提取Agent"): error_msg = "形式元素提取失败:无法获取视频文件" state["workflow_failed"] = True state["workflow_error"] = error_msg return state final_elements = result.get("form_final_elements", []) logger.info(f"形式元素提取完成 - 最终元素数: {len(final_elements)}") except Exception as e: logger.error(f"形式元素提取失败: {e}", exc_info=True) state["workflow_failed"] = True state["workflow_error"] = f"形式元素提取异常: {str(e)}" state.update({ "concrete_element_forms": [], "concrete_concept_forms": [], "overall_forms": [], "form_elements": [], "form_analyzed_result": [], "form_scored_result": {}, "form_weighted_result": {}, "form_filtered_ids": [], "form_categorized_result": {}, "form_final_elements": [] }) return state def _merge_all_results_node(self, state: Dict[str, Any]) -> Dict[str, Any]: """节点:分离实质和形式结果(脚本理解)""" logger.info("=== 执行节点:分离实质和形式结果 ===") try: # 获取实质和形式的最终元素 substance_final_elements = state.get("substance_final_elements", []) form_final_elements = state.get("form_final_elements", []) # 分别存储实质列表和形式列表 state["实质列表"] = substance_final_elements state["形式列表"] = form_final_elements logger.info(f"分离完成 - 实质元素: {len(substance_final_elements)}, 形式元素: {len(form_final_elements)}") except Exception as e: logger.error(f"分离结果失败: {e}", exc_info=True) state["实质列表"] = [] state["形式列表"] = [] return state def _search_keywords_extraction_node(self, state: Dict[str, Any]) -> Dict[str, Any]: """节点:搜索关键词提取(分支1的最后一步)""" logger.info("=== 执行节点:搜索关键词提取 ===") # 检查关键错误 if self._check_critical_error(state, "搜索关键词提取"): return state try: # 初始化Agent if not self.search_keyword_agent.is_initialized: self.search_keyword_agent.initialize() # 执行Agent(带重试机制) max_retries = 2 result = None last_error = None def _is_result_failed(result: Dict[str, Any]) -> tuple[bool, str]: """检查结果是否失败 Returns: (是否失败, 错误信息) """ if not result: return True, "结果为空" # 检查关键错误(如无法获取视频文件) if self._check_agent_result_for_errors(result, "搜索关键词提取Agent"): return True, "搜索关键词提取失败:无法获取视频文件" # 检查search_keywords中的错误字段 search_keywords = result.get("search_keywords", {}) if isinstance(search_keywords, dict): # 检查是否有错误字段 if "错误" in search_keywords or "error" in search_keywords: error_msg = search_keywords.get("错误") or search_keywords.get("error", "未知错误") return True, f"搜索关键词提取失败:{error_msg}" # 检查搜索词数量是否为0(可能是解析失败的表现) keyword_count = search_keywords.get("总数", 0) if keyword_count == 0: # 如果总数为0,可能是解析失败,需要重试 return True, "搜索关键词提取失败:未提取到搜索词(可能为解析失败)" return False, "" for attempt in range(max_retries): try: # 执行Agent result = self.search_keyword_agent.process(state) # 检查结果是否失败 is_failed, error_msg = _is_result_failed(result) if is_failed: if attempt < max_retries - 1: logger.warning(f"搜索关键词提取失败,准备重试 (尝试 {attempt + 1}/{max_retries}): {error_msg}") last_error = error_msg continue else: # 最后一次尝试也失败 state["workflow_failed"] = True state["workflow_error"] = error_msg return state else: # 成功,跳出重试循环 break except Exception as e: if attempt < max_retries - 1: logger.warning(f"搜索关键词提取异常,准备重试 (尝试 {attempt + 1}/{max_retries}): {e}") last_error = str(e) continue else: # 最后一次尝试也失败,抛出异常让外层catch处理 raise # 更新状态 if result: state.update(result) else: # 如果result为None,使用last_error error_msg = last_error or "搜索关键词提取失败:未知错误" state["workflow_failed"] = True state["workflow_error"] = error_msg return state # 再次检查(双重保险) if self._check_agent_result_for_errors(result, "搜索关键词提取Agent"): error_msg = "搜索关键词提取失败:无法获取视频文件" state["workflow_failed"] = True state["workflow_error"] = error_msg return state # 获取搜索关键词数量 search_keywords = result.get("search_keywords", {}) keyword_count = search_keywords.get("总数", 0) if isinstance(search_keywords, dict) else 0 logger.info(f"搜索关键词提取完成 - 共 {keyword_count} 个搜索词") # 标记分支1完成 state["branch1_completed"] = True except Exception as e: logger.error(f"搜索关键词提取失败: {e}", exc_info=True) state["workflow_failed"] = True state["workflow_error"] = f"搜索关键词提取异常: {str(e)}" state.update({ "search_keywords": { "搜索词列表": [], "总数": 0, "error": str(e) } }) state["branch1_completed"] = True # 即使失败也标记为完成,避免阻塞 return state def _structure_analysis_node(self, state: Dict[str, Any]) -> Dict[str, Any]: """节点:结构化内容库分析(ScriptWorkflowV2分支)""" logger.info("=== 执行节点:结构化内容库分析(分支2) ===") # 检查关键错误 if self._check_critical_error(state, "结构化内容库分析"): return state try: if not self.structure_agent.is_initialized: self.structure_agent.initialize() result = self.structure_agent.process(state) # 将结果存入 state 的 topic 字段 structure_data = result.get("structure_data", {}) state["topic"] = structure_data logger.info("结构化内容库分析完成") if isinstance(structure_data, dict): topic_info = structure_data.get("选题信息表", {}) macro_topic = topic_info.get("宏观母题", "") if isinstance(topic_info, dict) else "" if macro_topic: logger.info(f"宏观母题: {macro_topic}") except Exception as e: logger.error(f"结构化内容库分析失败: {e}", exc_info=True) state["topic"] = { "错误": str(e), } return state def _content_unit_split_node(self, state: Dict[str, Any]) -> Dict[str, Any]: """节点:L3 内容单元拆分(ScriptWorkflowV2分支)""" logger.info("=== 执行节点:L3 内容单元拆分(分支2) ===") # 检查关键错误 if self._check_critical_error(state, "L3 内容单元拆分"): return state try: if not self.content_unit_split_agent.is_initialized: self.content_unit_split_agent.initialize() result = self.content_unit_split_agent.process(state) state.update(result) analysis = result.get("content_unit_analysis", {}) logger.info( f"L3 单元拆分完成,单元数量: {len(analysis.get('单元列表', [])) if isinstance(analysis, dict) else 0}" ) except Exception as e: logger.error(f"L3 内容单元拆分失败: {e}", exc_info=True) state["content_unit_analysis"] = { "error": str(e), "单元列表": [], } return state def _content_unit_understand_node(self, state: Dict[str, Any]) -> Dict[str, Any]: """节点:整体结构理解(ScriptWorkflowV2分支)""" logger.info("=== 执行节点:整体结构理解(分支2) ===") # 检查关键错误 if self._check_critical_error(state, "整体结构理解"): return state try: if not self.content_unit_understand_agent.is_initialized: self.content_unit_understand_agent.initialize() result = self.content_unit_understand_agent.process(state) state.update(result) understanding = result.get("content_unit_understanding", {}) logger.info( f"整体结构理解完成,段落数量: {len(understanding.get('段落解构', [])) if isinstance(understanding, dict) else 0}" ) except Exception as e: logger.error(f"整体结构理解失败: {e}", exc_info=True) state["content_unit_understanding"] = { "error": str(e), "整体解构": {}, "段落解构": [], "单元解构": {}, } return state def _keyword_extraction_node(self, state: Dict[str, Any]) -> Dict[str, Any]: """节点:金句提取(ScriptWorkflowV2分支)""" logger.info("=== 执行节点:金句提取(分支2) ===") # 检查关键错误 if self._check_critical_error(state, "金句提取"): return state try: if not self.script_keyword_agent.is_initialized: self.script_keyword_agent.initialize() result = self.script_keyword_agent.process(state) state.update(result) script_keywords = result.get("script_keywords", {}) logger.info("金句提取完成") if isinstance(script_keywords, dict): hooks_count = len(script_keywords.get("hooks", [])) golden_sentences_count = len(script_keywords.get("golden_sentences", [])) logger.info(f"提取钩子数量: {hooks_count}, 金句数量: {golden_sentences_count}") # 标记分支2完成 state["branch2_completed"] = True except Exception as e: logger.error(f"金句提取失败: {e}", exc_info=True) state["script_keywords"] = { "error": str(e), } state["branch2_completed"] = True # 即使失败也标记为完成,避免阻塞 return state def _merge_branches_node(self, state: Dict[str, Any]) -> Dict[str, Any]: """节点:合并两个分支的结果 检查分支2是否完成,如果没完成则执行它 """ logger.info("=== 执行节点:合并分支结果 ===") try: # 检查分支2是否完成 branch2_completed = state.get("branch2_completed", False) if not branch2_completed: logger.info("分支2还未完成,现在执行分支2(ScriptWorkflowV2流程)") try: # 执行分支2的所有节点 if not self.structure_agent.is_initialized: self.structure_agent.initialize() structure_result = self.structure_agent.process(state) state["topic"] = structure_result.get("structure_data", {}) if not self.content_unit_split_agent.is_initialized: self.content_unit_split_agent.initialize() split_result = self.content_unit_split_agent.process(state) state.update(split_result) if not self.content_unit_understand_agent.is_initialized: self.content_unit_understand_agent.initialize() understand_result = self.content_unit_understand_agent.process(state) state.update(understand_result) if not self.script_keyword_agent.is_initialized: self.script_keyword_agent.initialize() keyword_result = self.script_keyword_agent.process(state) state.update(keyword_result) state["branch2_completed"] = True logger.info("分支2(ScriptWorkflowV2流程)执行完成") except Exception as e: logger.error(f"分支2执行失败: {e}", exc_info=True) state["branch2_completed"] = True # 即使失败也标记为完成,避免阻塞 else: logger.info("分支2已完成,直接合并结果") # 标记已合并 state["branches_merged"] = True logger.info("分支结果合并完成,准备进入结果汇总") except Exception as e: logger.error(f"合并分支结果失败: {e}", exc_info=True) state["branches_merged"] = True # 即使失败也标记为已合并,避免阻塞 return state def _result_aggregation_node(self, state: Dict[str, Any]) -> Dict[str, Any]: """节点:结果汇总(包含两个分支的结果)""" logger.info("=== 执行节点:结果汇总 ===") try: # 初始化Function if not self.result_aggregation_func.is_initialized: self.result_aggregation_func.initialize() # 执行Function(获取分支1的结果) final_result = self.result_aggregation_func.execute(state) # 添加分支2的结果(ScriptWorkflowV2的结果) topic = state.get("topic", {}) content_unit_analysis = state.get("content_unit_analysis", {}) content_unit_understanding = state.get("content_unit_understanding", {}) script_keywords = state.get("script_keywords", {}) # 将分支2的结果添加到最终结果中 final_result["脚本解构V2"] = { "结构化内容库": topic, "L3单元解构": content_unit_analysis, "整体结构理解": content_unit_understanding, "金句提取": script_keywords, } # 更新状态 state["final_result"] = final_result logger.info("结果汇总完成(包含两个分支的结果)") except Exception as e: logger.error(f"结果汇总失败: {e}", exc_info=True) state["final_result"] = { "视频信息": {}, "三点解构": { "灵感点": [], "目的点": {}, "关键点": {} }, "选题理解": {}, "脚本理解": { "内容品类": "未知", "段落列表": [], "实质列表": [], "形式列表": [] }, "脚本解构V2": { "结构化内容库": {}, "L3单元解构": {}, "整体结构理解": {}, "金句提取": {} }, "错误": f"汇总失败: {str(e)}" } return state def _update_db_record_after_workflow( self, state: Dict[str, Any], success: bool, final_result: Dict[str, Any] = None, error_msg: str = None ): """工作流执行完毕后更新数据库记录 Args: state: 工作流执行后的状态 success: 是否成功 final_result: 最终结果(成功时使用) error_msg: 错误信息(失败时使用) """ try: task_id = state.get("db_task_id") if not task_id: logger.warning("未找到 db_task_id,跳过数据库记录更新") return db = next(get_db()) try: record = db.query(DecodeVideo).filter_by(task_id=task_id).first() if not record: logger.warning(f"未找到 task_id={task_id} 的数据库记录,跳过更新") return if success: # 更新为成功状态 import json result_json = json.dumps(final_result, ensure_ascii=False) if final_result else None record.update_status(DecodeStatus.SUCCESS) record.update_result(result_json) # 提取分支2的结果(脚本解构V2)并存储到 decode_result_v2 字段 if final_result and isinstance(final_result, dict): script_v2_result = final_result.get("脚本解构V2") if script_v2_result: result_v2_json = json.dumps(script_v2_result, ensure_ascii=False) record.update_result_v2(result_v2_json) logger.info(f"更新数据库记录的分支2结果: task_id={task_id}") # 提取搜索关键词并存储到 search_keywords 字段(字符串数组的JSON格式) search_keywords_data = state.get("search_keywords", {}) if search_keywords_data and isinstance(search_keywords_data, dict): keyword_list = search_keywords_data.get("搜索词列表", []) if keyword_list: # 提取每个搜索词的"搜索词"字段,组成字符串数组 keyword_strings = [] for item in keyword_list: if isinstance(item, dict): keyword = item.get("搜索词", "") if keyword: keyword_strings.append(keyword) # 转换为JSON字符串数组格式 if keyword_strings: keywords_json = json.dumps(keyword_strings, ensure_ascii=False) record.update_search_keywords(keywords_json) logger.info(f"更新数据库记录的搜索关键词: task_id={task_id}, 关键词数量={len(keyword_strings)}") logger.info(f"更新数据库记录为成功: task_id={task_id}") else: # 更新为失败状态 record.update_status(DecodeStatus.FAILED, error_reason=error_msg) logger.info(f"更新数据库记录为失败: task_id={task_id}, error={error_msg}") db.commit() except Exception as e: logger.error(f"更新数据库记录失败: {e}", exc_info=True) db.rollback() finally: db.close() except Exception as e: logger.error(f"更新数据库记录节点执行失败: {e}", exc_info=True) def invoke(self, input_data: Dict[str, Any]) -> Dict[str, Any]: """执行工作流(公共接口)- 视频分析版本 Returns: 最终解码结果 """ logger.info("=== 开始执行解码工作流(视频分析) ===") logger.info(f"input_data: {input_data}") # 确保工作流已初始化 if not self.is_initialized: self.initialize() # 验证输入参数 video_url = input_data.get("video_url", "") if not video_url: error_msg = "未提供视频URL,无法执行工作流" logger.error(error_msg) return { "error": error_msg, "workflow_status": "failed", "input_data": input_data } # 初始化状态(包含视频信息,供视频上传和后续Agent使用) initial_state = { "video": video_url, "video_id": input_data.get("video_id", ""), "title": input_data.get("title", ""), "current_depth": 0, "max_depth": self.max_depth, "task_id": input_data.get("task_id", ""), } # 执行工作流 result = None try: result = self.compiled_graph.invoke(initial_state) except Exception as e: error_msg = f"工作流执行异常: {str(e)}" logger.error(error_msg, exc_info=True) # 更新数据库记录为失败状态(使用 initial_state 作为 fallback) if result is None: result = initial_state self._update_db_record_after_workflow(result, success=False, error_msg=error_msg) return { "status": 3, "error": error_msg, "workflow_status": "failed", "exception_type": type(e).__name__ } # 检查是否因为错误而终止 if result.get("workflow_failed"): error_msg = result.get("workflow_error", "工作流执行失败") logger.error(f"工作流因错误而终止: {error_msg}") # 更新数据库记录为失败状态 self._update_db_record_after_workflow(result, success=False, error_msg=error_msg) return { "error": error_msg, "video_upload_error": result.get("video_upload_error"), "workflow_status": "failed", "failed_at": result.get("failed_at", "unknown"), "status": 3 } # 检查是否有最终结果 final_result = result.get("final_result") if not final_result: # 如果没有最终结果,检查是否有错误信息 if result.get("workflow_error"): error_msg = result.get("workflow_error", "工作流执行失败,未生成结果") logger.error(f"工作流执行失败: {error_msg}") # 更新数据库记录为失败状态 self._update_db_record_after_workflow(result, success=False, error_msg=error_msg) return { "status": 3, "error": error_msg, "workflow_status": "failed" } else: logger.warning("工作流执行完成,但未生成最终结果") # 更新数据库记录为失败状态 self._update_db_record_after_workflow(result, success=False, error_msg="工作流执行完成,但未生成最终结果") return { "status": 3, "error": "工作流执行完成,但未生成最终结果", "workflow_status": "incomplete", "state": result } # 工作流执行成功,更新数据库记录 self._update_db_record_after_workflow(result, success=True, final_result=final_result) logger.info("=== 解码工作流执行完成(视频分析) ===") # 添加status字段(2表示成功) if isinstance(final_result, dict): final_result["status"] = 2 return final_result