jihuaqiang 1 week ago
parent
commit
a0e63cd504

+ 49 - 7
agent.py

@@ -29,7 +29,7 @@ except ImportError:
     print("警告: LangGraph 未安装,将使用传统模式")
 
 from utils.logging_config import get_logger
-from agent_tools import QueryDataTool, IdentifyTool, StructureTool
+from tools.agent_tools import QueryDataTool, IdentifyTool, UpdateDataTool, StructureTool
 
 # 创建 logger
 logger = get_logger('Agent')
@@ -164,7 +164,7 @@ def create_langgraph_workflow():
             state["identify_result"] = identify_result
             
             # Step 2: 结构化并入库
-            affected = StructureTool.store_parsing_result(
+            affected = UpdateDataTool.store_indentify_result(
                 state["request_id"], 
                 {
                     "content_id": state["content_id"],
@@ -172,8 +172,21 @@ def create_langgraph_workflow():
                 }, 
                 identify_result
             )
+            # 使用StructureTool进行内容结构化处理
+            structure_tool = StructureTool()
+            structure_result = structure_tool.process_content_structure(identify_result)
             
-            ok = affected is not None and affected > 0
+            # 存储结构化解析结果
+            parsing_affected = UpdateDataTool.store_parsing_result(
+                state["request_id"],
+                {
+                    "content_id": state["content_id"],
+                    "task_id": state["task_id"]
+                },
+                structure_result
+            )
+            
+            ok = affected is not None and affected > 0 and parsing_affected is not None and parsing_affected > 0
             if ok:
                 state["success"] += 1
             
@@ -212,7 +225,6 @@ def create_langgraph_workflow():
         
         current_index = state.get("current_index", 0)
         items = state.get("items", [])
-        
         if current_index >= len(items):
             # 所有数据处理完毕,更新状态为2
             update_request_status(state["request_id"], 2)
@@ -327,6 +339,7 @@ async def parse_processing(request: TriggerRequest, background_tasks: Background
             
             # 获取待处理数据
             items = QueryDataTool.fetch_crawl_data_list(request.requestId)
+            print(f"传统模式---items: {items}")
             if not items:
                 # 无数据需要处理,更新状态为完成
                 update_request_status(request.requestId, 2)
@@ -351,7 +364,7 @@ async def parse_processing(request: TriggerRequest, background_tasks: Background
                     )
                     
                     # Step 2: 结构化并入库
-                    affected = StructureTool.store_parsing_result(
+                    affected = UpdateDataTool.store_indentify_result(
                         request.requestId, 
                         {
                             content_id: item.get('content_id') or '',
@@ -360,6 +373,20 @@ async def parse_processing(request: TriggerRequest, background_tasks: Background
                         identify_result
                     )
                     
+                    # 使用StructureTool进行内容结构化处理
+                    structure_tool = StructureTool()
+                    structure_result = structure_tool.process_content_structure(identify_result)
+                    
+                    # 存储结构化解析结果
+                    parsing_affected = UpdateDataTool.store_parsing_result(
+                        request.requestId,
+                        {
+                            content_id: item.get('content_id') or '',
+                            task_id: item.get('task_id') or ''
+                        },
+                        structure_result
+                    )
+                    
                     ok = affected is not None and affected > 0
                     if ok:
                         success_count += 1
@@ -458,6 +485,7 @@ async def process_request_background(request_id: str):
             update_request_status(request_id, 1)
             
             items = QueryDataTool.fetch_crawl_data_list(request_id)
+            print(f"传统模式process_request_background---items: {items}")
             if not items:
                 logger.info(f"后台处理完成: requestId={request_id}, 无数据需要处理")
                 # 无数据需要处理,更新状态为完成
@@ -474,7 +502,7 @@ async def process_request_background(request_id: str):
                         crawl_data if isinstance(crawl_data, dict) else {}
                     )
                     
-                    affected = StructureTool.store_parsing_result(
+                    affected = UpdateDataTool.store_indentify_result(
                         request_id, 
                         {
                             content_id: item.get('content_id') or '',
@@ -483,10 +511,24 @@ async def process_request_background(request_id: str):
                         identify_result
                     )
                     
+                    # 使用StructureTool进行内容结构化处理
+                    structure_tool = StructureTool()
+                    structure_result = structure_tool.process_content_structure(identify_result)
+                    
+                    # 存储结构化解析结果
+                    parsing_affected = UpdateDataTool.store_parsing_result(
+                        request_id,
+                        {
+                            content_id: item.get('content_id') or '',
+                            task_id: item.get('task_id') or ''
+                        },
+                        structure_result
+                    )
+                    
                     if affected is not None and affected > 0:
                         success_count += 1
                     
-                    logger.info(f"后台处理进度: {idx}/{len(items)} - {'成功' if affected else '失败'}")
+                    logger.info(f"后台处理进度: {idx}/{len(items)} - {'成功' if affected else '失败'} - 结构化{'成功' if parsing_affected else '失败'}")
                     
                 except Exception as e:
                     logger.error(f"后台处理第 {idx} 项时出错: {e}")

+ 0 - 238
structure/structure_processor.py

@@ -1,238 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-内容结构化处理模块
-主要功能:
-1. 从数据库中拉取需要结构化的数据
-2. 调用Gemini API进行内容结构化
-3. 将结构化结果更新到数据库
-"""
-
-import os
-import json
-import time
-import sys
-import threading
-from typing import Dict, Any, List, Optional, Tuple
-
-# 导入自定义模块
-sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-
-from utils.mysql_db import MysqlHelper
-from gemini import GeminiProcessor
-from utils.file import File
-from utils.logging_config import get_logger
-
-
-class StructureProcessor:
-    def __init__(self):
-        # 设置日志
-        self.logger = get_logger('StructureProcessor')
-        
-        # 初始化处理器
-        self.processor = GeminiProcessor()
-        self.system_prompt = File.read_file('../prompt/structure.md')
-        self.logger.info("系统提示词加载完成")
-        self.logger.debug(f"系统提示词: {self.system_prompt}")
-        
-        # 线程控制
-        self.lock = threading.Lock()
-        self.stop_event = threading.Event()
-        self.threads = []
-    
-    def get_query_words(self) -> List[str]:
-        """从 knowledge_content_query 表中获取 category_id = 0 的所有 query_word"""
-        try:
-            sql = """
-            SELECT query_word 
-            FROM knowledge_content_query 
-            WHERE category_id = 0
-            """
-            
-            result = MysqlHelper.get_values(sql)
-            if result:
-                query_words = [row[0] for row in result]
-                self.logger.info(f"找到 {len(query_words)} 个 category_id = 0 的 query_word")
-                return query_words
-            else:
-                self.logger.warning("未找到 category_id = 0 的 query_word")
-                return []
-                
-        except Exception as e:
-            self.logger.error(f"获取 query_word 失败: {e}")
-            return []
-    
-    def process_single_record(self) -> bool:
-        """处理单条记录"""
-        try:
-            with self.lock:
-                # 第一步:获取 category_id = 0 的所有 query_word
-                query_words = self.get_query_words()
-                if not query_words:
-                    self.logger.warning("没有可用的 query_word")
-                    return False
-                
-                # 第二步:用这些 query_word 去匹配 knowledge_search_content 表
-                # 构建带引号的查询条件
-                quoted_words = [f"'{word}'" for word in query_words]
-                placeholders = ','.join(quoted_words)
-                
-                # 使用 FOR UPDATE 锁定记录,确保原子性操作
-                # 明确排除正在处理中和已处理的记录
-                select_sql = f"""
-                    SELECT id, multimodal_recognition 
-                    FROM knowledge_search_content 
-                    WHERE multimodal_recognition IS NOT NULL  
-                        AND structured_data IS NULL
-                        AND query_word IN ({placeholders})
-                    ORDER BY id ASC
-                    LIMIT 1
-                """
-                
-                self.logger.info(f"执行查询: {select_sql}")
-                
-                records = MysqlHelper.get_values(select_sql)
-                if not records:
-                    self.logger.warning("没有找到需要处理的记录")
-                    return False
-                
-                row = records[0]
-                self.logger.info(f"row: {row}")
-                record_id = row[0]
-                self.logger.info(f"record_id: {record_id}")
-                
-                # 立即标记为处理中,防止其他线程取到重复处理
-                mark_sql = """
-                    UPDATE knowledge_search_content 
-                    SET structured_data = 'PROCESSING' 
-                    WHERE id = %s
-                """
-                
-                mark_result = MysqlHelper.update_values(mark_sql, (record_id,))
-                if mark_result is None:
-                    self.logger.error(f"标记记录 {record_id} 为处理中失败")
-                    return False
-                
-                self.logger.info(f"记录 {record_id} 已标记为处理中")
-                
-                # 处理内容
-                result = self.processor.process(row[1], self.system_prompt)
-                self.logger.info(f"处理完成,结果长度: {len(str(result))}")
-                self.logger.debug(f"处理结果: {result}")
-                
-                # 更新数据库为实际结果
-                update_sql = """
-                    UPDATE knowledge_search_content 
-                    SET structured_data = %s 
-                    WHERE id = %s
-                """
-                
-                update_result = MysqlHelper.update_values(update_sql, (result, record_id))
-                if update_result is None:
-                    self.logger.error(f"更新记录 {record_id} 失败")
-                    return False
-                
-                self.logger.info(f"记录 {record_id} 处理完成并更新数据库")
-                return True
-                
-        except Exception as e:
-            self.logger.error(f"处理记录失败: {str(e)}", exc_info=True)
-            return False
-    
-    def worker_thread(self, thread_id: int):
-        """工作线程函数"""
-        thread_logger = get_logger(f'WorkerThread-{thread_id}')
-        thread_logger.info(f"线程 {thread_id} 启动")
-        
-        while not self.stop_event.is_set():
-            try:
-                # 尝试处理一条记录
-                success = self.process_single_record()
-                
-                if not success:
-                    thread_logger.info(f"没有找到需要处理的记录,等待5秒后重试")
-                    # 等待时也要检查停止信号
-                    if self.stop_event.wait(5):
-                        break
-                    continue
-                
-                # 处理成功后等待5秒再处理下一条
-                thread_logger.info(f"处理完成,等待5秒后处理下一条")
-                # 等待时也要检查停止信号
-                if self.stop_event.wait(5):
-                    break
-                
-            except Exception as e:
-                thread_logger.error(f"发生错误: {str(e)}", exc_info=True)
-                # 等待时也要检查停止信号
-                if self.stop_event.wait(5):
-                    break
-        
-        thread_logger.info(f"线程 {thread_id} 已停止")
-    
-    def start_multi_thread_processing(self):
-        """启动多线程处理"""
-        self.threads = []
-        
-        self.logger.info("启动多线程处理...")
-        self.logger.info("查询条件: multimodal_recognition is not null AND structured_data is null AND query_word IN (category_id = 0 的 query_word)")
-        
-        # 创建5个线程,间隔5秒启动
-        for i in range(5):
-            thread = threading.Thread(
-                target=self.worker_thread,
-                args=(i + 1,)
-            )
-            self.threads.append(thread)
-            
-            # 启动线程
-            thread.start()
-            self.logger.info(f"线程 {i + 1} 已启动")
-            
-            # 等待5秒后启动下一个线程
-            if i < 4:  # 最后一个线程不需要等待
-                self.logger.info("等待5秒后启动下一个线程...")
-                time.sleep(5)
-        
-        self.logger.info("所有线程已启动,使用 ./start_structure.sh stop 停止")
-        
-        try:
-            # 等待所有线程完成
-            for thread in self.threads:
-                thread.join()
-        except KeyboardInterrupt:
-            self.logger.info("收到停止信号,正在停止所有线程...")
-            self.stop_all_threads()
-    
-    def stop_all_threads(self):
-        """停止所有线程"""
-        self.logger.info("正在停止所有线程...")
-        self.stop_event.set()
-        
-        # 等待所有线程结束
-        for i, thread in enumerate(self.threads):
-            if thread.is_alive():
-                self.logger.info(f"等待线程 {i + 1} 结束...")
-                thread.join(timeout=10)  # 最多等待10秒
-                if thread.is_alive():
-                    self.logger.warning(f"线程 {i + 1} 未能正常结束")
-                else:
-                    self.logger.info(f"线程 {i + 1} 已正常结束")
-        
-        self.logger.info("所有线程已停止")
-
-
-def main():
-    """主函数"""
-    try:
-        processor = StructureProcessor()
-        processor.start_multi_thread_processing()
-    except Exception as e:
-        print(f"程序执行失败: {str(e)}")
-        sys.exit(1)
-
-
-if __name__ == "__main__":
-    # 测试单条记录处理
-    processor = StructureProcessor()
-    processor.process_single_record() 

+ 172 - 10
agent_tools.py → tools/agent_tools.py

@@ -8,6 +8,7 @@ sys.path.append(os.path.dirname(os.path.abspath(__file__)))
 from utils.logging_config import get_logger
 from utils.mysql_db import MysqlHelper
 from indentify.indentify import ContentIdentifier
+from structure import StructureProcessor
 
 logger = get_logger('AgentTools')
 
@@ -17,14 +18,14 @@ class QueryDataTool:
 
     @staticmethod
     def fetch_crawl_data_list(request_id: str) -> List[Dict[str, Any]]:
-        sql = "SELECT data FROM knowledge_crawl_content WHERE request_id = %s ORDER BY id ASC"
+        sql = "SELECT * FROM knowledge_crawl_content WHERE request_id = %s ORDER BY id ASC"
         rows = MysqlHelper.get_values(sql, (request_id,))
         if not rows:
             logger.info(f"request_id={request_id} 未查询到数据,使用默认值")
             # 返回默认数据
-            default_data = {
+            default_data = [{
                 "request_id": request_id,
-                "content_id": "684a789b000000002202a61b",
+                "content_id": "1",
                 "id": 1,
                 "task_id": 1,
                 "crawl_data": {
@@ -70,8 +71,56 @@ class QueryDataTool:
                     "modify_timestamp": 1749711589000,
                     "update_timestamp": 1755239186502
                 }
-            }
-            return [default_data]
+            },{
+                 "request_id": request_id,
+                "content_id": "2",
+                "id": 2,
+                "task_id": 2,
+                "crawl_data": {
+                    "channel": 1,
+                    "channel_content_id": "684a789b000000002202a61b",
+                    "content_link": "https://www.xiaohongshu.com/explore/684a789b000000002202a61b",
+                    "wx_sn": None,
+                    "title": "一个视频学会,5个剪辑工具,超详细教程",
+                    "content_type": "video",
+                    "body_text": "#剪辑教程[话题]# #剪辑[话题]# #手机剪辑[话题]# #视频制作[话题]# #视频剪辑[话题]# #自学剪辑[话题]# #原创视频[话题]# #新手小白学剪辑[话题]#",
+                    "location": "未知",
+                    "source_url": None,
+                    "mini_program": None,
+                    "topic_list": [],
+                    "image_url_list": [
+                        {
+                            "image_type": 2,
+                            "image_url": "http://rescdn.yishihui.com/pipeline/image/5be8f08a-4691-41b6-8dda-0b63cc2c1056.jpg"
+                        }
+                    ],
+                    "video_url_list": [
+                        {
+                            "video_url": "http://rescdn.yishihui.com/pipeline/video/9e38400e-21dc-4063-bab5-47c1667bb59d.mp4",
+                            "video_duration": 615
+                        }
+                    ],
+                    "bgm_data": None,
+                    "ad_info": None,
+                    "is_original": False,
+                    "voice_data": None,
+                    "channel_account_id": "670a10ac000000001d0216ec",
+                    "channel_account_name": "小伍剪辑视频",
+                    "channel_account_avatar": "https://sns-avatar-qc.xhscdn.com/avatar/1040g2jo31e469dkq0e005poa22m7c5ncbtuk1g0?imageView2/2/w/80/format/jpg",
+                    "item_index": None,
+                    "view_count": None,
+                    "play_count": None,
+                    "like_count": 692,
+                    "collect_count": 996,
+                    "comment_count": 37,
+                    "share_count": None,
+                    "looking_count": None,
+                    "publish_timestamp": 1749711589000,
+                    "modify_timestamp": 1749711589000,
+                    "update_timestamp": 1755239186502
+                }
+            }]
+            return default_data
 
         results: List[Dict[str, Any]] = []
         for row in rows:
@@ -158,14 +207,14 @@ class IdentifyTool:
             }
 
 
-class StructureTool:
+class UpdateDataTool:
     """
     结构化工具:按照既定的结构将识别结果与原始 crawl_data 组合,
     并存入 knowledge_parsing_content 表。
     """
-
+    
     @staticmethod
-    def store_parsing_result(request_id: str, crawl_raw: Dict[str, Any], identify_result: Dict[str, Any]) -> Optional[int]:
+    def store_indentify_result(request_id: str, crawl_raw: Dict[str, Any], identify_result: Dict[str, Any]) -> Optional[int]:
         """
         存储解析结果到 knowledge_parsing_content 表
         
@@ -186,7 +235,7 @@ class StructureTool:
             
             sql = (
                 "INSERT INTO knowledge_parsing_content "
-                "(content_id, request_id, task_id, parsing_data, create_time, status) "
+                "(content_id, request_id, task_id, indentify_data, create_time, status) "
                 "VALUES (%s, %s, %s, %s, NOW(), %s)"
             )
             
@@ -207,4 +256,117 @@ class StructureTool:
             
         except Exception as e:
             logger.error(f"存储解析结果失败: request_id={request_id}, error={e}")
-            return None
+            return None
+    @staticmethod
+    def store_parsing_result(request_id: str, crawl_raw: Dict[str, Any], parsing_result: Dict[str, Any]) -> Optional[int]:
+        """
+        存储解析结果到 knowledge_parsing_content 表
+        
+        Args:
+            request_id: 请求ID
+            crawl_raw: 原始爬取数据
+            parsing_result: 结构化的结果(建议传入 StructureTool.process_content_structure 的返回值)
+            
+        Returns:
+            受影响的行数,失败返回None
+        """
+        try:
+            # 从原始数据中提取必要字段
+            content_id = crawl_raw.get('content_id') or ''
+            
+            # 只提取result字段的内容,如果不存在则使用整个对象
+            structured_content = parsing_result.get('structured_content', {})
+            if isinstance(structured_content, dict) and 'result' in structured_content:
+                # 如果structured_content是字典且包含result字段,只存储result字段
+                parsing_payload = structured_content['result']
+            else:
+                # 否则存储整个structured_content
+                parsing_payload = structured_content
+            
+            # 更新数据
+            sql = (
+                "UPDATE knowledge_parsing_content "
+                "SET parsing_data = %s, status = %s "
+                "WHERE content_id = %s"
+            )
+            
+            # 状态:5 表示结构化处理完成
+            status = 5
+            params = (
+                json.dumps(parsing_payload, ensure_ascii=False),
+                status,
+                content_id
+            )
+            
+            result = MysqlHelper.update_values(sql, params)
+            if result:
+                logger.info(f"存储解析结果成功: request_id={request_id}, content_id={content_id}")
+            return result
+            
+        except Exception as e:
+            logger.error(f"存储解析结果失败: request_id={request_id}, error={e}")
+            return None
+
+class StructureTool:
+    """
+    内容结构化工具:调用tools/structure内部的方法进行内容结构化处理
+    """
+    
+    def __init__(self):
+        """初始化结构化工具"""
+        self.structure_processor = StructureProcessor()
+    
+    def process_content_structure(self, content_data: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        处理内容结构化
+        
+        Args:
+            content_data: 包含识别结果的内容数据,格式如下:
+            {
+                'channel': str,
+                'title': str,
+                'content': str,
+                'images': List[str],
+                'videos': Dict,
+                'meta': Dict
+            }
+            
+        Returns:
+            Dict[str, Any]: 结构化处理后的结果
+        """
+        try:
+            # 结构化输入规范化
+            structure_input = {
+                "title": content_data.get('title', ''),
+                "body_text": content_data.get('content', ''),
+                "images_comprehension": content_data.get('images', [])
+            }
+            
+            # 调用结构化处理器
+            structured_content = self.structure_processor.process_content(structure_input)
+            
+            # 若返回为字符串或字典,直接封装;不访问 .result
+            result = {
+                'original_data': content_data,
+                'structured_content': structured_content,
+                'structure_status': 'success',
+                'process_time': self._get_current_timestamp()
+            }
+            
+            logger.info(f"内容结构化处理成功: title={content_data.get('title', '')}")
+            return result
+            
+        except Exception as e:
+            logger.error(f"内容结构化处理失败: {e}")
+            return {
+                'original_data': content_data,
+                'structured_content': '',
+                'structure_status': 'failed',
+                'error': str(e),
+                'process_time': self._get_current_timestamp()
+            }
+    
+    def _get_current_timestamp(self) -> str:
+        """获取当前时间戳字符串"""
+        from datetime import datetime
+        return datetime.now().strftime('%Y-%m-%d %H:%M:%S')

+ 0 - 5
indentify/image_identifier.py → tools/indentify/image_identifier.py

@@ -93,11 +93,9 @@ class ImageIdentifier:
             if not image_urls:
                 return {"images_comprehension": [], "error": "没有图片需要分析"}
             
-            print(f"正在使用Gemini API分析 {len(image_urls)} 张图片...")
             results = []
             
             for i, image_url in enumerate(image_urls):
-                print(f"正在处理第 {i+1} 张图片: {image_url}")
                 
                 # 下载图片
                 image = self.download_image(image_url)
@@ -128,11 +126,9 @@ class ImageIdentifier:
     
     def process_images(self, formatted_content: Dict[str, Any]) -> Dict[str, Any]:
         """处理图片识别的主函数"""
-        print("开始图片OCR识别处理...")
         
         # 提取图片URL
         image_urls = self.extract_image_urls(formatted_content)
-        print(f"提取到 {len(image_urls)} 张图片")
         
         if not image_urls:
             print("没有图片需要分析")
@@ -143,7 +139,6 @@ class ImageIdentifier:
         
         if result.get("images_comprehension"):
             successful_count = sum(1 for img in result['images_comprehension'] if img.get('success', False))
-            print(f"图片OCR识别完成,成功分析 {successful_count}/{len(result['images_comprehension'])} 张图片")
         else:
             print("图片OCR识别失败")
         

+ 0 - 0
indentify/indentify.py → tools/indentify/indentify.py


+ 0 - 10
indentify/video_identifier.py → tools/indentify/video_identifier.py

@@ -531,23 +531,19 @@ class VideoIdentifier:
     
     def process_videos(self, formatted_content: Dict[str, Any]) -> List[Dict[str, Any]]:
         """处理视频识别的主函数"""
-        print("开始视频识别处理...")
         
         # 定期清理缓存
         self.cleanup_cache()
         
         # 提取视频URL
         video_data = self.extract_video_urls(formatted_content)
-        print(f"提取到 {len(video_data)} 个视频")
         
         if not video_data:
-            print("没有视频需要分析")
             return []
         
         # 逐个处理视频
         results = []
         for i, video_info in enumerate(video_data):
-            print(f"\n处理视频 {i+1}/{len(video_data)}")
             result = self.process_video_single(video_info)
             results.append(result)
             
@@ -555,12 +551,6 @@ class VideoIdentifier:
             if i < len(video_data) - 1:  # 不是最后一个视频
                 time.sleep(2)
         
-        if results:
-            print(f"\n视频识别完成,共分析 {len(results)} 个视频")
-            print("分析维度:ASR、关键帧提取")
-        else:
-            print("视频识别失败")
-        
         return results
 
 

+ 102 - 0
tools/structure.py

@@ -0,0 +1,102 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+内容结构化处理模块
+简化版本:只提供核心的内容结构化功能
+"""
+
+import os
+import sys
+import json
+from typing import Any, Dict, Union
+
+# 导入自定义模块
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from gemini import GeminiProcessor
+
+
+class StructureProcessor:
+    def __init__(self, prompt_file: str = None):
+        """
+        初始化结构化处理器
+        
+        Args:
+            prompt_file: 提示词文件路径,默认为prompt/structure.md
+        """
+        self.processor = GeminiProcessor()
+        
+        # 修复提示词文件路径
+        if prompt_file is None:
+            # 从tools目录找到项目根目录下的prompt/structure.md
+            current_dir = os.path.dirname(os.path.abspath(__file__))  # tools
+            project_root = os.path.dirname(current_dir)  # 项目根目录
+            prompt_file = os.path.join(project_root, "prompt", "structure.md")
+        
+        self.prompt_file = prompt_file
+        self.system_prompt = self._load_structure_prompt()
+    
+    def _load_structure_prompt(self) -> str:
+        """加载结构化处理提示词"""
+        try:
+            with open(self.prompt_file, 'r', encoding='utf-8') as f:
+                return f.read()
+        except Exception as e:
+            print(f"加载提示词文件失败: {e}")
+            print(f"尝试加载的文件路径: {self.prompt_file}")
+            return ""
+    
+    def process_content(self, content: Union[str, Dict, Any], custom_prompt: str = None) -> str:
+        """
+        处理内容结构化
+        
+        Args:
+            content: 需要结构化的内容,可以是字符串、字典或JSON对象
+            custom_prompt: 自定义提示词,如果为None则使用默认的结构化提示词
+            
+        Returns:
+            str: 结构化后的内容
+        """
+        try:
+            # 如果content是字典,转换为JSON字符串
+            if isinstance(content, dict):
+                content_str = json.dumps(content, ensure_ascii=False, indent=2)
+            else:
+                content_str = str(content)
+            
+            # 使用自定义提示词或默认的结构化提示词
+            prompt = custom_prompt or self.system_prompt
+            
+            # 构建完整的提示词
+            full_prompt = f"{prompt}\n\n## 输入\n用户将提供一个包含 `title`、`body_text` 和 `images_comprehension` 的JSON对象。\n\n请处理以下内容:\n{content_str}"
+            
+            result = self.processor.process(content_str, full_prompt)
+            return result
+        except Exception as e:
+            print(f"内容结构化处理失败: {e}")
+            return ""
+
+
+def main():
+    """测试函数"""
+    processor = StructureProcessor()
+    
+    # 测试数据 - 模拟实际的JSON输入
+    test_content = {
+        "title": "如何制作美味的蛋糕",
+        "body_text": "制作蛋糕需要准备一些基本的材料和工具,按照正确的步骤操作就能做出美味的蛋糕。",
+        "images_comprehension": [
+            "第一步:准备材料 - 面粉、鸡蛋、糖、牛奶",
+            "第二步:混合材料 - 将面粉和糖混合,加入鸡蛋和牛奶",
+            "第三步:烘烤 - 在180度烤箱中烘烤30分钟",
+            "注意事项:确保材料新鲜,烤箱预热到位"
+        ]
+    }
+    
+    print("使用默认结构化提示词处理内容...")
+    result = processor.process_content(test_content)
+    print(f"处理结果:\n{result}")
+
+
+if __name__ == "__main__":
+    main()