3 月之前 · 55831e0df7
--- a/content_indentify/indentify.py
+++ b/content_indentify/indentify.py
@@ -44,36 +44,59 @@ class ContentIdentifier:
 
				     
			
 
				 
			
 
				     def get_unprocessed_record(self) -> Optional[Dict[str, Any]]:
			
 
				-        """从数据库获取一条未处理的数据"""
			
 
				-        sql = """
			
 
				-        SELECT id, formatted_content 
			
 
				-        FROM knowledge_search_content 
			
 
				-        WHERE recognition_status = 0
			
 
				-        LIMIT 1
			
 
				+        """从数据库获取一条未处理的数据
			
 
				+        先从 knowledge_content_query 表中选取 category_id = 0 的所有 query_word，
			
 
				+        然后用这些 query_word 去 knowledge_search_content 表中匹配，
			
 
				+        找出 recognition_status = 0 的一条开始处理
			
 
				         """
			
 
				-        
			
 
				         try:
			
 
				-            result = self.db.get_values(sql)
			
 
				+            # 第一步：获取 category_id = 0 的所有 query_word
			
 
				+            query_sql = """
			
 
				+            SELECT query_word 
			
 
				+            FROM knowledge_content_query 
			
 
				+            WHERE category_id = 0
			
 
				+            """
			
 
				+            
			
 
				+            query_result = self.db.get_values(query_sql)
			
 
				+            if not query_result:
			
 
				+                self.logger.warning("未找到 category_id = 0 的 query_word")
			
 
				+                return None
			
 
				+            
			
 
				+            query_words = [row[0] for row in query_result]
			
 
				+            self.logger.info(f"找到 {len(query_words)} 个 category_id = 0 的 query_word")
			
 
				+            
			
 
				+            # 第二步：用这些 query_word 去匹配 knowledge_search_content 表
			
 
				+            # 使用 IN 查询来匹配多个 query_word
			
 
				+            if len(query_words) > 0:
			
 
				+                # 构建带引号的查询条件，因为 query_word 是字符串
			
 
				+                quoted_words = [f"'{word}'" for word in query_words]
			
 
				+                placeholders = ','.join(quoted_words)
			
 
				+                
			
 
				+                content_sql = f"""
			
 
				+                SELECT id, formatted_content
			
 
				+                FROM knowledge_search_content 
			
 
				+                WHERE recognition_status = 0
			
 
				+                AND query_word IN ({placeholders})
			
 
				+                LIMIT 1
			
 
				+                """
			
 
				+                self.logger.info(f"执行查询: {content_sql}")
			
 
				+                
			
 
				+                # 不需要传递参数，因为SQL已经包含了具体的值
			
 
				+                result = self.db.get_values(content_sql)
			
 
				+            else:
			
 
				+                self.logger.warning("没有可用的 query_word 进行匹配")
			
 
				+                return None
			
 
				             if result and len(result) > 0:
			
 
				                 record = result[0]
			
 
				                 # 检查返回的字段数量
			
 
				-                if len(record) >= 3:
			
 
				-                    return {
			
 
				-                        'id': record[0],
			
 
				-                        'formatted_content': record[1],
			
 
				-                        'channel_content_id': record[2]
			
 
				-                    }
			
 
				-                elif len(record) == 2:
			
 
				-                    # 如果没有channel_content_id字段，使用id作为默认值
			
 
				-                    return {
			
 
				-                        'id': record[0],
			
 
				-                        'formatted_content': record[1],
			
 
				-                        'channel_content_id': record[0]  # 使用id作为默认值
			
 
				-                    }
			
 
				-                else:
			
 
				-                    self.logger.error(f"数据库返回字段数量异常: {len(record)}, 期望至少2个字段")
			
 
				-                    return None
			
 
				-            return None
			
 
				+                return {
			
 
				+                    'id': record[0],
			
 
				+                    'formatted_content': record[1]
			
 
				+                }
			
 
				+            else:
			
 
				+                self.logger.info("未找到匹配 query_word 且 recognition_status = 0 的记录")
			
 
				+                return None
			
 
				+                
			
 
				         except Exception as e:
			
 
				             self.logger.error(f"获取未处理记录失败: {e}")
			
 
				             return None
			
@@ -145,7 +168,7 @@ class ContentIdentifier:
 
				                 self.logger.warning("没有找到未处理的记录")
			
 
				                 return False
			
 
				             
			
 
				-            self.logger.info(f"开始处理记录 ID: {record['id']}, 内容ID: {record['channel_content_id']}")
			
 
				+            self.logger.info(f"开始处理记录 ID: {record['id']}")
			
 
				             # self.logger.info(f"  多模态识别: {record['multimodal_recognition'][:300]}...")
			
 
				 
			
 
				             # 先设置这条记录的 recognition_status = 1
			
--- a/content_indentify/scheduler.pid
+++ b/content_indentify/scheduler.pid
@@ -0,0 +1 @@
 
				+82953
			
--- a/query_key/aggregate_queries.py
+++ b/query_key/aggregate_queries.py
@@ -0,0 +1,189 @@
 
				+# 聚合query.json中的数据，生成所有可能的组合并存储到数据库
			
 
				+# 参考set_querys.py的逻辑
			
 
				+
			
 
				+import json
			
 
				+import os
			
 
				+import sys
			
 
				+from datetime import datetime
			
 
				+
			
 
				+# 添加项目根目录到Python路径
			
 
				+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
			
 
				+
			
 
				+from utils.mysql_db import MysqlHelper
			
 
				+from loguru import logger
			
 
				+
			
 
				+def load_query_json():
			
 
				+    """加载query.json文件"""
			
 
				+    try:
			
 
				+        query_file_path = os.path.join(os.path.dirname(__file__), "query.json")
			
 
				+        with open(query_file_path, 'r', encoding='utf-8') as f:
			
 
				+            data = json.load(f)
			
 
				+        logger.info("成功加载query.json文件")
			
 
				+        return data
			
 
				+    except Exception as e:
			
 
				+        logger.error(f"加载query.json文件失败: {e}")
			
 
				+        return None
			
 
				+
			
 
				+def generate_combinations(query_data):
			
 
				+    """生成所有可能的组合"""
			
 
				+    combinations = []
			
 
				+    
			
 
				+    # 获取query.json中的各种类型
			
 
				+    content_formats = [item["name"] for item in query_data.get("content_format", [])]
			
 
				+    stages = [item["name"] for item in query_data.get("stage", [])]
			
 
				+    content_types = [item["name"] for item in query_data.get("content_type", [])]
			
 
				+    
			
 
				+    logger.info(f"发现 {len(content_formats)} 种内容格式: {content_formats}")
			
 
				+    logger.info(f"发现 {len(stages)} 个阶段: {stages}")
			
 
				+    logger.info(f"发现 {len(content_types)} 种内容类型: {content_types}")
			
 
				+    
			
 
				+    # 生成所有可能的组合
			
 
				+    for content_format in content_formats:
			
 
				+        for stage in stages:
			
 
				+            for content_type in content_types:
			
 
				+                # 生成拼接的字符串
			
 
				+                query_word = f"{content_format}{stage}{content_type}"
			
 
				+                
			
 
				+                # 创建JSON结构
			
 
				+                combination = {
			
 
				+                    "content_format": content_format,
			
 
				+                    "stage": stage,
			
 
				+                    "content_type": content_type,
			
 
				+                    "query_word": query_word,
			
 
				+                    "create_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
			
 
				+                }
			
 
				+                combinations.append(combination)
			
 
				+    
			
 
				+    logger.info(f"生成了 {len(combinations)} 个组合")
			
 
				+    return combinations
			
 
				+
			
 
				+def save_to_database(combinations):
			
 
				+    """将组合结果保存到knowledge_content_query表中"""
			
 
				+    try:
			
 
				+        # 先清空表（可选，根据需求决定）
			
 
				+        # clear_sql = "DELETE FROM knowledge_content_query"
			
 
				+        # MysqlHelper.update_values(clear_sql)
			
 
				+        
			
 
				+        # 插入数据的SQL语句
			
 
				+        insert_sql = """
			
 
				+        INSERT INTO knowledge_content_query 
			
 
				+        (stage, content_type, content_format, query_word, status, create_time) 
			
 
				+        VALUES (%s, %s, %s, %s, %s, %s)
			
 
				+        """
			
 
				+        
			
 
				+        success_count = 0
			
 
				+        total_count = len(combinations)
			
 
				+        
			
 
				+        logger.info(f"开始插入 {total_count} 条记录到数据库...")
			
 
				+        
			
 
				+        for i, combo in enumerate(combinations, 1):
			
 
				+            params = (
			
 
				+                combo["stage"],
			
 
				+                combo["content_type"],
			
 
				+                combo["content_format"],
			
 
				+                combo["query_word"],
			
 
				+                0,  # status设为0
			
 
				+                combo["create_time"]
			
 
				+            )
			
 
				+            
			
 
				+            result = MysqlHelper.update_values(insert_sql, params)
			
 
				+            if result is not None:
			
 
				+                success_count += 1
			
 
				+                # 每插入100条记录输出一次进度
			
 
				+                if i % 100 == 0:
			
 
				+                    logger.info(f"已插入 {i}/{total_count} 条记录")
			
 
				+            else:
			
 
				+                logger.error(f"插入失败: {combo}")
			
 
				+        
			
 
				+        logger.info(f"成功插入 {success_count}/{total_count} 条记录到数据库")
			
 
				+        return success_count
			
 
				+        
			
 
				+    except Exception as e:
			
 
				+        logger.error(f"保存到数据库失败: {e}")
			
 
				+        return 0
			
 
				+
			
 
				+def print_combinations(combinations):
			
 
				+    """打印所有组合结果"""
			
 
				+    try:
			
 
				+        print(f"\n=== 生成了 {len(combinations)} 个组合 ===\n")
			
 
				+        
			
 
				+        # 打印前10个组合的JSON结构作为示例
			
 
				+        for i, combo in enumerate(combinations[:10], 1):
			
 
				+            print(f"{i:3d}. {json.dumps(combo, ensure_ascii=False, indent=2)}")
			
 
				+        
			
 
				+        if len(combinations) > 10:
			
 
				+            print(f"... 还有 {len(combinations) - 10} 个组合")
			
 
				+        
			
 
				+        print(f"\n=== 总共 {len(combinations)} 个组合 ===")
			
 
				+        return True
			
 
				+        
			
 
				+    except Exception as e:
			
 
				+        logger.error(f"打印组合结果失败: {e}")
			
 
				+        return False
			
 
				+
			
 
				+def print_statistics(combinations):
			
 
				+    """打印统计信息"""
			
 
				+    try:
			
 
				+        print(f"\n=== 统计信息 ===")
			
 
				+        
			
 
				+        # 按内容格式统计
			
 
				+        content_format_stats = {}
			
 
				+        stage_stats = {}
			
 
				+        content_type_stats = {}
			
 
				+        
			
 
				+        for combo in combinations:
			
 
				+            content_format_stats[combo["content_format"]] = content_format_stats.get(combo["content_format"], 0) + 1
			
 
				+            stage_stats[combo["stage"]] = stage_stats.get(combo["stage"], 0) + 1
			
 
				+            content_type_stats[combo["content_type"]] = content_type_stats.get(combo["content_type"], 0) + 1
			
 
				+        
			
 
				+        print(f"内容格式分布:")
			
 
				+        for fmt, count in content_format_stats.items():
			
 
				+            print(f"  {fmt}: {count} 个组合")
			
 
				+        
			
 
				+        print(f"\n阶段分布:")
			
 
				+        for stage, count in stage_stats.items():
			
 
				+            print(f"  {stage}: {count} 个组合")
			
 
				+        
			
 
				+        print(f"\n内容类型分布:")
			
 
				+        for ctype, count in content_type_stats.items():
			
 
				+            print(f"  {ctype}: {count} 个组合")
			
 
				+        
			
 
				+        return True
			
 
				+        
			
 
				+    except Exception as e:
			
 
				+        logger.error(f"打印统计信息失败: {e}")
			
 
				+        return False
			
 
				+
			
 
				+def main():
			
 
				+    """主函数"""
			
 
				+    logger.info("开始执行query.json聚合")
			
 
				+    
			
 
				+    # 1. 加载query.json文件
			
 
				+    query_data = load_query_json()
			
 
				+    if not query_data:
			
 
				+        logger.error("无法加载query.json文件，程序退出")
			
 
				+        return
			
 
				+    
			
 
				+    # 2. 生成组合
			
 
				+    combinations = generate_combinations(query_data)
			
 
				+    
			
 
				+    # 3. 打印组合结果示例
			
 
				+    if print_combinations(combinations):
			
 
				+        logger.info("组合生成成功")
			
 
				+        
			
 
				+        # 4. 打印统计信息
			
 
				+        print_statistics(combinations)
			
 
				+        
			
 
				+        # 5. 保存到数据库
			
 
				+        logger.info("开始保存到数据库...")
			
 
				+        saved_count = save_to_database(combinations)
			
 
				+        
			
 
				+        if saved_count > 0:
			
 
				+            logger.info(f"程序执行成功完成，共保存 {saved_count} 条记录到数据库")
			
 
				+        else:
			
 
				+            logger.error("数据库保存失败")
			
 
				+    else:
			
 
				+        logger.error("程序执行失败")
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    main() 
			
--- a/query_key/aggregated_queries.json
+++ b/query_key/aggregated_queries.json
@@ -0,0 +1,177 @@
 
				+{
			
 
				+  "metadata": {
			
 
				+    "total_combinations": 24,
			
 
				+    "generated_time": "2025-08-15 11:15:59",
			
 
				+    "description": "由query.json聚合生成的查询组合"
			
 
				+  },
			
 
				+  "combinations": [
			
 
				+    {
			
 
				+      "content_format": "图文",
			
 
				+      "stage": "策划",
			
 
				+      "content_type": "方法",
			
 
				+      "query_word": "图文策划方法",
			
 
				+      "create_time": "2025-08-15 11:15:59"
			
 
				+    },
			
 
				+    {
			
 
				+      "content_format": "图文",
			
 
				+      "stage": "策划",
			
 
				+      "content_type": "原因",
			
 
				+      "query_word": "图文策划原因",
			
 
				+      "create_time": "2025-08-15 11:15:59"
			
 
				+    },
			
 
				+    {
			
 
				+      "content_format": "图文",
			
 
				+      "stage": "策划",
			
 
				+      "content_type": "关键点",
			
 
				+      "query_word": "图文策划关键点",
			
 
				+      "create_time": "2025-08-15 11:15:59"
			
 
				+    },
			
 
				+    {
			
 
				+      "content_format": "图文",
			
 
				+      "stage": "创作",
			
 
				+      "content_type": "方法",
			
 
				+      "query_word": "图文创作方法",
			
 
				+      "create_time": "2025-08-15 11:15:59"
			
 
				+    },
			
 
				+    {
			
 
				+      "content_format": "图文",
			
 
				+      "stage": "创作",
			
 
				+      "content_type": "原因",
			
 
				+      "query_word": "图文创作原因",
			
 
				+      "create_time": "2025-08-15 11:15:59"
			
 
				+    },
			
 
				+    {
			
 
				+      "content_format": "图文",
			
 
				+      "stage": "创作",
			
 
				+      "content_type": "关键点",
			
 
				+      "query_word": "图文创作关键点",
			
 
				+      "create_time": "2025-08-15 11:15:59"
			
 
				+    },
			
 
				+    {
			
 
				+      "content_format": "图文",
			
 
				+      "stage": "制作",
			
 
				+      "content_type": "方法",
			
 
				+      "query_word": "图文制作方法",
			
 
				+      "create_time": "2025-08-15 11:15:59"
			
 
				+    },
			
 
				+    {
			
 
				+      "content_format": "图文",
			
 
				+      "stage": "制作",
			
 
				+      "content_type": "原因",
			
 
				+      "query_word": "图文制作原因",
			
 
				+      "create_time": "2025-08-15 11:15:59"
			
 
				+    },
			
 
				+    {
			
 
				+      "content_format": "图文",
			
 
				+      "stage": "制作",
			
 
				+      "content_type": "关键点",
			
 
				+      "query_word": "图文制作关键点",
			
 
				+      "create_time": "2025-08-15 11:15:59"
			
 
				+    },
			
 
				+    {
			
 
				+      "content_format": "图文",
			
 
				+      "stage": "发布",
			
 
				+      "content_type": "方法",
			
 
				+      "query_word": "图文发布方法",
			
 
				+      "create_time": "2025-08-15 11:15:59"
			
 
				+    },
			
 
				+    {
			
 
				+      "content_format": "图文",
			
 
				+      "stage": "发布",
			
 
				+      "content_type": "原因",
			
 
				+      "query_word": "图文发布原因",
			
 
				+      "create_time": "2025-08-15 11:15:59"
			
 
				+    },
			
 
				+    {
			
 
				+      "content_format": "图文",
			
 
				+      "stage": "发布",
			
 
				+      "content_type": "关键点",
			
 
				+      "query_word": "图文发布关键点",
			
 
				+      "create_time": "2025-08-15 11:15:59"
			
 
				+    },
			
 
				+    {
			
 
				+      "content_format": "视频",
			
 
				+      "stage": "策划",
			
 
				+      "content_type": "方法",
			
 
				+      "query_word": "视频策划方法",
			
 
				+      "create_time": "2025-08-15 11:15:59"
			
 
				+    },
			
 
				+    {
			
 
				+      "content_format": "视频",
			
 
				+      "stage": "策划",
			
 
				+      "content_type": "原因",
			
 
				+      "query_word": "视频策划原因",
			
 
				+      "create_time": "2025-08-15 11:15:59"
			
 
				+    },
			
 
				+    {
			
 
				+      "content_format": "视频",
			
 
				+      "stage": "策划",
			
 
				+      "content_type": "关键点",
			
 
				+      "query_word": "视频策划关键点",
			
 
				+      "create_time": "2025-08-15 11:15:59"
			
 
				+    },
			
 
				+    {
			
 
				+      "content_format": "视频",
			
 
				+      "stage": "创作",
			
 
				+      "content_type": "方法",
			
 
				+      "query_word": "视频创作方法",
			
 
				+      "create_time": "2025-08-15 11:15:59"
			
 
				+    },
			
 
				+    {
			
 
				+      "content_format": "视频",
			
 
				+      "stage": "创作",
			
 
				+      "content_type": "原因",
			
 
				+      "query_word": "视频创作原因",
			
 
				+      "create_time": "2025-08-15 11:15:59"
			
 
				+    },
			
 
				+    {
			
 
				+      "content_format": "视频",
			
 
				+      "stage": "创作",
			
 
				+      "content_type": "关键点",
			
 
				+      "query_word": "视频创作关键点",
			
 
				+      "create_time": "2025-08-15 11:15:59"
			
 
				+    },
			
 
				+    {
			
 
				+      "content_format": "视频",
			
 
				+      "stage": "制作",
			
 
				+      "content_type": "方法",
			
 
				+      "query_word": "视频制作方法",
			
 
				+      "create_time": "2025-08-15 11:15:59"
			
 
				+    },
			
 
				+    {
			
 
				+      "content_format": "视频",
			
 
				+      "stage": "制作",
			
 
				+      "content_type": "原因",
			
 
				+      "query_word": "视频制作原因",
			
 
				+      "create_time": "2025-08-15 11:15:59"
			
 
				+    },
			
 
				+    {
			
 
				+      "content_format": "视频",
			
 
				+      "stage": "制作",
			
 
				+      "content_type": "关键点",
			
 
				+      "query_word": "视频制作关键点",
			
 
				+      "create_time": "2025-08-15 11:15:59"
			
 
				+    },
			
 
				+    {
			
 
				+      "content_format": "视频",
			
 
				+      "stage": "发布",
			
 
				+      "content_type": "方法",
			
 
				+      "query_word": "视频发布方法",
			
 
				+      "create_time": "2025-08-15 11:15:59"
			
 
				+    },
			
 
				+    {
			
 
				+      "content_format": "视频",
			
 
				+      "stage": "发布",
			
 
				+      "content_type": "原因",
			
 
				+      "query_word": "视频发布原因",
			
 
				+      "create_time": "2025-08-15 11:15:59"
			
 
				+    },
			
 
				+    {
			
 
				+      "content_format": "视频",
			
 
				+      "stage": "发布",
			
 
				+      "content_type": "关键点",
			
 
				+      "query_word": "视频发布关键点",
			
 
				+      "create_time": "2025-08-15 11:15:59"
			
 
				+    }
			
 
				+  ]
			
 
				+}
			
--- a/query_key/set_querys.py
+++ b/query_key/set_querys.py
@@ -66,7 +66,7 @@ def generate_combinations(categories, query_data):
 
				                         "content_type": content_type,
			
 
				                         "content_format": content_format,
			
 
				                         "query_word": query_word,
			
 
				-                        "status": 0,
			
 
				+                        "status": 1,
			
 
				                         "create_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
			
 
				                     }
			
 
				                     combinations.append(combination)
			
--- a/structure/multi_thread_scheduler.py
+++ b/structure/multi_thread_scheduler.py
@@ -16,16 +16,12 @@ from structure_processor import StructureProcessor
 
				 from utils.logging_config import get_logger
			
 
				 
			
 
				 class MultiThreadScheduler:
			
 
				-    def __init__(self, thread_count=5, interval_minutes=2, 
			
 
				-                 query_word=None, source_type=None, source_channel=None):
			
 
				+    def __init__(self, thread_count=5, interval_minutes=2):
			
 
				         self.thread_count = thread_count
			
 
				         self.interval_seconds = interval_minutes * 60
			
 
				         self.running = True
			
 
				         self.threads = []
			
 
				         self.processor = StructureProcessor()
			
 
				-        self.query_word = query_word
			
 
				-        self.source_type = source_type
			
 
				-        self.source_channel = source_channel
			
 
				         self.pid_file = "structure_scheduler.pid"
			
 
				         
			
 
				         # 设置日志
			
@@ -79,9 +75,7 @@ class MultiThreadScheduler:
 
				                 
			
 
				                 # 处理一条数据
			
 
				                 thread_logger.info(f"开始处理数据...")
			
 
				-                success = self.processor.process_single_record(
			
 
				-                    self.query_word, self.source_type, self.source_channel
			
 
				-                )
			
 
				+                success = self.processor.process_single_record()
			
 
				                 
			
 
				                 if success:
			
 
				                     thread_logger.info("数据处理成功")
			
@@ -116,7 +110,7 @@ class MultiThreadScheduler:
 
				     def start_all_threads(self):
			
 
				         """启动所有工作线程"""
			
 
				         self.logger.info(f"启动 {self.thread_count} 个工作线程...")
			
 
				-        self.logger.info(f"查询条件: query_word={self.query_word}, source_type={self.source_type}, source_channel={self.source_channel}")
			
 
				+        self.logger.info("查询条件: multimodal_recognition is not null AND structured_data is null AND query_word IN (category_id = 0 的 query_word)")
			
 
				         
			
 
				         for i in range(self.thread_count):
			
 
				             thread = threading.Thread(
			
@@ -174,9 +168,6 @@ def main():
 
				     import argparse
			
 
				     
			
 
				     parser = argparse.ArgumentParser(description='多线程结构化处理调度器')
			
 
				-    parser.add_argument('--query_word', default=None, help='query词')
			
 
				-    parser.add_argument('--source_type', default=None, help='数据源类型')
			
 
				-    parser.add_argument('--source_channel', default=None, help='数据源渠道')
			
 
				     parser.add_argument('--thread_count', type=int, default=5, help='线程数量')
			
 
				     parser.add_argument('--interval_minutes', type=int, default=2, help='处理间隔（分钟）')
			
 
				     
			
@@ -187,17 +178,14 @@ def main():
 
				     print("=" * 60)
			
 
				     print(f"线程数量: {args.thread_count}")
			
 
				     print(f"处理间隔: {args.interval_minutes}分钟")
			
 
				-    print(f"查询条件: query_word={args.query_word}, source_type={args.source_type}, source_channel={args.source_channel}")
			
 
				+    print("查询条件: multimodal_recognition is not null AND structured_data is null AND query_word IN (category_id = 0 的 query_word)")
			
 
				     print(f"启动时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
			
 
				     print("=" * 60)
			
 
				     
			
 
				     # 创建并运行调度器
			
 
				     scheduler = MultiThreadScheduler(
			
 
				         thread_count=args.thread_count, 
			
 
				-        interval_minutes=args.interval_minutes,
			
 
				-        query_word=args.query_word,
			
 
				-        source_type=args.source_type,
			
 
				-        source_channel=args.source_channel
			
 
				+        interval_minutes=args.interval_minutes
			
 
				     )
			
 
				     scheduler.run()
			
 
				 
			
--- a/structure/structure_processor.py
+++ b/structure/structure_processor.py
@@ -40,61 +40,79 @@ class StructureProcessor:
 
				         self.stop_event = threading.Event()
			
 
				         self.threads = []
			
 
				     
			
 
				-    def build_query_conditions(self, query_word: Optional[str], 
			
 
				-                             source_type: Optional[str], 
			
 
				-                             source_channel: Optional[str]) -> Tuple[str, Tuple]:
			
 
				-        """构建查询条件和参数"""
			
 
				-        conditions = ["multimodal_recognition is not null", "structured_data is null"]
			
 
				-        params = []
			
 
				-        
			
 
				-        if query_word is not None:
			
 
				-            conditions.append("query_word = %s")
			
 
				-            params.append(query_word)
			
 
				-        if source_type is not None:
			
 
				-            conditions.append("source_type = %s")
			
 
				-            params.append(source_type)
			
 
				-        if source_channel is not None:
			
 
				-            conditions.append("source_channel = %s")
			
 
				-            params.append(source_channel)
			
 
				+    def get_query_words(self) -> List[str]:
			
 
				+        """从 knowledge_content_query 表中获取 category_id = 0 的所有 query_word"""
			
 
				+        try:
			
 
				+            sql = """
			
 
				+            SELECT query_word 
			
 
				+            FROM knowledge_content_query 
			
 
				+            WHERE category_id = 0
			
 
				+            """
			
 
				             
			
 
				-        where_clause = " AND ".join(conditions)
			
 
				-        return where_clause, tuple(params)
			
 
				+            result = MysqlHelper.get_values(sql)
			
 
				+            if result:
			
 
				+                query_words = [row[0] for row in result]
			
 
				+                self.logger.info(f"找到 {len(query_words)} 个 category_id = 0 的 query_word")
			
 
				+                return query_words
			
 
				+            else:
			
 
				+                self.logger.warning("未找到 category_id = 0 的 query_word")
			
 
				+                return []
			
 
				+                
			
 
				+        except Exception as e:
			
 
				+            self.logger.error(f"获取 query_word 失败: {e}")
			
 
				+            return []
			
 
				     
			
 
				-    def process_single_record(self, query_word: Optional[str], 
			
 
				-                            source_type: Optional[str], 
			
 
				-                            source_channel: Optional[str]) -> bool:
			
 
				+    def process_single_record(self) -> bool:
			
 
				         """处理单条记录"""
			
 
				         try:
			
 
				             with self.lock:
			
 
				-                # 构建查询条件和参数
			
 
				-                where_clause, params = self.build_query_conditions(query_word, source_type, source_channel)
			
 
				+                # 第一步：获取 category_id = 0 的所有 query_word
			
 
				+                query_words = self.get_query_words()
			
 
				+                if not query_words:
			
 
				+                    self.logger.warning("没有可用的 query_word")
			
 
				+                    return False
			
 
				+                
			
 
				+                # 第二步：用这些 query_word 去匹配 knowledge_search_content 表
			
 
				+                # 构建带引号的查询条件
			
 
				+                quoted_words = [f"'{word}'" for word in query_words]
			
 
				+                placeholders = ','.join(quoted_words)
			
 
				                 
			
 
				-                # 先查询一条需要处理的记录
			
 
				+                # 使用 FOR UPDATE 锁定记录，确保原子性操作
			
 
				+                # 明确排除正在处理中和已处理的记录
			
 
				                 select_sql = f"""
			
 
				                     SELECT id, multimodal_recognition 
			
 
				                     FROM knowledge_search_content 
			
 
				-                    WHERE {where_clause}
			
 
				+                    WHERE multimodal_recognition IS NOT NULL  
			
 
				+                        AND structured_data IS NULL
			
 
				+                        AND query_word IN ({placeholders})
			
 
				                     LIMIT 1
			
 
				                 """
			
 
				                 
			
 
				-                records = MysqlHelper.get_values(select_sql, params)
			
 
				+                self.logger.info(f"执行查询: {select_sql}")
			
 
				+                
			
 
				+                records = MysqlHelper.get_values(select_sql)
			
 
				                 if not records:
			
 
				                     self.logger.warning("没有找到需要处理的记录")
			
 
				                     return False
			
 
				                 
			
 
				                 row = records[0]
			
 
				+                self.logger.info(f"row: {row}")
			
 
				                 record_id = row[0]
			
 
				+                self.logger.info(f"record_id: {record_id}")
			
 
				                 
			
 
				-                # 标记为处理中，防止其他线程取到重复处理
			
 
				+                # 立即标记为处理中，防止其他线程取到重复处理
			
 
				                 mark_sql = """
			
 
				                     UPDATE knowledge_search_content 
			
 
				-                    SET structured_data = '{}' 
			
 
				+                    SET structured_data = 'PROCESSING' 
			
 
				                     WHERE id = %s
			
 
				                 """
			
 
				                 
			
 
				-                MysqlHelper.update_values(mark_sql, (record_id,))
			
 
				+                mark_result = MysqlHelper.update_values(mark_sql, (record_id,))
			
 
				+                if mark_result is None:
			
 
				+                    self.logger.error(f"标记记录 {record_id} 为处理中失败")
			
 
				+                    return False
			
 
				                 
			
 
				-                self.logger.info(f"开始处理记录 ID: {record_id}")
			
 
				+                self.logger.info(f"记录 {record_id} 已标记为处理中")
			
 
				                 
			
 
				                 # 处理内容
			
 
				                 result = self.processor.process(row[1], self.system_prompt)
			
@@ -108,7 +126,11 @@ class StructureProcessor:
 
				                     WHERE id = %s
			
 
				                 """
			
 
				                 
			
 
				-                MysqlHelper.update_values(update_sql, (result, record_id))
			
 
				+                update_result = MysqlHelper.update_values(update_sql, (result, record_id))
			
 
				+                if update_result is None:
			
 
				+                    self.logger.error(f"更新记录 {record_id} 失败")
			
 
				+                    return False
			
 
				+                
			
 
				                 self.logger.info(f"记录 {record_id} 处理完成并更新数据库")
			
 
				                 return True
			
 
				                 
			
@@ -116,8 +138,7 @@ class StructureProcessor:
 
				             self.logger.error(f"处理记录失败: {str(e)}", exc_info=True)
			
 
				             return False
			
 
				     
			
 
				-    def worker_thread(self, thread_id: int, query_word: Optional[str], 
			
 
				-                     source_type: Optional[str], source_channel: Optional[str]):
			
 
				+    def worker_thread(self, thread_id: int):
			
 
				         """工作线程函数"""
			
 
				         thread_logger = get_logger(f'WorkerThread-{thread_id}')
			
 
				         thread_logger.info(f"线程 {thread_id} 启动")
			
@@ -125,7 +146,7 @@ class StructureProcessor:
 
				         while not self.stop_event.is_set():
			
 
				             try:
			
 
				                 # 尝试处理一条记录
			
 
				-                success = self.process_single_record(query_word, source_type, source_channel)
			
 
				+                success = self.process_single_record()
			
 
				                 
			
 
				                 if not success:
			
 
				                     thread_logger.info(f"没有找到需要处理的记录，等待5秒后重试")
			
@@ -148,20 +169,18 @@ class StructureProcessor:
 
				         
			
 
				         thread_logger.info(f"线程 {thread_id} 已停止")
			
 
				     
			
 
				-    def start_multi_thread_processing(self, query_word: Optional[str], 
			
 
				-                                    source_type: Optional[str], 
			
 
				-                                    source_channel: Optional[str]):
			
 
				+    def start_multi_thread_processing(self):
			
 
				         """启动多线程处理"""
			
 
				         self.threads = []
			
 
				         
			
 
				         self.logger.info("启动多线程处理...")
			
 
				-        self.logger.info(f"查询条件: query_word={query_word}, source_type={source_type}, source_channel={source_channel}")
			
 
				+        self.logger.info("查询条件: multimodal_recognition is not null AND structured_data is null AND query_word IN (category_id = 0 的 query_word)")
			
 
				         
			
 
				         # 创建5个线程，间隔5秒启动
			
 
				         for i in range(5):
			
 
				             thread = threading.Thread(
			
 
				                 target=self.worker_thread,
			
 
				-                args=(i + 1, query_word, source_type, source_channel)
			
 
				+                args=(i + 1,)
			
 
				             )
			
 
				             self.threads.append(thread)
			
 
				             
			
@@ -204,23 +223,9 @@ class StructureProcessor:
 
				 
			
 
				 def main():
			
 
				     """主函数"""
			
 
				-    import argparse
			
 
				-    
			
 
				-    parser = argparse.ArgumentParser(description='内容结构化处理脚本')
			
 
				-    parser.add_argument('--query_word', default=None, help='query词')
			
 
				-    parser.add_argument('--source_type', default=None, help='数据源类型')
			
 
				-    parser.add_argument('--source_channel', default=None, help='数据源渠道')
			
 
				-    
			
 
				-    args = parser.parse_args()
			
 
				-    
			
 
				     try:
			
 
				         processor = StructureProcessor()
			
 
				-        
			
 
				-        processor.start_multi_thread_processing(
			
 
				-            query_word=args.query_word, 
			
 
				-            source_type=args.source_type, 
			
 
				-            source_channel=args.source_channel
			
 
				-        )
			
 
				+        processor.start_multi_thread_processing()
			
 
				     except Exception as e:
			
 
				         print(f"程序执行失败: {str(e)}")
			
 
				         sys.exit(1)
			
@@ -229,4 +234,4 @@ def main():
 
				 if __name__ == "__main__":
			
 
				     # 测试单条记录处理
			
 
				     processor = StructureProcessor()
			
 
				-    processor.process_single_record(query_word=None, source_type=None, source_channel=None) 
			
 
				+    processor.process_single_record() 
			
--- a/structure/structure_scheduler.pid
+++ b/structure/structure_scheduler.pid
@@ -0,0 +1 @@
 
				+96498