преди 3 месеца · 7a2041f508
--- a/evaluate/README.md
+++ b/evaluate/README.md
@@ -0,0 +1,237 @@
 
				+# 多线程结构化处理调度器
			
 
				+
			
 
				+这是一个多线程定时任务调度器，用于自动处理内容结构化任务。在服务器上运行时，所有日志都会保存到文件中，方便查看和调试。
			
 
				+
			
 
				+## 功能特点
			
 
				+
			
 
				+- 🚀 多线程并发处理，提高处理效率
			
 
				+- 📝 完整的日志记录，支持文件和控制台输出
			
 
				+- 🕐 定时任务调度，可配置处理间隔
			
 
				+- 🛑 优雅退出，支持信号处理
			
 
				+- 🔧 进程管理，支持启动、停止、重启、状态查看
			
 
				+- 🎯 支持按条件筛选处理（query_word、source_type、source_channel）
			
 
				+
			
 
				+## 文件说明
			
 
				+
			
 
				+- `evaluate.py` - 结构化处理核心模块
			
 
				+- `multi_thread_scheduler.py` - 多线程调度器主程序
			
 
				+- `logging_config.py` - 日志配置模块
			
 
				+- `start_evaluate.sh` - 启动脚本（支持多种操作）
			
 
				+- `logs/` - 日志文件目录
			
 
				+
			
 
				+## 使用方法
			
 
				+
			
 
				+### 1. 启动调度器
			
 
				+
			
 
				+```bash
			
 
				+# 启动调度器（后台运行）
			
 
				+./start_evaluate.sh start
			
 
				+
			
 
				+# 或者直接运行Python脚本（前台运行）
			
 
				+python3 multi_thread_scheduler.py
			
 
				+
			
 
				+# 带参数启动
			
 
				+python3 multi_thread_scheduler.py --query_word "关键词" --source_type "类型" --source_channel "渠道"
			
 
				+```
			
 
				+
			
 
				+### 2. 查看运行状态
			
 
				+
			
 
				+```bash
			
 
				+# 查看调度器状态
			
 
				+./start_evaluate.sh status
			
 
				+
			
 
				+# 查看实时日志
			
 
				+tail -f logs/evaluate_*.log
			
 
				+
			
 
				+# 查看结构化处理日志
			
 
				+tail -f logs/Processor_*.log
			
 
				+```
			
 
				+
			
 
				+### 3. 停止调度器
			
 
				+
			
 
				+```bash
			
 
				+# 优雅停止调度器
			
 
				+./start_evaluate.sh stop
			
 
				+
			
 
				+# 强制停止（如果优雅停止失败）
			
 
				+kill -KILL $(cat evaluate_scheduler.pid)
			
 
				+```
			
 
				+
			
 
				+### 4. 重启调度器
			
 
				+
			
 
				+```bash
			
 
				+# 重启调度器
			
 
				+./start_evaluate.sh restart
			
 
				+```
			
 
				+
			
 
				+## 配置说明
			
 
				+
			
 
				+### 线程数量和处理间隔
			
 
				+
			
 
				+在 `multi_thread_scheduler.py` 中修改：
			
 
				+
			
 
				+```python
			
 
				+scheduler = MultiThreadScheduler(
			
 
				+    thread_count=5,           # 工作线程数量（默认5个）
			
 
				+    interval_minutes=2,        # 每个线程处理数据的间隔（默认2分钟）
			
 
				+    query_word=None,           # 查询关键词
			
 
				+    source_type=None,          # 数据源类型
			
 
				+    source_channel=None        # 数据源渠道
			
 
				+)
			
 
				+```
			
 
				+
			
 
				+### 查询条件
			
 
				+
			
 
				+支持按以下条件筛选需要处理的数据：
			
 
				+
			
 
				+- `query_word`: 查询关键词
			
 
				+- `source_type`: 数据源类型
			
 
				+- `source_channel`: 数据源渠道
			
 
				+
			
 
				+如果不指定条件，则处理所有符合条件的数据。
			
 
				+
			
 
				+### 日志配置
			
 
				+
			
 
				+日志文件会自动按日期命名：
			
 
				+- 调度器日志：`logs/evaluate_scheduler_YYYYMMDD.log`
			
 
				+- 结构化处理日志：`logs/EvaluateProcessor_YYYYMMDD.log`
			
 
				+- 标准输出日志：`logs/evaluate_scheduler_stdout.log`
			
 
				+
			
 
				+## 服务器部署建议
			
 
				+
			
 
				+### 1. 使用screen或tmux
			
 
				+
			
 
				+```bash
			
 
				+# 使用screen
			
 
				+screen -S evaluate
			
 
				+./start_evaluate.sh start
			
 
				+# 按 Ctrl+A 然后按 D 分离会话
			
 
				+
			
 
				+# 重新连接会话
			
 
				+screen -r evaluate
			
 
				+```
			
 
				+
			
 
				+### 2. 使用systemd服务（推荐）
			
 
				+
			
 
				+创建服务文件 `/etc/systemd/system/evaluate-scheduler.service`：
			
 
				+
			
 
				+```ini
			
 
				+[Unit]
			
 
				+Description=Evaluate Processing Scheduler
			
 
				+After=network.target
			
 
				+
			
 
				+[Service]
			
 
				+Type=forking
			
 
				+User=your_username
			
 
				+WorkingDirectory=/path/to/your/project/evaluate
			
 
				+ExecStart=/path/to/your/project/evaluate/start_evaluate.sh start
			
 
				+ExecStop=/path/to/your/project/evaluate/start_evaluate.sh stop
			
 
				+PIDFile=/path/to/your/project/evaluate/evaluate_scheduler.pid
			
 
				+Restart=always
			
 
				+RestartSec=10
			
 
				+
			
 
				+[Install]
			
 
				+WantedBy=multi-user.target
			
 
				+```
			
 
				+
			
 
				+启用服务：
			
 
				+```bash
			
 
				+sudo systemctl daemon-reload
			
 
				+sudo systemctl enable evaluate-scheduler
			
 
				+sudo systemctl start evaluate-scheduler
			
 
				+sudo systemctl status evaluate-scheduler
			
 
				+```
			
 
				+
			
 
				+### 3. 使用crontab监控
			
 
				+
			
 
				+```bash
			
 
				+# 编辑crontab
			
 
				+crontab -e
			
 
				+
			
 
				+# 添加监控任务（每5分钟检查一次）
			
 
				+*/5 * * * * /path/to/your/project/evaluate/start_evaluate.sh status > /dev/null 2>&1 || /path/to/your/project/evaluate/start_evaluate.sh restart
			
 
				+```
			
 
				+
			
 
				+## 日志查看技巧
			
 
				+
			
 
				+### 1. 实时监控日志
			
 
				+
			
 
				+```bash
			
 
				+# 监控所有日志
			
 
				+tail -f logs/*.log
			
 
				+
			
 
				+# 监控特定日志
			
 
				+tail -f logs/evaluate_scheduler_$(date +%Y%m%d).log
			
 
				+
			
 
				+# 监控错误日志
			
 
				+tail -f logs/*.log | grep ERROR
			
 
				+```
			
 
				+
			
 
				+### 2. 日志搜索
			
 
				+
			
 
				+```bash
			
 
				+# 搜索特定关键词
			
 
				+grep "ERROR" logs/*.log
			
 
				+
			
 
				+# 搜索特定时间段的日志
			
 
				+grep "2024-01-15" logs/*.log
			
 
				+
			
 
				+# 搜索特定线程的日志
			
 
				+grep "EvaluateeWorkerThread-1" logs/*.log
			
 
				+```
			
 
				+
			
 
				+### 3. 日志分析
			
 
				+
			
 
				+```bash
			
 
				+# 统计错误数量
			
 
				+grep -c "ERROR" logs/*.log
			
 
				+
			
 
				+# 查看处理成功的记录
			
 
				+grep -c "数据处理成功" logs/*.log
			
 
				+
			
 
				+# 查看处理失败的记录
			
 
				+grep -c "处理失败" logs/*.log
			
 
				+```
			
 
				+
			
 
				+## 故障排除
			
 
				+
			
 
				+### 1. 调度器无法启动
			
 
				+
			
 
				+- 检查Python环境和依赖包
			
 
				+- 检查数据库连接配置
			
 
				+- 查看错误日志
			
 
				+
			
 
				+### 2. 调度器意外停止
			
 
				+
			
 
				+- 检查系统资源（内存、CPU）
			
 
				+- 查看错误日志
			
 
				+- 检查数据库连接状态
			
 
				+
			
 
				+### 3. 日志文件过大
			
 
				+
			
 
				+- 定期清理旧日志文件
			
 
				+- 调整日志级别
			
 
				+- 使用logrotate进行日志轮转
			
 
				+
			
 
				+## 注意事项
			
 
				+
			
 
				+1. **权限设置**: 确保脚本有执行权限
			
 
				+   ```bash
			
 
				+   chmod +x start_evaluate.sh
			
 
				+   ```
			
 
				+
			
 
				+2. **路径配置**: 确保所有路径都是绝对路径或正确的相对路径
			
 
				+
			
 
				+3. **环境变量**: 确保数据库连接等环境变量已正确配置
			
 
				+
			
 
				+4. **资源监控**: 定期监控服务器资源使用情况
			
 
				+
			
 
				+5. **备份策略**: 定期备份重要的配置和日志文件
			
 
				+
			
 
				+## 联系支持
			
 
				+
			
 
				+如果遇到问题，请：
			
 
				+1. 查看相关日志文件
			
 
				+2. 检查系统资源使用情况
			
 
				+3. 确认配置是否正确
			
 
				+4. 联系技术支持团队 
			
--- a/evaluate/evaluate.py
+++ b/evaluate/evaluate.py
@@ -0,0 +1,249 @@
 
				+#!/usr/bin/env python3
			
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+内容结构化处理模块
			
 
				+主要功能：
			
 
				+1. 从数据库中拉取需要结构化的数据
			
 
				+2. 调用Gemini API进行内容结构化
			
 
				+3. 将结构化结果更新到数据库
			
 
				+"""
			
 
				+
			
 
				+import os
			
 
				+import json
			
 
				+import time
			
 
				+import sys
			
 
				+import re
			
 
				+import threading
			
 
				+from typing import Dict, Any, List, Optional, Tuple
			
 
				+
			
 
				+# 导入自定义模块
			
 
				+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
			
 
				+
			
 
				+from utils.mysql_db import MysqlHelper
			
 
				+from gemini import GeminiProcessor
			
 
				+from utils.file import File
			
 
				+from utils.logging_config import get_logger
			
 
				+
			
 
				+
			
 
				+class EvaluateProcessor:
			
 
				+    def __init__(self):
			
 
				+        # 设置日志
			
 
				+        self.logger = get_logger('EvaluateProcessor')
			
 
				+        
			
 
				+        # 初始化处理器
			
 
				+        self.processor = GeminiProcessor()
			
 
				+        self.system_prompt = File.read_file('../prompt/evaluate.md')
			
 
				+        self.logger.info("系统提示词加载完成")
			
 
				+        self.logger.debug(f"系统提示词: {self.system_prompt}")
			
 
				+        
			
 
				+        # 线程控制
			
 
				+        self.lock = threading.Lock()
			
 
				+        self.stop_event = threading.Event()
			
 
				+        self.threads = []
			
 
				+    
			
 
				+    def build_query_conditions(self, query_word: Optional[str], 
			
 
				+                             source_type: Optional[str], 
			
 
				+                             source_channel: Optional[str]) -> Tuple[str, Tuple]:
			
 
				+        """构建查询条件和参数"""
			
 
				+        conditions = ["a.structured_data is not null", "b.score is null"]
			
 
				+        params = []
			
 
				+        
			
 
				+        if query_word is not None:
			
 
				+            conditions.append("a.query_word = %s")
			
 
				+            params.append(query_word)
			
 
				+        if source_type is not None:
			
 
				+            conditions.append("a.source_type = %s")
			
 
				+            params.append(source_type)
			
 
				+        if source_channel is not None:
			
 
				+            conditions.append("a.source_channel = %s")
			
 
				+            params.append(source_channel)
			
 
				+            
			
 
				+        where_clause = " AND ".join(conditions)
			
 
				+        return where_clause, tuple(params)
			
 
				+    
			
 
				+    def process_single_record(self, query_word: Optional[str], 
			
 
				+                            source_type: Optional[str], 
			
 
				+                            source_channel: Optional[str]) -> bool:
			
 
				+        """处理单条记录"""
			
 
				+        try:
			
 
				+            with self.lock:
			
 
				+                # 构建查询条件和参数
			
 
				+                where_clause, params = self.build_query_conditions(query_word, source_type, source_channel)
			
 
				+                
			
 
				+                # 先查询一条需要处理的记录
			
 
				+                select_sql = f"""
			
 
				+                    SELECT a.id, a.query_word, a.structured_data 
			
 
				+                    FROM knowledge_search_content a
			
 
				+                    left join knowledge_content_evaluate b on a.id = b.search_content_id
			
 
				+                    WHERE {where_clause}
			
 
				+                    LIMIT 1
			
 
				+                """
			
 
				+                
			
 
				+                records = MysqlHelper.get_values(select_sql, params)
			
 
				+                if not records:
			
 
				+                    self.logger.warning("没有找到需要处理的记录")
			
 
				+                    return False
			
 
				+                
			
 
				+                row = records[0]
			
 
				+                record_id = row[0]
			
 
				+                
			
 
				+                # 标记为处理中，防止其他线程取到重复处理
			
 
				+                mark_sql = """
			
 
				+                    insert into knowledge_content_evaluate (search_content_id, score)
			
 
				+                    values (%s, '-1')
			
 
				+                """
			
 
				+                
			
 
				+                MysqlHelper.update_values(mark_sql, (record_id))
			
 
				+                
			
 
				+                self.logger.info(f"开始处理记录 ID: {record_id}")
			
 
				+                
			
 
				+                # 处理内容
			
 
				+                user_prompt = f"""
			
 
				+                    # 任务 (Task)
			
 
				+                    现在，请根据以下输入，严格执行你的任务。你的最终输出必须且只能是一个JSON对象。
			
 
				+                    ## 输入:
			
 
				+                    Query: {row[1]}
			
 
				+                    Content: {row[2]}
			
 
				+                """
			
 
				+
			
 
				+                # print(user_prompt)
			
 
				+
			
 
				+                result = self.processor.process(user_prompt, self.system_prompt)
			
 
				+                result = re.sub(r'^\s*```json|\s*```\s*$', '', result, flags=re.MULTILINE).strip()
			
 
				+                self.logger.info(f"处理完成，结果长度: {len(str(result))}")
			
 
				+                self.logger.info(f"处理结果: {result}")
			
 
				+                
			
 
				+
			
 
				+                # 更新数据库为实际结果
			
 
				+                update_sql = """
			
 
				+                    UPDATE knowledge_content_evaluate
			
 
				+                    SET score = %s ,reason = %s
			
 
				+                    WHERE search_content_id = %s
			
 
				+                """      
			
 
				+
			
 
				+                result = json.loads(result)
			
 
				+                score = result['score']
			
 
				+                reason = result['reason']
			
 
				+                
			
 
				+                MysqlHelper.update_values(update_sql, (score, reason, record_id))
			
 
				+                self.logger.info(f"记录 {record_id} 处理完成并更新数据库")
			
 
				+                return True
			
 
				+                
			
 
				+        except Exception as e:
			
 
				+            self.logger.error(f"处理记录失败: {str(e)}", exc_info=True)
			
 
				+            return False
			
 
				+    
			
 
				+    def worker_thread(self, thread_id: int, query_word: Optional[str], 
			
 
				+                     source_type: Optional[str], source_channel: Optional[str]):
			
 
				+        """工作线程函数"""
			
 
				+        thread_logger = get_logger(f'WorkerThread-{thread_id}')
			
 
				+        thread_logger.info(f"线程 {thread_id} 启动")
			
 
				+        
			
 
				+        while not self.stop_event.is_set():
			
 
				+            try:
			
 
				+                # 尝试处理一条记录
			
 
				+                success = self.process_single_record(query_word, source_type, source_channel)
			
 
				+                
			
 
				+                if not success:
			
 
				+                    thread_logger.info(f"没有找到需要处理的记录，等待5秒后重试")
			
 
				+                    # 等待时也要检查停止信号
			
 
				+                    if self.stop_event.wait(5):
			
 
				+                        break
			
 
				+                    continue
			
 
				+                
			
 
				+                # 处理成功后等待5秒再处理下一条
			
 
				+                thread_logger.info(f"处理完成，等待5秒后处理下一条")
			
 
				+                # 等待时也要检查停止信号
			
 
				+                if self.stop_event.wait(5):
			
 
				+                    break
			
 
				+                
			
 
				+            except Exception as e:
			
 
				+                thread_logger.error(f"发生错误: {str(e)}", exc_info=True)
			
 
				+                # 等待时也要检查停止信号
			
 
				+                if self.stop_event.wait(5):
			
 
				+                    break
			
 
				+        
			
 
				+        thread_logger.info(f"线程 {thread_id} 已停止")
			
 
				+    
			
 
				+    def start_multi_thread_processing(self, query_word: Optional[str], 
			
 
				+                                    source_type: Optional[str], 
			
 
				+                                    source_channel: Optional[str]):
			
 
				+        """启动多线程处理"""
			
 
				+        self.threads = []
			
 
				+        
			
 
				+        self.logger.info("启动多线程处理...")
			
 
				+        self.logger.info(f"查询条件: query_word={query_word}, source_type={source_type}, source_channel={source_channel}")
			
 
				+        
			
 
				+        # 创建5个线程，间隔5秒启动
			
 
				+        for i in range(5):
			
 
				+            thread = threading.Thread(
			
 
				+                target=self.worker_thread,
			
 
				+                args=(i + 1, query_word, source_type, source_channel)
			
 
				+            )
			
 
				+            self.threads.append(thread)
			
 
				+            
			
 
				+            # 启动线程
			
 
				+            thread.start()
			
 
				+            self.logger.info(f"线程 {i + 1} 已启动")
			
 
				+            
			
 
				+            # 等待5秒后启动下一个线程
			
 
				+            if i < 4:  # 最后一个线程不需要等待
			
 
				+                self.logger.info("等待5秒后启动下一个线程...")
			
 
				+                time.sleep(5)
			
 
				+        
			
 
				+        self.logger.info("所有线程已启动，使用 ./start_evaluate.sh stop 停止")
			
 
				+        
			
 
				+        try:
			
 
				+            # 等待所有线程完成
			
 
				+            for thread in self.threads:
			
 
				+                thread.join()
			
 
				+        except KeyboardInterrupt:
			
 
				+            self.logger.info("收到停止信号，正在停止所有线程...")
			
 
				+            self.stop_all_threads()
			
 
				+    
			
 
				+    def stop_all_threads(self):
			
 
				+        """停止所有线程"""
			
 
				+        self.logger.info("正在停止所有线程...")
			
 
				+        self.stop_event.set()
			
 
				+        
			
 
				+        # 等待所有线程结束
			
 
				+        for i, thread in enumerate(self.threads):
			
 
				+            if thread.is_alive():
			
 
				+                self.logger.info(f"等待线程 {i + 1} 结束...")
			
 
				+                thread.join(timeout=10)  # 最多等待10秒
			
 
				+                if thread.is_alive():
			
 
				+                    self.logger.warning(f"线程 {i + 1} 未能正常结束")
			
 
				+                else:
			
 
				+                    self.logger.info(f"线程 {i + 1} 已正常结束")
			
 
				+        
			
 
				+        self.logger.info("所有线程已停止")
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+    """主函数"""
			
 
				+    import argparse
			
 
				+    
			
 
				+    parser = argparse.ArgumentParser(description='内容结构化处理脚本')
			
 
				+    parser.add_argument('--query_word', default=None, help='query词')
			
 
				+    parser.add_argument('--source_type', default=None, help='数据源类型')
			
 
				+    parser.add_argument('--source_channel', default=None, help='数据源渠道')
			
 
				+    
			
 
				+    args = parser.parse_args()
			
 
				+    
			
 
				+    try:
			
 
				+        processor = EvaluateProcessor()
			
 
				+        
			
 
				+        processor.start_multi_thread_processing(
			
 
				+            query_word=args.query_word, 
			
 
				+            source_type=args.source_type, 
			
 
				+            source_channel=args.source_channel
			
 
				+        )
			
 
				+    except Exception as e:
			
 
				+        print(f"程序执行失败: {str(e)}")
			
 
				+        sys.exit(1)
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    # 测试单条记录处理
			
 
				+    processor = EvaluateProcessor()
			
 
				+    processor.process_single_record(query_word=None, source_type=None, source_channel=None) 
			
--- a/evaluate/multi_thread_scheduler.py
+++ b/evaluate/multi_thread_scheduler.py
@@ -0,0 +1,205 @@
 
				+#!/usr/bin/env python3
			
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+多线程定时任务调度器 - 结构化处理版本
			
 
				+开启5个线程，每2分钟调用一次processor.process_single_record()处理一条数据
			
 
				+"""
			
 
				+
			
 
				+import threading
			
 
				+import time
			
 
				+import signal
			
 
				+import sys
			
 
				+import os
			
 
				+import atexit
			
 
				+from datetime import datetime
			
 
				+from evaluate import EvaluateProcessor
			
 
				+from utils.logging_config import get_logger
			
 
				+
			
 
				+class MultiThreadScheduler:
			
 
				+    def __init__(self, thread_count=5, interval_minutes=2, 
			
 
				+                 query_word=None, source_type=None, source_channel=None):
			
 
				+        self.thread_count = thread_count
			
 
				+        self.interval_seconds = interval_minutes * 60
			
 
				+        self.running = True
			
 
				+        self.threads = []
			
 
				+        self.processor = EvaluateProcessor()
			
 
				+        self.query_word = query_word
			
 
				+        self.source_type = source_type
			
 
				+        self.source_channel = source_channel
			
 
				+        self.pid_file = "evaluate_scheduler.pid"
			
 
				+        
			
 
				+        # 设置日志
			
 
				+        self.logger = get_logger('EvaluateMultiThreadScheduler')
			
 
				+        
			
 
				+        # 设置信号处理，优雅退出
			
 
				+        signal.signal(signal.SIGINT, self.signal_handler)
			
 
				+        signal.signal(signal.SIGTERM, self.signal_handler)
			
 
				+        
			
 
				+        # 注册退出时的清理函数
			
 
				+        atexit.register(self.cleanup)
			
 
				+        
			
 
				+        # 创建PID文件
			
 
				+        self.create_pid_file()
			
 
				+    
			
 
				+    def create_pid_file(self):
			
 
				+        """创建PID文件"""
			
 
				+        try:
			
 
				+            with open(self.pid_file, 'w') as f:
			
 
				+                f.write(str(os.getpid()))
			
 
				+            self.logger.info(f"PID文件已创建: {self.pid_file}")
			
 
				+        except Exception as e:
			
 
				+            self.logger.error(f"创建PID文件失败: {e}")
			
 
				+    
			
 
				+    def cleanup(self):
			
 
				+        """清理资源"""
			
 
				+        try:
			
 
				+            if os.path.exists(self.pid_file):
			
 
				+                os.remove(self.pid_file)
			
 
				+                self.logger.info("PID文件已清理")
			
 
				+        except Exception as e:
			
 
				+            self.logger.error(f"清理PID文件失败: {e}")
			
 
				+    
			
 
				+    def signal_handler(self, signum, frame):
			
 
				+        """信号处理函数，优雅退出"""
			
 
				+        signal_name = "SIGTERM" if signum == signal.SIGTERM else "SIGINT"
			
 
				+        self.logger.info(f"收到信号 {signal_name}，正在优雅退出...")
			
 
				+        self.running = False
			
 
				+        self.stop_all_threads()
			
 
				+        self.cleanup()
			
 
				+        sys.exit(0)
			
 
				+    
			
 
				+    def worker_thread(self, thread_id):
			
 
				+        """工作线程函数"""
			
 
				+        thread_logger = get_logger(f'EvaluateWorkerThread-{thread_id}')
			
 
				+        thread_logger.info(f"线程 {thread_id} 启动，每 {self.interval_seconds//60} 分钟处理一条数据")
			
 
				+        
			
 
				+        while self.running:
			
 
				+            try:
			
 
				+                start_time = time.time()
			
 
				+                
			
 
				+                # 处理一条数据
			
 
				+                thread_logger.info(f"开始处理数据...")
			
 
				+                success = self.processor.process_single_record(
			
 
				+                    self.query_word, self.source_type, self.source_channel
			
 
				+                )
			
 
				+                
			
 
				+                if success:
			
 
				+                    thread_logger.info("数据处理成功")
			
 
				+                else:
			
 
				+                    thread_logger.info("没有数据需要处理或处理失败")
			
 
				+                
			
 
				+                # 计算剩余等待时间
			
 
				+                elapsed_time = time.time() - start_time
			
 
				+                wait_time = max(0, self.interval_seconds - elapsed_time)
			
 
				+                
			
 
				+                if wait_time > 0:
			
 
				+                    thread_logger.info(f"等待 {wait_time:.1f} 秒后继续...")
			
 
				+                    # 分段等待，每10秒检查一次running状态
			
 
				+                    for _ in range(int(wait_time / 10) + 1):
			
 
				+                        if not self.running:
			
 
				+                            break
			
 
				+                        time.sleep(min(10, wait_time))
			
 
				+                        wait_time -= 10
			
 
				+                        if wait_time <= 0:
			
 
				+                            break
			
 
				+                
			
 
				+            except Exception as e:
			
 
				+                thread_logger.error(f"处理过程中发生错误: {e}", exc_info=True)
			
 
				+                # 发生错误时等待一段时间再继续
			
 
				+                for _ in range(10):
			
 
				+                    if not self.running:
			
 
				+                        break
			
 
				+                    time.sleep(1)
			
 
				+        
			
 
				+        thread_logger.info(f"线程 {thread_id} 已停止")
			
 
				+    
			
 
				+    def start_all_threads(self):
			
 
				+        """启动所有工作线程"""
			
 
				+        self.logger.info(f"启动 {self.thread_count} 个工作线程...")
			
 
				+        self.logger.info(f"查询条件: query_word={self.query_word}, source_type={self.source_type}, source_channel={self.source_channel}")
			
 
				+        
			
 
				+        for i in range(self.thread_count):
			
 
				+            thread = threading.Thread(
			
 
				+                target=self.worker_thread, 
			
 
				+                args=(i+1,),
			
 
				+                daemon=True,
			
 
				+                name=f"EvaluateWorkerThread-{i+1}"
			
 
				+            )
			
 
				+            thread.start()
			
 
				+            self.threads.append(thread)
			
 
				+            self.logger.info(f"线程 {i+1} 已启动")
			
 
				+            
			
 
				+            # 如果不是最后一个线程，等待5秒再启动下一个
			
 
				+            if i < self.thread_count - 1:
			
 
				+                self.logger.info("等待5秒后启动下一个线程...")
			
 
				+                time.sleep(5)
			
 
				+        
			
 
				+        self.logger.info(f"所有 {self.thread_count} 个线程已启动")
			
 
				+        self.logger.info(f"每个线程每 {self.interval_seconds//60} 分钟处理一条数据")
			
 
				+        self.logger.info("使用以下命令停止: ./start_evaluate.sh stop")
			
 
				+    
			
 
				+    def stop_all_threads(self):
			
 
				+        """停止所有线程"""
			
 
				+        self.logger.info("正在停止所有线程...")
			
 
				+        self.running = False
			
 
				+        
			
 
				+        # 等待所有线程结束
			
 
				+        for i, thread in enumerate(self.threads):
			
 
				+            if thread.is_alive():
			
 
				+                thread.join(timeout=5)
			
 
				+                if thread.is_alive():
			
 
				+                    self.logger.warning(f"线程 {i+1} 未能正常停止")
			
 
				+                else:
			
 
				+                    self.logger.info(f"线程 {i+1} 已停止")
			
 
				+        
			
 
				+        self.logger.info("所有线程已停止")
			
 
				+    
			
 
				+    def run(self):
			
 
				+        """运行调度器"""
			
 
				+        try:
			
 
				+            self.start_all_threads()
			
 
				+            
			
 
				+            # 主线程保持运行，等待信号
			
 
				+            while self.running:
			
 
				+                time.sleep(1)
			
 
				+                
			
 
				+        except KeyboardInterrupt:
			
 
				+            self.logger.info("收到键盘中断信号")
			
 
				+        finally:
			
 
				+            self.stop_all_threads()
			
 
				+            self.cleanup()
			
 
				+
			
 
				+def main():
			
 
				+    """主函数"""
			
 
				+    import argparse
			
 
				+    
			
 
				+    parser = argparse.ArgumentParser(description='多线程结构化处理调度器')
			
 
				+    parser.add_argument('--query_word', default=None, help='query词')
			
 
				+    parser.add_argument('--source_type', default=None, help='数据源类型')
			
 
				+    parser.add_argument('--source_channel', default=None, help='数据源渠道')
			
 
				+    parser.add_argument('--thread_count', type=int, default=5, help='线程数量')
			
 
				+    parser.add_argument('--interval_minutes', type=int, default=2, help='处理间隔（分钟）')
			
 
				+    
			
 
				+    args = parser.parse_args()
			
 
				+    
			
 
				+    print("=" * 60)
			
 
				+    print("多线程结构化处理调度器")
			
 
				+    print("=" * 60)
			
 
				+    print(f"线程数量: {args.thread_count}")
			
 
				+    print(f"处理间隔: {args.interval_minutes}分钟")
			
 
				+    print(f"查询条件: query_word={args.query_word}, source_type={args.source_type}, source_channel={args.source_channel}")
			
 
				+    print(f"启动时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
			
 
				+    print("=" * 60)
			
 
				+    
			
 
				+    # 创建并运行调度器
			
 
				+    scheduler = MultiThreadScheduler(
			
 
				+        thread_count=args.thread_count, 
			
 
				+        interval_minutes=args.interval_minutes,
			
 
				+        query_word=args.query_word,
			
 
				+        source_type=args.source_type,
			
 
				+        source_channel=args.source_channel
			
 
				+    )
			
 
				+    scheduler.run()
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    main() 
			
--- a/evaluate/start_evaluate.sh
+++ b/evaluate/start_evaluate.sh
@@ -0,0 +1,172 @@
 
				+#!/bin/bash
			
 
				+
			
 
				+# 多线程结构化处理调度器启动脚本
			
 
				+# 使用方法: 
			
 
				+#   ./start_evaluate.sh start    # 启动调度器
			
 
				+#   ./start_evaluate.sh stop     # 停止调度器
			
 
				+#   ./start_evaluate.sh status   # 查看状态
			
 
				+#   ./start_evaluate.sh restart  # 重启调度器
			
 
				+
			
 
				+SCRIPT_NAME="multi_thread_scheduler.py"
			
 
				+PID_FILE="Evaluate_scheduler.pid"
			
 
				+LOG_DIR="logs"
			
 
				+
			
 
				+# 检查Python环境
			
 
				+if ! command -v python3 &> /dev/null; then
			
 
				+    echo "错误: 未找到Python3，请先安装Python3"
			
 
				+    exit 1
			
 
				+fi
			
 
				+
			
 
				+# 检查依赖文件
			
 
				+if [ ! -f "evaluate_processor.py" ]; then
			
 
				+    echo "错误: 未找到evaluate_processor.py文件"
			
 
				+    exit 1
			
 
				+fi
			
 
				+
			
 
				+if [ ! -f "multi_thread_scheduler.py" ]; then
			
 
				+    echo "错误: 未找到multi_thread_scheduler.py文件"
			
 
				+    exit 1
			
 
				+fi
			
 
				+
			
 
				+# 创建logs目录
			
 
				+mkdir -p logs
			
 
				+
			
 
				+# 获取进程ID
			
 
				+get_pid() {
			
 
				+    if [ -f "$PID_FILE" ]; then
			
 
				+        cat "$PID_FILE"
			
 
				+    else
			
 
				+        echo ""
			
 
				+    fi
			
 
				+}
			
 
				+
			
 
				+# 检查进程是否运行
			
 
				+is_running() {
			
 
				+    local pid=$(get_pid)
			
 
				+    if [ -n "$pid" ]; then
			
 
				+        if ps -p "$pid" > /dev/null 2>&1; then
			
 
				+            return 0
			
 
				+        else
			
 
				+            # 进程不存在，删除PID文件
			
 
				+            rm -f "$PID_FILE"
			
 
				+            return 1
			
 
				+        fi
			
 
				+    fi
			
 
				+    return 1
			
 
				+}
			
 
				+
			
 
				+# 启动调度器
			
 
				+start_scheduler() {
			
 
				+    if is_running; then
			
 
				+        echo "调度器已经在运行中 (PID: $(get_pid))"
			
 
				+        return 1
			
 
				+    fi
			
 
				+    
			
 
				+    echo "正在启动多线程结构化处理调度器..."
			
 
				+    echo "日志文件将保存在 $LOG_DIR/ 目录中"
			
 
				+    
			
 
				+    # 后台运行Python脚本
			
 
				+    nohup python3 "$SCRIPT_NAME" > "$LOG_DIR/evaluate_scheduler_stdout.log" 2>&1 &
			
 
				+    local pid=$!
			
 
				+    
			
 
				+    # 保存PID到文件
			
 
				+    echo "$pid" > "$PID_FILE"
			
 
				+    
			
 
				+    echo "调度器已启动 (PID: $pid)"
			
 
				+    echo "使用以下命令查看状态:"
			
 
				+    echo "  ./start_evaluate.sh status"
			
 
				+    echo "  tail -f $LOG_DIR/evaluate_*.log"
			
 
				+    echo ""
			
 
				+    echo "使用以下命令停止:"
			
 
				+    echo "  ./start_evaluate.sh stop"
			
 
				+}
			
 
				+
			
 
				+# 停止调度器
			
 
				+stop_scheduler() {
			
 
				+    local pid=$(get_pid)
			
 
				+    if [ -z "$pid" ]; then
			
 
				+        echo "调度器未运行"
			
 
				+        return 1
			
 
				+    fi
			
 
				+    
			
 
				+    echo "正在停止调度器 (PID: $pid)..."
			
 
				+    
			
 
				+    # 发送SIGTERM信号
			
 
				+    kill -TERM "$pid" 2>/dev/null
			
 
				+    
			
 
				+    # 等待进程结束
			
 
				+    local count=0
			
 
				+    while [ $count -lt 10 ] && ps -p "$pid" > /dev/null 2>&1; do
			
 
				+        sleep 1
			
 
				+        count=$((count + 1))
			
 
				+        echo "等待进程结束... ($count/10)"
			
 
				+    done
			
 
				+    
			
 
				+    # 如果进程仍在运行，强制杀死
			
 
				+    if ps -p "$pid" > /dev/null 2>&1; then
			
 
				+        echo "强制停止进程..."
			
 
				+        kill -KILL "$pid" 2>/dev/null
			
 
				+    fi
			
 
				+    
			
 
				+    # 删除PID文件
			
 
				+    rm -f "$PID_FILE"
			
 
				+    echo "调度器已停止"
			
 
				+}
			
 
				+
			
 
				+# 查看状态
			
 
				+show_status() {
			
 
				+    if is_running; then
			
 
				+        local pid=$(get_pid)
			
 
				+        echo "调度器正在运行 (PID: $pid)"
			
 
				+        echo "进程信息:"
			
 
				+        ps -p "$pid" -o pid,ppid,cmd,etime
			
 
				+        echo ""
			
 
				+        echo "最近的日志:"
			
 
				+        if [ -f "$LOG_DIR/evaluate_scheduler_$(date +%Y%m%d).log" ]; then
			
 
				+            tail -5 "$LOG_DIR/evaluate_scheduler_$(date +%Y%m%d).log"
			
 
				+        else
			
 
				+            echo "未找到今日日志文件"
			
 
				+        fi
			
 
				+    else
			
 
				+        echo "调度器未运行"
			
 
				+    fi
			
 
				+}
			
 
				+
			
 
				+# 重启调度器
			
 
				+restart_scheduler() {
			
 
				+    echo "重启调度器..."
			
 
				+    stop_scheduler
			
 
				+    sleep 2
			
 
				+    start_scheduler
			
 
				+}
			
 
				+
			
 
				+# 主逻辑
			
 
				+case "${1:-start}" in
			
 
				+    start)
			
 
				+        start_scheduler
			
 
				+        ;;
			
 
				+    stop)
			
 
				+        stop_scheduler
			
 
				+        ;;
			
 
				+    status)
			
 
				+        show_status
			
 
				+        ;;
			
 
				+    restart)
			
 
				+        restart_scheduler
			
 
				+        ;;
			
 
				+    *)
			
 
				+        echo "用法: $0 {start|stop|status|restart}"
			
 
				+        echo ""
			
 
				+        echo "命令说明:"
			
 
				+        echo "  start   - 启动调度器"
			
 
				+        echo "  stop    - 停止调度器"
			
 
				+        echo "  status  - 查看运行状态"
			
 
				+        echo "  restart - 重启调度器"
			
 
				+        echo ""
			
 
				+        echo "示例:"
			
 
				+        echo "  $0 start    # 启动"
			
 
				+        echo "  $0 status   # 查看状态"
			
 
				+        echo "  $0 stop     # 停止"
			
 
				+        exit 1
			
 
				+        ;;
			
 
				+esac 
			
--- a/prompt/evaluate.md
+++ b/prompt/evaluate.md
@@ -0,0 +1,48 @@
 
				+# Role: 内容策略分析师
			
 
				+
			
 
				+## Profile
			
 
				+你是一位专业且细致的内容策略分析师。你的核心任务是模仿我的个人判断标准，评估一段内容（Content）在多大程度上满足了我的一个特定查询（Query）背后的真实意图。你的评估必须精准、客观，并严格遵循我为你设定的评分标准和工作流程。
			
 
				+
			
 
				+## Core Logic
			
 
				+你评估的核心是“要素满足度”。你需要先将我的Query拆解成几个核心构成要素，然后判断Content是否同时满足了这些要素。
			
 
				+
			
 
				+## 评分标准 (Scoring Rubric): 0-100分制
			
 
				+你必须严格按照以下标准进行打分：
			
 
				+
			
 
				+- **90-100分 (完美/精准命中):**
			
 
				+  - 内容 **同时** 精准且深入地满足了Query中 **所有** 核心要素。
			
 
				+  - 内容质量高，甚至提供了超出预期的价值。
			
 
				+
			
 
				+- **70-89分 (高度相关/基本命中):**
			
 
				+  - 内容满足了Query中的 **所有** 核心要素，但在某一要素的 **深度或具体性** 上略有不足，或整体内容质量一般。
			
 
				+
			
 
				+- **40-69分 (中度相关/命中主干):**
			
 
				+  - 内容只满足了Query中最主要的那个核心要素，但 **忽略或偏离** 了其他关键要素。
			
 
				+
			
 
				+- **10-39分 (轻度相关/仅擦边):**
			
 
				+  - 内容只是与Query中的某个要素 **轻微相关**，但整体上没有抓住我的意图。
			
 
				+
			
 
				+- **0-9分 (完全无关):**
			
 
				+  - 内容与Query中的任何核心要素都无关。
			
 
				+
			
 
				+## 工作流程 (Workflow)
			
 
				+你必须严格遵循以下四个步骤来完成任务：
			
 
				+
			
 
				+1.  **分析Query:** 首先，在内心解析用户提供的`Query`，识别出其中包含的2-3个核心构成要素。
			
 
				+2.  **评估Content:** 接着，仔细阅读`Content`，评估它对每一个核心要素的覆盖程度（是否提及、是否深入、是否准确）。
			
 
				+3.  **打分与解释:** 根据`评分标准`，结合`Content`对各要素的满足情况，给出一个具体的分数，并撰写打分理由。理由必须清晰地指出`Content`满足了哪些要素、忽略了哪些要素。
			
 
				+4.  **格式化输出:** 将你的分数和理由整理成一个严格的JSON格式输出。
			
 
				+
			
 
				+## 示例 (Example)
			
 
				+这是一个你需要学习的范例：
			
 
				+
			
 
				+**输入:**
			
 
				+- **Query:** "宠物类目的选题方法"
			
 
				+- **Content:** "本文将介绍宠物狗的十大可爱瞬间，以及如何给它们拍摄有趣的照片，吸引社交媒体上的关注。我们将分享构图技巧和后期处理方法。"
			
 
				+
			
 
				+**理想输出:**
			
 
				+```json
			
 
				+{
			
 
				+  "score": 45,
			
 
				+  "reason": "内容命中了[宠物]这个核心要素，但完全忽略了[选题方法]这一关键要素，反而详细阐述了[图文创作]，这与我的查询意图不符，因此给予中度相关评分。"
			
 
				+}