system_monitor.py 11 KB


  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. 系统监控脚本
  5. 监控调度器的运行状态、内存使用、CPU使用等,自动处理异常情况
  6. """
  7. import os
  8. import sys
  9. import time
  10. import psutil
  11. import signal
  12. import subprocess
  13. import json
  14. import logging
  15. from datetime import datetime, timedelta
  16. from typing import Dict, Any, Optional
  17. # 配置日志
  18. logging.basicConfig(
  19. level=logging.INFO,
  20. format='%(asctime)s - %(levelname)s - %(message)s',
  21. handlers=[
  22. logging.FileHandler('logs/system_monitor.log'),
  23. logging.StreamHandler()
  24. ]
  25. )
  26. logger = logging.getLogger(__name__)
  27. class SystemMonitor:
  28. def __init__(self):
  29. self.config = {
  30. 'max_memory_mb': 2048, # 最大内存使用量
  31. 'max_cpu_percent': 80, # 最大CPU使用率
  32. 'max_disk_percent': 90, # 最大磁盘使用率
  33. 'check_interval': 30, # 检查间隔(秒)
  34. 'restart_delay': 60, # 重启延迟(秒)
  35. 'max_restarts': 5, # 最大重启次数
  36. 'pid_file': 'scheduler.pid', # 调度器PID文件
  37. 'log_file': 'logs/system_monitor.log'
  38. }
  39. self.restart_count = 0
  40. self.last_restart_time = None
  41. self.running = True
  42. # 设置信号处理
  43. signal.signal(signal.SIGINT, self.signal_handler)
  44. signal.signal(signal.SIGTERM, self.signal_handler)
  45. def signal_handler(self, signum, frame):
  46. """信号处理函数"""
  47. signal_name = "SIGTERM" if signum == signal.SIGTERM else "SIGINT"
  48. logger.info(f"收到信号 {signal_name},正在停止监控...")
  49. self.running = False
  50. def get_process_info(self) -> Optional[Dict[str, Any]]:
  51. """获取调度器进程信息"""
  52. try:
  53. if not os.path.exists(self.config['pid_file']):
  54. return None
  55. with open(self.config['pid_file'], 'r') as f:
  56. pid = int(f.read().strip())
  57. if not psutil.pid_exists(pid):
  58. return None
  59. process = psutil.Process(pid)
  60. return {
  61. 'pid': pid,
  62. 'name': process.name(),
  63. 'memory_mb': process.memory_info().rss / 1024 / 1024,
  64. 'cpu_percent': process.cpu_percent(),
  65. 'status': process.status(),
  66. 'create_time': process.create_time(),
  67. 'num_threads': process.num_threads()
  68. }
  69. except Exception as e:
  70. logger.error(f"获取进程信息失败: {e}")
  71. return None
  72. def check_system_resources(self) -> Dict[str, Any]:
  73. """检查系统资源使用情况"""
  74. try:
  75. # CPU使用率
  76. cpu_percent = psutil.cpu_percent(interval=1)
  77. # 内存使用率
  78. memory = psutil.virtual_memory()
  79. memory_percent = memory.percent
  80. memory_available_gb = memory.available / 1024 / 1024 / 1024
  81. # 磁盘使用率
  82. disk = psutil.disk_usage('/')
  83. disk_percent = disk.percent
  84. # 网络连接数
  85. net_connections = len(psutil.net_connections())
  86. return {
  87. 'cpu_percent': cpu_percent,
  88. 'memory_percent': memory_percent,
  89. 'memory_available_gb': memory_available_gb,
  90. 'disk_percent': disk_percent,
  91. 'net_connections': net_connections,
  92. 'timestamp': datetime.now().isoformat()
  93. }
  94. except Exception as e:
  95. logger.error(f"检查系统资源失败: {e}")
  96. return {}
  97. def check_logs_for_errors(self) -> bool:
  98. """检查日志文件中的错误"""
  99. try:
  100. log_files = [
  101. 'logs/scheduler_stdout.log',
  102. 'logs/scheduler_*.log'
  103. ]
  104. error_patterns = [
  105. 'double free',
  106. 'corruption',
  107. 'segmentation fault',
  108. 'memory error',
  109. 'out of memory',
  110. 'killed'
  111. ]
  112. for log_pattern in log_files:
  113. if '*' in log_pattern:
  114. # 处理通配符
  115. import glob
  116. log_files = glob.glob(log_pattern)
  117. else:
  118. log_files = [log_pattern]
  119. for log_file in log_files:
  120. if os.path.exists(log_file):
  121. try:
  122. with open(log_file, 'r', encoding='utf-8', errors='ignore') as f:
  123. content = f.read()
  124. for pattern in error_patterns:
  125. if pattern.lower() in content.lower():
  126. logger.warning(f"在日志文件 {log_file} 中发现错误模式: {pattern}")
  127. return True
  128. except Exception as e:
  129. logger.error(f"读取日志文件 {log_file} 失败: {e}")
  130. return False
  131. except Exception as e:
  132. logger.error(f"检查日志文件失败: {e}")
  133. return False
  134. def restart_scheduler(self) -> bool:
  135. """重启调度器"""
  136. try:
  137. current_time = datetime.now()
  138. # 检查重启频率限制
  139. if (self.last_restart_time and
  140. (current_time - self.last_restart_time).seconds < self.config['restart_delay']):
  141. logger.warning("重启过于频繁,跳过本次重启")
  142. return False
  143. if self.restart_count >= self.config['max_restarts']:
  144. logger.error(f"达到最大重启次数 ({self.config['max_restarts']}),停止重启")
  145. return False
  146. logger.info("正在重启调度器...")
  147. # 停止现有进程
  148. process_info = self.get_process_info()
  149. if process_info:
  150. try:
  151. os.kill(process_info['pid'], signal.SIGTERM)
  152. time.sleep(5)
  153. if psutil.pid_exists(process_info['pid']):
  154. os.kill(process_info['pid'], signal.SIGILL)
  155. except Exception as e:
  156. logger.error(f"停止进程失败: {e}")
  157. # 等待进程完全停止
  158. time.sleep(10)
  159. # 启动新进程
  160. try:
  161. subprocess.Popen(['python3', 'multi_thread_scheduler.py'],
  162. stdout=open('logs/scheduler_stdout.log', 'a'),
  163. stderr=subprocess.STDOUT)
  164. # 等待进程启动
  165. time.sleep(15)
  166. # 检查是否启动成功
  167. if self.get_process_info():
  168. logger.info("调度器重启成功")
  169. self.restart_count += 1
  170. self.last_restart_time = current_time
  171. return True
  172. else:
  173. logger.error("调度器重启失败")
  174. return False
  175. except Exception as e:
  176. logger.error(f"启动调度器失败: {e}")
  177. return False
  178. except Exception as e:
  179. logger.error(f"重启调度器过程中发生错误: {e}")
  180. return False
  181. def should_restart(self, process_info: Dict[str, Any], system_info: Dict[str, Any]) -> bool:
  182. """判断是否需要重启"""
  183. if not process_info:
  184. logger.warning("调度器进程不存在,需要重启")
  185. return True
  186. # 检查内存使用
  187. if process_info['memory_mb'] > self.config['max_memory_mb']:
  188. logger.warning(f"内存使用过高: {process_info['memory_mb']:.1f}MB > {self.config['max_memory_mb']}MB")
  189. return True
  190. # 检查CPU使用率
  191. if process_info['cpu_percent'] > self.config['max_cpu_percent']:
  192. logger.warning(f"CPU使用率过高: {process_info['cpu_percent']:.1f}% > {self.config['max_cpu_percent']}%")
  193. return True
  194. # 检查系统资源
  195. if system_info.get('memory_percent', 0) > 90:
  196. logger.warning(f"系统内存使用率过高: {system_info['memory_percent']:.1f}%")
  197. return True
  198. if system_info.get('disk_percent', 0) > self.config['max_disk_percent']:
  199. logger.warning(f"磁盘使用率过高: {system_info['disk_percent']:.1f}%")
  200. return True
  201. # 检查日志中的错误
  202. if self.check_logs_for_errors():
  203. logger.warning("检测到日志错误,需要重启")
  204. return True
  205. return False
  206. def run(self):
  207. """运行监控"""
  208. logger.info("系统监控启动")
  209. logger.info(f"配置: {json.dumps(self.config, indent=2)}")
  210. while self.running:
  211. try:
  212. # 获取进程信息
  213. process_info = self.get_process_info()
  214. # 获取系统资源信息
  215. system_info = self.check_system_resources()
  216. # 记录状态
  217. if process_info:
  218. logger.info(f"进程状态: PID={process_info['pid']}, "
  219. f"内存={process_info['memory_mb']:.1f}MB, "
  220. f"CPU={process_info['cpu_percent']:.1f}%")
  221. if system_info:
  222. logger.info(f"系统状态: CPU={system_info['cpu_percent']:.1f}%, "
  223. f"内存={system_info['memory_percent']:.1f}%, "
  224. f"磁盘={system_info['disk_percent']:.1f}%")
  225. # 检查是否需要重启
  226. if self.should_restart(process_info, system_info):
  227. if self.restart_scheduler():
  228. logger.info("重启操作完成")
  229. else:
  230. logger.error("重启操作失败")
  231. # 等待下次检查
  232. time.sleep(self.config['check_interval'])
  233. except Exception as e:
  234. logger.error(f"监控过程中发生错误: {e}")
  235. time.sleep(self.config['check_interval'])
  236. logger.info("系统监控已停止")
  237. def main():
  238. """主函数"""
  239. print("=" * 60)
  240. print("系统监控脚本")
  241. print("=" * 60)
  242. print(f"启动时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
  243. print("=" * 60)
  244. try:
  245. monitor = SystemMonitor()
  246. monitor.run()
  247. except KeyboardInterrupt:
  248. print("\n收到中断信号,正在停止...")
  249. except Exception as e:
  250. print(f"监控脚本运行失败: {e}")
  251. sys.exit(1)
  252. if __name__ == "__main__":
  253. main()