start_scheduler.sh 9.7 KB


  1. #!/bin/bash
  2. # 多线程调度器启动脚本
  3. # 使用方法:
  4. # ./start_scheduler.sh start # 启动调度器
  5. # ./start_scheduler.sh stop # 停止调度器
  6. # ./start_scheduler.sh status # 查看状态
  7. # ./start_scheduler.sh restart # 重启调度器
  8. # ./start_scheduler.sh monitor # 监控模式(自动重启)
  9. SCRIPT_NAME="multi_thread_scheduler.py"
  10. PID_FILE="scheduler.pid"
  11. LOG_DIR="logs"
  12. MONITOR_PID_FILE="monitor.pid"
  13. MAX_MEMORY_MB=2048 # 最大内存使用量(MB)
  14. RESTART_COUNT_FILE="restart_count.txt"
  15. MAX_RESTARTS=10 # 最大重启次数
  16. # 检查Python环境
  17. if ! command -v python3 &> /dev/null; then
  18. echo "错误: 未找到Python3,请先安装Python3"
  19. exit 1
  20. fi
  21. # 检查依赖文件
  22. if [ ! -f "indentify.py" ]; then
  23. echo "错误: 未找到indentify.py文件"
  24. exit 1
  25. fi
  26. if [ ! -f "multi_thread_scheduler.py" ]; then
  27. echo "错误: 未找到multi_thread_scheduler.py文件"
  28. exit 1
  29. fi
  30. # 创建logs目录
  31. mkdir -p logs
  32. # 获取进程ID
  33. get_pid() {
  34. if [ -f "$PID_FILE" ]; then
  35. cat "$PID_FILE"
  36. else
  37. echo ""
  38. fi
  39. }
  40. # 获取监控进程ID
  41. get_monitor_pid() {
  42. if [ -f "$MONITOR_PID_FILE" ]; then
  43. cat "$MONITOR_PID_FILE"
  44. else
  45. echo ""
  46. fi
  47. }
  48. # 检查进程是否运行
  49. is_running() {
  50. local pid=$(get_pid)
  51. if [ -n "$pid" ]; then
  52. if ps -p "$pid" > /dev/null 2>&1; then
  53. return 0
  54. else
  55. # 进程不存在,删除PID文件
  56. rm -f "$PID_FILE"
  57. return 1
  58. fi
  59. fi
  60. return 1
  61. }
  62. # 检查内存使用情况
  63. check_memory() {
  64. local pid=$(get_pid)
  65. if [ -n "$pid" ] && ps -p "$pid" > /dev/null 2>&1; then
  66. local memory_kb=$(ps -p "$pid" -o rss= 2>/dev/null | tr -d ' ')
  67. if [ -n "$memory_kb" ]; then
  68. local memory_mb=$((memory_kb / 1024))
  69. echo "$memory_mb"
  70. else
  71. echo "0"
  72. fi
  73. else
  74. echo "0"
  75. fi
  76. }
  77. # 记录重启次数
  78. record_restart() {
  79. local count=0
  80. if [ -f "$RESTART_COUNT_FILE" ]; then
  81. count=$(cat "$RESTART_COUNT_FILE")
  82. fi
  83. count=$((count + 1))
  84. echo "$count" > "$RESTART_COUNT_FILE"
  85. echo "$count"
  86. }
  87. # 重置重启计数
  88. reset_restart_count() {
  89. echo "0" > "$RESTART_COUNT_FILE"
  90. }
  91. # 启动调度器
  92. start_scheduler() {
  93. if is_running; then
  94. echo "调度器已经在运行中 (PID: $(get_pid))"
  95. return 1
  96. fi
  97. echo "正在启动多线程调度器..."
  98. echo "日志文件将保存在 $LOG_DIR/ 目录中"
  99. echo "最大内存使用量: ${MAX_MEMORY_MB}MB"
  100. # 设置环境变量,增加Python内存管理稳定性
  101. export PYTHONMALLOC=malloc
  102. export PYTHONDEVMODE=1
  103. export PYTHONUNBUFFERED=1
  104. # 后台运行Python脚本
  105. nohup python3 -u "$SCRIPT_NAME" > "$LOG_DIR/scheduler_stdout.log" 2>&1 &
  106. local pid=$!
  107. # 保存PID到文件
  108. echo "$pid" > "$PID_FILE"
  109. echo "调度器已启动 (PID: $pid)"
  110. echo "使用以下命令查看状态:"
  111. echo " ./start_scheduler.sh status"
  112. echo " tail -f $LOG_DIR/scheduler_*.log"
  113. echo ""
  114. echo "使用以下命令停止:"
  115. echo " ./start_scheduler.sh stop"
  116. # 等待几秒检查进程是否正常启动
  117. sleep 3
  118. if ! ps -p "$pid" > /dev/null 2>&1; then
  119. echo "警告: 进程启动后立即退出,请检查日志文件"
  120. rm -f "$PID_FILE"
  121. return 1
  122. fi
  123. }
  124. # 停止调度器
  125. stop_scheduler() {
  126. local pid=$(get_pid)
  127. if [ -z "$pid" ]; then
  128. echo "调度器未运行"
  129. return 1
  130. fi
  131. echo "正在停止调度器 (PID: $pid)..."
  132. # 发送SIGTERM信号
  133. kill -TERM "$pid" 2>/dev/null
  134. # 等待进程结束
  135. local count=0
  136. while [ $count -lt 10 ] && ps -p "$pid" > /dev/null 2>&1; do
  137. sleep 1
  138. count=$((count + 1))
  139. echo "等待进程结束... ($count/10)"
  140. done
  141. # 如果进程仍在运行,强制杀死
  142. if ps -p "$pid" > /dev/null 2>&1; then
  143. echo "强制停止进程..."
  144. kill -KILL "$pid" 2>/dev/null
  145. fi
  146. # 删除PID文件
  147. rm -f "$PID_FILE"
  148. echo "调度器已停止"
  149. }
  150. # 停止监控进程
  151. stop_monitor() {
  152. local monitor_pid=$(get_monitor_pid)
  153. if [ -n "$monitor_pid" ]; then
  154. echo "正在停止监控进程 (PID: $monitor_pid)..."
  155. kill -TERM "$monitor_pid" 2>/dev/null
  156. local count=0
  157. while [ $count -lt 5 ] && ps -p "$monitor_pid" > /dev/null 2>&1; do
  158. sleep 1
  159. count=$((count + 1))
  160. done
  161. if ps -p "$monitor_pid" > /dev/null 2>&1; then
  162. kill -KILL "$monitor_pid" 2>/dev/null
  163. fi
  164. rm -f "$MONITOR_PID_FILE"
  165. echo "监控进程已停止"
  166. fi
  167. }
  168. # 查看状态
  169. show_status() {
  170. if is_running; then
  171. local pid=$(get_pid)
  172. local memory_mb=$(check_memory)
  173. echo "调度器正在运行 (PID: $pid)"
  174. echo "内存使用: ${memory_mb}MB / ${MAX_MEMORY_MB}MB"
  175. echo "进程信息:"
  176. ps -p "$pid" -o pid,ppid,cmd,etime
  177. echo ""
  178. # 检查重启次数
  179. if [ -f "$RESTART_COUNT_FILE" ]; then
  180. local restart_count=$(cat "$RESTART_COUNT_FILE")
  181. echo "重启次数: $restart_count / $MAX_RESTARTS"
  182. fi
  183. echo ""
  184. echo "最近的日志:"
  185. if [ -f "$LOG_DIR/scheduler_$(date +%Y%m%d).log" ]; then
  186. tail -5 "$LOG_DIR/scheduler_$(date +%Y%m%d).log"
  187. else
  188. echo "未找到今日日志文件"
  189. fi
  190. # 检查监控进程状态
  191. if [ -f "$MONITOR_PID_FILE" ]; then
  192. local monitor_pid=$(get_monitor_pid)
  193. if ps -p "$monitor_pid" > /dev/null 2>&1; then
  194. echo ""
  195. echo "监控进程正在运行 (PID: $monitor_pid)"
  196. fi
  197. fi
  198. else
  199. echo "调度器未运行"
  200. fi
  201. }
  202. # 重启调度器
  203. restart_scheduler() {
  204. echo "重启调度器..."
  205. stop_scheduler
  206. sleep 2
  207. start_scheduler
  208. }
  209. # 监控模式
  210. start_monitor() {
  211. if [ -f "$MONITOR_PID_FILE" ] && ps -p "$(get_monitor_pid)" > /dev/null 2>&1; then
  212. echo "监控进程已经在运行中"
  213. return 1
  214. fi
  215. echo "启动监控模式..."
  216. echo "监控进程将自动重启崩溃的调度器"
  217. # 重置重启计数
  218. reset_restart_count
  219. # 启动监控进程
  220. (
  221. while true; do
  222. if ! is_running; then
  223. local restart_count=$(record_restart)
  224. echo "$(date): 调度器已停止,尝试重启 (第${restart_count}次)"
  225. if [ "$restart_count" -le "$MAX_RESTARTS" ]; then
  226. start_scheduler
  227. if is_running; then
  228. echo "$(date): 调度器重启成功"
  229. sleep 30 # 等待30秒再检查
  230. else
  231. echo "$(date): 调度器重启失败"
  232. sleep 60 # 等待1分钟再尝试
  233. fi
  234. else
  235. echo "$(date): 达到最大重启次数 ($MAX_RESTARTS),停止监控"
  236. break
  237. fi
  238. else
  239. # 检查内存使用情况
  240. local memory_mb=$(check_memory)
  241. if [ "$memory_mb" -gt "$MAX_MEMORY_MB" ]; then
  242. echo "$(date): 内存使用过高 (${memory_mb}MB > ${MAX_MEMORY_MB}MB),重启调度器"
  243. stop_scheduler
  244. sleep 5
  245. start_scheduler
  246. fi
  247. sleep 30 # 每30秒检查一次
  248. fi
  249. done
  250. ) > "$LOG_DIR/monitor.log" 2>&1 &
  251. local monitor_pid=$!
  252. echo "$monitor_pid" > "$MONITOR_PID_FILE"
  253. echo "监控进程已启动 (PID: $monitor_pid)"
  254. echo "监控日志: tail -f $LOG_DIR/monitor.log"
  255. }
  256. # 停止监控
  257. stop_monitor_mode() {
  258. stop_monitor
  259. echo "监控模式已停止"
  260. }
  261. # 主逻辑
  262. case "${1:-start}" in
  263. start)
  264. start_scheduler
  265. ;;
  266. stop)
  267. stop_scheduler
  268. stop_monitor
  269. ;;
  270. status)
  271. show_status
  272. ;;
  273. restart)
  274. restart_scheduler
  275. ;;
  276. monitor)
  277. start_monitor
  278. ;;
  279. stop-monitor)
  280. stop_monitor_mode
  281. ;;
  282. cache-status)
  283. echo "查看缓存状态..."
  284. python3 cache_manager.py status
  285. ;;
  286. cache-clean)
  287. echo "清理缓存文件..."
  288. python3 cache_manager.py clean
  289. ;;
  290. cache-cleanup)
  291. echo "清理过期缓存文件..."
  292. python3 cache_manager.py cleanup
  293. ;;
  294. *)
  295. echo "用法: $0 {start|stop|status|restart|monitor|stop-monitor}"
  296. echo ""
  297. echo "命令说明:"
  298. echo " start - 启动调度器"
  299. echo " stop - 停止调度器和监控"
  300. echo " status - 查看运行状态"
  301. echo " restart - 重启调度器"
  302. echo " monitor - 启动监控模式(自动重启)"
  303. echo " stop-monitor - 停止监控模式"
  304. echo " cache-status - 查看缓存状态"
  305. echo " cache-clean - 清理所有缓存文件"
  306. echo " cache-cleanup- 清理过期缓存文件"
  307. echo ""
  308. echo "监控模式特性:"
  309. echo " - 自动检测进程崩溃并重启"
  310. echo " - 内存使用监控(超过${MAX_MEMORY_MB}MB自动重启)"
  311. echo " - 最大重启次数限制: $MAX_RESTARTS"
  312. echo ""
  313. echo "示例:"
  314. echo " $0 start # 启动"
  315. echo " $0 monitor # 启动监控模式"
  316. echo " $0 status # 查看状态"
  317. echo " $0 cache-status # 查看缓存状态"
  318. echo " $0 cache-cleanup # 清理过期缓存"
  319. echo " $0 stop # 停止所有"
  320. exit 1
  321. ;;
  322. esac