run_long_articles_job.sh 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156
  1. #!/usr/bin/env bash
  2. # =============================================================
  3. # 作者:Junhui Luo / 2025-07-14
  4. # 每分钟由系统 cron 触发一次,内部解析任务表达式决定是否执行
  5. # =============================================================
  6. ###################### 全局配置 ######################
  7. SCRIPT_DIR="/root/luojunhui/LongArticlesJob" # 工作目录
  8. LOG_DIR="${SCRIPT_DIR}/logs" # 日志根目录
  9. PYTHON_SCRIPT="long_articles_job.py" # 统一入口脚本
  10. CONDA_SH="/root/miniconda3/etc/profile.d/conda.sh"
  11. CONDA_ENV="tasks"
  12. LOG_RETENTION_DAYS=7 # 日志保存天数
  13. LOG_MAX_MB=100 # 单文件最大 MB,超过清空
  14. # 失败告警(自行实现:邮件、钉钉、Prometheus Pushgateway…)
  15. on_failure(){
  16. local task=$1
  17. local now=$(date '+%F %T')
  18. local url="https://open.feishu.cn/open-apis/bot/v2/hook/223b3d72-f2e8-40e0-9b53-6956e0ae7158"
  19. local content="定时任务失败:${task}\n时间:${now}"
  20. curl --request POST "${url}" \
  21. --header 'Content-Type: application/json' \
  22. --data-row "{\"msg_type\":\"interactive\",\"card\":{\"content\":\"${content}\"}}" \
  23. >/dev/null 2>&1
  24. }
  25. ###################### 任务定义 ######################
  26. # 语法: "分 时 日 月 周|任务名|日志模板"
  27. # 支持 *、*/n、a-b、a,b,c 以及它们组合
  28. TASKS=(
  29. # 视频号视频抓取
  30. "0 3 * * *|run_sph_video_crawler|${LOG_DIR}/run_sph_video_crawler/%Y-%m-%d.log"
  31. "0 15 * * *|run_sph_video_crawler|${LOG_DIR}/run_sph_video_crawler/%Y-%m-%d.log"
  32. # 票圈站内视频抓取
  33. "0 6 * * *|run_piaoquan_video_crawler|${LOG_DIR}/run_piaoquan_video_crawler/%Y-%m-%d.log"
  34. # 搜狐视频抓取
  35. "10 6 * * *|run_sohu_video_crawler|${LOG_DIR}/run_sohu_video_crawler/%Y-%m-%d.log"
  36. # top文章泛化作为供给
  37. "28 19 * * *|top_article_generalize|${LOG_DIR}/top_article_generalize/%Y-%m-%d.log"
  38. # 校验kimi剩余额度
  39. "30 * * * *|run_check_kimi_balance|${LOG_DIR}/run_check_kimi_balance/%Y-%m-%d.log"
  40. # 服务号数据回收
  41. "0 17 * * *|run_fwh_data_manager|${LOG_DIR}/run_fwh_data_manager/%Y-%m-%d.log"
  42. "0 11 * * *|run_fwh_data_manager|${LOG_DIR}/run_fwh_data_manager/%Y-%m-%d.log"
  43. # 标题相似度任务
  44. "*/10 * * * *|run_title_similarity_task|${LOG_DIR}/run_title_similarity_task/%Y-%m-%d.log"
  45. # 头条视频抓取
  46. "0 4 * * *|run_toutiao_video_crawler|${LOG_DIR}/run_toutiao_video_crawler/%Y-%m-%d.log"
  47. "0 16 * * *|run_toutiao_video_crawler|${LOG_DIR}/run_toutiao_video_crawler/%Y-%m-%d.log"
  48. "0 20 * * *|run_toutiao_video_crawler|${LOG_DIR}/run_toutiao_video_crawler/%Y-%m-%d.log"
  49. # 百度视频抓取
  50. "0 0 * * *|run_baidu_video_crawler|${LOG_DIR}/run_baidu_video_crawler/%Y-%m-%d.log"
  51. "0 12 * * *|run_baidu_video_crawler|${LOG_DIR}/run_baidu_video_crawler/%Y-%m-%d.log"
  52. "40 19 * * *|run_baidu_video_crawler|${LOG_DIR}/run_baidu_video_crawler/%Y-%m-%d.log"
  53. # 外部服务号监测
  54. "0 10 * * *|run_outside_server_accounts_monitor|${LOG_DIR}/run_outside_server_accounts_monitor/%Y-%m-%d.log"
  55. "0 16 * * *|run_outside_server_accounts_monitor|${LOG_DIR}/run_outside_server_accounts_monitor/%Y-%m-%d.log"
  56. # 自动下架视频
  57. "30 9 * * *|run_get_off_videos|${LOG_DIR}/run_get_off_videos/%Y-%m-%d.log"
  58. "30 15 * * *|run_get_off_videos|${LOG_DIR}/run_get_off_videos/%Y-%m-%d.log"
  59. "38 19 * * *|run_get_off_videos|${LOG_DIR}/run_get_off_videos/%Y-%m-%d.log"
  60. )
  61. ###################### 工具函数 ######################
  62. log(){ printf '%s [%s] %s\n' "$(date '+%F %T')" "$1" "$2"; }
  63. cron_field_match(){ # 参数:字段 当前值
  64. local field=$1 now=$2
  65. [[ $field == "*" ]] && return 0
  66. IFS=',' read -ra parts <<< "$field"
  67. for p in "${parts[@]}"; do
  68. if [[ $p == "*/"* ]]; then # 步进 */n
  69. local step=${p#*/}
  70. (( now % step == 0 )) && return 0
  71. elif [[ $p == *-* ]]; then # 范围 a-b
  72. local start=${p%-*} end=${p#*-}
  73. (( now >= start && now <= end )) && return 0
  74. elif (( now == p )); then # 单值
  75. return 0
  76. fi
  77. done
  78. return 1
  79. }
  80. cron_match(){ # 参数:完整表达式
  81. read -r m h dom mon dow <<< "$1"
  82. local n_m=$(date +%-M) n_h=$(date +%-H) n_dom=$(date +%-d) \
  83. n_mon=$(date +%-m) n_dow=$(date +%-u) # 1(周一)…7(周日)
  84. cron_field_match "$m" "$n_m" && \
  85. cron_field_match "$h" "$n_h" && \
  86. cron_field_match "$dom" "$n_dom" && \
  87. cron_field_match "$mon" "$n_mon" && \
  88. cron_field_match "$dow" "$n_dow"
  89. }
  90. start_task(){ # 参数:任务名 日志文件
  91. local name=$1 logfile=$2
  92. mkdir -p "$(dirname "$logfile")"; touch "$logfile"
  93. # 若已在运行则跳过
  94. pgrep -f "python3 $PYTHON_SCRIPT --task_name $name" >/dev/null && {
  95. log INFO "任务 $name 已在运行" | tee -a "$logfile"; return; }
  96. (
  97. # 子 shell 中运行,便于 flock 持锁
  98. exec >>"$logfile" 2>&1
  99. log INFO "启动任务 $name"
  100. cd "$SCRIPT_DIR" || { log ERROR "进入 $SCRIPT_DIR 失败"; exit 1; }
  101. [[ -f $CONDA_SH ]] && { source "$CONDA_SH"; conda activate "$CONDA_ENV"; }
  102. nohup python3 "$PYTHON_SCRIPT" --task_name "$name" &
  103. sleep 1
  104. pgrep -f "python3 $PYTHON_SCRIPT --task_name $name" \
  105. && log SUCCESS "任务 $name 启动成功" \
  106. || { log ERROR "任务 $name 启动失败"; on_failure "$name"; }
  107. ) &
  108. }
  109. cleanup_logs(){
  110. # 过期删除
  111. find "$LOG_DIR" -type f -name '*.log' -mtime +"$LOG_RETENTION_DAYS" -delete
  112. # 超大清空
  113. find "$LOG_DIR" -type f -name '*.log' -size +"${LOG_MAX_MB}M" -exec \
  114. sh -c 'cat /dev/null > "$1"' sh {} \;
  115. }
  116. ###################### 主流程 ######################
  117. (
  118. # 全局锁,阻止同脚本并行
  119. exec 200>"$0.lock"
  120. flock -n 200 || exit 0
  121. mkdir -p "$LOG_DIR"
  122. MAIN_LOG="${LOG_DIR}/scheduler_$(date +%Y-%m-%d).log"
  123. exec >>"$MAIN_LOG" 2>&1
  124. log INFO "====== 调度开始 ======"
  125. for task in "${TASKS[@]}"; do
  126. IFS='|' read -r cron_expr name log_tpl <<< "$task"
  127. cron_match "$cron_expr" || continue
  128. logfile=$(date +"$log_tpl") # 渲染 %Y-%m-%d
  129. start_task "$name" "$logfile"
  130. done
  131. cleanup_logs
  132. log INFO "====== 调度结束 ======"
  133. )
  134. exit 0