# 爬虫调度系统 ### 启动 1. cd ./piaoquan_crawler 2. sh ./main/scheduling_main.sh ${crawler_dir} ${log_type} ${crawler} ${env} >>${nohup_dir} 2>&1 & ```commandline 参数说明 ${crawler_dir}: 爬虫执行路径,如: scheduling/scheduling_main/run_write_task.py ${log_type}: 日志命名格式,如: scheduling-task,则在 scheduling/logs/目录下,生成 2023-02-08-scheduling-task.log ${crawler}: 哪款爬虫,如: youtube / kanyikan / weixinzhishu ${env}: 爬虫运行环境,正式环境: prod / 测试环境: dev ${nohup_dir}: nohup日志存储路径,如: shceduling/nohup-task.log ``` #### 运行命令 ```commandline 阿里云 102 服务器 sh ./main/scheduling_main.sh scheduling/scheduling_main/run_write_task.py --log_type="scheduling-write" --crawler="scheduling" --env="prod" nohup-write.log sh ./main/scheduling_main.sh scheduling/scheduling_main/run_scheduling_task.py --log_type="scheduling-task" --crawler="scheduling" --env="prod" nohup-task.log # 读取任务写入 Redis,1分钟/次 */1 * * * * cd /data5/piaoquan_crawler && /usr/bin/sh ./main/scheduling_main.sh scheduling/scheduling_v3/run_write_task_v3.py --log_type="scheduling-write" --crawler="scheduling" --env="prod" scheduling/logs/scheduling-write.log # 调度任务,5秒/次 * * * * * for i in {1..12}; do cd /data5/piaoquan_crawler && /usr/bin/sh ./main/scheduling_main.sh scheduling/scheduling_v3/run_scheduling_task_v3.py --log_type="scheduling-task" --crawler="scheduling" --env="prod" scheduling/logs/scheduling-task.log; sleep 5; done 香港服务器 sh ./main/scheduling_main.sh scheduling/scheduling_main/run_write_task.py --log_type="scheduling-write" --crawler="scheduling" --env="prod" shceduling/nohup-write.log sh ./main/scheduling_main.sh scheduling/scheduling_main/run_scheduling_task.py --log_type="scheduling-task" --crawler="scheduling" --env="prod" shceduling/nohup-task.log 线下调试 # 读取任务写入 Redis sh ./main/scheduling_main.sh scheduling/scheduling_v3/run_write_task_v3.py --log_type="scheduling-write" --crawler="scheduling" --env="dev" scheduling/logs/scheduling-write.log # 调度任务 sh ./main/scheduling_main.sh scheduling/scheduling_v3/run_scheduling_task_v3.py --log_type="scheduling-task" --crawler="scheduling" --env="dev" scheduling/logs/scheduling-task.log 杀进程 ps aux | grep scheduling ps aux | grep scheduling | grep -v grep | awk '{print $2}' | xargs kill -9 ``` # 爬虫平台 ### 启动 1. cd ./piaoquan_crawler 2. sh ./main/main.sh ${crawler_dir} ${log_type} ${crawler} ${strategy} ${oss_endpoint} ${env} ${machine} ${nohup_dir} ```commandline 参数说明 ${crawler_dir}: 爬虫执行路径,如: ./youtube/youtube_main/run_youtube_follow.py ${log_type}: 日志命名格式,如: follow,则在 youtube/logs/目录下,生成 2023-02-08-follow.log ${crawler}: 哪款爬虫,如: youtube / kanyikan / weixinzhishu ${strategy}: 爬虫策略,如: 定向爬虫策略 / 小时榜爬虫策略 / 热榜爬虫策略 # ${oss_endpoint}: OSS网关,内网: inner / 外网: out / 香港: hk ${env}: 爬虫运行环境,正式环境: prod / 测试环境: dev ${machine}: 爬虫运行机器,阿里云服务器: aliyun_hk / aliyun / macpro / macair / local ${nohup_dir}: nohup日志存储路径,如: ./youtube/nohup.log ``` #### YouTube ```commandline sh ./main/main.sh ./youtube/youtube_main/run_youtube_follow.py --log_type="follow" --crawler="youtube" --strategy="定向爬虫策略" --oss_endpoint="hk" --env="prod" --machine="aliyun_hk" youtube/nohup.log # sh ./main/main.sh ./youtube/youtube_main/run_youtube_follow.py --log_type="follow" --crawler="youtube" --strategy="定向爬虫策略" --env="prod" --machine="aliyun_hk" youtube/nohup.log youtube杀进程命令: ps aux | grep run_youtube ps aux | grep run_youtube | grep -v grep | awk '{print $2}' | xargs kill -9 ``` #### 微信指数 ```commandline 获取站外标题, crontab定时脚本, 每天 12:00:00 点运行一次 00 12 * * * cd /data5/piaoquan_crawler/ && /root/anaconda3/bin/python weixinzhishu/weixinzhishu_main/run_weixinzhishu_hot_search.py >>weixinzhishu/logs/nohup-hot-search.log 2>&1 & 获取站外热词微信指数, crontab定时脚本, 每天 12:30:00 点运行一次 30 12 * * * cd /data5/piaoquan_crawler/ && /root/anaconda3/bin/python weixinzhishu/weixinzhishu_main/run_weixinzhishu_today_score.py >>weixinzhishu/logs/today-score.log 2>&1 & 获取微信指数, crontab定时脚本, 每天 08:00:00 20:00:00 各运行一次 00 08,20 * * * cd /data5/piaoquan_crawler/ && /root/anaconda3/bin/python weixinzhishu/weixinzhishu_main/run_weixinzhishu_score.py >>weixinzhishu/logs/nohup-score.log 2>&1 & nohup python3 -u /data5/piaoquan_crawler/weixinzhishu/weixinzhishu_main/weixinzhishu_inner_long.py >>/data5/piaoquan_crawler/weixinzhishu/logs/nohup_inner_long.log 2>&1 & nohup python3 -u /data5/piaoquan_crawler/weixinzhishu/weixinzhishu_main/weixinzhishu_out.py >>/data5/piaoquan_crawler/weixinzhishu/logs/nohup_out.log 2>&1 & nohup python3 -u /data5/piaoquan_crawler/weixinzhishu/weixinzhishu_main/weixinzhishu_inner_sort.py >>/data5/piaoquan_crawler/weixinzhishu/logs/nohup_inner_sort.log 2>&1 & 获取 wechat_key 设备: Mac Air cd ~ && source ./base_profile && ps aux | grep weixinzhishu | grep -v grep | awk '{print $2}' | xargs kill -9 && cd /Users/piaoquan/Desktop/piaoquan_crawler && nohup python3 -u weixinzhishu/weixinzhishu_key/search_key_mac.py >> weixinzhishu/logs/nohup-search-key.log 2>&1 & 线下调试 抓取今日微信指数 python3 /Users/wangkun/Desktop/crawler/piaoquan_crawler/weixinzhishu/weixinzhishu_main/run_weixinzhishu_today_score.py 检测进程 ps aux | grep WeChat.app ps aux | grep weixinzhishu ps aux | grep weixinzhishu | grep -v grep | awk '{print $2}' | xargs kill -9 ps aux | grep 微信 | grep -v grep | awk '{print $2}' | xargs kill -9 ``` #### 线下爬虫: 刚刚都传 / 吉祥幸福 / 知青天天看 / 众妙音信 / wechat_search_key ```commandline MacAir 设备, crontab定时任务 * * * * * /bin/sh /Users/piaoquan/Desktop/piaoquan_crawler/main/process_offline.sh "prod" 线下调试 sh /Users/wangkun/Desktop/crawler/piaoquan_crawler/main/process_offline.sh "dev" cd /Users/piaoquan/Desktop/piaoquan_crawler/ && nohup python3 -u weixinzhishu/weixinzhishu_key/search_key_mac.py >> weixinzhishu/logs/nohup-search-key.log 2>&1 & 检测进程 ps aux | grep run_ganggangdouchuan | grep -v grep | awk '{print $2}' | xargs kill -9 ps aux | grep run_jixiangxingfu | grep -v grep | awk '{print $2}' | xargs kill -9 ps aux | grep run_zhongmiaoyinxin | grep -v grep | awk '{print $2}' | xargs kill -9 ps aux | grep run_zhiqingtiantiankan | grep -v grep | awk '{print $2}' | xargs kill -9 ps aux | grep Appium.app | grep -v grep | awk '{print $2}' | xargs kill -9 ``` #### 视频号 ```commandline 正式环境 00 00 * * * /bin/sh /Users/piaoquan/Desktop/piaoquan_crawler/shipinhao/shipinhao_main/run_shipinhao.sh shipinhao/shipinhao_main/run_shipinhao_search.py --log_type="search" --crawler="shipinhao" --env="prod" 线下调试 sh shipinhao/shipinhao_main/run_shipinhao.sh shipinhao/shipinhao_main/run_shipinhao_search.py --log_type="search" --crawler="shipinhao" --env="dev" 检测进程 ps aux | grep shipinhao_search ps aux | grep shipinhao_search | grep -v grep | awk '{print $2}' | xargs kill -9 ``` #### 调用MQ的爬虫进程守护: main/process_mq.sh ```commandline 本地调试 /bin/sh /Users/wangkun/Desktop/crawler/piaoquan_crawler/main/process_mq.sh "ssnnyfq" "suisuiniannianyingfuqi" "recommend" "dev" /bin/sh /Users/wangkun/Desktop/crawler/piaoquan_crawler/main/process_mq.sh "gzh1" "gongzhonghao" "author" "dev" /bin/sh /Users/wangkun/Desktop/crawler/piaoquan_crawler/main/process_mq.sh "gzh2" "gongzhonghao" "author" "dev" /bin/sh /Users/wangkun/Desktop/crawler/piaoquan_crawler/main/process_mq.sh "xg" "xigua" "recommend" "dev" /bin/sh /Users/wangkun/Desktop/crawler/piaoquan_crawler/main/process_mq.sh "xg" "xigua" "author" "dev" /bin/sh /Users/wangkun/Desktop/crawler/piaoquan_crawler/main/process_mq.sh "xg" "xigua" "search" "dev" /bin/sh /Users/wangkun/Desktop/crawler/piaoquan_crawler/main/process_mq.sh "bszf" "benshanzhufu" "recommend" "dev" /bin/sh /Users/wangkun/Desktop/crawler/piaoquan_crawler/main/process_mq.sh "ks" "kuaishou" "recommend" "dev" /bin/sh /Users/wangkun/Desktop/crawler/piaoquan_crawler/main/process_mq.sh "ks" "kuaishou" "author" "dev" /bin/sh /Users/wangkun/Desktop/crawler/piaoquan_crawler/main/process_mq.sh "dy" "douyin" "recommend" "dev" /bin/sh /Users/wangkun/Desktop/crawler/piaoquan_crawler/main/process_mq.sh "dy" "douyin" "author" "dev" /bin/sh /Users/wangkun/Desktop/crawler/piaoquan_crawler/main/process_mq.sh "xng" "xiaoniangao" "play" "dev" /bin/sh /Users/wangkun/Desktop/crawler/piaoquan_crawler/main/process_mq.sh "xng" "xiaoniangao" "hour" "dev" /bin/sh /Users/wangkun/Desktop/crawler/piaoquan_crawler/main/process_mq.sh "xng" "xiaoniangao" "author" "dev" /bin/sh /Users/wangkun/Desktop/crawler/piaoquan_crawler/main/process_mq.sh "kyk" "kanyikan" "recommend" "dev" 102 服务器 # 调用 MQ 爬虫守护进程 * * * * * /usr/bin/sh /root/piaoquan_crawler/main/process_mq.sh "ssnnyfq" "suisuiniannianyingfuqi" "recommend" "prod" * * * * * /usr/bin/sh /root/piaoquan_crawler/main/process_mq.sh "gzh1" "gongzhonghao" "author" "prod" * * * * * /usr/bin/sh /root/piaoquan_crawler/main/process_mq.sh "gzh2" "gongzhonghao" "author" "prod" * * * * * /usr/bin/sh /root/piaoquan_crawler/main/process_mq.sh "gzh3" "gongzhonghao" "author" "prod" * * * * * /usr/bin/sh /root/piaoquan_crawler/main/process_mq.sh "gzh4" "gongzhonghao" "author" "prod" * * * * * /usr/bin/sh /root/piaoquan_crawler/main/process_mq.sh "gzh5" "gongzhonghao" "author" "prod" * * * * * /usr/bin/sh /root/piaoquan_crawler/main/process_mq.sh "xg" "xigua" "recommend" "prod" * * * * * /usr/bin/sh /root/piaoquan_crawler/main/process_mq.sh "xg" "xigua" "author" "prod" * * * * * /usr/bin/sh /root/piaoquan_crawler/main/process_mq.sh "xg" "xigua" "search" "prod" * * * * * /usr/bin/sh /root/piaoquan_crawler/main/process_mq.sh "bszf" "benshanzhufu" "recommend" "prod" * * * * * /usr/bin/sh /root/piaoquan_crawler/main/process_mq.sh "ks" "kuaishou" "recommend" "prod" * * * * * /usr/bin/sh /root/piaoquan_crawler/main/process_mq.sh "ks" "kuaishou" "author" "prod" * * * * * /usr/bin/sh /root/piaoquan_crawler/main/process_mq.sh "dy" "douyin" "recommend" "prod" * * * * * /usr/bin/sh /root/piaoquan_crawler/main/process_mq.sh "dy" "douyin" "author" "prod" * * * * * /usr/bin/sh /root/piaoquan_crawler/main/process_mq.sh "xng" "xiaoniangao" "play" "prod" * * * * * /usr/bin/sh /root/piaoquan_crawler/main/process_mq.sh "xng" "xiaoniangao" "hour" "prod" * * * * * /usr/bin/sh /root/piaoquan_crawler/main/process_mq.sh "xng" "xiaoniangao" "author" "prod" * * * * * /bin/sh /Users/lieyunye/Desktop/crawler/piaoquan_crawler/main/process_mq.sh "kyk" "kanyikan" "recommend" "prod" 线下服务器 杀进程 ps aux | grep suisuiniannianyingfuqi | grep -v grep | awk '{print $2}' | xargs kill -9 ps aux | grep benshanzhufu | grep -v grep | awk '{print $2}' | xargs kill -9 ps aux | grep gongzhonghao | grep -v grep | awk '{print $2}' | xargs kill -9 ps aux | grep xigua | grep -v grep | awk '{print $2}' | xargs kill -9 ps aux | grep kuaishou | grep -v grep | awk '{print $2}' | xargs kill -9 ps aux | grep douyin | grep -v grep | awk '{print $2}' | xargs kill -9 ps aux | grep xiaoniangao | grep -v grep | awk '{print $2}' | xargs kill -9 ps aux | grep kanyikan | grep -v grep | awk '{print $2}' | xargs kill -9 ps aux | grep Appium | grep -v grep | awk '{print $2}' | xargs kill -9 ``` #### 生成 requirements.txt ```commandline cd ./piaoquan_crawler && pipreqs ./ --force # pip3 install Appium-Python-Client Appium_Python_Client==2.10.1 # 翻墙, pip3 install git+https://github.com/pyatom/pyatom/ atomac==1.2.0 # pip3 install ffmpeg-python ffmpeg==1.4 # pip3 install loguru loguru==0.6.0 # pip3 install lxml lxml==4.9.1 # pip3 install mq_http_sdk, 若您使用的SDK版本为v1.0.0,您需要安装大于等于2.5且小于3.0版本的Python。若您使用的SDK版本大于v1.0.0,您需要安装2.5及以上版本的Python。 mq_http_sdk==1.0.3 # sudo pip3 install oss2 oss2==2.15.0 # pip3 install psutil psutil==5.9.2 # pip3 install PyExecJS PyExecJS==1.5.1 # pip3 install PyMysql PyMySQL==1.0.2 # pip3 install redis redis==4.5.1 # pip3 install requests requests==2.27.1 # pip3 install selenium selenium==4.9.1 # pip3 install urllib3 urllib3==1.26.9 # pip3 install jieba jieba==0.42.1 # pip3 install workalendar workalendar==17.0.0 # pip3 install aliyun_python_sdk # pip3 install -U aliyun-log-python-sdk aliyun_python_sdk==2.2.0 ```