|
1 rok temu | |
---|---|---|
benshanzhufu | 1 rok temu | |
common | 1 rok temu | |
control | 1 rok temu | |
dev | 1 rok temu | |
douyin | 1 rok temu | |
ganggangdouchuan | 1 rok temu | |
gongzhonghao | 1 rok temu | |
haitunzhufu | 1 rok temu | |
jixiangxingfu | 1 rok temu | |
kanyikan | 1 rok temu | |
kuaishou | 1 rok temu | |
main | 1 rok temu | |
monitor | 1 rok temu | |
scheduling | 1 rok temu | |
shipinhao | 1 rok temu | |
suisuiniannianyingfuqi | 1 rok temu | |
weixinzhishu | 1 rok temu | |
xiaoniangao | 1 rok temu | |
xigua | 1 rok temu | |
youtube | 1 rok temu | |
zhiqingtiantiankan | 1 rok temu | |
zhongmiaoyinxin | 1 rok temu | |
zhufuquanzi | 1 rok temu | |
zhufushenghuo | 1 rok temu | |
.gitignore | 1 rok temu | |
README.MD | 1 rok temu | |
requirements.txt | 1 rok temu |
sh ./main/scheduling_main.sh ${crawler_dir} ${log_type} ${crawler} ${env} >>${nohup_dir} 2>&1 &
参数说明
${crawler_dir}: 爬虫执行路径,如: scheduling/scheduling_main/run_write_task.py
${log_type}: 日志命名格式,如: scheduling-task,则在 scheduling/logs/目录下,生成 2023-02-08-scheduling-task.log
${crawler}: 哪款爬虫,如: youtube / kanyikan / weixinzhishu
${env}: 爬虫运行环境,正式环境: prod / 测试环境: dev
${nohup_dir}: nohup日志存储路径,如: shceduling/nohup-task.log
阿里云 102 服务器
sh ./main/scheduling_main.sh scheduling/scheduling_main/run_write_task.py --log_type="scheduling-write" --crawler="scheduling" --env="prod" nohup-write.log
sh ./main/scheduling_main.sh scheduling/scheduling_main/run_scheduling_task.py --log_type="scheduling-task" --crawler="scheduling" --env="prod" nohup-task.log
# 读取任务写入 Redis,1分钟/次
*/1 * * * * cd /data5/piaoquan_crawler && /usr/bin/sh ./main/scheduling_main.sh scheduling/scheduling_v3/run_write_task_v3.py --log_type="scheduling-write" --crawler="scheduling" --env="prod" scheduling/logs/scheduling-write.log
# 调度任务,5秒/次
* * * * * for i in {1..12}; do cd /data5/piaoquan_crawler && /usr/bin/sh ./main/scheduling_main.sh scheduling/scheduling_v3/run_scheduling_task_v3.py --log_type="scheduling-task" --crawler="scheduling" --env="prod" scheduling/logs/scheduling-task.log; sleep 5; done
香港服务器
sh ./main/scheduling_main.sh scheduling/scheduling_main/run_write_task.py --log_type="scheduling-write" --crawler="scheduling" --env="prod" shceduling/nohup-write.log
sh ./main/scheduling_main.sh scheduling/scheduling_main/run_scheduling_task.py --log_type="scheduling-task" --crawler="scheduling" --env="prod" shceduling/nohup-task.log
线下调试
# 读取任务写入 Redis
sh ./main/scheduling_main.sh scheduling/scheduling_v3/run_write_task_v3.py --log_type="scheduling-write" --crawler="scheduling" --env="dev" scheduling/logs/scheduling-write.log
# 调度任务
sh ./main/scheduling_main.sh scheduling/scheduling_v3/run_scheduling_task_v3.py --log_type="scheduling-task" --crawler="scheduling" --env="dev" scheduling/logs/scheduling-task.log
杀进程
ps aux | grep scheduling
ps aux | grep scheduling | grep -v grep | awk '{print $2}' | xargs kill -9
sh ./main/main.sh ${crawler_dir} ${log_type} ${crawler} ${strategy} ${oss_endpoint} ${env} ${machine} ${nohup_dir}
参数说明
${crawler_dir}: 爬虫执行路径,如: ./youtube/youtube_main/run_youtube_follow.py
${log_type}: 日志命名格式,如: follow,则在 youtube/logs/目录下,生成 2023-02-08-follow.log
${crawler}: 哪款爬虫,如: youtube / kanyikan / weixinzhishu
${strategy}: 爬虫策略,如: 定向爬虫策略 / 小时榜爬虫策略 / 热榜爬虫策略
# ${oss_endpoint}: OSS网关,内网: inner / 外网: out / 香港: hk
${env}: 爬虫运行环境,正式环境: prod / 测试环境: dev
${machine}: 爬虫运行机器,阿里云服务器: aliyun_hk / aliyun / macpro / macair / local
${nohup_dir}: nohup日志存储路径,如: ./youtube/nohup.log
sh ./main/main.sh ./youtube/youtube_main/run_youtube_follow.py --log_type="follow" --crawler="youtube" --strategy="定向爬虫策略" --oss_endpoint="hk" --env="prod" --machine="aliyun_hk" youtube/nohup.log
# sh ./main/main.sh ./youtube/youtube_main/run_youtube_follow.py --log_type="follow" --crawler="youtube" --strategy="定向爬虫策略" --env="prod" --machine="aliyun_hk" youtube/nohup.log
youtube杀进程命令:
ps aux | grep run_youtube
ps aux | grep run_youtube | grep -v grep | awk '{print $2}' | xargs kill -9
# 微信指数,Mac Air
00 11 * * * cd ~ && source ./base_profile && ps aux | grep weixinzhishu | grep -v grep | awk '{print $2}' | xargs kill -9 && cd /Users/piaoquan/Desktop/piaoquan_crawler && nohup python3 -u weixinzhishu/weixinzhishu_key/search_key_mac.py >> weixinzhishu/logs/nohup-search-key.log 2>&1 &
获取站外标题, crontab定时脚本, 每天 12:00:00 点运行一次
00 12 * * * cd /data5/piaoquan_crawler/ && /root/anaconda3/bin/python weixinzhishu/weixinzhishu_main/run_weixinzhishu_hot_search.py >>weixinzhishu/logs/nohup-hot-search.log 2>&1 &
获取站外热词微信指数, crontab定时脚本, 每天 12:30:00 点运行一次
30 12 * * * cd /data5/piaoquan_crawler/ && /root/anaconda3/bin/python weixinzhishu/weixinzhishu_main/run_weixinzhishu_today_score.py >>weixinzhishu/logs/today-score.log 2>&1 &
获取微信指数, crontab定时脚本, 每天 08:00:00 20:00:00 各运行一次
00 08,20 * * * cd /data5/piaoquan_crawler/ && /root/anaconda3/bin/python weixinzhishu/weixinzhishu_main/run_weixinzhishu_score.py >>weixinzhishu/logs/nohup-score.log 2>&1 &
nohup python3 -u /data5/piaoquan_crawler/weixinzhishu/weixinzhishu_main/weixinzhishu_inner_long.py >>/data5/piaoquan_crawler/weixinzhishu/logs/nohup_inner_long.log 2>&1 &
nohup python3 -u /data5/piaoquan_crawler/weixinzhishu/weixinzhishu_main/weixinzhishu_out.py >>/data5/piaoquan_crawler/weixinzhishu/logs/nohup_out.log 2>&1 &
nohup python3 -u /data5/piaoquan_crawler/weixinzhishu/weixinzhishu_main/weixinzhishu_inner_sort.py >>/data5/piaoquan_crawler/weixinzhishu/logs/nohup_inner_sort.log 2>&1 &
获取 wechat_key 设备: Mac Air
cd ~ && source ./base_profile && ps aux | grep weixinzhishu | grep -v grep | awk '{print $2}' | xargs kill -9 && cd /Users/piaoquan/Desktop/piaoquan_crawler && nohup python3 -u weixinzhishu/weixinzhishu_key/search_key_mac.py >> weixinzhishu/logs/nohup-search-key.log 2>&1 &
线下调试
抓取今日微信指数
python3 /Users/wangkun/Desktop/crawler/piaoquan_crawler/weixinzhishu/weixinzhishu_main/run_weixinzhishu_today_score.py
检测进程
ps aux | grep WeChat.app
ps aux | grep weixinzhishu
ps aux | grep weixinzhishu | grep -v grep | awk '{print $2}' | xargs kill -9
ps aux | grep 微信 | grep -v grep | awk '{print $2}' | xargs kill -9
# 线下爬虫调度,每分钟检测线下爬虫进程状态
* * * * * /bin/sh /Users/piaoquan/Desktop/piaoquan_crawler/main/process_offline.sh "prod"
# 启动并检测Appium进程状态
* * * * * /bin/sh /Users/piaoquan/Desktop/piaoquan_crawler/main/start_appium.sh "recommend" "jixiangxingfu" "prod"
线下调试
ps aux | grep Appium.app | grep -v grep | awk '{print $2}' | xargs kill -9 && nohup /opt/homebrew/bin/node /Applications/Appium.app/Contents/Resources/app/node_modules/appium/build/lib/main.js >>/Users/wangkun/Desktop/logs/nohup.log 2>&1 &
sh /Users/wangkun/Desktop/crawler/piaoquan_crawler/main/process_offline.sh "dev"
cd /Users/piaoquan/Desktop/piaoquan_crawler/ && nohup python3 -u weixinzhishu/weixinzhishu_key/search_key_mac.py >> weixinzhishu/logs/nohup-search-key.log 2>&1 &
检测进程
ps aux | grep run_ganggangdouchuan | grep -v grep | awk '{print $2}' | xargs kill -9
ps aux | grep run_jixiangxingfu | grep -v grep | awk '{print $2}' | xargs kill -9
ps aux | grep run_zhongmiaoyinxin | grep -v grep | awk '{print $2}' | xargs kill -9
ps aux | grep run_zhiqingtiantiankan | grep -v grep | awk '{print $2}' | xargs kill -9
ps aux | grep Appium.app | grep -v grep | awk '{print $2}' | xargs kill -9
正式环境
00 00 * * * /bin/sh /Users/piaoquan/Desktop/piaoquan_crawler/shipinhao/shipinhao_main/run_shipinhao.sh shipinhao/shipinhao_main/run_shipinhao_search.py --log_type="search" --crawler="shipinhao" --env="prod"
线下调试
sh shipinhao/shipinhao_main/run_shipinhao.sh shipinhao/shipinhao_main/run_shipinhao_search.py --log_type="search" --crawler="shipinhao" --env="dev"
检测进程
ps aux | grep shipinhao_search
ps aux | grep shipinhao_search | grep -v grep | awk '{print $2}' | xargs kill -9
正式环境
* * * * * /usr/bin/sh /root/piaoquan_crawler/monitor/monitor_main/run_monitor.sh monitor/monitor_main/run_cpu_memory.py "cpumemory" "monitor" "prod"
线下调试
sh monitor/monitor_main/run_monitor.sh monitor/monitor_main/run_cpu_memory.py "cpumemory" "monitor" "dev"
检测进程
ps aux | grep run_monitor | grep -v grep | awk '{print $2}' | xargs kill -9
本地调试
/bin/sh /Users/wangkun/Desktop/crawler/piaoquan_crawler/main/process_mq.sh "ssnnyfq" "suisuiniannianyingfuqi" "recommend" "dev"
/bin/sh /Users/wangkun/Desktop/crawler/piaoquan_crawler/main/process_mq.sh "gzh1" "gongzhonghao" "author" "dev"
/bin/sh /Users/wangkun/Desktop/crawler/piaoquan_crawler/main/process_mq.sh "gzh2" "gongzhonghao" "author" "dev"
/bin/sh /Users/wangkun/Desktop/crawler/piaoquan_crawler/main/process_mq.sh "xg" "xigua" "recommend" "dev"
/bin/sh /Users/wangkun/Desktop/crawler/piaoquan_crawler/main/process_mq.sh "xg" "xigua" "author" "dev"
/bin/sh /Users/wangkun/Desktop/crawler/piaoquan_crawler/main/process_mq.sh "xg" "xigua" "search" "dev"
/bin/sh /Users/wangkun/Desktop/crawler/piaoquan_crawler/main/process_mq.sh "bszf" "benshanzhufu" "recommend" "dev"
/bin/sh /Users/wangkun/Desktop/crawler/piaoquan_crawler/main/process_mq.sh "ks" "kuaishou" "recommend" "dev"
/bin/sh /Users/wangkun/Desktop/crawler/piaoquan_crawler/main/process_mq.sh "ks" "kuaishou" "author" "dev"
/bin/sh /Users/wangkun/Desktop/crawler/piaoquan_crawler/main/process_mq.sh "dy" "douyin" "recommend" "dev"
/bin/sh /Users/wangkun/Desktop/crawler/piaoquan_crawler/main/process_mq.sh "dy" "douyin" "author" "dev"
/bin/sh /Users/wangkun/Desktop/crawler/piaoquan_crawler/main/process_mq.sh "xng" "xiaoniangao" "play" "dev"
/bin/sh /Users/wangkun/Desktop/crawler/piaoquan_crawler/main/process_mq.sh "xng" "xiaoniangao" "hour" "dev"
/bin/sh /Users/wangkun/Desktop/crawler/piaoquan_crawler/main/process_mq.sh "xng" "xiaoniangao" "author" "dev"
/bin/sh /Users/wangkun/Desktop/crawler/piaoquan_crawler/main/process_mq.sh "kykjk" "kanyikan" "recommend" "dev"
207 服务器, 调用 MQ 爬虫守护进程
# 岁岁年年迎福气
* * * * * /usr/bin/sh /root/piaoquan_crawler/main/process_mq.sh "ssnnyfq" "suisuiniannianyingfuqi" "recommend" "prod"
# 公众号(根据抓取目标用户数,自动计算需要启动 X 个进程同时抓取。每 100 个目标抓取用户,占用一个进程)
* * * * * /usr/bin/sh /root/piaoquan_crawler/main/process_mq.sh "gzh" "gongzhonghao" "author" "prod"
# 西瓜账号
* * * * * /usr/bin/sh /root/piaoquan_crawler/main/process_mq.sh "xg" "xigua" "author" "prod"
# 西瓜搜索
* * * * * /usr/bin/sh /root/piaoquan_crawler/main/process_mq.sh "xg" "xigua" "search" "prod"
# 本山祝福
* * * * * /usr/bin/sh /root/piaoquan_crawler/main/process_mq.sh "bszf" "benshanzhufu" "recommend" "prod"
# 快手推荐
* * * * * /usr/bin/sh /root/piaoquan_crawler/main/process_mq.sh "ks" "kuaishou" "recommend" "prod"
# 快手账号
* * * * * /usr/bin/sh /root/piaoquan_crawler/main/process_mq.sh "ks" "kuaishou" "author" "prod"
# 抖音推荐
* * * * * /usr/bin/sh /root/piaoquan_crawler/main/process_mq.sh "dy" "douyin" "recommend" "prod"
# 抖音账号
* * * * * /usr/bin/sh /root/piaoquan_crawler/main/process_mq.sh "dy" "douyin" "author" "prod"
# 小年糕播放榜
* * * * * /usr/bin/sh /root/piaoquan_crawler/main/process_mq.sh "xng" "xiaoniangao" "play" "prod"
# 小年糕上升榜
* * * * * /usr/bin/sh /root/piaoquan_crawler/main/process_mq.sh "xng" "xiaoniangao" "hour" "prod"
# 小年糕账号
* * * * * /usr/bin/sh /root/piaoquan_crawler/main/process_mq.sh "xng" "xiaoniangao" "author" "prod"
# 看一看推荐 1
* * * * * /bin/sh /Users/lieyunye/Desktop/crawler/piaoquan_crawler/main/process_mq.sh "kyk" "kanyikan" "recommend" "prod"
# 看一看推荐健康类
* * * * * /bin/sh /Users/kanyikan/Desktop/crawler/piaoquan_crawler/main/process_mq.sh "kykjk" "kanyikan" "recommend" "prod"
# 西瓜推荐 1
* * * * * /bin/sh /Users/kanyikan/Desktop/crawler/piaoquan_crawler/main/process_mq.sh "xg" "xigua" "recommend" "prod"
# 西瓜推荐民生类
* * * * * /bin/sh /Users/piaoquan/Desktop/piaoquan_crawler/main/process_mq.sh "xgms" "xigua" "recommend" "prod"
# 启动 Appium
* * * * * /bin/sh /Users/lieyunye/Desktop/crawler/piaoquan_crawler/main/start_appium.sh "recommend" "shipinhao" "prod"
# 视频号推荐
* * * * * /bin/sh /Users/lieyunye/Desktop/crawler/piaoquan_crawler/main/process_mq.sh "sph" "shipinhao" "recommend" "prod"
# 视频号搜索
* * * * * /bin/sh /Users/piaoquan/Desktop/piaoquan_crawler/main/process_mq.sh "sph" "shipinhao" "search" "prod"
# 祝福圈子
* * * * * /bin/sh /Users/piaoquan/Desktop/piaoquan_crawler/main/process_mq.sh "zfqz" "zhufuquanzi" "recommend" "prod"
杀进程
ps aux | grep suisuiniannianyingfuqi | grep -v grep | awk '{print $2}' | xargs kill -9
ps aux | grep benshanzhufu | grep -v grep | awk '{print $2}' | xargs kill -9
ps aux | grep gongzhonghao | grep -v grep | awk '{print $2}' | xargs kill -9
ps aux | grep xigua | grep -v grep | awk '{print $2}' | xargs kill -9
ps aux | grep kuaishou | grep -v grep | awk '{print $2}' | xargs kill -9
ps aux | grep douyin | grep -v grep | awk '{print $2}' | xargs kill -9
ps aux | grep xiaoniangao | grep -v grep | awk '{print $2}' | xargs kill -9
ps aux | grep kanyikan | grep -v grep | awk '{print $2}' | xargs kill -9
ps aux | grep Appium | grep -v grep | awk '{print $2}' | xargs kill -9
ps aux | grep shipinhao | grep -v grep | awk '{print $2}' | xargs kill -9
ps aux | grep Python | grep -v grep | awk '{print $2}' | xargs kill -9
cd ./piaoquan_crawler && pipreqs ./ --force
# pip3 install Appium-Python-Client
Appium_Python_Client==2.10.1
# 翻墙, pip3 install git+https://github.com/pyatom/pyatom/
atomac==1.2.0
# pip3 install ffmpeg-python
ffmpeg==1.4
# pip3 install loguru
loguru==0.6.0
# pip3 install lxml
lxml==4.9.1
# pip3 install mq_http_sdk, 若您使用的SDK版本为v1.0.0,您需要安装大于等于2.5且小于3.0版本的Python。若您使用的SDK版本大于v1.0.0,您需要安装2.5及以上版本的Python。
mq_http_sdk==1.0.3
# sudo pip3 install oss2
oss2==2.15.0
# pip3 install psutil
psutil==5.9.2
# pip3 install PyExecJS
PyExecJS==1.5.1
# pip3 install PyMysql
PyMySQL==1.0.2
# pip3 install redis
redis==4.5.1
# pip3 install requests
requests==2.27.1
# pip3 install selenium==4.2.0
selenium==4.9.1
# pip3 install urllib3
urllib3==1.26.9
# pip3 install jieba
jieba==0.42.1
# pip3 install workalendar
workalendar==17.0.0
# pip3 install aliyun_python_sdk
# pip3 install -U aliyun-log-python-sdk
aliyun_python_sdk==2.2.0
# pip3 install opencv-python / pip3 install opencv-contrib-python
opencv-python~=4.8.0.74
# pip3 install scikit-learn
scikit-learn~=1.3.0
# pip3 install beautifulsoup4