票圈爬虫平台

wangkun 743cb2e073 update 1 рік тому
benshanzhufu 1a603d121c update 1 рік тому
common fee4528e47 update 1 рік тому
douyin 743cb2e073 update 1 рік тому
ganggangdouchuan 1a603d121c update 1 рік тому
gongzhonghao 465aba1858 update 1 рік тому
jixiangxingfu 1a603d121c update 1 рік тому
kuaishou fee4528e47 update 1 рік тому
main 743cb2e073 update 1 рік тому
scheduling 5ce152779f update 1 рік тому
shipinhao 7d4f8aa937 update 1 рік тому
suisuiniannianyingfuqi 1a603d121c update 1 рік тому
weixinzhishu 1a603d121c update 1 рік тому
xiaoniangao 743cb2e073 update 1 рік тому
xigua 743cb2e073 update 1 рік тому
youtube 1a603d121c update 1 рік тому
zhiqingtiantiankan 1a603d121c update 1 рік тому
zhongmiaoyinxin 38514bd93d update 1 рік тому
.gitignore 565226dbe1 add xigua_search 1 рік тому
README.MD 0347cc0439 add kuaishou_author 1 рік тому
requirements.txt 565226dbe1 add xigua_search 1 рік тому

README.MD

爬虫调度系统

启动

  1. cd ./piaoquan_crawler
  2. sh ./main/scheduling_main.sh ${crawler_dir} ${log_type} ${crawler} ${env} >>${nohup_dir} 2>&1 &

    参数说明
    ${crawler_dir}:     爬虫执行路径,如: scheduling/scheduling_main/run_write_task.py
    ${log_type}:        日志命名格式,如: scheduling-task,则在 scheduling/logs/目录下,生成 2023-02-08-scheduling-task.log
    ${crawler}:         哪款爬虫,如: youtube / kanyikan / weixinzhishu
    ${env}:             爬虫运行环境,正式环境: prod / 测试环境: dev
    ${nohup_dir}:       nohup日志存储路径,如: shceduling/nohup-task.log
    

    运行命令

    阿里云 102 服务器
    sh ./main/scheduling_main.sh scheduling/scheduling_main/run_write_task.py --log_type="scheduling-write" --crawler="scheduling" --env="prod" nohup-write.log 
    sh ./main/scheduling_main.sh scheduling/scheduling_main/run_scheduling_task.py --log_type="scheduling-task" --crawler="scheduling" --env="prod" nohup-task.log 
    # 读取任务写入 Redis,1分钟/次
    */1 * * * * cd /data5/piaoquan_crawler && /usr/bin/sh ./main/scheduling_main.sh scheduling/scheduling_v3/run_write_task_v3.py --log_type="scheduling-write" --crawler="scheduling" --env="prod" scheduling/logs/scheduling-write.log
    # 调度任务,5秒/次
    * * * * * for i in {1..12}; do cd /data5/piaoquan_crawler && /usr/bin/sh ./main/scheduling_main.sh scheduling/scheduling_v3/run_scheduling_task_v3.py --log_type="scheduling-task" --crawler="scheduling" --env="prod" scheduling/logs/scheduling-task.log; sleep 5; done
    香港服务器
    sh ./main/scheduling_main.sh scheduling/scheduling_main/run_write_task.py --log_type="scheduling-write" --crawler="scheduling" --env="prod" shceduling/nohup-write.log 
    sh ./main/scheduling_main.sh scheduling/scheduling_main/run_scheduling_task.py --log_type="scheduling-task" --crawler="scheduling" --env="prod" shceduling/nohup-task.log 
    
    线下调试
    # 读取任务写入 Redis
    sh ./main/scheduling_main.sh scheduling/scheduling_v3/run_write_task_v3.py --log_type="scheduling-write" --crawler="scheduling" --env="dev"  scheduling/logs/scheduling-write.log 
    # 调度任务
    sh ./main/scheduling_main.sh scheduling/scheduling_v3/run_scheduling_task_v3.py --log_type="scheduling-task" --crawler="scheduling" --env="dev"  scheduling/logs/scheduling-task.log 
    
    杀进程
    ps aux | grep scheduling
    ps aux | grep scheduling | grep -v grep | awk '{print $2}' | xargs kill -9
    

爬虫平台

启动

  1. cd ./piaoquan_crawler
  2. sh ./main/main.sh ${crawler_dir} ${log_type} ${crawler} ${strategy} ${oss_endpoint} ${env} ${machine} ${nohup_dir}

    参数说明
    ${crawler_dir}:     爬虫执行路径,如: ./youtube/youtube_main/run_youtube_follow.py
    ${log_type}:        日志命名格式,如: follow,则在 youtube/logs/目录下,生成 2023-02-08-follow.log
    ${crawler}:         哪款爬虫,如: youtube / kanyikan / weixinzhishu
    ${strategy}:        爬虫策略,如: 定向爬虫策略 / 小时榜爬虫策略 / 热榜爬虫策略
    # ${oss_endpoint}:    OSS网关,内网: inner / 外网: out / 香港: hk
    ${env}:             爬虫运行环境,正式环境: prod / 测试环境: dev
    ${machine}:         爬虫运行机器,阿里云服务器: aliyun_hk / aliyun / macpro / macair / local
    ${nohup_dir}:       nohup日志存储路径,如: ./youtube/nohup.log
    

    YouTube

    sh ./main/main.sh ./youtube/youtube_main/run_youtube_follow.py --log_type="follow" --crawler="youtube" --strategy="定向爬虫策略" --oss_endpoint="hk" --env="prod" --machine="aliyun_hk" youtube/nohup.log
    # sh ./main/main.sh ./youtube/youtube_main/run_youtube_follow.py --log_type="follow" --crawler="youtube" --strategy="定向爬虫策略" --env="prod" --machine="aliyun_hk" youtube/nohup.log
    youtube杀进程命令: 
    ps aux | grep run_youtube
    ps aux | grep run_youtube | grep -v grep | awk '{print $2}' | xargs kill -9
    

西瓜视频

阿里云 102 服务器
西瓜定向: sh ./main/main.sh ./xigua/xigua_main/run_xigua_follow.py --log_type="follow" --crawler="xigua" --strategy="定向爬虫策略" --oss_endpoint="inner" --env="prod" --machine="aliyun" xigua/nohup.log
西瓜推荐: /usr/bin/sh ./main/scheduling_main.sh ./xigua/xigua_main/run_xigua_recommend.py --log_type="recommend" --crawler="xigua" --env="prod" xigua/logs/nohup-recommend.log
本机
西瓜定向: sh ./main/main.sh ./xigua/xigua_main/run_xigua_follow.py --log_type="follow" --crawler="xigua" --strategy="定向爬虫策略" --oss_endpoint="out" --env="prod" --machine="local" xigua/nohup.log
西瓜推荐: sh ./main/scheduling_main.sh ./xigua/xigua_main/run_xigua_recommend.py --log_type="recommend" --crawler="xigua" --env="dev" xigua/logs/nohup-recommend.log
西瓜搜索: sh main/scheduling_main.sh ./xigua/xigua_main/run_xigua_search_new.py --log_type="search" --crawler="xigua" --env="dev" xigua/logs/search-shell.log
杀进程命令:
ps aux | grep run_xigua
ps aux | grep run_xigua | grep -v grep | awk '{print $2}' | xargs kill -9
ps aux | grep run_xigua_follow | grep -v grep | awk '{print $2}' | xargs kill -9
ps aux | grep run_xigua_recommend | grep -v grep | awk '{print $2}' | xargs kill -9
ps aux | grep run_xigua_search | grep -v grep | awk '{print $2}' | xargs kill -9

快手

阿里云 102 服务器
sh ./main/main.sh ./kuaishou/kuaishou_main/run_kuaishou_recommend.py --log_type="recommend" --crawler="kuaishou" --strategy="推荐爬虫策略" --oss_endpoint="inner" --env="prod" --machine="aliyun" kuaishou/recommend.log
sh ./main/main.sh ./kuaishou/kuaishou_main/run_kuaishou_follow.py --log_type="follow" --crawler="kuaishou" --strategy="定向爬虫策略" --oss_endpoint="inner" --env="prod" --machine="aliyun" kuaishou/follow.log
# sh ./main/main.sh ./kuaishou/kuaishou_main/run_kuaishou_follow.py --log_type="follow" --crawler="kuaishou" --strategy="定向爬虫策略" --env="prod" --machine="aliyun" kuaishou/nohup.log
本机
sh ./main/main.sh ./kuaishou/kuaishou_main/run_kuaishou_follow.py --log_type="follow" --crawler="kuaishou" --strategy="定向爬虫策略" --oss_endpoint="out" --env="dev" --machine="local" kuaishou/nohup.log
# sh ./main/main.sh ./kuaishou/kuaishou_main/run_kuaishou_follow.py --log_type="follow" --crawler="kuaishou" --strategy="定向爬虫策略" --env="dev" --machine="local" kuaishou/nohup.log
macpro
sh ./main/main.sh ./kuaishou/kuaishou_main/run_kuaishou_follow.py --log_type="follow" --crawler="kuaishou" --strategy="定向爬虫策略" --oss_endpoint="out" --env="prod" --machine="macpro" kuaishou/nohup.log
# sh ./main/main.sh ./kuaishou/kuaishou_main/run_kuaishou_follow.py --log_type="follow" --crawler="kuaishou" --strategy="定向爬虫策略" --env="prod" --machine="macpro" kuaishou/nohup.log
杀进程命令:
ps aux | grep run_kuaishou
ps aux | grep run_kuaishou | grep -v grep | awk '{print $2}' | xargs kill -9
ps aux | grep Appium.app | grep -v grep | awk '{print $2}' | xargs kill -9

小年糕

阿里云 102 服务器
定向爬虫策略: /usr/bin/sh /data5/piaoquan_crawler/main/scheduling_main.sh ./xiaoniangao/xiaoniangao_main/run_xiaoniangao_follow.py --log_type="follow" --crawler="xiaoniangao" --env="prod"  xiaoniangao/nohup-follow.log
小时榜爬虫策略: /usr/bin/sh /data5/piaoquan_crawler/main/scheduling_main.sh ./xiaoniangao/xiaoniangao_main/run_xiaoniangao_hour.py --log_type="hour" --crawler="xiaoniangao" --env="prod" xiaoniangao/nohup-hour.log
播放量榜爬虫策略: /usr/bin/sh /data5/piaoquan_crawler/main/scheduling_main.sh ./xiaoniangao/xiaoniangao_main/run_xiaoniangao_play.py --log_type="play" --crawler="xiaoniangao" --env="prod" xiaoniangao/nohup-play.log

线下调试
定向爬虫策略: sh main/scheduling_main.sh ./xiaoniangao/xiaoniangao_main/run_xiaoniangao_follow.py --log_type="follow" --crawler="xiaoniangao" --env="dev" xiaoniangao/logs/nohup-follow.log
小时榜爬虫策略: sh main/scheduling_main.sh ./xiaoniangao/xiaoniangao_main/run_xiaoniangao_hour.py --log_type="hour" --crawler="xiaoniangao" --env="dev" xiaoniangao/logs/nohup-hour.log
播放量榜爬虫策略: sh main/scheduling_main.sh ./xiaoniangao/xiaoniangao_main/run_xiaoniangao_play.py --log_type="play" --crawler="xiaoniangao" --env="dev" xiaoniangao/logs/nohup-play.log

nohup python3 -u xiaoniangao/xiaoniangao_follow/insert_video_1.py >> xiaoniangao/nohup-1.log 2>&1 &
nohup python3 -u xiaoniangao/xiaoniangao_follow/insert_video_2.py >> xiaoniangao/nohup-1.log 2>&1 &
nohup python3 -u xiaoniangao/xiaoniangao_follow/insert_video_3.py >> xiaoniangao/nohup-1.log 2>&1 &

杀进程命令
ps aux | grep run_xiaoniangao_follow
ps aux | grep run_xiaoniangao_hour
ps aux | grep run_xiaoniangao_play
ps aux | grep run_xiaoniangao | grep -v grep | awk '{print $2}' | xargs kill -9 
ps aux | grep run_xiaoniangao_follow | grep -v grep | awk '{print $2}' | xargs kill -9 
ps aux | grep run_xiaoniangao_hour | grep -v grep | awk '{print $2}' | xargs kill -9 
ps aux | grep run_xiaoniangao_play | grep -v grep | awk '{print $2}' | xargs kill -9 

公众号

阿里云 102 服务器
定向爬虫策略: 
/usr/bin/sh /data5/piaoquan_crawler/main/scheduling_main.sh ./gongzhonghao/gongzhonghao_main/run_gongzhonghao_follow.py --log_type="follow" --crawler="gongzhonghao" --env="prod"  gongzhonghao/nohup-follow.log
/usr/bin/sh /data5/piaoquan_crawler/main/scheduling_main.sh ./gongzhonghao/gongzhonghao_main/run_gongzhonghao_follow_2.py --log_type="follow-2" --crawler="gongzhonghao" --env="prod"  gongzhonghao/nohup-follow-2.log
/usr/bin/sh /data5/piaoquan_crawler/main/scheduling_main.sh ./gongzhonghao/gongzhonghao_main/run_gongzhonghao_follow_3.py --log_type="follow-3" --crawler="gongzhonghao" --env="prod"  gongzhonghao/nohup-follow-3.log
线下调试
定向爬虫策略: 
sh main/scheduling_main.sh ./gongzhonghao/gongzhonghao_main/run_gongzhonghao_follow.py --log_type="follow" --crawler="gongzhonghao" --env="dev" gongzhonghao/nohup-follow.log
sh main/scheduling_main.sh ./gongzhonghao/gongzhonghao_main/run_gongzhonghao_follow_2.py --log_type="follow-2" --crawler="gongzhonghao" --env="dev" gongzhonghao/nohup-follow-2.log
sh main/scheduling_main.sh ./gongzhonghao/gongzhonghao_main/run_gongzhonghao_follow_3.py --log_type="follow-3" --crawler="gongzhonghao" --env="dev" gongzhonghao/nohup-follow-3.log
杀进程命令
ps aux | grep run_gongzhonghao
ps aux | grep run_gongzhonghao | grep -v grep | awk '{print $2}' | xargs kill -9 

微信指数

获取站外标题, crontab定时脚本, 每天 12:00:00 点运行一次
00 12 * * * cd /data5/piaoquan_crawler/ && /root/anaconda3/bin/python weixinzhishu/weixinzhishu_main/run_weixinzhishu_hot_search.py >>weixinzhishu/logs/nohup-hot-search.log 2>&1 &
获取站外热词微信指数, crontab定时脚本, 每天 12:30:00 点运行一次
30 12 * * * cd /data5/piaoquan_crawler/ && /root/anaconda3/bin/python weixinzhishu/weixinzhishu_main/run_weixinzhishu_today_score.py >>weixinzhishu/logs/today-score.log 2>&1 &
获取微信指数, crontab定时脚本, 每天 08:00:00 20:00:00 各运行一次
00 08,20 * * * cd /data5/piaoquan_crawler/ && /root/anaconda3/bin/python weixinzhishu/weixinzhishu_main/run_weixinzhishu_score.py >>weixinzhishu/logs/nohup-score.log 2>&1 &
nohup python3 -u /data5/piaoquan_crawler/weixinzhishu/weixinzhishu_main/weixinzhishu_inner_long.py >>/data5/piaoquan_crawler/weixinzhishu/logs/nohup_inner_long.log 2>&1 &
nohup python3 -u /data5/piaoquan_crawler/weixinzhishu/weixinzhishu_main/weixinzhishu_out.py >>/data5/piaoquan_crawler/weixinzhishu/logs/nohup_out.log 2>&1 &
nohup python3 -u /data5/piaoquan_crawler/weixinzhishu/weixinzhishu_main/weixinzhishu_inner_sort.py >>/data5/piaoquan_crawler/weixinzhishu/logs/nohup_inner_sort.log 2>&1 &
获取 wechat_key 设备: Mac Air 
cd ~ && source ./base_profile && ps aux | grep weixinzhishu | grep -v grep | awk '{print $2}' | xargs kill -9 && cd /Users/piaoquan/Desktop/piaoquan_crawler && nohup python3 -u weixinzhishu/weixinzhishu_key/search_key_mac.py >> weixinzhishu/logs/nohup-search-key.log 2>&1 &
线下调试
抓取今日微信指数
python3 /Users/wangkun/Desktop/crawler/piaoquan_crawler/weixinzhishu/weixinzhishu_main/run_weixinzhishu_today_score.py
检测进程
ps aux | grep WeChat.app
ps aux | grep weixinzhishu
ps aux | grep weixinzhishu | grep -v grep | awk '{print $2}' | xargs kill -9
ps aux | grep 微信 | grep -v grep | awk '{print $2}' | xargs kill -9

抖音

阿里云 102 服务器
sh ./main/main.sh ./douyin/douyin_main/run_douyin_recommend.py --log_type="recommend" --crawler="douyin" --strategy="推荐爬虫策略" --oss_endpoint="inner" --env="prod" --machine="aliyun" douyin/recommend.log
# sh ./main/main.sh ./kuaishou/douyin_main/run_douyin_recommend.py --log_type="recommend" --crawler="douyin" --strategy="定向爬策策略" --env="prod" --machine="aliyun" kuaishou/nohup.log
本机

#### 爬虫进程监测
```commandline
阿里云 102 服务器:/usr/bin/sh /data5/piaoquan_crawler/main/process.sh "prod"
香港 服务器:/usr/bin/sh /root/piaoquan_crawler/main/process.sh "hk"
线下调试:sh /Users/wangkun/Desktop/crawler/piaoquan_crawler/main/process.sh "dev"

本山祝福小程序

阿里云 102 服务器
/usr/bin/sh /data5/piaoquan_crawler/main/scheduling_main.sh ./benshanzhufu/benshanzhufu_main/run_benshanzhufu_recommend.py --log_type="recommend" --crawler="benshanzhufu" --env="prod"  ./benshanzhufu/logs/nohup-recommend.log
线下调试
sh ./main/scheduling_main.sh ./benshanzhufu/benshanzhufu_main/run_benshanzhufu_recommend.py --log_type="recommend" --crawler="benshanzhufu" --env="dev"  ./benshanzhufu/logs/nohup-recommend.log
检测进程
ps aux | grep run_benshanzhufu
ps aux | grep run_benshanzhufu | grep -v grep | awk '{print $2}' | xargs kill -9

岁岁年年迎福气小程序

阿里云 102 服务器
/usr/bin/sh /data5/piaoquan_crawler/main/scheduling_main.sh ./suisuiniannianyingfuqi/suisuiniannianyingfuqi_main/run_suisuiniannianyingfuqi_recommend.py --log_type="recommend" --crawler="suisuiniannianyingfuqi" --env="prod"  ./suisuiniannianyingfuqi/logs/nohup-recommend.log
线下调试
sh ./main/scheduling_main.sh ./suisuiniannianyingfuqi/suisuiniannianyingfuqi_main/run_suisuiniannianyingfuqi_recommend.py --log_type="recommend" --crawler="suisuiniannianyingfuqi" --env="dev"  ./suisuiniannianyingfuqi/logs/nohup-recommend.log
检测进程
ps aux | grep run_suisuiniannianyingfuqi
ps aux | grep run_suisuiniannianyingfuqi | grep -v grep | awk '{print $2}' | xargs kill -9

线下爬虫: 刚刚都传 / 吉祥幸福 / 知青天天看 / 众妙音信 / wechat_search_key

MacAir 设备, crontab定时任务
* * * * * /bin/sh /Users/piaoquan/Desktop/piaoquan_crawler/main/process_offline.sh "prod"
线下调试
sh /Users/wangkun/Desktop/crawler/piaoquan_crawler/main/process_offline.sh "dev"
cd /Users/piaoquan/Desktop/piaoquan_crawler/ && nohup python3 -u weixinzhishu/weixinzhishu_key/search_key_mac.py >> weixinzhishu/logs/nohup-search-key.log 2>&1 &
检测进程
ps aux | grep run_ganggangdouchuan | grep -v grep | awk '{print $2}' | xargs kill -9
ps aux | grep run_jixiangxingfu | grep -v grep | awk '{print $2}' | xargs kill -9
ps aux | grep run_zhongmiaoyinxin | grep -v grep | awk '{print $2}' | xargs kill -9
ps aux | grep run_zhiqingtiantiankan | grep -v grep | awk '{print $2}' | xargs kill -9
ps aux | grep Appium.app | grep -v grep | awk '{print $2}' | xargs kill -9

视频号

正式环境
00 00 * * * /bin/sh /Users/piaoquan/Desktop/piaoquan_crawler/shipinhao/shipinhao_main/run_shipinhao.sh shipinhao/shipinhao_main/run_shipinhao_search.py --log_type="search" --crawler="shipinhao" --env="prod"
线下调试
sh shipinhao/shipinhao_main/run_shipinhao.sh shipinhao/shipinhao_main/run_shipinhao_search.py --log_type="search" --crawler="shipinhao" --env="dev"
检测进程
ps aux | grep shipinhao_search
ps aux | grep shipinhao_search | grep -v grep | awk '{print $2}' | xargs kill -9

爬虫进程监控: main/process.sh

102 服务器: 
* * * * * /usr/bin/sh /data5/piaoquan_crawler/main/process.sh "prod"  >>/data5/piaoquan_crawler/main/main_logs/run-process.log 2>&1
线下调试: 
sh main/process.sh "dev" >> main/main_logs/run-process.log 2>&1
进程监控
ps aux | grep search_key_mac | grep -v grep | awk '{print $2}' | xargs kill -9
ps aux | grep gongzhonghao | grep -v grep | awk '{print $2}' | xargs kill -9
ps aux | grep xiaoniangao | grep -v grep | awk '{print $2}' | xargs kill -9
ps aux | grep run_xigua_search | grep -v grep | awk '{print $2}' | xargs kill -9
ps aux | grep run_suisuiniannianyingfuqi | grep -v grep | awk '{print $2}' | xargs kill -9
ps aux | grep run_benshanzhufu | grep -v grep | awk '{print $2}' | xargs kill -9
ps aux | grep run_kuaishou | grep -v grep | awk '{print $2}' | xargs kill -9
ps aux | grep run_gongzhonghao | grep -v grep | awk '{print $2}' | xargs kill -9
ps aux | grep run_shipinhao | grep -v grep | awk '{print $2}' | xargs kill -9
ps aux | grep Appium.app | grep -v grep | awk '{print $2}' | xargs kill -9
ps aux | grep kuaishou | grep -v grep | awk '{print $2}' | xargs kill -9