票圈爬虫平台

wangkun c76c241d45 update hai 1 ano
benshanzhufu e536d00d07 update hai 1 ano
common c76c241d45 update hai 1 ano
control d8bcc1c3d1 update hai 1 ano
dev d73743d5a6 update haitunzhufu hai 1 ano
douyin e536d00d07 update hai 1 ano
ganggangdouchuan 38bde8b2cb update hai 1 ano
gongzhonghao 9b4558573e update hai 1 ano
haitunzhufu c76c241d45 update hai 1 ano
jixiangxingfu 38bde8b2cb update hai 1 ano
kanyikan 9e0fcd20c9 update hai 1 ano
kuaishou e536d00d07 update hai 1 ano
main 38bde8b2cb update hai 1 ano
monitor a224b49898 update hai 1 ano
scheduling d2920f2989 update hai 1 ano
shipinhao 65c065f75f update hai 1 ano
suisuiniannianyingfuqi 1c4b2731f8 update hai 1 ano
weixinzhishu 766295af54 add kuaishou_cut_title hai 1 ano
xiaoniangao e536d00d07 update hai 1 ano
xigua 652e329bb0 update hai 1 ano
youtube 1a603d121c update hai 1 ano
zhiqingtiantiankan 38bde8b2cb update hai 1 ano
zhongmiaoyinxin 38bde8b2cb update hai 1 ano
zhufuquanzi d73743d5a6 update haitunzhufu hai 1 ano
.gitignore 4954c999a0 update hai 1 ano
README.MD c76c241d45 update hai 1 ano
requirements.txt 2f02aac746 update hai 1 ano

README.MD

爬虫调度系统

启动

  1. cd ./piaoquan_crawler
  2. sh ./main/scheduling_main.sh ${crawler_dir} ${log_type} ${crawler} ${env} >>${nohup_dir} 2>&1 &

    参数说明
    ${crawler_dir}:     爬虫执行路径,如: scheduling/scheduling_main/run_write_task.py
    ${log_type}:        日志命名格式,如: scheduling-task,则在 scheduling/logs/目录下,生成 2023-02-08-scheduling-task.log
    ${crawler}:         哪款爬虫,如: youtube / kanyikan / weixinzhishu
    ${env}:             爬虫运行环境,正式环境: prod / 测试环境: dev
    ${nohup_dir}:       nohup日志存储路径,如: shceduling/nohup-task.log
    

    运行命令

    阿里云 102 服务器
    sh ./main/scheduling_main.sh scheduling/scheduling_main/run_write_task.py --log_type="scheduling-write" --crawler="scheduling" --env="prod" nohup-write.log 
    sh ./main/scheduling_main.sh scheduling/scheduling_main/run_scheduling_task.py --log_type="scheduling-task" --crawler="scheduling" --env="prod" nohup-task.log 
    # 读取任务写入 Redis,1分钟/次
    */1 * * * * cd /data5/piaoquan_crawler && /usr/bin/sh ./main/scheduling_main.sh scheduling/scheduling_v3/run_write_task_v3.py --log_type="scheduling-write" --crawler="scheduling" --env="prod" scheduling/logs/scheduling-write.log
    # 调度任务,5秒/次
    * * * * * for i in {1..12}; do cd /data5/piaoquan_crawler && /usr/bin/sh ./main/scheduling_main.sh scheduling/scheduling_v3/run_scheduling_task_v3.py --log_type="scheduling-task" --crawler="scheduling" --env="prod" scheduling/logs/scheduling-task.log; sleep 5; done
    香港服务器
    sh ./main/scheduling_main.sh scheduling/scheduling_main/run_write_task.py --log_type="scheduling-write" --crawler="scheduling" --env="prod" shceduling/nohup-write.log 
    sh ./main/scheduling_main.sh scheduling/scheduling_main/run_scheduling_task.py --log_type="scheduling-task" --crawler="scheduling" --env="prod" shceduling/nohup-task.log 
    
    线下调试
    # 读取任务写入 Redis
    sh ./main/scheduling_main.sh scheduling/scheduling_v3/run_write_task_v3.py --log_type="scheduling-write" --crawler="scheduling" --env="dev"  scheduling/logs/scheduling-write.log 
    # 调度任务
    sh ./main/scheduling_main.sh scheduling/scheduling_v3/run_scheduling_task_v3.py --log_type="scheduling-task" --crawler="scheduling" --env="dev"  scheduling/logs/scheduling-task.log 
    
    杀进程
    ps aux | grep scheduling
    ps aux | grep scheduling | grep -v grep | awk '{print $2}' | xargs kill -9
    

爬虫平台

启动

  1. cd ./piaoquan_crawler
  2. sh ./main/main.sh ${crawler_dir} ${log_type} ${crawler} ${strategy} ${oss_endpoint} ${env} ${machine} ${nohup_dir}

    参数说明
    ${crawler_dir}:     爬虫执行路径,如: ./youtube/youtube_main/run_youtube_follow.py
    ${log_type}:        日志命名格式,如: follow,则在 youtube/logs/目录下,生成 2023-02-08-follow.log
    ${crawler}:         哪款爬虫,如: youtube / kanyikan / weixinzhishu
    ${strategy}:        爬虫策略,如: 定向爬虫策略 / 小时榜爬虫策略 / 热榜爬虫策略
    # ${oss_endpoint}:    OSS网关,内网: inner / 外网: out / 香港: hk
    ${env}:             爬虫运行环境,正式环境: prod / 测试环境: dev
    ${machine}:         爬虫运行机器,阿里云服务器: aliyun_hk / aliyun / macpro / macair / local
    ${nohup_dir}:       nohup日志存储路径,如: ./youtube/nohup.log
    

    YouTube

    sh ./main/main.sh ./youtube/youtube_main/run_youtube_follow.py --log_type="follow" --crawler="youtube" --strategy="定向爬虫策略" --oss_endpoint="hk" --env="prod" --machine="aliyun_hk" youtube/nohup.log
    # sh ./main/main.sh ./youtube/youtube_main/run_youtube_follow.py --log_type="follow" --crawler="youtube" --strategy="定向爬虫策略" --env="prod" --machine="aliyun_hk" youtube/nohup.log
    youtube杀进程命令: 
    ps aux | grep run_youtube
    ps aux | grep run_youtube | grep -v grep | awk '{print $2}' | xargs kill -9
    

微信指数

# 微信指数,Mac Air
00 11 * * * cd ~ && source ./base_profile && ps aux | grep weixinzhishu | grep -v grep | awk '{print $2}' | xargs kill -9 && cd /Users/piaoquan/Desktop/piaoquan_crawler && nohup python3 -u weixinzhishu/weixinzhishu_key/search_key_mac.py >> weixinzhishu/logs/nohup-search-key.log 2>&1 &

获取站外标题, crontab定时脚本, 每天 12:00:00 点运行一次
00 12 * * * cd /data5/piaoquan_crawler/ && /root/anaconda3/bin/python weixinzhishu/weixinzhishu_main/run_weixinzhishu_hot_search.py >>weixinzhishu/logs/nohup-hot-search.log 2>&1 &
获取站外热词微信指数, crontab定时脚本, 每天 12:30:00 点运行一次
30 12 * * * cd /data5/piaoquan_crawler/ && /root/anaconda3/bin/python weixinzhishu/weixinzhishu_main/run_weixinzhishu_today_score.py >>weixinzhishu/logs/today-score.log 2>&1 &
获取微信指数, crontab定时脚本, 每天 08:00:00 20:00:00 各运行一次
00 08,20 * * * cd /data5/piaoquan_crawler/ && /root/anaconda3/bin/python weixinzhishu/weixinzhishu_main/run_weixinzhishu_score.py >>weixinzhishu/logs/nohup-score.log 2>&1 &
nohup python3 -u /data5/piaoquan_crawler/weixinzhishu/weixinzhishu_main/weixinzhishu_inner_long.py >>/data5/piaoquan_crawler/weixinzhishu/logs/nohup_inner_long.log 2>&1 &
nohup python3 -u /data5/piaoquan_crawler/weixinzhishu/weixinzhishu_main/weixinzhishu_out.py >>/data5/piaoquan_crawler/weixinzhishu/logs/nohup_out.log 2>&1 &
nohup python3 -u /data5/piaoquan_crawler/weixinzhishu/weixinzhishu_main/weixinzhishu_inner_sort.py >>/data5/piaoquan_crawler/weixinzhishu/logs/nohup_inner_sort.log 2>&1 &
获取 wechat_key 设备: Mac Air 
cd ~ && source ./base_profile && ps aux | grep weixinzhishu | grep -v grep | awk '{print $2}' | xargs kill -9 && cd /Users/piaoquan/Desktop/piaoquan_crawler && nohup python3 -u weixinzhishu/weixinzhishu_key/search_key_mac.py >> weixinzhishu/logs/nohup-search-key.log 2>&1 &
线下调试
抓取今日微信指数
python3 /Users/wangkun/Desktop/crawler/piaoquan_crawler/weixinzhishu/weixinzhishu_main/run_weixinzhishu_today_score.py
检测进程
ps aux | grep WeChat.app
ps aux | grep weixinzhishu
ps aux | grep weixinzhishu | grep -v grep | awk '{print $2}' | xargs kill -9
ps aux | grep 微信 | grep -v grep | awk '{print $2}' | xargs kill -9

线下爬虫: 刚刚都传 / 吉祥幸福 / 知青天天看 / 众妙音信 / wechat_search_key / start_appium / 祝福圈子

# 线下爬虫调度,每分钟检测线下爬虫进程状态
* * * * * /bin/sh /Users/piaoquan/Desktop/piaoquan_crawler/main/process_offline.sh "prod"
# 启动并检测Appium进程状态
* * * * * /bin/sh /Users/piaoquan/Desktop/piaoquan_crawler/main/start_appium.sh "recommend" "jixiangxingfu" "prod"

线下调试
ps aux | grep Appium.app | grep -v grep | awk '{print $2}' | xargs kill -9 && nohup /opt/homebrew/bin/node /Applications/Appium.app/Contents/Resources/app/node_modules/appium/build/lib/main.js >>/Users/wangkun/Desktop/logs/nohup.log 2>&1 &
sh /Users/wangkun/Desktop/crawler/piaoquan_crawler/main/process_offline.sh "dev"
cd /Users/piaoquan/Desktop/piaoquan_crawler/ && nohup python3 -u weixinzhishu/weixinzhishu_key/search_key_mac.py >> weixinzhishu/logs/nohup-search-key.log 2>&1 &
检测进程
ps aux | grep run_ganggangdouchuan | grep -v grep | awk '{print $2}' | xargs kill -9
ps aux | grep run_jixiangxingfu | grep -v grep | awk '{print $2}' | xargs kill -9
ps aux | grep run_zhongmiaoyinxin | grep -v grep | awk '{print $2}' | xargs kill -9
ps aux | grep run_zhiqingtiantiankan | grep -v grep | awk '{print $2}' | xargs kill -9
ps aux | grep Appium.app | grep -v grep | awk '{print $2}' | xargs kill -9

视频号

正式环境
00 00 * * * /bin/sh /Users/piaoquan/Desktop/piaoquan_crawler/shipinhao/shipinhao_main/run_shipinhao.sh shipinhao/shipinhao_main/run_shipinhao_search.py --log_type="search" --crawler="shipinhao" --env="prod"
线下调试
sh shipinhao/shipinhao_main/run_shipinhao.sh shipinhao/shipinhao_main/run_shipinhao_search.py --log_type="search" --crawler="shipinhao" --env="dev"
检测进程
ps aux | grep shipinhao_search
ps aux | grep shipinhao_search | grep -v grep | awk '{print $2}' | xargs kill -9

207 服务器,CPU/MEMORY 监控

正式环境
* * * * * /usr/bin/sh /root/piaoquan_crawler/monitor/monitor_main/run_monitor.sh monitor/monitor_main/run_cpu_memory.py "cpumemory" "monitor" "prod"
线下调试
sh monitor/monitor_main/run_monitor.sh monitor/monitor_main/run_cpu_memory.py "cpumemory" "monitor" "dev"
检测进程
ps aux | grep run_monitor | grep -v grep | awk '{print $2}' | xargs kill -9

调用MQ的爬虫进程守护: main/process_mq.sh

本地调试
/bin/sh /Users/wangkun/Desktop/crawler/piaoquan_crawler/main/process_mq.sh "ssnnyfq" "suisuiniannianyingfuqi" "recommend" "dev"
/bin/sh /Users/wangkun/Desktop/crawler/piaoquan_crawler/main/process_mq.sh "gzh1" "gongzhonghao" "author" "dev"
/bin/sh /Users/wangkun/Desktop/crawler/piaoquan_crawler/main/process_mq.sh "gzh2" "gongzhonghao" "author" "dev"
/bin/sh /Users/wangkun/Desktop/crawler/piaoquan_crawler/main/process_mq.sh "xg" "xigua" "recommend" "dev"
/bin/sh /Users/wangkun/Desktop/crawler/piaoquan_crawler/main/process_mq.sh "xg" "xigua" "author" "dev"
/bin/sh /Users/wangkun/Desktop/crawler/piaoquan_crawler/main/process_mq.sh "xg" "xigua" "search" "dev"
/bin/sh /Users/wangkun/Desktop/crawler/piaoquan_crawler/main/process_mq.sh "bszf" "benshanzhufu" "recommend" "dev"
/bin/sh /Users/wangkun/Desktop/crawler/piaoquan_crawler/main/process_mq.sh "ks" "kuaishou" "recommend" "dev"
/bin/sh /Users/wangkun/Desktop/crawler/piaoquan_crawler/main/process_mq.sh "ks" "kuaishou" "author" "dev"
/bin/sh /Users/wangkun/Desktop/crawler/piaoquan_crawler/main/process_mq.sh "dy" "douyin" "recommend" "dev"
/bin/sh /Users/wangkun/Desktop/crawler/piaoquan_crawler/main/process_mq.sh "dy" "douyin" "author" "dev"
/bin/sh /Users/wangkun/Desktop/crawler/piaoquan_crawler/main/process_mq.sh "xng" "xiaoniangao" "play" "dev"
/bin/sh /Users/wangkun/Desktop/crawler/piaoquan_crawler/main/process_mq.sh "xng" "xiaoniangao" "hour" "dev"
/bin/sh /Users/wangkun/Desktop/crawler/piaoquan_crawler/main/process_mq.sh "xng" "xiaoniangao" "author" "dev"
/bin/sh /Users/wangkun/Desktop/crawler/piaoquan_crawler/main/process_mq.sh "kykjk" "kanyikan" "recommend" "dev"


207 服务器, 调用 MQ 爬虫守护进程
# 岁岁年年迎福气
* * * * * /usr/bin/sh /root/piaoquan_crawler/main/process_mq.sh "ssnnyfq" "suisuiniannianyingfuqi" "recommend" "prod"
# 公众号(根据抓取目标用户数,自动计算需要启动 X 个进程同时抓取。每 100 个目标抓取用户,占用一个进程)
* * * * * /usr/bin/sh /root/piaoquan_crawler/main/process_mq.sh "gzh" "gongzhonghao" "author" "prod"
# 西瓜账号
* * * * * /usr/bin/sh /root/piaoquan_crawler/main/process_mq.sh "xg" "xigua" "author" "prod"
# 西瓜搜索
* * * * * /usr/bin/sh /root/piaoquan_crawler/main/process_mq.sh "xg" "xigua" "search" "prod"
# 本山祝福
* * * * * /usr/bin/sh /root/piaoquan_crawler/main/process_mq.sh "bszf" "benshanzhufu" "recommend" "prod"
# 快手推荐
* * * * * /usr/bin/sh /root/piaoquan_crawler/main/process_mq.sh "ks" "kuaishou" "recommend" "prod"
# 快手账号
* * * * * /usr/bin/sh /root/piaoquan_crawler/main/process_mq.sh "ks" "kuaishou" "author" "prod"
# 抖音推荐
* * * * * /usr/bin/sh /root/piaoquan_crawler/main/process_mq.sh "dy" "douyin" "recommend" "prod"
# 抖音账号
* * * * * /usr/bin/sh /root/piaoquan_crawler/main/process_mq.sh "dy" "douyin" "author" "prod"
# 小年糕播放榜
* * * * * /usr/bin/sh /root/piaoquan_crawler/main/process_mq.sh "xng" "xiaoniangao" "play" "prod"
# 小年糕上升榜
* * * * * /usr/bin/sh /root/piaoquan_crawler/main/process_mq.sh "xng" "xiaoniangao" "hour" "prod"
# 小年糕账号
* * * * * /usr/bin/sh /root/piaoquan_crawler/main/process_mq.sh "xng" "xiaoniangao" "author" "prod"
# 看一看推荐 1
* * * * * /bin/sh /Users/lieyunye/Desktop/crawler/piaoquan_crawler/main/process_mq.sh "kyk" "kanyikan" "recommend" "prod"
# 看一看推荐健康类
* * * * * /bin/sh /Users/kanyikan/Desktop/crawler/piaoquan_crawler/main/process_mq.sh "kykjk" "kanyikan" "recommend" "prod"
# 西瓜推荐 1
* * * * * /bin/sh /Users/kanyikan/Desktop/crawler/piaoquan_crawler/main/process_mq.sh "xg" "xigua" "recommend" "prod"
# 西瓜推荐民生类
* * * * * /bin/sh /Users/piaoquan/Desktop/piaoquan_crawler/main/process_mq.sh "xgms" "xigua" "recommend" "prod"
# 启动 Appium 
* * * * * /bin/sh /Users/lieyunye/Desktop/crawler/piaoquan_crawler/main/start_appium.sh "recommend" "shipinhao" "prod"
# 视频号推荐
* * * * * /bin/sh /Users/lieyunye/Desktop/crawler/piaoquan_crawler/main/process_mq.sh "sph" "shipinhao" "recommend" "prod"
# 视频号搜索
* * * * * /bin/sh /Users/piaoquan/Desktop/piaoquan_crawler/main/process_mq.sh "sph" "shipinhao" "search" "prod"
# 祝福圈子
* * * * * /bin/sh /Users/piaoquan/Desktop/piaoquan_crawler/main/process_mq.sh "zfqz" "zhufuquanzi" "recommend" "prod"


杀进程
ps aux | grep suisuiniannianyingfuqi | grep -v grep | awk '{print $2}' | xargs kill -9
ps aux | grep benshanzhufu | grep -v grep | awk '{print $2}' | xargs kill -9
ps aux | grep gongzhonghao | grep -v grep | awk '{print $2}' | xargs kill -9
ps aux | grep xigua | grep -v grep | awk '{print $2}' | xargs kill -9
ps aux | grep kuaishou | grep -v grep | awk '{print $2}' | xargs kill -9
ps aux | grep douyin | grep -v grep | awk '{print $2}' | xargs kill -9
ps aux | grep xiaoniangao | grep -v grep | awk '{print $2}' | xargs kill -9
ps aux | grep kanyikan | grep -v grep | awk '{print $2}' | xargs kill -9
ps aux | grep Appium | grep -v grep | awk '{print $2}' | xargs kill -9
ps aux | grep shipinhao | grep -v grep | awk '{print $2}' | xargs kill -9
ps aux | grep Python | grep -v grep | awk '{print $2}' | xargs kill -9

生成 requirements.txt

cd ./piaoquan_crawler && pipreqs ./ --force

# pip3 install Appium-Python-Client
Appium_Python_Client==2.10.1
# 翻墙, pip3 install git+https://github.com/pyatom/pyatom/
atomac==1.2.0
# pip3 install ffmpeg-python
ffmpeg==1.4
# pip3 install loguru
loguru==0.6.0
# pip3 install lxml
lxml==4.9.1
# pip3 install mq_http_sdk, 若您使用的SDK版本为v1.0.0,您需要安装大于等于2.5且小于3.0版本的Python。若您使用的SDK版本大于v1.0.0,您需要安装2.5及以上版本的Python。
mq_http_sdk==1.0.3
# sudo pip3 install oss2
oss2==2.15.0
# pip3 install psutil
psutil==5.9.2
# pip3 install PyExecJS
PyExecJS==1.5.1
# pip3 install PyMysql
PyMySQL==1.0.2
# pip3 install redis
redis==4.5.1
# pip3 install requests
requests==2.27.1
# pip3 install selenium==4.2.0
selenium==4.9.1
# pip3 install urllib3
urllib3==1.26.9
# pip3 install jieba
jieba==0.42.1
# pip3 install workalendar
workalendar==17.0.0
# pip3 install aliyun_python_sdk
# pip3 install -U aliyun-log-python-sdk
aliyun_python_sdk==2.2.0
# pip3 install opencv-python / pip3 install opencv-contrib-python
opencv-python~=4.8.0.74
# pip3 install scikit-learn
scikit-learn~=1.3.0
# pip3 install beautifulsoup4