123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198 |
- #!/bin/bash
- # environment variables for fleet distribute training
- echo 'begin to train...'
- export GLOG_v=0
- ulimit -c unlimited
- # download dependency
- wget https://paddlerec.bj.bcebos.com/benchmark/pgl/dependency_py310.tar.gz --no-check-certificate
- tar -zxvf dependency_py310.tar.gz
- rm dependency_py310.tar.gz
- SOURCE_HOME=$(readlink -f $(dirname ${BASH_SOURCE[0]}) )/
- PGLBOX_HOME=${SOURCE_HOME}/../
- LOG_DIR="${PGLBOX_HOME}/log"
- [ ! -d ${LOG_DIR} ] && mkdir -p ${LOG_DIR}
- config_file="${PGLBOX_HOME}/models/graph/config.yaml" # 模型配置文件,可以修改
- # environment variables for fleet distribute training
- source ${PGLBOX_HOME}/tools/utils/static_ps/pglbox_util.sh
- unset PYTHONHOME
- unset PYTHONPATH
- # download graph data
- graph_data_hdfs_path=`parse_yaml2 ${config_file} graph_data_hdfs_path`
- graph_data_local_path=`parse_yaml2 ${config_file} graph_data_local_path`
- if [ -z "$graph_data_hdfs_path" ]; then
- echo "download default graph data"
- wget https://paddlerec.bj.bcebos.com/benchmark/pgl/data.tar.gz --no-check-certificate
- tar -zxvf data.tar.gz
- rm data.tar.gz
- touch ${PGLBOX_HOME}/data/download.done
- else
- echo "download your graph data"
- sh ${PGLBOX_HOME}/tools/utils/static_ps/download_graph_data.sh ${graph_data_hdfs_path} ${graph_data_local_path} ${config_file}> ${LOG_DIR}/graph_data.log 2>&1 &
- fi
- # train
- sharding=`grep sharding $config_file | sed s/#.*//g | grep sharding | awk -F':' '{print $1}' | sed 's/ //g'`
- if [ "${sharding}" = "sharding" ]; then
- export FLAGS_enable_adjust_op_order=2
- fi
- pretrained_model=`parse_yaml2 $config_file pretrained_model`
- sage_mode=`parse_yaml2 $config_file sage_mode`
- if [[ ${pretrained_model} =~ "1.5B" ]] || [[ ${pretrained_model} =~ "10B" ]]; then
- echo "pretrained_model is [${pretrained_model}], using LLM_MODELING"
- export LLM_MODELING=true
- fi
- # environment variables for fleet distribute training
- export FLAGS_enable_pir_api=0 #PS模式不支持新IR
- export FLAGS_dynamic_static_unified_comm=false #PGLBOX最新不支持新通信库
- export NCCL_DEBUG=INFO
- export FLAGS_LAUNCH_BARRIER=0
- export PADDLE_TRAINERS=1
- export FLAGS_enable_tracker_all2all=false
- export FLAGS_enable_auto_rdma_trans=true
- export FLAGS_enable_all2all_use_fp16=false
- export FLAGS_check_nan_inf=false
- export FLAGS_eager_delete_tensor_gb=0.0
- export FLAGS_memory_fraction_of_eager_deletion=1
- export FLAGS_control_flow_use_new_executor=false
- export FLAGS_graph_neighbor_size_percent=1.0
- export FLAGS_graph_edges_split_mode="hard"
- export FLAGS_enable_graph_multi_node_sampling=false
- export FLAGS_new_executor_use_local_scope=0
- # multiple machines need high version nccl
- # set launch mode GROUP, after more than nccl2.9 default PARALLEL multi-thread will blocking
- export NCCL_LAUNCH_MODE=GROUP
- # export NCCL_ROOT=/home/work/nccl/nccl2.16.2_cuda11 # 注意nccl位置
- # export LD_LIBRARY_PATH=$NCCL_ROOT/lib:$LD_LIBRARY_PATH
- if [[ ! -z "$MPI_NODE_NUM" ]] && [[ $MPI_NODE_NUM -gt 1 ]]; then
- echo "PADDLE_TRAINER_ID: $PADDLE_TRAINER_ID, PADDLE_TRAINER_ENDPOINTS: $PADDLE_TRAINER_ENDPOINTS, PADDLE_CURRENT_ENDPOINT: $PADDLE_CURRENT_ENDPOINT"
- export PADDLE_WITH_GLOO=2
- export PADDLE_GLOO_RENDEZVOUS=3
- export FLAGS_graph_edges_split_only_by_src_id=true
- export FLAGS_enable_graph_multi_node_sampling=true
- if [ $FLAGS_graph_edges_split_mode = "hard" ]; then
- echo "run gpugraph in hard mode"
- elif [ $FLAGS_graph_edges_split_mode = "fennel" ]; then
- export FLAGS_enable_sparse_inner_gather=false
- export FLAGS_query_dest_rank_by_multi_node=true
- echo "run gpugraph in fennel mode"
- fi
- if [ $sage_mode = "True" ]; then
- export FLAGS_graph_embedding_split_infer_mode=false
- echo "run gpugraph in sage mode"
- else
- export FLAGS_graph_embedding_split_infer_mode=true
- echo "run gpugraph in deepwalk mode"
- fi
- else
- export PADDLE_TRAINERS_NUM=${PADDLE_TRAINERS}
- export POD_IP=127.0.0.1
- export PADDLE_PSERVERS_IP_PORT_LIST="127.0.0.1:29011" #set free port if 29011 is occupied
- export PADDLE_TRAINER_ENDPOINTS=${PADDLE_TRAINER_ENDPOINTS/,*/}
- export PADDLE_PSERVER_PORT_ARRAY=(29011)
- export PADDLE_TRAINER_ID=0
- export TRAINING_ROLE=TRAINER
- export PADDLE_PORT=8800
- fi
- export LD_PRELOAD=./dependency/libjemalloc.so
- # jemalloc parameter tuning
- export MALLOC_CONF=background_thread:true,metadata_thp:auto,dirty_decay_ms:30000,muzzy_decay_ms:30000
- export DEPENDENCY_HOME=./dependency
- export HADOOP_HOME="${PGLBOX_HOME}/dependency/hadoop-client/hadoop"
- export PATH=${PGLBOX_HOME}/dependency/hadoop-client/hadoop/bin:$PATH
- export GZSHELL="${PGLBOX_HOME}/dependency/gzshell"
- selected_gpus=`grep gpus: $config_file |sed s/#.*//g | grep gpus | awk -F':' '{print $2}' | sed "s/\[//g" | sed "s/\]//g" | sed "s/ //g"`
- export FLAGS_selected_gpus=${selected_gpus}
- export FLAGS_free_when_no_cache_hit=true
- export FLAGS_use_stream_safe_cuda_allocator=true
- export FLAGS_gpugraph_enable_hbm_table_collision_stat=false
- export FLAGS_gpugraph_hbm_table_load_factor=0.75
- export FLAGS_gpugraph_enable_segment_merge_grads=true
- export FLAGS_gpugraph_merge_grads_segment_size=128
- export FLAGS_gpugraph_dedup_pull_push_mode=1
- export FLAGS_gpugraph_load_node_list_into_hbm=false
- export FLAGS_gpugraph_storage_mode=3
- export FLAGS_enable_exit_when_partial_worker=false
- export FLAGS_gpugraph_debug_gpu_memory=false
- export FLAGS_enable_neighbor_list_use_uva=false
- max_seq_len=`parse_yaml2 $config_file max_seq_len`
- if [ "$max_seq_len" != "" ]; then
- export FLAGS_gpugraph_slot_feasign_max_num=${max_seq_len}
- else
- export FLAGS_gpugraph_slot_feasign_max_num=200
- fi
-
- export FLAGS_gpugraph_enable_gpu_direct_access=false
- export FLAGS_graph_load_in_parallel=true
- sage_mode=`grep sage_mode $config_file | sed s/#.*//g | grep sage_mode | awk -F':' '{print $2}' | sed 's/ //g'`
- if [ "${sage_mode}" = "True" ] || [ "${sage_mode}" = "true" ]; then
- export FLAGS_enable_exit_when_partial_worker=true
- echo "FLAGS_enable_exit_when_partial_worker is true"
- else
- export FLAGS_enable_exit_when_partial_worker=false
- echo "FLAGS_enable_exit_when_partial_worker is false"
- fi
- metapath_split_opt=`grep metapath_split_opt $config_file | sed s/#.*//g | grep metapath_split_opt | awk -F':' '{print $2}' | sed 's/ //g'`
- if [ "${metapath_split_opt}" == "True" ] || [ "${metapath_split_opt}" == "true" ];then
- export FLAGS_graph_metapath_split_opt=true
- echo "FLAGS_graph_metapath_split_opt is true"
- else
- export FLAGS_graph_metapath_split_opt=false
- echo "FLAGS_graph_metapath_split_opt is false"
- fi
-
- part_num=`grep num_part $config_file | sed s/#.*//g | grep num_part | awk -F':' '{print $2}' | sed 's/ //g'`
- if [ ${part_num} -eq 1000 ];then
- echo "will run full graph"
- export FLAGS_graph_get_neighbor_id=false
- else
- echo "will sub part graph"
- export FLAGS_graph_get_neighbor_id=true
- fi
- data_path=`parse_yaml2 $config_file graph_data_local_path`
- echo "data_path:"$data_path
- if [[ ${data_path} =~ "raid0" ]]; then
- echo "set export FLAGS_rocksdb_path=/raid0/database"
- export FLAGS_rocksdb_path="/raid0/database"
- fi
- PYTHON_HOME=${PGLBOX_HOME}/dependency/cpython-3.10.0
- export PATH=${PYTHON_HOME}/bin:${PATH}
- export LD_LIBRARY_PATH=${PYTHON_HOME}/lib:${LD_LIBRARY_PATH}
- set -x
- which python
- unset PYTHONHOME
- unset PYTHONPATH
- # install paddlepaddle-gpu whl
- # python -m pip install paddlepaddle-gpu==2.6.1
- ret=0
- for((i=0;i<$PADDLE_TRAINERS;i++))
- do
- python -u tools/static_pglbox_trainer.py -m $config_file &> ./log/trainer.$i.log
- done
- ret=$?
- if [[ $ret -ne 0 ]]; then
- echo "Something failed in cluster_train_and_infer.py"
- exit 1
- fi
|