run_pglbox.sh 7.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198
  1. #!/bin/bash
  2. # environment variables for fleet distribute training
  3. echo 'begin to train...'
  4. export GLOG_v=0
  5. ulimit -c unlimited
  6. # download dependency
  7. wget https://paddlerec.bj.bcebos.com/benchmark/pgl/dependency_py310.tar.gz --no-check-certificate
  8. tar -zxvf dependency_py310.tar.gz
  9. rm dependency_py310.tar.gz
  10. SOURCE_HOME=$(readlink -f $(dirname ${BASH_SOURCE[0]}) )/
  11. PGLBOX_HOME=${SOURCE_HOME}/../
  12. LOG_DIR="${PGLBOX_HOME}/log"
  13. [ ! -d ${LOG_DIR} ] && mkdir -p ${LOG_DIR}
  14. config_file="${PGLBOX_HOME}/models/graph/config.yaml" # 模型配置文件,可以修改
  15. # environment variables for fleet distribute training
  16. source ${PGLBOX_HOME}/tools/utils/static_ps/pglbox_util.sh
  17. unset PYTHONHOME
  18. unset PYTHONPATH
  19. # download graph data
  20. graph_data_hdfs_path=`parse_yaml2 ${config_file} graph_data_hdfs_path`
  21. graph_data_local_path=`parse_yaml2 ${config_file} graph_data_local_path`
  22. if [ -z "$graph_data_hdfs_path" ]; then
  23. echo "download default graph data"
  24. wget https://paddlerec.bj.bcebos.com/benchmark/pgl/data.tar.gz --no-check-certificate
  25. tar -zxvf data.tar.gz
  26. rm data.tar.gz
  27. touch ${PGLBOX_HOME}/data/download.done
  28. else
  29. echo "download your graph data"
  30. sh ${PGLBOX_HOME}/tools/utils/static_ps/download_graph_data.sh ${graph_data_hdfs_path} ${graph_data_local_path} ${config_file}> ${LOG_DIR}/graph_data.log 2>&1 &
  31. fi
  32. # train
  33. sharding=`grep sharding $config_file | sed s/#.*//g | grep sharding | awk -F':' '{print $1}' | sed 's/ //g'`
  34. if [ "${sharding}" = "sharding" ]; then
  35. export FLAGS_enable_adjust_op_order=2
  36. fi
  37. pretrained_model=`parse_yaml2 $config_file pretrained_model`
  38. sage_mode=`parse_yaml2 $config_file sage_mode`
  39. if [[ ${pretrained_model} =~ "1.5B" ]] || [[ ${pretrained_model} =~ "10B" ]]; then
  40. echo "pretrained_model is [${pretrained_model}], using LLM_MODELING"
  41. export LLM_MODELING=true
  42. fi
  43. # environment variables for fleet distribute training
  44. export FLAGS_enable_pir_api=0 #PS模式不支持新IR
  45. export FLAGS_dynamic_static_unified_comm=false #PGLBOX最新不支持新通信库
  46. export NCCL_DEBUG=INFO
  47. export FLAGS_LAUNCH_BARRIER=0
  48. export PADDLE_TRAINERS=1
  49. export FLAGS_enable_tracker_all2all=false
  50. export FLAGS_enable_auto_rdma_trans=true
  51. export FLAGS_enable_all2all_use_fp16=false
  52. export FLAGS_check_nan_inf=false
  53. export FLAGS_eager_delete_tensor_gb=0.0
  54. export FLAGS_memory_fraction_of_eager_deletion=1
  55. export FLAGS_control_flow_use_new_executor=false
  56. export FLAGS_graph_neighbor_size_percent=1.0
  57. export FLAGS_graph_edges_split_mode="hard"
  58. export FLAGS_enable_graph_multi_node_sampling=false
  59. export FLAGS_new_executor_use_local_scope=0
  60. # multiple machines need high version nccl
  61. # set launch mode GROUP, after more than nccl2.9 default PARALLEL multi-thread will blocking
  62. export NCCL_LAUNCH_MODE=GROUP
  63. # export NCCL_ROOT=/home/work/nccl/nccl2.16.2_cuda11 # 注意nccl位置
  64. # export LD_LIBRARY_PATH=$NCCL_ROOT/lib:$LD_LIBRARY_PATH
  65. if [[ ! -z "$MPI_NODE_NUM" ]] && [[ $MPI_NODE_NUM -gt 1 ]]; then
  66. echo "PADDLE_TRAINER_ID: $PADDLE_TRAINER_ID, PADDLE_TRAINER_ENDPOINTS: $PADDLE_TRAINER_ENDPOINTS, PADDLE_CURRENT_ENDPOINT: $PADDLE_CURRENT_ENDPOINT"
  67. export PADDLE_WITH_GLOO=2
  68. export PADDLE_GLOO_RENDEZVOUS=3
  69. export FLAGS_graph_edges_split_only_by_src_id=true
  70. export FLAGS_enable_graph_multi_node_sampling=true
  71. if [ $FLAGS_graph_edges_split_mode = "hard" ]; then
  72. echo "run gpugraph in hard mode"
  73. elif [ $FLAGS_graph_edges_split_mode = "fennel" ]; then
  74. export FLAGS_enable_sparse_inner_gather=false
  75. export FLAGS_query_dest_rank_by_multi_node=true
  76. echo "run gpugraph in fennel mode"
  77. fi
  78. if [ $sage_mode = "True" ]; then
  79. export FLAGS_graph_embedding_split_infer_mode=false
  80. echo "run gpugraph in sage mode"
  81. else
  82. export FLAGS_graph_embedding_split_infer_mode=true
  83. echo "run gpugraph in deepwalk mode"
  84. fi
  85. else
  86. export PADDLE_TRAINERS_NUM=${PADDLE_TRAINERS}
  87. export POD_IP=127.0.0.1
  88. export PADDLE_PSERVERS_IP_PORT_LIST="127.0.0.1:29011" #set free port if 29011 is occupied
  89. export PADDLE_TRAINER_ENDPOINTS=${PADDLE_TRAINER_ENDPOINTS/,*/}
  90. export PADDLE_PSERVER_PORT_ARRAY=(29011)
  91. export PADDLE_TRAINER_ID=0
  92. export TRAINING_ROLE=TRAINER
  93. export PADDLE_PORT=8800
  94. fi
  95. export LD_PRELOAD=./dependency/libjemalloc.so
  96. # jemalloc parameter tuning
  97. export MALLOC_CONF=background_thread:true,metadata_thp:auto,dirty_decay_ms:30000,muzzy_decay_ms:30000
  98. export DEPENDENCY_HOME=./dependency
  99. export HADOOP_HOME="${PGLBOX_HOME}/dependency/hadoop-client/hadoop"
  100. export PATH=${PGLBOX_HOME}/dependency/hadoop-client/hadoop/bin:$PATH
  101. export GZSHELL="${PGLBOX_HOME}/dependency/gzshell"
  102. selected_gpus=`grep gpus: $config_file |sed s/#.*//g | grep gpus | awk -F':' '{print $2}' | sed "s/\[//g" | sed "s/\]//g" | sed "s/ //g"`
  103. export FLAGS_selected_gpus=${selected_gpus}
  104. export FLAGS_free_when_no_cache_hit=true
  105. export FLAGS_use_stream_safe_cuda_allocator=true
  106. export FLAGS_gpugraph_enable_hbm_table_collision_stat=false
  107. export FLAGS_gpugraph_hbm_table_load_factor=0.75
  108. export FLAGS_gpugraph_enable_segment_merge_grads=true
  109. export FLAGS_gpugraph_merge_grads_segment_size=128
  110. export FLAGS_gpugraph_dedup_pull_push_mode=1
  111. export FLAGS_gpugraph_load_node_list_into_hbm=false
  112. export FLAGS_gpugraph_storage_mode=3
  113. export FLAGS_enable_exit_when_partial_worker=false
  114. export FLAGS_gpugraph_debug_gpu_memory=false
  115. export FLAGS_enable_neighbor_list_use_uva=false
  116. max_seq_len=`parse_yaml2 $config_file max_seq_len`
  117. if [ "$max_seq_len" != "" ]; then
  118. export FLAGS_gpugraph_slot_feasign_max_num=${max_seq_len}
  119. else
  120. export FLAGS_gpugraph_slot_feasign_max_num=200
  121. fi
  122. export FLAGS_gpugraph_enable_gpu_direct_access=false
  123. export FLAGS_graph_load_in_parallel=true
  124. sage_mode=`grep sage_mode $config_file | sed s/#.*//g | grep sage_mode | awk -F':' '{print $2}' | sed 's/ //g'`
  125. if [ "${sage_mode}" = "True" ] || [ "${sage_mode}" = "true" ]; then
  126. export FLAGS_enable_exit_when_partial_worker=true
  127. echo "FLAGS_enable_exit_when_partial_worker is true"
  128. else
  129. export FLAGS_enable_exit_when_partial_worker=false
  130. echo "FLAGS_enable_exit_when_partial_worker is false"
  131. fi
  132. metapath_split_opt=`grep metapath_split_opt $config_file | sed s/#.*//g | grep metapath_split_opt | awk -F':' '{print $2}' | sed 's/ //g'`
  133. if [ "${metapath_split_opt}" == "True" ] || [ "${metapath_split_opt}" == "true" ];then
  134. export FLAGS_graph_metapath_split_opt=true
  135. echo "FLAGS_graph_metapath_split_opt is true"
  136. else
  137. export FLAGS_graph_metapath_split_opt=false
  138. echo "FLAGS_graph_metapath_split_opt is false"
  139. fi
  140. part_num=`grep num_part $config_file | sed s/#.*//g | grep num_part | awk -F':' '{print $2}' | sed 's/ //g'`
  141. if [ ${part_num} -eq 1000 ];then
  142. echo "will run full graph"
  143. export FLAGS_graph_get_neighbor_id=false
  144. else
  145. echo "will sub part graph"
  146. export FLAGS_graph_get_neighbor_id=true
  147. fi
  148. data_path=`parse_yaml2 $config_file graph_data_local_path`
  149. echo "data_path:"$data_path
  150. if [[ ${data_path} =~ "raid0" ]]; then
  151. echo "set export FLAGS_rocksdb_path=/raid0/database"
  152. export FLAGS_rocksdb_path="/raid0/database"
  153. fi
  154. PYTHON_HOME=${PGLBOX_HOME}/dependency/cpython-3.10.0
  155. export PATH=${PYTHON_HOME}/bin:${PATH}
  156. export LD_LIBRARY_PATH=${PYTHON_HOME}/lib:${LD_LIBRARY_PATH}
  157. set -x
  158. which python
  159. unset PYTHONHOME
  160. unset PYTHONPATH
  161. # install paddlepaddle-gpu whl
  162. # python -m pip install paddlepaddle-gpu==2.6.1
  163. ret=0
  164. for((i=0;i<$PADDLE_TRAINERS;i++))
  165. do
  166. python -u tools/static_pglbox_trainer.py -m $config_file &> ./log/trainer.$i.log
  167. done
  168. ret=$?
  169. if [[ $ret -ne 0 ]]; then
  170. echo "Something failed in cluster_train_and_infer.py"
  171. exit 1
  172. fi