#!/bin/bash
# environment variables for fleet distribute training
echo 'begin to train...'

export GLOG_v=0
ulimit -c unlimited

# download dependency
wget https://paddlerec.bj.bcebos.com/benchmark/pgl/dependency_py310.tar.gz --no-check-certificate
tar -zxvf dependency_py310.tar.gz
rm dependency_py310.tar.gz

SOURCE_HOME=$(readlink -f $(dirname ${BASH_SOURCE[0]}) )/
PGLBOX_HOME=${SOURCE_HOME}/../
LOG_DIR="${PGLBOX_HOME}/log"
[ ! -d ${LOG_DIR} ] && mkdir -p ${LOG_DIR}

config_file="${PGLBOX_HOME}/models/graph/config.yaml" # 模型配置文件，可以修改

# environment variables for fleet distribute training
source ${PGLBOX_HOME}/tools/utils/static_ps/pglbox_util.sh

unset PYTHONHOME
unset PYTHONPATH

# download graph data
graph_data_hdfs_path=`parse_yaml2 ${config_file} graph_data_hdfs_path`
graph_data_local_path=`parse_yaml2 ${config_file} graph_data_local_path`
if [ -z "$graph_data_hdfs_path" ]; then
    echo "download default graph data"
    wget https://paddlerec.bj.bcebos.com/benchmark/pgl/data.tar.gz --no-check-certificate
    tar -zxvf data.tar.gz
    rm data.tar.gz
    touch ${PGLBOX_HOME}/data/download.done
else
    echo "download your graph data"
    sh ${PGLBOX_HOME}/tools/utils/static_ps/download_graph_data.sh ${graph_data_hdfs_path} ${graph_data_local_path} ${config_file}> ${LOG_DIR}/graph_data.log 2>&1 &
fi

# train
sharding=`grep sharding $config_file | sed s/#.*//g | grep sharding | awk -F':' '{print $1}' | sed 's/ //g'`
if [ "${sharding}" = "sharding" ]; then
   export FLAGS_enable_adjust_op_order=2
fi

pretrained_model=`parse_yaml2 $config_file pretrained_model`
sage_mode=`parse_yaml2 $config_file sage_mode`
if [[ ${pretrained_model} =~ "1.5B" ]] || [[ ${pretrained_model} =~ "10B" ]]; then
    echo "pretrained_model is [${pretrained_model}], using LLM_MODELING"
    export LLM_MODELING=true
fi

# environment variables for fleet distribute training
export FLAGS_enable_pir_api=0 #PS模式不支持新IR
export FLAGS_dynamic_static_unified_comm=false #PGLBOX最新不支持新通信库
export NCCL_DEBUG=INFO
export FLAGS_LAUNCH_BARRIER=0
export PADDLE_TRAINERS=1
export FLAGS_enable_tracker_all2all=false
export FLAGS_enable_auto_rdma_trans=true
export FLAGS_enable_all2all_use_fp16=false
export FLAGS_check_nan_inf=false
export FLAGS_eager_delete_tensor_gb=0.0
export FLAGS_memory_fraction_of_eager_deletion=1
export FLAGS_control_flow_use_new_executor=false
export FLAGS_graph_neighbor_size_percent=1.0
export FLAGS_graph_edges_split_mode="hard"
export FLAGS_enable_graph_multi_node_sampling=false
export FLAGS_new_executor_use_local_scope=0

# multiple machines need high version nccl
# set launch mode GROUP, after more than nccl2.9 default PARALLEL multi-thread will blocking
export NCCL_LAUNCH_MODE=GROUP
# export NCCL_ROOT=/home/work/nccl/nccl2.16.2_cuda11  # 注意nccl位置
# export LD_LIBRARY_PATH=$NCCL_ROOT/lib:$LD_LIBRARY_PATH

if [[ ! -z "$MPI_NODE_NUM" ]] && [[ $MPI_NODE_NUM -gt 1 ]]; then
    echo "PADDLE_TRAINER_ID: $PADDLE_TRAINER_ID, PADDLE_TRAINER_ENDPOINTS: $PADDLE_TRAINER_ENDPOINTS, PADDLE_CURRENT_ENDPOINT: $PADDLE_CURRENT_ENDPOINT"
    export PADDLE_WITH_GLOO=2
    export PADDLE_GLOO_RENDEZVOUS=3
    export FLAGS_graph_edges_split_only_by_src_id=true
    export FLAGS_enable_graph_multi_node_sampling=true
    if [ $FLAGS_graph_edges_split_mode = "hard" ]; then
        echo "run gpugraph in hard mode"
    elif [ $FLAGS_graph_edges_split_mode = "fennel" ]; then
        export FLAGS_enable_sparse_inner_gather=false
        export FLAGS_query_dest_rank_by_multi_node=true
        echo "run gpugraph in fennel mode"
    fi

    if [ $sage_mode = "True" ]; then
        export FLAGS_graph_embedding_split_infer_mode=false
        echo "run gpugraph in sage mode"
    else
        export FLAGS_graph_embedding_split_infer_mode=true
        echo "run gpugraph in deepwalk mode"
    fi
else
    export PADDLE_TRAINERS_NUM=${PADDLE_TRAINERS}
    export POD_IP=127.0.0.1
    export PADDLE_PSERVERS_IP_PORT_LIST="127.0.0.1:29011"  #set free port if 29011 is occupied
    export PADDLE_TRAINER_ENDPOINTS=${PADDLE_TRAINER_ENDPOINTS/,*/}
    export PADDLE_PSERVER_PORT_ARRAY=(29011)
    export PADDLE_TRAINER_ID=0
    export TRAINING_ROLE=TRAINER
    export PADDLE_PORT=8800
fi

export LD_PRELOAD=./dependency/libjemalloc.so
# jemalloc parameter tuning
export MALLOC_CONF=background_thread:true,metadata_thp:auto,dirty_decay_ms:30000,muzzy_decay_ms:30000
export DEPENDENCY_HOME=./dependency
export HADOOP_HOME="${PGLBOX_HOME}/dependency/hadoop-client/hadoop"
export PATH=${PGLBOX_HOME}/dependency/hadoop-client/hadoop/bin:$PATH
export GZSHELL="${PGLBOX_HOME}/dependency/gzshell"

selected_gpus=`grep gpus: $config_file |sed s/#.*//g | grep gpus | awk -F':' '{print $2}' | sed "s/\[//g" | sed "s/\]//g" | sed "s/ //g"`
export FLAGS_selected_gpus=${selected_gpus}
export FLAGS_free_when_no_cache_hit=true
export FLAGS_use_stream_safe_cuda_allocator=true
export FLAGS_gpugraph_enable_hbm_table_collision_stat=false
export FLAGS_gpugraph_hbm_table_load_factor=0.75
export FLAGS_gpugraph_enable_segment_merge_grads=true
export FLAGS_gpugraph_merge_grads_segment_size=128
export FLAGS_gpugraph_dedup_pull_push_mode=1
export FLAGS_gpugraph_load_node_list_into_hbm=false

export FLAGS_gpugraph_storage_mode=3

export FLAGS_enable_exit_when_partial_worker=false
export FLAGS_gpugraph_debug_gpu_memory=false
export FLAGS_enable_neighbor_list_use_uva=false
max_seq_len=`parse_yaml2 $config_file max_seq_len`
if [ "$max_seq_len" != "" ]; then
    export FLAGS_gpugraph_slot_feasign_max_num=${max_seq_len}
else
    export FLAGS_gpugraph_slot_feasign_max_num=200
fi
    
export FLAGS_gpugraph_enable_gpu_direct_access=false
export FLAGS_graph_load_in_parallel=true
sage_mode=`grep sage_mode $config_file | sed s/#.*//g | grep sage_mode | awk -F':' '{print $2}' | sed 's/ //g'`
if [ "${sage_mode}" = "True" ] || [ "${sage_mode}" = "true" ]; then
    export FLAGS_enable_exit_when_partial_worker=true
    echo "FLAGS_enable_exit_when_partial_worker is true"
else
    export FLAGS_enable_exit_when_partial_worker=false
    echo "FLAGS_enable_exit_when_partial_worker is false"
fi

metapath_split_opt=`grep metapath_split_opt $config_file | sed s/#.*//g | grep metapath_split_opt | awk -F':' '{print $2}' | sed 's/ //g'`
if [ "${metapath_split_opt}" == "True" ] || [ "${metapath_split_opt}" == "true" ];then
    export FLAGS_graph_metapath_split_opt=true
    echo "FLAGS_graph_metapath_split_opt is true"
else
    export FLAGS_graph_metapath_split_opt=false
    echo "FLAGS_graph_metapath_split_opt is false"
fi
    
part_num=`grep num_part $config_file | sed s/#.*//g | grep num_part | awk -F':' '{print $2}' | sed 's/ //g'`
if [ ${part_num} -eq 1000 ];then
    echo "will run full graph"
    export FLAGS_graph_get_neighbor_id=false
else
    echo "will sub part graph"
    export FLAGS_graph_get_neighbor_id=true
fi

data_path=`parse_yaml2 $config_file graph_data_local_path`
echo "data_path:"$data_path
if [[ ${data_path} =~ "raid0" ]]; then
    echo "set export FLAGS_rocksdb_path=/raid0/database"
    export FLAGS_rocksdb_path="/raid0/database"
fi

PYTHON_HOME=${PGLBOX_HOME}/dependency/cpython-3.10.0
export PATH=${PYTHON_HOME}/bin:${PATH}
export LD_LIBRARY_PATH=${PYTHON_HOME}/lib:${LD_LIBRARY_PATH}
set -x
which python
unset PYTHONHOME
unset PYTHONPATH

# install paddlepaddle-gpu whl
# python -m pip install paddlepaddle-gpu==2.6.1

ret=0
for((i=0;i<$PADDLE_TRAINERS;i++))
do
    python -u tools/static_pglbox_trainer.py -m $config_file &> ./log/trainer.$i.log

done
ret=$?

if [[ $ret -ne 0 ]]; then
    echo "Something failed in cluster_train_and_infer.py"
    exit 1
fi