fix pytorch bert

2021-08-30 10:15:35 +00:00 · 2021-08-30 10:15:35 +00:00 · 6101e02baf
parent 248927e6fd
commit 6101e02baf
3 changed files with 45 additions and 80 deletions
--- a/PyTorch/LanguageModeling/BERT/bert_config_base.json
+++ b/PyTorch/LanguageModeling/BERT/bert_config_base.json
@ -0,0 +1,13 @@
+{
+  "attention_probs_dropout_prob": 0.1,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "max_position_embeddings": 512,
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "type_vocab_size": 2,
+  "vocab_size": 30522
+}
--- a/PyTorch/LanguageModeling/BERT/run_benchmark.sh
+++ b/PyTorch/LanguageModeling/BERT/run_benchmark.sh
@ -0,0 +1,21 @@
+#!/bin/bash
+set -xe
+export LD_LIBRARY_PATH=/usr/lib/libibverbs/:/usr/lib/x86_64-linux-gnu/:$LD_LIBRARY_PATH
+
+PADDLE_TRAINER_ENDPOINTS=`echo $PADDLE_TRAINER_ENDPOINTS | tr ',' '\n' | head -n 1`
+
+batch_size=${1:-"96"}  # batch size per gpu
+num_gpus=${2:-"8"}    # number of gpu
+precision=${3:-"fp16"}   # fp32 | fp16
+gradient_accumulation_steps=$(expr 67584 \/ $batch_size \/ $num_gpus)
+train_batch_size=$(expr 67584 \/ $num_gpus)   # total batch_size per gpu
+train_steps=${4:-4}    # max train steps
+
+export NODE_RANK=`python get_mpi_rank.py`
+
+cd ${HOME_WORK_DIR}/workspace/env_run/zengjinle/DeepLearningExamples/PyTorch/LanguageModeling/BERT
+
+rm -rf ./results/checkpoints
+# run pre-training
+bash scripts/run_pretraining.sh $train_batch_size 6e-3 $precision $num_gpus 0.2843 $train_steps 200 false true true $gradient_accumulation_steps
+
--- a/PyTorch/LanguageModeling/BERT/scripts/run_pretraining.sh
+++ b/PyTorch/LanguageModeling/BERT/scripts/run_pretraining.sh
@ -34,12 +34,13 @@ learning_rate_phase2=${17:-"4e-3"}
 warmup_proportion_phase2=${18:-"0.128"}
 train_steps_phase2=${19:-1563}
 gradient_accumulation_steps_phase2=${20:-512}
-DATASET=hdf5_lower_case_1_seq_len_128_max_pred_20_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5/wikicorpus_en # change this for other datasets
+export BERT_PREP_WORKING_DIR=${BERT_PREP_WORKING_DIR:-"/root/paddlejob/workspace/env_run/data"} 
+DATASET=hdf5_lower_case_1_seq_len_128_max_pred_20_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5/wikicorpus_en/training # change this for other datasets
 DATA_DIR_PHASE1=${21:-$BERT_PREP_WORKING_DIR/${DATASET}/}
-BERT_CONFIG=bert_config.json
+BERT_CONFIG=bert_config_base.json
 DATASET2=hdf5_lower_case_1_seq_len_512_max_pred_80_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5/wikicorpus_en # change this for other datasets
 DATA_DIR_PHASE2=${22:-$BERT_PREP_WORKING_DIR/${DATASET2}/}
-CODEDIR=${23:-"/workspace/bert"}
+CODEDIR=${23:-"/root/paddlejob/workspace/env_run/zengjinle/DeepLearningExamples/PyTorch/LanguageModeling/BERT"}
 init_checkpoint=${24:-"None"}
 RESULTS_DIR=$CODEDIR/results
 CHECKPOINTS_DIR=$RESULTS_DIR/checkpoints
@ -107,7 +108,7 @@ CMD=" $CODEDIR/run_pretraining.py"
 CMD+=" --input_dir=$DATA_DIR_PHASE1"
 CMD+=" --output_dir=$CHECKPOINTS_DIR"
 CMD+=" --config_file=$BERT_CONFIG"
-CMD+=" --bert_model=bert-large-uncased"
+CMD+=" --bert_model=bert-base-uncased"
 CMD+=" --train_batch_size=$train_batch_size"
 CMD+=" --max_seq_length=128"
 CMD+=" --max_predictions_per_seq=20"
@ -125,7 +126,12 @@ CMD+=" $INIT_CHECKPOINT"
 CMD+=" --do_train"
 CMD+=" --json-summary ${RESULTS_DIR}/dllogger.json "

-CMD="python3 -m torch.distributed.launch --nproc_per_node=$num_gpus $CMD"
+MASTER_ADDR=`echo $PADDLE_TRAINER_ENDPOINTS | awk -F',' '{print $1}' | awk -F':' '{print $1}'`
+MASTER_PORT=`echo $PADDLE_TRAINER_ENDPOINTS | awk -F',' '{print $1}' | awk -F':' '{print $2}'`
+NUM_NODES=`echo $PADDLE_TRAINER_ENDPOINTS | tr ',' '\n' | wc -l`
+NODE_RANK=$PADDLE_TRAINER_ID
+
+CMD="python3 -m torch.distributed.launch --nproc_per_node=$num_gpus --nnodes=$NUM_NODES --node_rank=$NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT $CMD"


 if [ "$create_logfile" = "true" ] ; then
@ -148,78 +154,3 @@ fi
 set +x

 echo "finished pretraining"
-
-#Start Phase2
-
-PREC=""
-if [ "$precision" = "fp16" ] ; then
-   PREC="--fp16"
-elif [ "$precision" = "fp32" ] ; then
-   PREC=""
-elif [ "$precision" = "tf32" ] ; then
-   PREC=""
-else
-   echo "Unknown <precision> argument"
-   exit -2
-fi
-
-ACCUMULATE_GRADIENTS=""
-if [ "$accumulate_gradients" == "true" ] ; then
-   ACCUMULATE_GRADIENTS="--gradient_accumulation_steps=$gradient_accumulation_steps_phase2"
-fi
-
-ALL_REDUCE_POST_ACCUMULATION=""
-if [ "$allreduce_post_accumulation" == "true" ] ; then
-   ALL_REDUCE_POST_ACCUMULATION="--allreduce_post_accumulation"
-fi
-
-ALL_REDUCE_POST_ACCUMULATION_FP16=""
-if [ "$allreduce_post_accumulation_fp16" == "true" ] ; then
-   ALL_REDUCE_POST_ACCUMULATION_FP16="--allreduce_post_accumulation_fp16"
-fi
-
-echo $DATA_DIR_PHASE2
-INPUT_DIR=$DATA_DIR_PHASE2
-CMD=" $CODEDIR/run_pretraining.py"
-CMD+=" --input_dir=$DATA_DIR_PHASE2"
-CMD+=" --output_dir=$CHECKPOINTS_DIR"
-CMD+=" --config_file=$BERT_CONFIG"
-CMD+=" --bert_model=bert-large-uncased"
-CMD+=" --train_batch_size=$train_batch_size_phase2"
-CMD+=" --max_seq_length=512"
-CMD+=" --max_predictions_per_seq=80"
-CMD+=" --max_steps=$train_steps_phase2"
-CMD+=" --warmup_proportion=$warmup_proportion_phase2"
-CMD+=" --num_steps_per_checkpoint=$save_checkpoint_steps"
-CMD+=" --learning_rate=$learning_rate_phase2"
-CMD+=" --seed=$seed"
-CMD+=" $PREC"
-CMD+=" $ACCUMULATE_GRADIENTS"
-CMD+=" $CHECKPOINT"
-CMD+=" $ALL_REDUCE_POST_ACCUMULATION"
-CMD+=" $ALL_REDUCE_POST_ACCUMULATION_FP16"
-CMD+=" --do_train --phase2 --resume_from_checkpoint --phase1_end_step=$train_steps"
-CMD+=" --json-summary ${RESULTS_DIR}/dllogger.json "
-
-CMD="python3 -m torch.distributed.launch --nproc_per_node=$num_gpus $CMD"
-
-if [ "$create_logfile" = "true" ] ; then
-  export GBS=$(expr $train_batch_size_phase2 \* $num_gpus)
-  printf -v TAG "pyt_bert_pretraining_phase2_%s_gbs%d" "$precision" $GBS
-  DATESTAMP=`date +'%y%m%d%H%M%S'`
-  LOGFILE=$RESULTS_DIR/$job_name.$TAG.$DATESTAMP.log
-  printf "Logs written to %s\n" "$LOGFILE"
-fi
-
-set -x
-if [ -z "$LOGFILE" ] ; then
-   $CMD
-else
-   (
-     $CMD
-   ) |& tee $LOGFILE
-fi
-
-set +x
-
-echo "finished phase2"