fix pytorch bert

This commit is contained in:
sneaxiy 2021-08-30 10:15:35 +00:00
parent 248927e6fd
commit 6101e02baf
3 changed files with 45 additions and 80 deletions

View file

@ -0,0 +1,13 @@
{
"attention_probs_dropout_prob": 0.1,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 768,
"initializer_range": 0.02,
"intermediate_size": 3072,
"max_position_embeddings": 512,
"num_attention_heads": 12,
"num_hidden_layers": 12,
"type_vocab_size": 2,
"vocab_size": 30522
}

View file

@ -0,0 +1,21 @@
#!/bin/bash
set -xe
export LD_LIBRARY_PATH=/usr/lib/libibverbs/:/usr/lib/x86_64-linux-gnu/:$LD_LIBRARY_PATH
PADDLE_TRAINER_ENDPOINTS=`echo $PADDLE_TRAINER_ENDPOINTS | tr ',' '\n' | head -n 1`
batch_size=${1:-"96"} # batch size per gpu
num_gpus=${2:-"8"} # number of gpu
precision=${3:-"fp16"} # fp32 | fp16
gradient_accumulation_steps=$(expr 67584 \/ $batch_size \/ $num_gpus)
train_batch_size=$(expr 67584 \/ $num_gpus) # total batch_size per gpu
train_steps=${4:-4} # max train steps
export NODE_RANK=`python get_mpi_rank.py`
cd ${HOME_WORK_DIR}/workspace/env_run/zengjinle/DeepLearningExamples/PyTorch/LanguageModeling/BERT
rm -rf ./results/checkpoints
# run pre-training
bash scripts/run_pretraining.sh $train_batch_size 6e-3 $precision $num_gpus 0.2843 $train_steps 200 false true true $gradient_accumulation_steps

View file

@ -34,12 +34,13 @@ learning_rate_phase2=${17:-"4e-3"}
warmup_proportion_phase2=${18:-"0.128"}
train_steps_phase2=${19:-1563}
gradient_accumulation_steps_phase2=${20:-512}
DATASET=hdf5_lower_case_1_seq_len_128_max_pred_20_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5/wikicorpus_en # change this for other datasets
export BERT_PREP_WORKING_DIR=${BERT_PREP_WORKING_DIR:-"/root/paddlejob/workspace/env_run/data"}
DATASET=hdf5_lower_case_1_seq_len_128_max_pred_20_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5/wikicorpus_en/training # change this for other datasets
DATA_DIR_PHASE1=${21:-$BERT_PREP_WORKING_DIR/${DATASET}/}
BERT_CONFIG=bert_config.json
BERT_CONFIG=bert_config_base.json
DATASET2=hdf5_lower_case_1_seq_len_512_max_pred_80_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5/wikicorpus_en # change this for other datasets
DATA_DIR_PHASE2=${22:-$BERT_PREP_WORKING_DIR/${DATASET2}/}
CODEDIR=${23:-"/workspace/bert"}
CODEDIR=${23:-"/root/paddlejob/workspace/env_run/zengjinle/DeepLearningExamples/PyTorch/LanguageModeling/BERT"}
init_checkpoint=${24:-"None"}
RESULTS_DIR=$CODEDIR/results
CHECKPOINTS_DIR=$RESULTS_DIR/checkpoints
@ -107,7 +108,7 @@ CMD=" $CODEDIR/run_pretraining.py"
CMD+=" --input_dir=$DATA_DIR_PHASE1"
CMD+=" --output_dir=$CHECKPOINTS_DIR"
CMD+=" --config_file=$BERT_CONFIG"
CMD+=" --bert_model=bert-large-uncased"
CMD+=" --bert_model=bert-base-uncased"
CMD+=" --train_batch_size=$train_batch_size"
CMD+=" --max_seq_length=128"
CMD+=" --max_predictions_per_seq=20"
@ -125,7 +126,12 @@ CMD+=" $INIT_CHECKPOINT"
CMD+=" --do_train"
CMD+=" --json-summary ${RESULTS_DIR}/dllogger.json "
CMD="python3 -m torch.distributed.launch --nproc_per_node=$num_gpus $CMD"
MASTER_ADDR=`echo $PADDLE_TRAINER_ENDPOINTS | awk -F',' '{print $1}' | awk -F':' '{print $1}'`
MASTER_PORT=`echo $PADDLE_TRAINER_ENDPOINTS | awk -F',' '{print $1}' | awk -F':' '{print $2}'`
NUM_NODES=`echo $PADDLE_TRAINER_ENDPOINTS | tr ',' '\n' | wc -l`
NODE_RANK=$PADDLE_TRAINER_ID
CMD="python3 -m torch.distributed.launch --nproc_per_node=$num_gpus --nnodes=$NUM_NODES --node_rank=$NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT $CMD"
if [ "$create_logfile" = "true" ] ; then
@ -148,78 +154,3 @@ fi
set +x
echo "finished pretraining"
#Start Phase2
PREC=""
if [ "$precision" = "fp16" ] ; then
PREC="--fp16"
elif [ "$precision" = "fp32" ] ; then
PREC=""
elif [ "$precision" = "tf32" ] ; then
PREC=""
else
echo "Unknown <precision> argument"
exit -2
fi
ACCUMULATE_GRADIENTS=""
if [ "$accumulate_gradients" == "true" ] ; then
ACCUMULATE_GRADIENTS="--gradient_accumulation_steps=$gradient_accumulation_steps_phase2"
fi
ALL_REDUCE_POST_ACCUMULATION=""
if [ "$allreduce_post_accumulation" == "true" ] ; then
ALL_REDUCE_POST_ACCUMULATION="--allreduce_post_accumulation"
fi
ALL_REDUCE_POST_ACCUMULATION_FP16=""
if [ "$allreduce_post_accumulation_fp16" == "true" ] ; then
ALL_REDUCE_POST_ACCUMULATION_FP16="--allreduce_post_accumulation_fp16"
fi
echo $DATA_DIR_PHASE2
INPUT_DIR=$DATA_DIR_PHASE2
CMD=" $CODEDIR/run_pretraining.py"
CMD+=" --input_dir=$DATA_DIR_PHASE2"
CMD+=" --output_dir=$CHECKPOINTS_DIR"
CMD+=" --config_file=$BERT_CONFIG"
CMD+=" --bert_model=bert-large-uncased"
CMD+=" --train_batch_size=$train_batch_size_phase2"
CMD+=" --max_seq_length=512"
CMD+=" --max_predictions_per_seq=80"
CMD+=" --max_steps=$train_steps_phase2"
CMD+=" --warmup_proportion=$warmup_proportion_phase2"
CMD+=" --num_steps_per_checkpoint=$save_checkpoint_steps"
CMD+=" --learning_rate=$learning_rate_phase2"
CMD+=" --seed=$seed"
CMD+=" $PREC"
CMD+=" $ACCUMULATE_GRADIENTS"
CMD+=" $CHECKPOINT"
CMD+=" $ALL_REDUCE_POST_ACCUMULATION"
CMD+=" $ALL_REDUCE_POST_ACCUMULATION_FP16"
CMD+=" --do_train --phase2 --resume_from_checkpoint --phase1_end_step=$train_steps"
CMD+=" --json-summary ${RESULTS_DIR}/dllogger.json "
CMD="python3 -m torch.distributed.launch --nproc_per_node=$num_gpus $CMD"
if [ "$create_logfile" = "true" ] ; then
export GBS=$(expr $train_batch_size_phase2 \* $num_gpus)
printf -v TAG "pyt_bert_pretraining_phase2_%s_gbs%d" "$precision" $GBS
DATESTAMP=`date +'%y%m%d%H%M%S'`
LOGFILE=$RESULTS_DIR/$job_name.$TAG.$DATESTAMP.log
printf "Logs written to %s\n" "$LOGFILE"
fi
set -x
if [ -z "$LOGFILE" ] ; then
$CMD
else
(
$CMD
) |& tee $LOGFILE
fi
set +x
echo "finished phase2"