DeepLearningExamples/PyTorch/LanguageModeling/BERT/scripts/start_pretraining.sh
Przemek Strzelczyk 0663b67c1a Updating models
2019-07-08 22:51:28 +02:00

90 lines
3.9 KiB
Bash

#!/bin/bash
# purpose: for multinode training on slurm clusters
node_type=${1:-"dgx1"}
num_nodes=${2:-1}
partition=${3:-"default"}
wall_time=${4:-"12:00:00"}
job_name=${5:-"pyt_bert"}
root_dir=${6:-"$PWD"}
train_batch_size=${7:-4}
eval_batch_size=${8:-4}
train_steps=${9:-1000000}
warmup_proportion=${10:-0.01}
learning_rate=${11:-1e-4}
precision=${12:-"fp16"}
save_checkpoint_steps=${13:-5000}
results_dir=${14:-"$root_dir/results"}
checkpoints_dir=${15:-"$root_dir/checkpoints"}
CONT=${CONT:-"gitlab-master.nvidia.com:5005/dl/dgx/pytorch:19.02-py3-devel"}
BENCHMARK=${BENCHMARK:-"bert"}
BENCHMARK_NAME="bert"
if [ "$node_type" = "dgx1" ] ; then
echo "Running on dgx1 systems"
DGXSYSTEM="DGX1"
DGXNGPU=8
DGXSOCKETCORES=20
DGXNSOCKET=2
DGXHT=2
DGXIBDEVICES='--device=/dev/infiniband --device=/dev/infiniband/rdma_cm --device=/dev/infiniband/ucm3 --device=/dev/infiniband/ucm2 --device=/dev/infiniband/ucm1 --device=/dev/infiniband/ucm0 --device=/dev/infiniband/uverbs3 --device=/dev/infiniband/uverbs2 --device=/dev/infiniband/uverbs1 --device=/dev/infiniband/uverbs0 --device=/dev/infiniband/issm3 --device=/dev/infiniband/umad3 --device=/dev/infiniband/issm2 --device=/dev/infiniband/umad2 --device=/dev/infiniband/issm1 --device=/dev/infiniband/umad1 --device=/dev/infiniband/issm0 --device=/dev/infiniband/umad0'
elif [ "$node_type" = "dgx2h" ] ; then
echo "Running on dgx2h systems"
DGXSYSTEM="DGX2H"
DGXNGPU=16
DGXSOCKETCORES=24
DGXNSOCKET=2
DGXHT=2 # HT is on is 2, HT off is 1
DGXIBDEVICES='--device=/dev/infiniband/rdma_cm --device=/dev/infiniband/ucm10 --device=/dev/infiniband/ucm9 --device=/dev/infiniband/ucm8 --device=/dev/infiniband/ucm7 --device=/dev/infiniband/ucm4 --device=/dev/infiniband/ucm3 --device=/dev/infiniband/ucm2 --device=/dev/infiniband/ucm1 --device=/dev/infiniband/uverbs10 --device=/dev/infiniband/uverbs9 --device=/dev/infiniband/uverbs8 --device=/dev/infiniband/uverbs7 --device=/dev/infiniband/uverbs4 --device=/dev/infiniband/uverbs3 --device=/dev/infiniband/uverbs2 --device=/dev/infiniband/uverbs1 --device=/dev/infiniband/issm10 --device=/dev/infiniband/umad10 --device=/dev/infiniband/issm9 --device=/dev/infiniband/umad9 --device=/dev/infiniband/issm8 --device=/dev/infiniband/umad8 --device=/dev/infiniband/issm7 --device=/dev/infiniband/umad7 --device=/dev/infiniband/issm4 --device=/dev/infiniband/umad4 --device=/dev/infiniband/issm3 --device=/dev/infiniband/umad3 --device=/dev/infiniband/issm2 --device=/dev/infiniband/umad2 --device=/dev/infiniband/issm1 --device=/dev/infiniband/umad1'
else
echo "Unknown <node_type>, must be either dgx1 or dgx2"
exit -1
fi
printf -v EXTRA_PARAMS "%d %d %e %s 1 %d %d %d false" $train_batch_size $eval_batch_size $learning_rate "$precision" $warmup_proportion $train_steps $save_checkpoint_steps
export ROOTDIR=$root_dir
export DATA_DIR=${DATA_DIR:-$CODEDIR/data/wikipedia_corpus/pyt_hdf5_shards}
VOLS="-v $ROOTDIR:/workspace/bert"
VOLS+=" -v $DATA_DIR:/workspace/bert/data/wikipedia_corpus/pyt_hdf5_shards"
# VOLS+=" -v $BOOKS_DIR:/workspace/bert/data/bookcorpus/final_tfrecord_sharded"
VOLS+=" -v $results_dir:/results"
VOLS+=" -v $checkpoints_dir:/checkpoints"
export VOLS
export CONT
export DGXSYSTEM
export DGXNGPU
export DGXIBDEVICES
export EXTRA_PARAMS
set -x
cd $CODEDIR
pwd
PART=""
if [ "$partition" != "default" ] ; then
printf -v PART "%s" "-p $partition"
fi
export GBS=$(expr $num_nodes \* $batch_size \* $DGXNGPU)
printf -v TAG "%s_%dn_%s_gbs%d" "$job_name" $num_nodes "$precision" $GBS
export DATESTAMP=`date +'%y%m%d%H%M%S'`
sbatch $PART \
-N $num_nodes \
-t $wall_time \
-J $job_name \
--exclusive \
--mem=0 \
--mail-type=FAIL \
--ntasks-per-node=$DGXNGPU \
--threads-per-core=$DGXHT \
--cores-per-socket=$DGXSOCKETCORES \
--output=$LOGDIR/$TAG.$DATESTAMP.log \
$CODEDIR/scripts/run.sub
set +x