DeepLearningExamples/PyTorch/LanguageModeling/Transformer-XL/pytorch/run.sub

90 lines
2.7 KiB
Bash

#!/bin/bash
#SBATCH -N 8 # number of nodes
#SBATCH -t 4:00:00 # wall time
#SBATCH -J "transformer-xl_pyt" # job name
#SBATCH --exclusive # exclusive node access
#SBATCH --mem=0 # all mem avail
#SBATCH --mail-type=FAIL # only send email on failure
#SBATCH --overcommit # Needed for pytorch
# Configure the multinode script
SCRIPT="run_multinode_wt103_large.sh"
# Configure workspace in the file system
WORK_DIR="/gpfs/fs1/${USER}/transformer-xl_pyt"
CONT_WORK_DIR="/workspace/transformer-xl"
# Configure container and mounting paths
CONT="<YOUR DOCKER REGISTRY>/transformer-xl:latest"
MOUNTS="${WORK_DIR}/data:${CONT_WORK_DIR}/data,${WORK_DIR}/results:${CONT_WORK_DIR}/results"
# Create directory for the results
srun --ntasks="${SLURM_JOB_NUM_NODES}" --ntasks-per-node=1 mkdir -p "${WORK_DIR}/results"
# Configure DGX-2H parameters
export DGXNSOCKET=2
export DGXSOCKETCORES=24
export DGXNGPU=16
# Overwrite default parameters in the script
EXTRA_TRAIN_PARAMS=""
EXTRA_EVAL_PARAMS=""
if [ -n "$MAX_STEP" ]; then
EXTRA_TRAIN_PARAMS+="--max_step ${MAX_STEP} "
fi
if [ -n "$SEED" ]; then
EXTRA_TRAIN_PARAMS+="--seed ${SEED} "
fi
if [ -n "$BATCH_SIZE" ]; then
EXTRA_TRAIN_PARAMS+="--batch_size ${BATCH_SIZE} "
fi
if [ -n "$LOCAL_BATCH_SIZE" ]; then
EXTRA_TRAIN_PARAMS+="--local_batch_size ${LOCAL_BATCH_SIZE} "
fi
if [ -n "$BATCH_CHUNK" ]; then
EXTRA_TRAIN_PARAMS+="--batch_chunk ${BATCH_CHUNK} "
fi
if [ -n "$EVAL_INTERVAL" ]; then
EXTRA_TRAIN_PARAMS+="--eval_interval ${EVAL_INTERVAL} "
fi
if [ -n "$EVAL_BATCH_SIZE" ]; then
EXTRA_TRAIN_PARAMS+="--eval_batch_size ${EVAL_BATCH_SIZE} "
EXTRA_EVAL_PARAMS+="--batch_size ${EVAL_BATCH_SIZE} "
fi
if [ -n "$LR" ]; then
EXTRA_TRAIN_PARAMS+="--lr ${LR} "
fi
if [ -n "$FP16" ]; then
EXTRA_TRAIN_PARAMS+="--fp16 "
EXTRA_EVAL_PARAMS+="--fp16 "
fi
if [ -n "$CONFIG" ]; then
EXTRA_TRAIN_PARAMS+="--config ${CONFIG} "
EXTRA_EVAL_PARAMS+="--config ${CONFIG} "
fi
if [[ $1 == 'train' ]] || [[ $1 == 'all' ]]; then
# Run training
srun --mpi=none \
--container-image="${CONT}" \
--container-mounts="${MOUNTS}" \
bash "${SCRIPT}" train \
--work_dir ${CONT_WORK_DIR}/results/${SLURM_JOB_ID} \
"${EXTRA_TRAIN_PARAMS}"
fi
if [[ $1 == 'eval' ]] || [[ $1 == 'all' ]]; then
# Run final evaluation
srun --mpi=none \
--container-image="${CONT}" \
--container-mounts="${MOUNTS}" \
bash "${SCRIPT}" eval \
--work_dir ${CONT_WORK_DIR}/results/${SLURM_JOB_ID} \
"${EXTRA_EVAL_PARAMS}"
fi
if [[ $1 != 'train' ]] && [[ $1 != 'eval' ]] && [[ $1 != 'all' ]]; then
echo 'unknown argment 1'
fi