DeepLearningExamples/PyTorch/LanguageModeling/Transformer-XL/pytorch/run_multinode_wt103_large.sh

44 lines
1.4 KiB
Bash

#!/bin/bash
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
DISTRIBUTED="-m torch.distributed.launch --nnodes ${WORLD_SIZE} --node_rank ${SLURM_NODEID} \
--master_addr ${MASTER_ADDR} --master_port ${MASTER_PORT} --nproc_per_node=${DGXNGPU}"
echo "MASTER_ADDR ${MASTER_ADDR}"
echo "MASTER_PORT ${MASTER_PORT}"
echo "WORLD_SIZE ${WORLD_SIZE}"
echo "SLURM_NODEID ${SLURM_NODEID}"
if [[ $1 == 'train' ]] || [[ $1 == 'all' ]]; then
echo 'Run training...'
python ${DISTRIBUTED} train.py \
--config_file wt103_large.yaml \
--config 8dgx2_16gpu_fp16 \
${@:2}
fi
if [[ $1 == 'eval' ]] || [[ $1 == 'all' ]]; then
echo 'Run evaluation...'
python ${DISTRIBUTED} eval.py \
--config_file wt103_large.yaml \
--config 8dgx2_16gpu_fp16 \
${@:2}
fi
if [[ $1 != 'train' ]] && [[ $1 != 'eval' ]] && [[ $1 != 'all' ]]; then
echo 'unknown argment 1'
fi