81 lines
3.6 KiB
Python
Executable file
81 lines
3.6 KiB
Python
Executable file
#!/usr/bin/env python
|
|
|
|
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
import os
|
|
import argparse
|
|
from pathlib import Path
|
|
|
|
optparser = argparse.ArgumentParser(description='Train classification models on ImageNet',
|
|
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
|
optparser.add_argument('-n', '--ngpus', type=int, default=1, help='number of GPUs to use')
|
|
optparser.add_argument('-b', '--batch-size', type=int, default=192, help='batch size per GPU')
|
|
optparser.add_argument('-e', '--num-epochs', type=int, default=90, help='number of epochs')
|
|
optparser.add_argument('-l', '--lr', type=float, default=0.256, help='learning rate; '
|
|
'IMPORTANT: true learning rate will be calculated as `lr * batch_size / 256`')
|
|
optparser.add_argument('--data-root', type=Path, help='Directory with RecordIO data files', default=Path('/data/imagenet/train-val-recordio-passthrough'))
|
|
optparser.add_argument('--dtype', help='Precision', default='float16', choices=('float32', 'float16'))
|
|
optparser.add_argument('--kv-store', default='horovod', choices=('device', 'horovod'), help='key-value store type')
|
|
optparser.add_argument('--data-backend', default='dali-gpu', choices=('dali-gpu', 'dali-cpu', 'mxnet', 'synthetic'), help='data backend')
|
|
|
|
opts, args = optparser.parse_known_args()
|
|
|
|
if opts.dtype == 'float16':
|
|
n_ch = str(4 - int(opts.data_backend == 'mxnet'))
|
|
else:
|
|
n_ch = str(3)
|
|
|
|
opts.batch_size *= opts.ngpus
|
|
opts.lr *= opts.batch_size / 256
|
|
|
|
command = []
|
|
if 'horovod' in opts.kv_store:
|
|
command += ['horovodrun', '-np', str(opts.ngpus)]
|
|
command += ['python', str(Path(__file__).parent / "train.py")]
|
|
command += ['--data-train', str(opts.data_root / "train.rec")]
|
|
command += ['--data-train-idx', str(opts.data_root / "train.idx")]
|
|
command += ['--data-val', str(opts.data_root / "val.rec")]
|
|
command += ['--data-val-idx', str(opts.data_root / "val.idx")]
|
|
command += ['--dtype', opts.dtype]
|
|
command += ['--image-shape', n_ch + ',224,224']
|
|
if opts.dtype == 'float16':
|
|
command += '--fuse-bn-relu 1 --fuse-bn-add-relu 1'.split()
|
|
command += '--input-layout NCHW --conv-layout NHWC ' \
|
|
'--batchnorm-layout NHWC --pooling-layout NHWC'.split()
|
|
|
|
command += ['--kv-store', opts.kv_store]
|
|
command += ['--data-backend', opts.data_backend]
|
|
command += ['--lr', str(opts.lr)]
|
|
command += ['--gpus', ','.join(list(map(str, range(opts.ngpus))))]
|
|
command += ['--batch-size', str(opts.batch_size)]
|
|
command += ['--num-epochs', str(opts.num_epochs)]
|
|
|
|
command += args
|
|
|
|
|
|
os.environ['MXNET_UPDATE_ON_KVSTORE'] = "0"
|
|
os.environ['MXNET_EXEC_ENABLE_ADDTO'] = "1"
|
|
os.environ['MXNET_USE_TENSORRT'] = "0"
|
|
os.environ['MXNET_GPU_WORKER_NTHREADS'] = "2"
|
|
os.environ['MXNET_GPU_COPY_NTHREADS'] = "1"
|
|
os.environ['MXNET_OPTIMIZER_AGGREGATION_SIZE'] = "54"
|
|
os.environ['HOROVOD_CYCLE_TIME'] = "0.1"
|
|
os.environ['HOROVOD_FUSION_THRESHOLD'] = "67108864"
|
|
os.environ['HOROVOD_NUM_NCCL_STREAMS'] = "2"
|
|
os.environ['MXNET_HOROVOD_NUM_GROUPS'] = "16"
|
|
os.environ['MXNET_EXEC_BULK_EXEC_MAX_NODE_TRAIN_FWD'] = "999"
|
|
os.environ['MXNET_EXEC_BULK_EXEC_MAX_NODE_TRAIN_BWD'] = "25"
|
|
|
|
os.execvp(command[0], command)
|