#!/usr/bin/env python

# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import argparse
from pathlib import Path

optparser = argparse.ArgumentParser(description='Train classification models on ImageNet',
                                    formatter_class=argparse.ArgumentDefaultsHelpFormatter)
optparser.add_argument('-n', '--ngpus', type=int, default=1, help='number of GPUs to use')
optparser.add_argument('-b', '--batch-size', type=int, default=192, help='batch size per GPU')
optparser.add_argument('-e', '--num-epochs', type=int, default=90, help='number of epochs')
optparser.add_argument('-l', '--lr', type=float, default=0.256, help='learning rate; '
                       'IMPORTANT: true learning rate will be calculated as `lr * batch_size / 256`')
optparser.add_argument('--data-root', type=Path, help='Directory with RecordIO data files', default=Path('/data/imagenet/train-val-recordio-passthrough'))
optparser.add_argument('--dtype', help='Precision', default='float16', choices=('float32', 'float16'))
optparser.add_argument('--kv-store', default='horovod', choices=('device', 'horovod'), help='key-value store type')
optparser.add_argument('--data-backend', default='dali-gpu', choices=('dali-gpu', 'dali-cpu', 'mxnet', 'synthetic'), help='data backend')

opts, args = optparser.parse_known_args()

if opts.dtype == 'float16':
    n_ch = str(4 - int(opts.data_backend == 'mxnet'))
else:
    n_ch = str(3)

opts.batch_size *= opts.ngpus
opts.lr *= opts.batch_size / 256

command = []
if 'horovod' in opts.kv_store:
    command += ['horovodrun', '-np', str(opts.ngpus)]
command += ['python', str(Path(__file__).parent / "train.py")]
command += ['--data-train', str(opts.data_root / "train.rec")]
command += ['--data-train-idx', str(opts.data_root / "train.idx")]
command += ['--data-val', str(opts.data_root / "val.rec")]
command += ['--data-val-idx', str(opts.data_root / "val.idx")]
command += ['--dtype', opts.dtype]
command += ['--image-shape', n_ch + ',224,224']
if opts.dtype == 'float16':
    command += '--fuse-bn-relu 1 --fuse-bn-add-relu 1'.split()
    command += '--input-layout NCHW --conv-layout NHWC ' \
               '--batchnorm-layout NHWC --pooling-layout NHWC'.split()

command += ['--kv-store', opts.kv_store]
command += ['--data-backend', opts.data_backend]
command += ['--lr', str(opts.lr)]
command += ['--gpus', ','.join(list(map(str, range(opts.ngpus))))]
command += ['--batch-size', str(opts.batch_size)]
command += ['--num-epochs', str(opts.num_epochs)]

command += args


os.environ['MXNET_UPDATE_ON_KVSTORE'] = "0"
os.environ['MXNET_EXEC_ENABLE_ADDTO'] = "1"
os.environ['MXNET_USE_TENSORRT'] = "0"
os.environ['MXNET_GPU_WORKER_NTHREADS'] = "2"
os.environ['MXNET_GPU_COPY_NTHREADS'] = "1"
os.environ['MXNET_OPTIMIZER_AGGREGATION_SIZE'] = "54"
os.environ['HOROVOD_CYCLE_TIME'] = "0.1"
os.environ['HOROVOD_FUSION_THRESHOLD'] = "67108864"
os.environ['HOROVOD_NUM_NCCL_STREAMS'] = "2"
os.environ['MXNET_HOROVOD_NUM_GROUPS'] = "16"
os.environ['MXNET_EXEC_BULK_EXEC_MAX_NODE_TRAIN_FWD'] = "999"
os.environ['MXNET_EXEC_BULK_EXEC_MAX_NODE_TRAIN_BWD'] = "25"

os.execvp(command[0], command)