Merge pull request #4 from NVIDIA/master

pull upstream
2019-10-22 09:48:09 -07:00 · 2019-10-22 09:48:09 -07:00 · 52c60d2ed2
parent dd8101e93d 970a54b296
commit 52c60d2ed2
331 changed files with 19749 additions and 4977 deletions
--- a/FasterTransformer/fastertransformer/cuda/CMakeLists.txt
+++ b/FasterTransformer/fastertransformer/cuda/CMakeLists.txt
@ -19,5 +19,6 @@ set(cuda_kernel_files
 )

 add_library(fastertransformer STATIC ${cuda_kernel_files})
+set_target_properties(fastertransformer PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON)
 target_link_libraries(fastertransformer PUBLIC -lcublas -lcudart ${CMAKE_THREAD_LIBS_INIT})

--- a/FasterTransformer/fastertransformer/cuda/cuda_kernels.cu
+++ b/FasterTransformer/fastertransformer/cuda/cuda_kernels.cu
@ -197,11 +197,9 @@ void add_bias_input_layernorm(__half* out, const __half* input, const __half* bi
 template <typename T>
 void add_bias_act_kernelLauncher(T* out, const T* bias, int m, int n, cudaStream_t stream)
 {
-//  dim3 grid(m / 64);
  dim3 grid(m / 4);
  dim3 block(n / 4);
-  assert(block.x > 1024);
-//  dim3 block(n);
+  assert(block.x <= 1024);
  add_bias_act<T><<<grid, block, 0, stream>>>(out, bias, m, n);
 }

@ -209,9 +207,9 @@ template<typename T>
 void add_bias_input_layernorm_kernelLauncher(T* out, const T* input, const T* bias, 
  const T* gamma, const T* beta, int m, int n, cudaStream_t stream)
 {
-  assert(n > 1024);
  dim3 grid(m);
  dim3 block(n);
+  assert(block.x <= 1024);
  add_bias_input_layernorm<T><<<grid, block, 0, stream>>>(out, input, bias, gamma, beta, m, n);
 }

@ -220,9 +218,9 @@ template <>
 void add_bias_input_layernorm_kernelLauncher(__half* out, const __half* input, const __half* bias, 
  const __half* gamma, const __half* beta, int m, int n, cudaStream_t stream)
 {
-  assert(n / 2 > 1024);
  dim3 grid(m);
  dim3 block(n / 2);
+  assert(block.x <= 1024);
  add_bias_input_layernorm<__half><<<grid, block, 0, stream>>>(out, input, bias, gamma, beta, m, n);
 }

--- a/FasterTransformer/fastertransformer/cuda/open_attention.cu
+++ b/FasterTransformer/fastertransformer/cuda/open_attention.cu
@ -88,7 +88,7 @@ T blockReduceMax(T val)
  __syncthreads();


-  val = (threadIdx.x < (blockDim.x >> 5 )) ? shared[lane] : 0;
+  val = (threadIdx.x < (blockDim.x >> 5 )) ? shared[lane] : -1e20f;
  val = warpReduceMax(val);

  return val;
@ -204,7 +204,7 @@ void softmax_kernel(T* qk_buf_, const T* attr_mask, const int batch_size, const
      
      mask_val = (1.0f - mask_val) * -10000.0f;

-      float tmp = threadIdx.x < seq_len ? (float)(qk * (float)scaler + mask_val): -1e-20f;
+      float tmp = threadIdx.x < seq_len ? (float)(qk * (float)scaler + mask_val): -1e20f;

      float max_val = blockReduceMax<float>(tmp);

@ -248,7 +248,7 @@ void softmax_kernel_v2(T* qk_buf_, const T* attr_mask, const int batch_size, con
      
    mask_val = (1.0f - mask_val) * -10000.0f;

-    float tmp = threadIdx.x < seq_len ? (float)(qk * (float)scaler + mask_val) : -1e-20f;
+    float tmp = threadIdx.x < seq_len ? (float)(qk * (float)scaler + mask_val) : -1e20f;
    float max_val = blockReduceMax<float>(tmp);
    if(threadIdx.x == 0)
      s_max = max_val;
@ -324,10 +324,9 @@ void OpenMultiHeadAttention<OpType_>::multiHeadAttr_nofuse_kernelLauncher(

    if(OpType_ == OperationType::FP32)
    {
-//      const int word_per_block = 32;
      const int word_per_block = 1;
-      assert(k > 1024);
-      assert(m / word_per_block * 3 > 65536);
+      assert(k <= 1024);
+      assert(m / word_per_block * 3 <= 65536);

      dim3 grid(m / word_per_block * 3);
      dim3 block(k);
@ -340,8 +339,6 @@ void OpenMultiHeadAttention<OpType_>::multiHeadAttr_nofuse_kernelLauncher(
      grid.x = batch_size * seq_len / word_per_block;
      block.x = head_num * size_per_head * word_per_block / 2;

-      assert(block.x);
-
      add_QKV_bias<DataType_><<<grid, block, 0, stream>>>(Q, bias_Q, K, bias_K, V, bias_V, q_buf_, k_buf_, 
      v_buf_, batch_size, seq_len, head_num, size_per_head / 2, word_per_block);
    }
@ -400,11 +397,10 @@ void OpenMultiHeadAttention<OpType_>::multiHeadAttr_nofuse_kernelLauncher(
    if(OpType_ == OperationType::HALF)
    {
      const int seq_per_block = 4;
-  //    const int seq_per_block = 1;
      grid.x = batch_size * head_num * seq_len / seq_per_block;
      block.x = seq_per_block * size_per_head / 2;

-      assert(grid.x * seq_per_block != batch_size * head_num * seq_len);
+      assert(grid.x * seq_per_block == batch_size * head_num * seq_len);

      transpose<DataType_><<<grid, block, 0, stream>>>(transpose_dst_, dst, 
          batch_size, seq_len, head_num, size_per_head / 2);
--- a/FasterTransformer/fastertransformer/tf_op/CMakeLists.txt
+++ b/FasterTransformer/fastertransformer/tf_op/CMakeLists.txt
@ -25,4 +25,5 @@ add_definitions(-DGOOGLE_CUDA=1)
 add_definitions(-DNDEBUG)

 add_library(tf_fastertransformer SHARED ${tf_bert_transformer_files})
+set_target_properties(tf_fastertransformer PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON)
 target_link_libraries(tf_fastertransformer PRIVATE -lcublas -lcudart -ltensorflow_framework ${CMAKE_THREAD_LIBS_INIT})
--- a/FasterTransformer/sample/tensorflow/transformer_fp16.py
+++ b/FasterTransformer/sample/tensorflow/transformer_fp16.py
@ -363,7 +363,7 @@ with tf.Session(config=config) as sess:
    print("#################################")
    np_val1 = sess.run(output)
    np_val2 = sess.run(output_own)
-    print("cross_check " + str(np.allclose(np_val1, np_val2, atol = 1e-5)))
+    print("cross_check " + str(np.allclose(np_val1, np_val2, atol = 1e-1)))
    print("max diff " + str(np.fabs(np_val1 - np_val2).max()))
    print("min diff " + str(np.fabs(np_val1 - np_val2).min()))
    print np_val1
--- a/FasterTransformer/sample/tensorflow/transformer_fp32.py
+++ b/FasterTransformer/sample/tensorflow/transformer_fp32.py
@ -361,7 +361,7 @@ with tf.Session(config=config) as sess:
    print("#################################")
    np_val1 = sess.run(output)
    np_val2 = sess.run(output_own)
-    print("cross_check " + str(np.allclose(np_val1, np_val2, atol = 1e-5)))
+    print("cross_check " + str(np.allclose(np_val1, np_val2, atol = 1e-4)))
    print("max diff " + str(np.fabs(np_val1 - np_val2).max()))
    print("min diff " + str(np.fabs(np_val1 - np_val2).min()))

--- a/FasterTransformer/tools/gemm_test/CMakeLists.txt
+++ b/FasterTransformer/tools/gemm_test/CMakeLists.txt
@ -22,7 +22,9 @@ set(gemm_fp32_files
 )

 add_executable(gemm_fp32 ${gemm_fp32_files})
+set_target_properties(gemm_fp32 PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON)
 target_link_libraries(gemm_fp32 PUBLIC -lcublas -lcudart ${CMAKE_THREAD_LIBS_INIT})

 add_executable(gemm_fp16 ${gemm_fp16_files})
+set_target_properties(gemm_fp16 PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON)
 target_link_libraries(gemm_fp16 PUBLIC -lcublas -lcudart ${CMAKE_THREAD_LIBS_INIT})
--- a/MxNet/Classification/RN50v1.5/Dockerfile
+++ b/MxNet/Classification/RN50v1.5/Dockerfile
@ -0,0 +1,3 @@
+FROM nvcr.io/nvidia/mxnet:19.07-py3
+COPY . /workspace/rn50
+WORKDIR /workspace/rn50
--- a/MxNet/Classification/RN50v1.5/LICENSE
+++ b/MxNet/Classification/RN50v1.5/LICENSE
@ -1,3 +1,4 @@
+
                                 Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/
--- a/MxNet/Classification/RN50v1.5/README.md
+++ b/MxNet/Classification/RN50v1.5/README.md
@ -1,6 +1,46 @@
-# ResNet50 v1.5 For MXNet
+# ResNet50 v1.5 for MXNet

-## The model
+This repository provides a script and recipe to train the ResNet50 v1.5 model to achieve state of the art accuracy, and is tested and maintained by NVIDIA.
+
+## Table Of Contents
+- [Model overview](#model-overview)
+    * [Default configuration](#default-configuration)
+    * [Feature support matrix](#feature-support-matrix)
+        * [Features](#features)
+    * [Mixed precision training](#mixed-precision-training)
+        * [Enabling mixed precision](#enabling-mixed-precision)
+- [Setup](#setup)
+    * [Requirements](#requirements)
+- [Quick Start Guide](#quick-start-guide)
+- [Advanced](#advanced)
+    * [Scripts and sample code](#scripts-and-sample-code)
+    * [Parameters](#parameters)
+    * [Command-line options](#command-line-options)
+    * [Getting the data](#getting-the-data)
+        * [Dataset guidelines](#dataset-guidelines)
+    * [Multi-dataset](#multi-dataset)
+    * [Training process](#training-process)
+    * [Inference process](#inference-process)
+- [Performance](#performance)
+    * [Benchmarking](#benchmarking)
+        * [Training performance benchmark](#training-performance-benchmark)
+        * [Inference performance benchmark](#inference-performance-benchmark)
+    * [Results](#results)
+        * [Training accuracy results](#training-accuracy-results)
+            * [Training accuracy: NVIDIA DGX-1 (8x V100 16G)](#training-accuracy-nvidia-dgx-1-(8x-v100-16G))
+            * [Training stability test](#training-stability-test)
+        * [Training performance results](#training-performance-results)
+            * [Training performance: NVIDIA DGX-1 (8x V100 16G)](#training-performance-nvidia-dgx-1-(8x-v100-16G))
+            * [Training performance: NVIDIA DGX-2 (16x V100 32G)](#training-performance-nvidia-dgx-2-(16x-v100-32G))
+        * [Inference performance results](#inference-performance-results)
+            * [Inference performance: NVIDIA DGX-1 (8x V100 16G)](#inference-performance-nvidia-dgx-1-(8x-v100-16G))
+            * [Inference performance: NVIDIA T4](#inference-performance-nvidia-t4)
+- [Release notes](#release-notes)
+    * [Changelog](#changelog)
+    * [Known issues](#known-issues)
+
+
+## Model overview
 The ResNet50 v1.5 model is a modified version of the [original ResNet50 v1 model](https://arxiv.org/abs/1512.03385).

 The difference between v1 and v1.5 is in the bottleneck blocks which require
@ -9,96 +49,448 @@ v1.5 has stride = 2 in the 3x3 convolution

 This difference makes ResNet50 v1.5 slightly more accurate (~0.5% top1) than v1, but comes with a small performance drawback (~5% imgs/sec).

-## Training procedure
+This model is trained with mixed precision using Tensor Cores on NVIDIA Volta and Turing GPUs. Therefore, researchers can get results 3.5x faster than training without Tensor Cores, while experiencing the benefits of mixed precision training. This model is tested against each NGC monthly container release to ensure consistent accuracy and performance over time.

-### Optimizer
+### Default configuration

-This model trains for 90 epochs, with the standard ResNet v1.5 setup:
+**Optimizer:**

-* SGD with momentum (0.9)
+* SGD with momentum (0.875)
+* Learning rate = 0.256 for 256 batch size, for other batch sizes we lineary scale the learning rate.
+* Learning rate schedule -- we use cosine LR schedule
+* Linear warmup of the learning rate during first 5 epochs according to [Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour](https://arxiv.org/abs/1706.02677).
+* Weight decay: 3.0517578125e-05 (1/32768).
+* We do not apply WD on Batch Norm trainable parameters (gamma/bias)
+* Label Smoothing: 0.1
+* We train for:
+    * 50 Epochs -> configuration that reaches 75.9% top1 accuracy
+    * 90 Epochs -> 90 epochs is a standard for ResNet50
+    * 250 Epochs -> best possible accuracy. For 250 epoch training we also use [MixUp regularization](https://arxiv.org/pdf/1710.09412.pdf).

-* Learning rate = 0.1 for 256 batch size, for other batch sizes we linearly
-scale the learning rate.
+**Data augmentation:**

-* Learning rate decay - multiply by 0.1 after 30, 60, and 80 epochs
+This model uses the following data augmentation:

-* Linear warmup of the learning rate during first 5 epochs
-according to [Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour](https://arxiv.org/abs/1706.02677).
-
-* Weight decay: 1e-4
-
-### Data Augmentation
-
-During training, we perform the following augmentation techniques:
+For training:
 * Normalization
 * Random resized crop to 224x224
-* Scale from 5% to 100%
+* Scale from 8% to 100%
 * Aspect ratio from 3/4 to 4/3
 * Random horizontal flip

-During inference, we perform the following augmentation techniques:
+For inference:
 * Normalization
 * Scale to 256x256
 * Center crop to 224x224

-See `data.py` for more info.
+### Feature support matrix

-# Setup
-
-## Requirements
-
-Ensure your environment meets the following requirements:
-
-* [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker)
-* [MXNet 18.12-py3 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia%2Fmxnet) or newer
-* [NVIDIA-DALI 0.5.0](https://github.com/NVIDIA/DALI) -- included in the MXNet container
-* [Python 3.5](https://www.python.org) -- included in the MXNet container
-* [CUDA 10](https://developer.nvidia.com/cuda-toolkit) -- included in the MXNet container
-* [cuDNN 7.4.1](https://developer.nvidia.com/cudnn) -- included in the the MXNet container
-* (optional) NVIDIA Volta or Turing GPU (see section below) -- for best training performance using FP16
-
-For more information about how to get started with NGC containers, see the
-following sections from the NVIDIA GPU Cloud Documentation and the Deep Learning Documentation:
-* [Getting Started Using NVIDIA GPU Cloud](https://docs.nvidia.com/ngc/ngc-getting-started-guide/index.html)
-* [Accessing And Pulling From The NGC Container Registry](https://docs.nvidia.com/deeplearning/dgx/user-guide/index.html#accessing_registry)
-* [Running MXNet](https://docs.nvidia.com/deeplearning/dgx/mxnet-release-notes/running.html#running)
-
-## Training using mixed precision with Tensor Cores
-
-### Hardware requirements
-Training with mixed precision on NVIDIA Tensor Cores, requires an NVIDIA Volta-based or Turing-based GPU.
+| **Feature** | **ResNet50 MXNet** |
+|:---:|:--------:|
+|[DALI](https://docs.nvidia.com/deeplearning/sdk/dali-release-notes/index.html)|yes|
+|Horovod Multi-GPU|yes|


-### Software changes
+#### Features
+The following features are supported by this model.

-For information about how to train using mixed precision, see the
-[Mixed Precision Training paper](https://arxiv.org/abs/1710.03740)
-and
-[Training With Mixed Precision documentation](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html).
+NVIDIA DALI - NVIDIA Data Loading Library (DALI) is a collection of highly optimized building blocks, and an execution engine, to accelerate the pre-processing of the input data for deep learning applications. DALI provides both the performance and the flexibility for accelerating different data pipelines as a single library. This single library can then be easily integrated into different deep learning training and inference applications.
+
+Horovod Multi-GPU - Horovod is a distributed training framework for TensorFlow, Keras, PyTorch and MXNet. The goal of Horovod is to make distributed deep learning fast and easy to use. For more information about how to get started with Horovod, see the [Horovod: Official repository](https://github.com/horovod/horovod).


-# Quick start guide

-## Docker
+### Mixed precision training

-To run docker MXNet container, run:
+Mixed precision is the combined use of different numerical precisions in a computational method. [Mixed precision](https://arxiv.org/abs/1710.03740) training offers significant computational speedup by performing operations in half-precision format, while storing minimal information in single-precision to retain as much information as possible in critical parts of the network. Since the introduction of [Tensor Cores](https://developer.nvidia.com/tensor-cores) in the Volta and Turing architecture, significant training speedups are experienced by switching to mixed precision -- up to 3x overall speedup on the most arithmetically intense model architectures. Using mixed precision training requires two steps:
+1.  Porting the model to use the FP16 data type where appropriate.
+2.  Adding loss scaling to preserve small gradient values.

-`nvidia-docker run --rm -it --ipc=host -v <path to source of this repo>:/workspace/resnet50 -v <path to prepared dataset>:/data/imagenet/train-val-recordio-passthrough nvcr.io/nvidia/mxnet:18.12-py3`
+The ability to train deep learning networks with lower precision was introduced in the Pascal architecture and first supported in [CUDA 8](https://devblogs.nvidia.com/parallelforall/tag/fp16/) in the NVIDIA Deep Learning SDK.

-It will also automatically start downloading the MXNet container if you haven't downloaded it yet. You can also download it manually by running:
+For information about:
+-   How to train using mixed precision, see the [Mixed Precision Training](https://arxiv.org/abs/1710.03740) paper and [Training With Mixed Precision](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html) documentation.
+-   Techniques used for mixed precision training, see the [Mixed-Precision Training of Deep Neural Networks](https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/) blog.

-`nvidia-docker pull nvcr.io/nvidia/mxnet:18.12-py3`

-If you haven't prepared dataset yet (see section below), download raw ImageNet dataset (see section below), and run:

-`nvidia-docker run --rm -it --ipc=host -v <path to source of this repo>:/workspace/resnet50 -v <path where prepared dataset should be created>:/data/imagenet/train-val-recordio-passthrough -v <path to raw dataset>:/data/imagenet/raw nvcr.io/nvidia/mxnet:18.12-py3`
+#### Enabling mixed precision
+Using the Gluon API, ensure you perform the following steps to convert a model that supports computation with float16.

-and follow step from Prepare Dataset section.
+1. Cast Gluon Block‘s parameters and expected input type to float16 by calling the cast method of the Block representing the network.
+    ```python
+    net = net.cast('float16')
+    ```

-## Prepare Dataset
+2. Ensure the data input to the network is of float16 type. If your DataLoader or Iterator produces output in another datatype, then you have to cast your data. There are different ways you can do this. The easiest way is to use the `astype` method of NDArrays.
+    ```python
+    data = data.astype('float16', copy=False)
+    ```
+
+3. If you are using images and DataLoader, you can also use a Cast transform.  It is preferable to use multi_precision mode of optimizer when training in float16. This mode of optimizer maintains a master copy of the weights in float32 even when the training (forward and backward pass) is in float16. This helps increase precision of the weight updates and can lead to faster convergence in some scenarios.
+    ```python
+    optimizer = mx.optimizer.create('sgd', multi_precision=True, lr=0.01)
+    ```
+
+## Setup
+The following section lists the requirements in order to start training the ResNet50 v1.5 model.
+
+### Requirements
+
+This repository contains Dockerfile which extends the MXNet NGC container and encapsulates some dependencies. Aside from these dependencies, ensure you have the following components:
+-   [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker)
+-   [MXNet 19.07-py3 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia%2Fmxnet)
+-   [NVIDIA Volta](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/) or [Turing](https://www.nvidia.com/en-us/geforce/turing/) based GPU
+
+For more information about how to get started with NGC containers, see the following sections from the NVIDIA GPU Cloud Documentation and the Deep Learning Documentation:
+-   [Getting Started Using NVIDIA GPU Cloud](https://docs.nvidia.com/ngc/ngc-getting-started-guide/index.html)
+-   [Accessing And Pulling From The NGC Container Registry](https://docs.nvidia.com/deeplearning/frameworks/user-guide/index.html#accessing_registry)
+-   [Running MXNet](https://docs.nvidia.com/deeplearning/dgx/mxnet-release-notes/running.html#running)
+
+For those unable to use the MXNet NGC container, to set up the required environment or create your own container, see the versioned [NVIDIA Container Support Matrix](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html).
+
+
+## Quick Start Guide
+
+**1. Clone the repository.**
+```bash
+git clone https://github.com/NVIDIA/DeepLearningExamples
+cd DeepLearningExamples/MxNet/Classification/RN50v1.5
+```
+
+**2. Build the ResNet50 MXNet NGC container.**
+After Docker is setup, you can build the ResNet50 image with:
+```bash
+docker build . -t nvidia_rn50_mx
+```
+
+**3. Start an interactive session in the NGC container to run preprocessing/training/inference.**
+```bash
+nvidia-docker run --rm -it --ipc=host <path to dataset>:/data/imagenet/train-val-recordio-passthrough nvidia_rn50_mx
+```
+
+**4. Download and preprocess the data.**
+* Download the images from http://image-net.org/download-images.
+* Extract the training and validation data:
+    ```bash
+    mkdir train && mv ILSVRC2012_img_train.tar train/ && cd train
+    tar -xvf ILSVRC2012_img_train.tar && rm -f ILSVRC2012_img_train.tar
+    find . -name "*.tar" | while read NAME ; do mkdir -p "${NAME%.tar}"; tar -xvf "${NAME}" -C "${NAME%.tar}"; rm -f "${NAME}"; done
+    cd ..
+    ```
+
+**5. Extract the validation data and move the images to subfolders.**
+```bash
+mkdir val && mv ILSVRC2012_img_val.tar val/ && cd val && tar -xvf ILSVRC2012_img_val.tar
+wget -qO- https://raw.githubusercontent.com/soumith/imagenetloader.torch/master/valprep.sh | bash
+```
+
+**6. Preprocess the dataset.**
+```bash
+./scripts/prepare_imagenet.sh <path to raw imagenet> <path where processed dataset will be created>
+```
+
+**7. Start training.**
+```bash
+./runner -n <number of gpus> -b <batch size per GPU (default 192)>
+```
+
+**8. Start validation/evaluation.**
+```bash
+./runner -n <number of gpus> -b <batch size per GPU (default 192)> --load <path to trained model> --mode val
+```
+
+**9. Start inference/predictions.**
+```bash
+./runner --load <path to trained model> --mode pred --data-pred <path to the image>
+```
+
+
+## Advanced
+
+The following sections provide greater details of the dataset, running training and inference, and the training results.
+
+### Scripts and sample code
+
+In the root directory, the most important files are:
+* `runner`: A wrapper on the `train.py` script which is the main executable script for training/validation/predicting
+* `benchmark.py`: A script for benchmarking
+* `Dockerfile`: Container to build the container
+* `fit.py`: A file containing most of the training and validation logic
+* `data.py`: Data loading and preprocessing code
+* `dali.py`: Data loading and preprocessing code using DALI
+* `models.py`: The model architecture
+* `report.py`: A file containing JSON report structure and description of fields
+
+In the `scripts` directory, the most important files are:
+* `prepare_imagenet.sh`: A script that converts raw dataset format to RecordIO format
+
+
+
+
+### Parameters
+
+The complete list of available parameters contains:
+```
+Model:
+  --arch {resnetv1,resnetv15,resnextv1,resnextv15,xception}
+                        model architecture (default: resnetv15)
+  --num-layers NUM_LAYERS
+                        number of layers in the neural network, required by
+                        some networks such as resnet (default: 50)
+  --num-groups NUM_GROUPS
+                        number of groups for grouped convolutions, required by
+                        some networks such as resnext (default: 32)
+  --num-classes NUM_CLASSES
+                        the number of classes (default: 1000)
+  --batchnorm-eps BATCHNORM_EPS
+                        the amount added to the batchnorm variance to prevent
+                        output explosion. (default: 1e-05)
+  --batchnorm-mom BATCHNORM_MOM
+                        the leaky-integrator factor controling the batchnorm
+                        mean and variance. (default: 0.9)
+  --fuse-bn-relu FUSE_BN_RELU
+                        have batchnorm kernel perform activation relu
+                        (default: 0)
+  --fuse-bn-add-relu FUSE_BN_ADD_RELU
+                        have batchnorm kernel perform add followed by
+                        activation relu (default: 0)
+
+Training:
+  --mode {train_val,train,val,pred}
+                        mode (default: train_val)
+  --seed SEED           random seed (default: None)
+  -n NGPUS, --ngpus NGPUS
+                        number of GPUs to use (default: 1)
+  --kv-store {device,horovod}
+                        key-value store type (default: horovod)
+  --dtype {float32,float16}
+                        Precision (default: float16)
+  --amp                 If enabled, turn on AMP (Automatic Mixed Precision)
+                        (default: False)
+  -b BATCH_SIZE, --batch-size BATCH_SIZE
+                        batch size per GPU (default: 192)
+  -e NUM_EPOCHS, --num-epochs NUM_EPOCHS
+                        number of epochs (default: 90)
+  -l LR, --lr LR        learning rate; IMPORTANT: true learning rate will be
+                        calculated as `lr * batch_size / 256` (default: 0.256)
+  --lr-schedule {multistep,cosine}
+                        learning rate schedule (default: cosine)
+  --lr-factor LR_FACTOR
+                        the ratio to reduce lr on each step (default: 0.256)
+  --lr-steps LR_STEPS   the epochs to reduce the lr, e.g. 30,60 (default: [])
+  --warmup-epochs WARMUP_EPOCHS
+                        the epochs to ramp-up lr to scaled large-batch value
+                        (default: 5)
+  --optimizer OPTIMIZER
+                        the optimizer type (default: sgd)
+  --mom MOM             momentum for sgd (default: 0.875)
+  --wd WD               weight decay for sgd (default: 3.0517578125e-05)
+  --label-smoothing LABEL_SMOOTHING
+                        label smoothing factor (default: 0.1)
+  --mixup MIXUP         alpha parameter for mixup (if 0 then mixup is not
+                        applied) (default: 0)
+  --disp-batches DISP_BATCHES
+                        show progress for every n batches (default: 20)
+  --model-prefix MODEL_PREFIX
+                        model checkpoint prefix (default: model)
+  --save-frequency SAVE_FREQUENCY
+                        frequency of saving model in epochs (--model-prefix
+                        must be specified). If -1 then save only best model.
+                        If 0 then do not save anything. (default: -1)
+  --begin-epoch BEGIN_EPOCH
+                        start the model from an epoch (default: 0)
+  --load LOAD           checkpoint to load (default: None)
+  --test-io             test reading speed without training (default: False)
+  --test-io-mode {train,val}
+                        data to test (default: train)
+  --log LOG             file where to save the log from the experiment
+                        (default: log.log)
+  --report REPORT       file where to save report (default: report.json)
+  --no-metrics          do not calculate evaluation metrics (for benchmarking)
+                        (default: False)
+  --benchmark-iters BENCHMARK_ITERS
+                        run only benchmark-iters iterations from each epoch
+                        (default: None)
+
+Data:
+  --data-root DATA_ROOT
+                        Directory with RecordIO data files (default:
+                        /data/imagenet/train-val-recordio-passthrough)
+  --data-backend {dali,mxnet,synthetic}
+                        data backend (default: dali)
+  --image-shape IMAGE_SHAPE
+                        the image shape feed into the network (default: [3,
+                        224, 224])
+  --rgb-mean RGB_MEAN   a tuple of size 3 for the mean rgb (default: [123.68,
+                        116.779, 103.939])
+  --rgb-std RGB_STD     a tuple of size 3 for the std rgb (default: [58.393,
+                        57.12, 57.375])
+  --input-layout {NCHW,NHWC}
+                        the layout of the input data (default: NCHW)
+  --conv-layout {NCHW,NHWC}
+                        the layout of the data assumed by the conv operation
+                        (default: NCHW)
+  --batchnorm-layout {NCHW,NHWC}
+                        the layout of the data assumed by the batchnorm
+                        operation (default: NCHW)
+  --pooling-layout {NCHW,NHWC}
+                        the layout of the data assumed by the pooling
+                        operation (default: NCHW)
+  --num-examples NUM_EXAMPLES
+                        the number of training examples (doesn't work with
+                        mxnet data backend) (default: 1281167)
+  --data-val-resize DATA_VAL_RESIZE
+                        base length of shorter edge for validation dataset
+                        (default: 256)
+
+DALI data backend:
+  entire group applies only to dali data backend
+
+  --dali-separ-val      each process will perform independent validation on
+                        whole val-set (default: False)
+  --dali-threads DALI_THREADS
+                        number of threadsper GPU for DALI (default: 3)
+  --dali-validation-threads DALI_VALIDATION_THREADS
+                        number of threadsper GPU for DALI for validation
+                        (default: 10)
+  --dali-prefetch-queue DALI_PREFETCH_QUEUE
+                        DALI prefetch queue depth (default: 2)
+  --dali-nvjpeg-memory-padding DALI_NVJPEG_MEMORY_PADDING
+                        Memory padding value for nvJPEG (in MB) (default: 64)
+
+MXNet data backend:
+  entire group applies only to mxnet data backend
+
+  --data-mxnet-threads DATA_MXNET_THREADS
+                        number of threads for data decoding for mxnet data
+                        backend (default: 40)
+  --random-crop RANDOM_CROP
+                        if or not randomly crop the image (default: 0)
+  --random-mirror RANDOM_MIRROR
+                        if or not randomly flip horizontally (default: 1)
+  --max-random-h MAX_RANDOM_H
+                        max change of hue, whose range is [0, 180] (default:
+                        0)
+  --max-random-s MAX_RANDOM_S
+                        max change of saturation, whose range is [0, 255]
+                        (default: 0)
+  --max-random-l MAX_RANDOM_L
+                        max change of intensity, whose range is [0, 255]
+                        (default: 0)
+  --min-random-aspect-ratio MIN_RANDOM_ASPECT_RATIO
+                        min value of aspect ratio, whose value is either None
+                        or a positive value. (default: 0.75)
+  --max-random-aspect-ratio MAX_RANDOM_ASPECT_RATIO
+                        max value of aspect ratio. If min_random_aspect_ratio
+                        is None, the aspect ratio range is
+                        [1-max_random_aspect_ratio,
+                        1+max_random_aspect_ratio], otherwise it is
+                        [min_random_aspect_ratio, max_random_aspect_ratio].
+                        (default: 1.33)
+  --max-random-rotate-angle MAX_RANDOM_ROTATE_ANGLE
+                        max angle to rotate, whose range is [0, 360] (default:
+                        0)
+  --max-random-shear-ratio MAX_RANDOM_SHEAR_RATIO
+                        max ratio to shear, whose range is [0, 1] (default: 0)
+  --max-random-scale MAX_RANDOM_SCALE
+                        max ratio to scale (default: 1)
+  --min-random-scale MIN_RANDOM_SCALE
+                        min ratio to scale, should >= img_size/input_shape.
+                        otherwise use --pad-size (default: 1)
+  --max-random-area MAX_RANDOM_AREA
+                        max area to crop in random resized crop, whose range
+                        is [0, 1] (default: 1)
+  --min-random-area MIN_RANDOM_AREA
+                        min area to crop in random resized crop, whose range
+                        is [0, 1] (default: 0.05)
+  --min-crop-size MIN_CROP_SIZE
+                        Crop both width and height into a random size in
+                        [min_crop_size, max_crop_size] (default: -1)
+  --max-crop-size MAX_CROP_SIZE
+                        Crop both width and height into a random size in
+                        [min_crop_size, max_crop_size] (default: -1)
+  --brightness BRIGHTNESS
+                        brightness jittering, whose range is [0, 1] (default:
+                        0)
+  --contrast CONTRAST   contrast jittering, whose range is [0, 1] (default: 0)
+  --saturation SATURATION
+                        saturation jittering, whose range is [0, 1] (default:
+                        0)
+  --pca-noise PCA_NOISE
+                        pca noise, whose range is [0, 1] (default: 0)
+  --random-resized-crop RANDOM_RESIZED_CROP
+                        whether to use random resized crop (default: 1)
+```
+
+### Command-line options
+
+To see the full list of available options and their descriptions, use the `-h` or `--help` command line option: `./runner --help` and `python train.py --help`. `./runner` acts as a wrapper on `train.py` and all additional flags will be passed to `train.py`.
+
+`./runner` command-line options:
+```
+usage: runner [-h] [-n NGPUS] [-b BATCH_SIZE] [-e NUM_EPOCHS] [-l LR]
+              [--data-root DATA_ROOT] [--dtype {float32,float16}]
+              [--kv-store {device,horovod}]
+              [--data-backend {dali,mxnet,synthetic}]
+```
+
+`train.py` command-line options:
+```
+usage: train.py [-h]
+                [--arch {resnetv1,resnetv15,resnextv1,resnextv15,xception}]
+                [--num-layers NUM_LAYERS] [--num-groups NUM_GROUPS]
+                [--num-classes NUM_CLASSES] [--batchnorm-eps BATCHNORM_EPS]
+                [--batchnorm-mom BATCHNORM_MOM] [--fuse-bn-relu FUSE_BN_RELU]
+                [--fuse-bn-add-relu FUSE_BN_ADD_RELU]
+                [--mode {train_val,train,val,pred}] [--seed SEED]
+                [--gpus GPUS] [--kv-store {device,horovod}]
+                [--dtype {float32,float16}] [--amp] [--batch-size BATCH_SIZE]
+                [--num-epochs NUM_EPOCHS] [--lr LR]
+                [--lr-schedule {multistep,cosine}] [--lr-factor LR_FACTOR]
+                [--lr-steps LR_STEPS] [--warmup-epochs WARMUP_EPOCHS]
+                [--optimizer OPTIMIZER] [--mom MOM] [--wd WD]
+                [--label-smoothing LABEL_SMOOTHING] [--mixup MIXUP]
+                [--disp-batches DISP_BATCHES] [--model-prefix MODEL_PREFIX]
+                [--save-frequency SAVE_FREQUENCY] [--begin-epoch BEGIN_EPOCH]
+                [--load LOAD] [--test-io] [--test-io-mode {train,val}]
+                [--log LOG] [--report REPORT] [--no-metrics]
+                [--benchmark-iters BENCHMARK_ITERS] [--data-train DATA_TRAIN]
+                [--data-train-idx DATA_TRAIN_IDX] [--data-val DATA_VAL]
+                [--data-val-idx DATA_VAL_IDX] [--data-pred DATA_PRED]
+                [--data-backend {dali,mxnet,synthetic}]
+                [--image-shape IMAGE_SHAPE] [--rgb-mean RGB_MEAN]
+                [--rgb-std RGB_STD] [--input-layout {NCHW,NHWC}]
+                [--conv-layout {NCHW,NHWC}] [--batchnorm-layout {NCHW,NHWC}]
+                [--pooling-layout {NCHW,NHWC}] [--num-examples NUM_EXAMPLES]
+                [--data-val-resize DATA_VAL_RESIZE] [--dali-separ-val]
+                [--dali-threads DALI_THREADS]
+                [--dali-validation-threads DALI_VALIDATION_THREADS]
+                [--dali-prefetch-queue DALI_PREFETCH_QUEUE]
+                [--dali-nvjpeg-memory-padding DALI_NVJPEG_MEMORY_PADDING]
+                [--data-mxnet-threads DATA_MXNET_THREADS]
+                [--random-crop RANDOM_CROP] [--random-mirror RANDOM_MIRROR]
+                [--max-random-h MAX_RANDOM_H] [--max-random-s MAX_RANDOM_S]
+                [--max-random-l MAX_RANDOM_L]
+                [--min-random-aspect-ratio MIN_RANDOM_ASPECT_RATIO]
+                [--max-random-aspect-ratio MAX_RANDOM_ASPECT_RATIO]
+                [--max-random-rotate-angle MAX_RANDOM_ROTATE_ANGLE]
+                [--max-random-shear-ratio MAX_RANDOM_SHEAR_RATIO]
+                [--max-random-scale MAX_RANDOM_SCALE]
+                [--min-random-scale MIN_RANDOM_SCALE]
+                [--max-random-area MAX_RANDOM_AREA]
+                [--min-random-area MIN_RANDOM_AREA]
+                [--min-crop-size MIN_CROP_SIZE]
+                [--max-crop-size MAX_CROP_SIZE] [--brightness BRIGHTNESS]
+                [--contrast CONTRAST] [--saturation SATURATION]
+                [--pca-noise PCA_NOISE]
+                [--random-resized-crop RANDOM_RESIZED_CROP]
+```
+
+### Getting the data

 The MXNet ResNet50 v1.5 script operates on ImageNet 1k, a widely popular image classification dataset from ILSVRC challenge.
-You can download the images from http://image-net.org/download-images
+You can download the images from http://image-net.org/download-images.

 The recommended data format is
 [RecordIO](http://mxnet.io/architecture/note_data_loading.html), which
@ -106,7 +498,7 @@ concatenates multiple examples into seekable binary files for better read
 efficiency. MXNet provides a tool called `im2rec.py` located in the `/opt/mxnet/tools/` directory.
 The tool converts individual images into `.rec` files.

-To prepare RecordIO file containing ImageNet data, we first need to create .lst files
+To prepare a RecordIO file containing ImageNet data, we first need to create `.lst` files
 which consist of the labels and image paths. We assume that the original images were
 downloaded to `/data/imagenet/raw/train-jpeg` and `/data/imagenet/raw/val-jpeg`.

@ -115,121 +507,216 @@ python /opt/mxnet/tools/im2rec.py --list --recursive train /data/imagenet/raw/tr
 python /opt/mxnet/tools/im2rec.py --list --recursive val /data/imagenet/raw/val-jpeg
 ```

-Then we generate the `.rec` (RecordIO files with data) and `.idx` (indexes required by DALI
+Next, we generate the `.rec` (RecordIO files with data) and `.idx` (indexes required by DALI
 to speed up data loading) files. To obtain the best training accuracy
-we do not preprocess the images when creating RecordIO file.
+we do not preprocess the images when creating the RecordIO file.

 ```bash
 python /opt/mxnet/tools/im2rec.py --pass-through --num-thread 40 train /data/imagenet/raw/train-jpeg
 python /opt/mxnet/tools/im2rec.py --pass-through --num-thread 40 val /data/imagenet/raw/val-jpeg
 ```

-## Running training
+#### Dataset guidelines
+The process of loading, normalizing and augmenting the data contained in the dataset can be found in the `data.py` and `dali.py` files.

-To run training for a standard configuration (1/4/8 GPUs, FP16/FP32),
-run one of the scripts in the `./examples` directory
-called `./examples/RN50_{FP16, FP32}_{1, 4, 8}GPU.sh`.
-By default the training scripts run the validation and save checkpoint after each epoch.
-Checkpoints will be stored in `model-symbol.json` and `model-<number of epoch>.params` files.
+The data is read from RecordIO format, which concatenates multiple examples into seekable binary files for better read efficiency.

-If imagenet is mounted in the `/data/imagenet/train-val-recordio-passthrough` directory, you don't have to specify `--data-root` flag.
+Data augmentation techniques are described in the [Default configuration](#default-configuration) section.

-To run a non standard configuration use:
+#### Multi-dataset

-`./runner -n <number of gpus> -b <batch size per gpu> --data-root <path to imagenet> --dtype <float32 or float16> --model-prefix <model prefix>`
+In most cases, to train a model on a different dataset, no changes in the code are required, but the dataset has to be converted into RecordIO format.

-Checkpoints will be stored in `<model prefix>-symbol.json` and `<model prefix>-<number of epoch>.params` files.
-To generate JSON report with performance and accuracy stats, use `--report <path to report>` flag (see `report.py` for info about JSON report file structure).
-Use `./runner -h` and `python ./train.py -h` to obtain the list of available options.
-
-## Running inference
-
-To run inference on a checkpointed model run:
-* For FP16
-    `./examples/SCORE_FP16.sh <model prefix> <epoch>`
-* For FP32
-    `./examples/SCORE_FP32.sh <model prefix> <epoch>`
+To convert a custom dataset, follow the steps from [Getting the data](#getting-the-data) section, and refer to the `scripts/prepare_dataset.py` script.


-## Benchmark scripts
+### Training process
+
+To start training, run:
+`./runner -n <number of gpus> -b <batch size per GPU> --data-root <path to imagenet> --dtype <float32 or float16>`
+
+By default the training script runs the validation after each epoch:
+* the best checkpoint will be stored in the `model_best.params` file in the working directory
+* the log from training will be saved in the  `log.log` file in the working directory
+* the JSON report with statistics will be saved in the  `report.json` file in the working directory
+
+If ImageNet is mounted in the `/data/imagenet/train-val-recordio-passthrough` directory, you don't have to specify the `--data-root` flag.
+
+### Inference process
+
+To start validation, run:
+`./runner -n <number of gpus> -b <batch size per GPU> --data-root <path to imagenet> --dtype <float32 or float16> --mode val`
+
+By default: 
+* the log from validation will be saved in the `log.log` file in the working directory
+* the JSON report with statistics will be saved in the `report.json` file in the working directory
+
+## Performance
+
+### Benchmarking

 To benchmark training and inference, run:
+`python benchmark.py -n <numbers of gpus separated by comma> -b <batch sizes per GPU separated by comma> --data-root <path to imagenet> --dtype <float32 or float16> -o <path to benchmark report>`

-`python benchmark.py -n <numbers of gpus separated by comma> -b <batch sizes per gpu separated by comma> --data-root <path to imagenet> --dtype <float32 or float16> -o <path to benchmark report>`
-
-To control benchmark length per epoch, use `-i` flag (defaults to 100 iterations).
-To control number of epochs, use `-e` flag.
-To control number of warmup epochs (epochs which are not taken into account), use `-w` flag.
-To limit length of dataset, use `--num-examples` flag.
-To benchmark only inference, use `--only-inference` flag.
+To control the benchmark length per epoch, use the `-i` flag (defaults to 100 iterations).
+To control the number of epochs, use the `-e` flag.
+To control the number of warmup epochs (epochs which are not taken into account), use the `-w` flag.
+To limit the length of the dataset, use the `--num-examples` flag.
 By default, the same parameters as in `./runner` will be used. Additional flags will be passed to `./runner`.

+#### Training performance benchmark
+To benchmark only training, use the `--mode train` flag.

-## Training accuracy results
-
-The following results were obtained by running the `./examples/RN50_{FP16, FP32}_{1, 4, 8}GPU.sh` scripts in the
-mxnet-18.12-py3 Docker container on NVIDIA DGX-1 with 8 V100 16G GPUs.
-
-| **number of GPUs** | **FP16 top1** | **FP16 training time** | **FP32 top1** | **FP32 training time** |
-|:------------------:|:-------------:|:----------------------:|:-------------:|:----------------------:|
-| 1                  | 76.424        | 22.9h                  | 76.462        | 82.0h                  |
-| 4                  | 76.328        | 6.2h                   | 76.448        | 21.1h                  |
-| 8                  | 76.490        | 3.3h                   | 76.668        | 11.1h                  |
-
-Here are example graphs of FP32 and FP16 training on 8 GPU configuration:
-
-![TrainingLoss](./img/training_loss.png)
-
-![TrainingAccuracy](./img/training_accuracy.png)
-
-![ValidationAccuracy](./img/validation_accuracy.png)
+#### Inference performance benchmark
+To benchmark only inference, use the `--mode val` flag.


-## Training performance results
+### Results
+
+The following sections provide details on how we achieved our performance and accuracy in training and inference.
+
+#### Training accuracy results
+
+##### Training accuracy: NVIDIA DGX-1 (8x V100 16G)
+
+90 epochs configuration
+Our results were obtained by running the `./runner -n <number of gpus> -b 96 --dtype float32` script for FP32 and the `./runner -n <number of gpus> -b 192` script for mixed precision in the in the mxnet-19.07-py3  NGC container on NVIDIA DGX-1 with (8x V100 16G) GPUs.
+ on NVIDIA DGX-1 with (8x V100 16G) GPUs.
+
+| **GPUs** | **Accuracy - mixed precision** | **Accuracy - FP32** | **Time to train - mixed precision** | **Time to train - FP32** | **Time to train - speedup** |
+|:---:|:---:|:---:|:---:|:---:|:---:|
+|1|77.208|77.160|24.2|84.5|3.49|
+|4|77.296|77.280|6.0|21.4|3.59|
+|8|77.308|77.292|3.0|10.7|3.54|
+
+##### Training stability test
+
+Our results were obtained by running the following commands 8 times with different seeds.
+
+* For 50 epochs
+  * `./runner -n 8 -b 96 --dtype float32 --num-epochs 50` for FP32
+  * `./runner -n 8 -b 192 --num-epochs 50` for mixed precision
+* For 90 epochs
+  * `./runner -n 8 -b 96 --dtype float32` for FP32
+  * `./runner -n 8 -b 192` for mixed precision
+* For 250 epochs
+  * `./runner -n 8 -b 96 --dtype float32 --num-epochs 250 --mixup 0.2` for FP32
+  * `./runner -n 8 -b 192 --num-epochs 250 --mixup 0.2` for mixed precision
+
+| **# of epochs** | **mixed precision avg top1** | **FP32 avg top1** | **mixed precision standard deviation** | **FP32 standard deviation** | **mixed precision minimum top1** | **FP32 minimum top1** | **mixed precision maximum top1** | **FP32 maximum top1** |
+|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|
+|50|76.156|76.185|0.118|0.082|76.010|76.062|76.370|76.304|
+|90|77.105|77.224|0.097|0.060|76.982|77.134|77.308|77.292|
+|250|78.317|78.400|0.073|0.102|78.202|78.316|78.432|78.570|
+
+
+Plots for 250 epoch configuration
+Here are example graphs of FP32 and mixed precision training on 8 GPU 250 epochs configuration:
+
+![TrainingLoss](./img/dgx1-16g_250e_training_loss.png)
+
+![TrainingAccuracy](./img/dgx1-16g_250e_validation_top1.png)
+
+![ValidationAccuracy](./img/dgx1-16g_250e_validation_top5.png)
+
+
+#### Training performance results
+
+##### Training performance: NVIDIA DGX-1 (8x V100 16G)
+
+The following results were obtained by running the
+`python benchmark.py -n 1,2,4,8 -b 192 --dtype float16 -o benchmark_report_fp16.json -i 500 -e 3 -w 1 --num-examples 32000 --mode train` script for mixed precision and the
+`python benchmark.py -n 1,2,4,8 -b 96 --dtype float32 -o benchmark_report_fp32.json -i 500 -e 3 -w 1 --num-examples 32000 --mode train` script for FP32 in the mxnet-19.07-py3 NGC container on NVIDIA DGX-1 with (8x V100 16G) GPUs.

-The following results were obtained by running
-`python benchmark.py -n 1,4,8 -b 208 --dtype float16 -o benchmark_report_fp16.json --data-root <path to imagenet> -i 100 -e 12 -w 4 --num-examples 25600` for FP16, and
-`python benchmark.py -n 1,4,8 -b 96 --dtype float32 -o benchmark_report_fp32.json --data-root <path to imagenet> -i 100 -e 12 -w 4 --num-examples 12800` for FP32
-in the mxnet-18.12-py3 Docker container on NVIDIA DGX-1 with V100 16G GPUs.
 Training performance reported as Total IPS (data + compute time taken into account).
 Weak scaling is calculated as a ratio of speed for given number of GPUs to speed for 1 GPU.

-| **number of GPUs** | **FP16 img/s** | **FP32 img/s** | **FP16 speedup** | **FP16 weak scaling** | **FP32 weak scaling** |
-|:------------------:|:--------------:|:--------------:|:----------------:|:---------------------:|:---------------------:|
-| 1                  | 1442.6         | 400.2          | 3.60             | 1.00                  | 1.00                  |
-| 4                  | 5391.8         | 1558.6         | 3.46             | 3.74                  | 3.89                  |
-| 8                  | 10263.2        | 2957.4         | 3.47             | 7.11                  | 7.39                  |
+| **GPUs** | **Throughput - mixed precision** | **Throughput - FP32** | **Throughput speedup (FP32 - mixed precision)** | **Weak scaling - mixed precision** | **Weak scaling - FP32** |
+|:---:|:---:|:---:|:---:|:---:|:---:|
+|1|1427|385|3.71|1.00|1.00|
+|2|2820|768|3.67|1.98|2.00|
+|4|5560|1513|3.68|3.90|3.93|
+|8|10931|3023|3.62|7.66|7.86|

+##### Training performance: NVIDIA DGX-2 (16x V100 32G)

-## Inference performance results
+The following results were obtained by running the
+`python benchmark.py -n 1,4,8,16 -b 256 --dtype float16 -o benchmark_report_fp16.json -i 500 -e 3 -w 1 --num-examples 32000 --mode train` script for mixed precision and the
+`python benchmark.py -n 1,4,8,16 -b 128 --dtype float32 -o benchmark_report_fp32.json -i 500 -e 3 -w 1 --num-examples 32000 --mode train` script for FP32 in the mxnet-19.07-py3 NGC container on NVIDIA DGX-1 with (8x V100 16G) GPUs.
+
+Training performance reported as Total IPS (data + compute time taken into account).
+Weak scaling is calculated as a ratio of speed for given number of GPUs to speed for 1 GPU.
+
+| **GPUs** | **Throughput - mixed precision** | **Throughput - FP32** | **Throughput speedup (FP32 - mixed precision)** | **Weak scaling - mixed precision** | **Weak scaling - FP32** |
+|:---:|:---:|:---:|:---:|:---:|:---:|
+|1|1438|409|3.52|1.00|1.00|
+|2|2868|817|3.51|1.99|2.00|
+|4|5624|1617|3.48|3.91|3.96|
+|8|11174|3214|3.48|7.77|7.86|
+|16|20530|6356|3.23|14.28|15.54|
+
+#### Inference performance results
+
+##### Inference performance: NVIDIA DGX-1 (8x V100 16G)
+
+The following results were obtained by running the
+`python benchmark.py -n 1 -b 1,2,4,8,16,32,64,128,192,256 --dtype float16 -o inferbenchmark_report_fp16.json -i 500 -e 3 -w 1 --mode val` script for mixed precision and the
+`python benchmark.py -n 1 -b 1,2,4,8,16,32,64,128,192,256 --dtype float32 -o inferbenchmark_report_fp32.json -i 500 -e 3 -w 1 --mode val` script for FP32 in the mxnet-19.07-py3 NGC container on NVIDIA DGX-1 with (8x V100 16G) GPUs.

-The following results were obtained by running
-`python benchmark.py -n 1 -b 1,2,4,8,16,32,64,96,128,192,208 --dtype float16 -o inferbenchmark_report_fp16.json --data-root <path to imagenet> -i 200 -e 12 -w 4 --only-inference` for FP16, and
-`python benchmark.py -n 1 -b 1,2,4,8,16,32,64,96 --dtype float32 -o inferbenchmark_report_fp32.json --data-root <path to imagenet> -i 200 -e 12 -w 4 --only-inference` for FP32
-in the mxnet-18.12-py3 Docker container on NVIDIA DGX-1 using one V100 16G GPU.
 Inference performance reported as Total IPS (data + compute time taken into account).

-| **batch size** | **FP16 img/s** | **FP32 img/s** |
-|:--------------:|:--------------:|:--------------:|
-|              1 |  314           | 252            |
-|              2 |  555           | 393            |
-|              4 |  1024          | 601            |
-|              8 |  1642          | 824            |
-|             16 |  2144          | 1028           |
-|             32 |  2954          | 1138           |
-|             64 |  3428          | 1236           |
-|             96 |  3546          | 1282           |
-|            128 |  3690          |                |
-|            192 |  3828          |                |
-|            208 |  3832          |                |
+Reported mixed precision speedups are relative to FP32 numbers for corresponding configuration.

+| **Batch size** | **Throughput (img/sec) - mixed precision** | **Throughput - speedup** | **Avg latency (ms) - mixed precision** | **Avg latency - speedup** | **50% latency (ms) - mixed precision** | **50% latency - speedup** | **90% latency (ms) - mixed precision** | **90% latency - speedup** | **95% latency (ms) - mixed precision** | **95% latency - speedup** | **99% latency (ms) - mixed precision** | **99% latency - speedup** | **100% latency (ms) - mixed precision** | **100% latency - speedup** |
+|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|
+| 1 | 397 | 1.65 | 2.5 | 1.65 | 2.5 | 1.67 | 2.7 | 1.59 | 2.8 | 1.56 | 3.2 | 1.51 | 15.8 | 0.84 |
+| 2 | 732 | 1.81 | 2.7 | 1.81 | 2.6 | 1.88 | 3.0 | 1.67 | 3.3 | 1.52 | 4.9 | 1.10 | 18.8 | 0.83 |
+| 4 | 1269 | 2.08 | 3.2 | 2.08 | 3.0 | 2.21 | 3.5 | 1.92 | 4.0 | 1.72 | 7.5 | 0.97 | 14.5 | 0.54 |
+| 8 | 2012 | 2.53 | 4.0 | 2.53 | 3.9 | 2.59 | 4.2 | 2.45 | 4.4 | 2.37 | 8.3 | 1.29 | 15.3 | 0.72 |
+| 16 | 2667 | 2.64 | 6.0 | 2.64 | 5.9 | 2.66 | 6.3 | 2.54 | 6.4 | 2.52 | 8.3 | 2.02 | 16.9 | 1.05 |
+| 32 | 3240 | 2.86 | 9.9 | 2.86 | 9.8 | 2.87 | 10.3 | 2.79 | 10.4 | 2.76 | 11.5 | 2.53 | 28.4 | 1.12 |
+| 64 | 3776 | 3.10 | 17.0 | 3.10 | 17.0 | 3.09 | 17.5 | 3.03 | 17.7 | 3.01 | 18.1 | 3.01 | 18.7 | 2.99 |
+| 128 | 3734 | 3.02 | 34.3 | 3.02 | 33.8 | 3.05 | 35.5 | 2.93 | 36.3 | 2.88 | 42.4 | 2.79 | 51.7 | 2.38 |
+| 192 | 3641 | 2.90 | 52.7 | 2.90 | 52.4 | 2.90 | 55.2 | 2.77 | 56.2 | 2.74 | 65.4 | 2.76 | 77.1 | 2.41 |
+| 256 | 3463 | 2.73 | 73.9 | 2.73 | 72.8 | 2.75 | 77.3 | 2.61 | 79.9 | 2.54 | 100.8 | 2.39 | 104.1 | 2.35 |

-# Changelog
+##### Inference performance: NVIDIA T4

-1. Dec 19, 2018
+The following results were obtained by running the
+`python benchmark.py -n 1 -b 1,2,4,8,16,32,64,128,192,256 --dtype float16 -o inferbenchmark_report_fp16.json -i 500 -e 3 -w 1 --mode val` script for mixed precision and the
+`python benchmark.py -n 1 -b 1,2,4,8,16,32,64,128,192,256 --dtype float32 -o inferbenchmark_report_fp32.json -i 500 -e 3 -w 1 --mode val` script for FP32 in the mxnet-19.07-py3 NGC container on an NVIDIA T4 GPU.
+
+Inference performance reported as Total IPS (data + compute time taken into account).
+
+Reported mixed precision speedups are relative to FP32 numbers for corresponding configuration.
+
+| **Batch size** | **Throughput (img/sec) - mixed precision** | **Throughput - speedup** | **Avg latency (ms) - mixed precision** | **Avg latency - speedup** | **50% latency (ms) - mixed precision** | **50% latency - speedup** | **90% latency (ms) - mixed precision** | **90% latency - speedup** | **95% latency (ms) - mixed precision** | **95% latency - speedup** | **99% latency (ms) - mixed precision** | **99% latency - speedup** | **100% latency (ms) - mixed precision** | **100% latency - speedup** |
+|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|
+| 1 | 348 | 1.88 | 2.9 | 1.88 | 2.8 | 1.91 | 2.9 | 1.88 | 3.0 | 1.90 | 3.9 | 1.82 | 17.6 | 0.74 |
+| 2 | 594 | 2.30 | 3.4 | 2.30 | 3.3 | 2.35 | 3.4 | 2.34 | 3.5 | 2.38 | 5.7 | 1.55 | 20.2 | 0.74 |
+| 4 | 858 | 2.93 | 4.7 | 2.93 | 4.6 | 2.97 | 4.9 | 2.86 | 5.0 | 2.81 | 6.0 | 2.46 | 13.7 | 1.12 |
+| 8 | 1047 | 3.17 | 7.6 | 3.17 | 7.6 | 3.19 | 7.9 | 3.10 | 8.2 | 3.02 | 9.1 | 2.77 | 15.0 | 1.72 |
+| 16 | 1163 | 3.16 | 13.8 | 3.16 | 13.7 | 3.17 | 14.1 | 3.13 | 14.4 | 3.07 | 15.4 | 2.90 | 17.5 | 2.62 |
+| 32 | 1225 | 3.22 | 26.1 | 3.22 | 26.1 | 3.22 | 27.0 | 3.15 | 27.3 | 3.12 | 28.3 | 3.05 | 30.5 | 2.89 |
+| 64 | 1230 | 3.15 | 52.0 | 3.15 | 51.8 | 3.16 | 52.9 | 3.12 | 53.3 | 3.10 | 54.4 | 3.08 | 58.8 | 2.90 |
+| 128 | 1260 | 3.21 | 101.6 | 3.21 | 101.3 | 3.22 | 102.7 | 3.21 | 103.2 | 3.20 | 115.0 | 2.89 | 121.8 | 2.86 |
+| 192 | 1252 | 3.20 | 153.3 | 3.20 | 153.1 | 3.20 | 154.7 | 3.19 | 155.5 | 3.21 | 156.9 | 3.20 | 182.3 | 2.81 |
+| 256 | 1251 | 3.22 | 204.6 | 3.22 | 204.3 | 3.23 | 206.4 | 3.21 | 207.1 | 3.21 | 209.3 | 3.18 | 241.9 | 2.76 |
+
+## Release notes 
+
+### Changelog
+
+1. Dec, 2018
  * Initial release (based on https://github.com/apache/incubator-mxnet/tree/master/example/image-classification)
+2. June, 2019
+  * Code refactor
+  * Label smoothing
+  * Cosine LR schedule
+  * MixUp regularization
+  * Better configurations


-# Known Issues
+### Known Issues

 There are no known issues with this model.
--- a/MxNet/Classification/RN50v1.5/init.py
+++ b/MxNet/Classification/RN50v1.5/init.py
--- a/MxNet/Classification/RN50v1.5/benchmark.py
+++ b/MxNet/Classification/RN50v1.5/benchmark.py
@ -1,3 +1,5 @@
+#!/usr/bin/env python3
+
 # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@ -18,14 +20,21 @@ import sys
 import tempfile
 import json
 import os
+import traceback
+import numpy as np
 from collections import OrderedDict
 from subprocess import Popen

-parser = argparse.ArgumentParser(description='Benchmark')
+def int_list(x):
+    return list(map(int, x.split(',')))
+
+parser = argparse.ArgumentParser(description='Benchmark',
+                                 formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 parser.add_argument('--executable', default='./runner', help='path to runner')
-parser.add_argument('-n', '--ngpus', metavar='N1,[N2,...]',
+parser.add_argument('-o', '--output', metavar='OUT', required=True, help="path to benchmark report")
+parser.add_argument('-n', '--ngpus', metavar='N1,[N2,...]', type=int_list,
                    required=True, help='numbers of gpus separated by comma')
-parser.add_argument('-b', '--batch-sizes', metavar='B1,[B2,...]',
+parser.add_argument('-b', '--batch-sizes', metavar='B1,[B2,...]', type=int_list,
                    required=True, help='batch sizes separated by comma')
 parser.add_argument('-i', '--benchmark-iters', metavar='I',
                    type=int, default=100, help='iterations')
@ -33,57 +42,83 @@ parser.add_argument('-e', '--epochs', metavar='E',
                    type=int, default=1, help='number of epochs')
 parser.add_argument('-w', '--warmup', metavar='N',
                    type=int, default=0, help='warmup epochs')
-parser.add_argument('-o', '--output', metavar='OUT', required=True, help="path to benchmark report")
-parser.add_argument('--only-inference', action='store_true', help="benchmark inference only")
+parser.add_argument('--timeout', metavar='T',
+                    type=str, default='inf', help='timeout for each run')
+parser.add_argument('--mode', metavar='MODE', choices=('train_val', 'train', 'val'), default='train_val',
+                    help="benchmark mode")
 args, other_args = parser.parse_known_args()

-ngpus = list(map(int, args.ngpus.split(',')))
-batch_sizes = list(map(int, args.batch_sizes.split(',')))
-
+latency_percentiles = ['avg', 50, 90, 95, 99, 100]
+harmonic_mean_metrics = ['train.total_ips', 'val.total_ips']

 res = OrderedDict()
 res['model'] = ''
-res['ngpus'] = ngpus
-res['bs'] = batch_sizes
-if args.only_inference:
-    res['metric_keys'] = ['val.total_ips']
-else:
-    res['metric_keys'] = ['train.total_ips', 'val.total_ips']
+res['ngpus'] = args.ngpus
+res['bs'] = args.batch_sizes
+res['metric_keys'] = []
+if args.mode == 'train' or args.mode == 'train_val':
+    res['metric_keys'].append('train.total_ips')
+    for percentile in latency_percentiles:
+        res['metric_keys'].append('train.latency_{}'.format(percentile))
+if args.mode == 'val' or args.mode == 'train_val':
+    res['metric_keys'].append('val.total_ips')
+    for percentile in latency_percentiles:
+        res['metric_keys'].append('val.latency_{}'.format(percentile))
+
 res['metrics'] = OrderedDict()

-for n in ngpus:
+for n in args.ngpus:
    res['metrics'][str(n)] = OrderedDict()
-    for bs in batch_sizes:
+    for bs in args.batch_sizes:
        res['metrics'][str(n)][str(bs)] = OrderedDict()

        report_file = args.output + '-{},{}'.format(n, bs)
-        Popen([args.executable, '-n', str(n), '-b', str(bs),
+        Popen(['timeout', args.timeout, args.executable, '-n', str(n), '-b', str(bs),
               '--benchmark-iters', str(args.benchmark_iters),
               '-e', str(args.epochs), '--report', report_file,
-               *([] if not args.only_inference else ['--only-inference']),
-               '--no-metrics'] + other_args, stdout=sys.stderr).wait()
+               '--mode', args.mode, '--no-metrics'] + other_args,
+              stdout=sys.stderr).wait()

-        with open(report_file, 'r') as f:
-            report = json.load(f)
+        try:
+            for suffix in ['', *['-{}'.format(i) for i in range(1, n)]]:
+                try:
+                    with open(report_file + suffix, 'r') as f:
+                        report = json.load(f)
+                    break
+                except FileNotFoundError:
+                    pass
+            else:
+                with open(report_file, 'r') as f:
+                    report = json.load(f)

-        for metric in res['metric_keys']:
-            data = report['metrics'][metric][args.warmup:]
-            avg = len(data) / sum(map(lambda x: 1 / x, data))
-            res['metrics'][str(n)][str(bs)][metric] = avg
+            for metric in res['metric_keys']:
+                if len(report['metrics'][metric]) != args.epochs:
+                    raise ValueError('Wrong number epochs in report')
+                data = report['metrics'][metric][args.warmup:]
+                if metric in harmonic_mean_metrics:
+                    avg = len(data) / sum(map(lambda x: 1 / x, data))
+                else:
+                    avg = np.mean(data)
+                res['metrics'][str(n)][str(bs)][metric] = avg
+        except Exception as e:
+            traceback.print_exc()
+
+            for metric in res['metric_keys']:
+                res['metrics'][str(n)][str(bs)][metric] = float('nan')


-column_len = 7
+column_len = 11
 for m in res['metric_keys']:
    print(m, file=sys.stderr)
    print(' ' * column_len, end='|', file=sys.stderr)
-    for bs in batch_sizes:
+    for bs in args.batch_sizes:
        print(str(bs).center(column_len), end='|', file=sys.stderr)
    print(file=sys.stderr)
-    print('-' * (len(batch_sizes) + 1) * (column_len + 1), file=sys.stderr)
-    for n in ngpus:
+    print('-' * (len(args.batch_sizes) + 1) * (column_len + 1), file=sys.stderr)
+    for n in args.ngpus:
        print(str(n).center(column_len), end='|', file=sys.stderr)
-        for bs in batch_sizes:
-            print(str(round(res['metrics'][str(n)][str(bs)][m])).center(column_len), end='|', file=sys.stderr)
+        for bs in args.batch_sizes:
+            print('{:.5g}'.format(res['metrics'][str(n)][str(bs)][m]).center(column_len), end='|', file=sys.stderr)
        print(file=sys.stderr)
    print(file=sys.stderr)

--- a/MxNet/Classification/RN50v1.5/benchmarking.py
+++ b/MxNet/Classification/RN50v1.5/benchmarking.py
@ -52,11 +52,14 @@ class BenchmarkingDataIter:
    def __getattr__(self, attr):
        return getattr(self.data_iter, attr)

-    def get_avg_time_and_clear(self):
+    def get_avg_time(self):
        if self.num <= 1:
            avg = float('nan')
        else:
            avg = self.overall_time / (self.num - 1)
+        return avg
+
+    def reset(self):
        self.overall_time = 0
        self.num = 0
-        return avg
+        self.data_iter.reset()
--- a/MxNet/Classification/RN50v1.5/dali.py
+++ b/MxNet/Classification/RN50v1.5/dali.py
@ -18,146 +18,166 @@ from nvidia.dali.pipeline import Pipeline
 import nvidia.dali.ops as ops
 import nvidia.dali.types as types
 from nvidia.dali.plugin.mxnet import DALIClassificationIterator
+import horovod.mxnet as hvd


 def add_dali_args(parser):
-    group = parser.add_argument_group('DALI', 'pipeline and augumentation')
-    group.add_argument('--use-dali', action='store_true',
-                      help='use dalli pipeline and augunetation')
-    group.add_argument('--separ-val', action='store_true',
+    group = parser.add_argument_group('DALI data backend', 'entire group applies only to dali data backend')
+    group.add_argument('--dali-separ-val', action='store_true',
                      help='each process will perform independent validation on whole val-set')
    group.add_argument('--dali-threads', type=int, default=3, help="number of threads" +\
                       "per GPU for DALI")
-    group.add_argument('--validation-dali-threads', type=int, default=10, help="number of threads" +\
+    group.add_argument('--dali-validation-threads', type=int, default=10, help="number of threads" +\
                       "per GPU for DALI for validation")
-    group.add_argument('--dali-prefetch-queue', type=int, default=3, help="DALI prefetch queue depth")
-    group.add_argument('--dali-nvjpeg-memory-padding', type=int, default=16, help="Memory padding value for nvJPEG (in MB)")
+    group.add_argument('--dali-prefetch-queue', type=int, default=2, help="DALI prefetch queue depth")
+    group.add_argument('--dali-nvjpeg-memory-padding', type=int, default=64, help="Memory padding value for nvJPEG (in MB)")
+    group.add_argument('--dali-fuse-decoder', type=int, default=1, help="0 or 1 whether to fuse decoder or not")
    return parser


-_mean_pixel = [255 * x for x in (0.485, 0.456, 0.406)]
-_std_pixel  = [255 * x for x in (0.229, 0.224, 0.225)]
-
 class HybridTrainPipe(Pipeline):
-    def __init__(self, batch_size, num_threads, device_id, rec_path, idx_path,
-                 shard_id, num_shards, crop_shape,
-                 nvjpeg_padding, prefetch_queue=3,
-                 output_layout=types.NCHW, pad_output=True, dtype='float16'):
-        super(HybridTrainPipe, self).__init__(batch_size, num_threads, device_id, seed = 12 + device_id, prefetch_queue_depth = prefetch_queue)
-        self.input = ops.MXNetReader(path = [rec_path], index_path=[idx_path],
+    def __init__(self, args, batch_size, num_threads, device_id, rec_path, idx_path,
+                 shard_id, num_shards, crop_shape, nvjpeg_padding, prefetch_queue=3,
+                 output_layout=types.NCHW, pad_output=True, dtype='float16', dali_cpu=False):
+        super(HybridTrainPipe, self).__init__(batch_size, num_threads, device_id, seed=12 + device_id, prefetch_queue_depth = prefetch_queue)
+        self.input = ops.MXNetReader(path=[rec_path], index_path=[idx_path],
                                     random_shuffle=True, shard_id=shard_id, num_shards=num_shards)

-        self.decode = ops.nvJPEGDecoder(device = "mixed", output_type = types.RGB,
-                                        device_memory_padding = nvjpeg_padding,
-                                        host_memory_padding = nvjpeg_padding)
-        self.rrc = ops.RandomResizedCrop(device = "gpu", size = crop_shape)
-        self.cmnp = ops.CropMirrorNormalize(device = "gpu",
-                                            output_dtype = types.FLOAT16 if dtype == 'float16' else types.FLOAT,
-                                            output_layout = output_layout,
-                                            crop = crop_shape,
-                                            pad_output = pad_output,
-                                            image_type = types.RGB,
-                                            mean = _mean_pixel,
-                                            std =  _std_pixel)
-        self.coin = ops.CoinFlip(probability = 0.5)
+        if dali_cpu:
+            dali_device = "cpu"
+            if args.dali_fuse_decoder:
+                self.decode = ops.HostDecoderRandomCrop(device=dali_device, output_type=types.RGB)
+            else:
+                self.decode = ops.HostDecoder(device=dali_device, output_type=types.RGB)
+        else:
+            dali_device = "gpu"
+            if args.dali_fuse_decoder:
+                self.decode = ops.nvJPEGDecoderRandomCrop(device="mixed", output_type=types.RGB,
+                                                          device_memory_padding=nvjpeg_padding, host_memory_padding=nvjpeg_padding)
+            else:
+                self.decode = ops.nvJPEGDecoder(device="mixed", output_type=types.RGB,
+                                                device_memory_padding=nvjpeg_padding, host_memory_padding=nvjpeg_padding)
+
+        if args.dali_fuse_decoder:
+            self.resize = ops.Resize(device=dali_device, resize_x=crop_shape[1], resize_y=crop_shape[0])
+        else:
+            self.resize = ops.RandomResizedCrop(device=dali_device, size=crop_shape)
+
+        self.cmnp = ops.CropMirrorNormalize(device="gpu",
+                                            output_dtype=types.FLOAT16 if dtype == 'float16' else types.FLOAT,
+                                            output_layout=output_layout, crop=crop_shape, pad_output=pad_output,
+                                            image_type=types.RGB, mean=args.rgb_mean, std=args.rgb_std)
+        self.coin = ops.CoinFlip(probability=0.5)

    def define_graph(self):
        rng = self.coin()
-        self.jpegs, self.labels = self.input(name = "Reader")
+        self.jpegs, self.labels = self.input(name="Reader")

        images = self.decode(self.jpegs)
-        images = self.rrc(images)
-        output = self.cmnp(images, mirror = rng)
+        images = self.resize(images)
+        output = self.cmnp(images.gpu(), mirror=rng)
        return [output, self.labels]


 class HybridValPipe(Pipeline):
-    def __init__(self, batch_size, num_threads, device_id, rec_path, idx_path,
-                 shard_id, num_shards, crop_shape,
-                 nvjpeg_padding, prefetch_queue=3,
-                 resize_shp=None,
-                 output_layout=types.NCHW, pad_output=True, dtype='float16'):
-        super(HybridValPipe, self).__init__(batch_size, num_threads, device_id, seed = 12 + device_id, prefetch_queue_depth = prefetch_queue)
-        self.input = ops.MXNetReader(path = [rec_path], index_path=[idx_path],
+    def __init__(self, args, batch_size, num_threads, device_id, rec_path, idx_path,
+                 shard_id, num_shards, crop_shape, nvjpeg_padding, prefetch_queue=3, resize_shp=None,
+                 output_layout=types.NCHW, pad_output=True, dtype='float16', dali_cpu=False):
+        super(HybridValPipe, self).__init__(batch_size, num_threads, device_id, seed=12 + device_id, prefetch_queue_depth=prefetch_queue)
+        self.input = ops.MXNetReader(path=[rec_path], index_path=[idx_path],
                                     random_shuffle=False, shard_id=shard_id, num_shards=num_shards)
-        self.decode = ops.nvJPEGDecoder(device = "mixed", output_type = types.RGB,
-                                        device_memory_padding = nvjpeg_padding,
-                                        host_memory_padding = nvjpeg_padding)
-        self.resize = ops.Resize(device = "gpu", resize_shorter=resize_shp) if resize_shp else None
-        self.cmnp = ops.CropMirrorNormalize(device = "gpu",
-                                            output_dtype = types.FLOAT16 if dtype == 'float16' else types.FLOAT,
-                                            output_layout = output_layout,
-                                            crop = crop_shape,
-                                            pad_output = pad_output,
-                                            image_type = types.RGB,
-                                            mean = _mean_pixel,
-                                            std =  _std_pixel)
+
+        if dali_cpu:
+            dali_device = "cpu"
+            self.decode = ops.HostDecoder(device=dali_device, output_type=types.RGB)
+        else:
+            dali_device = "gpu"
+            self.decode = ops.nvJPEGDecoder(device="mixed", output_type=types.RGB,
+                                            device_memory_padding=nvjpeg_padding,
+                                            host_memory_padding=nvjpeg_padding)
+        self.resize = ops.Resize(device=dali_device, resize_shorter=resize_shp) if resize_shp else None
+        self.cmnp = ops.CropMirrorNormalize(device="gpu",
+                                            output_dtype=types.FLOAT16 if dtype == 'float16' else types.FLOAT,
+                                            output_layout=output_layout, crop=crop_shape, pad_output=pad_output,
+                                            image_type=types.RGB, mean=args.rgb_mean, std=args.rgb_std)

    def define_graph(self):
-        self.jpegs, self.labels = self.input(name = "Reader")
+        self.jpegs, self.labels = self.input(name="Reader")
        images = self.decode(self.jpegs)
        if self.resize:
            images = self.resize(images)
-        output = self.cmnp(images)
+        output = self.cmnp(images.gpu())
        return [output, self.labels]


-def get_rec_iter(args, kv=None):
-    # resize is default base length of shorter edge for dataset;
-    # all images will be reshaped to this size
-    resize = int(args.resize)
-    # target shape is final shape of images pipelined to network;
-    # all images will be cropped to this size
-    target_shape = tuple([int(l) for l in args.image_shape.split(',')])
-    pad_output = target_shape[0] == 4
-    gpus = list(map(int, filter(None, args.gpus.split(',')))) # filter to not encount eventually empty strings
-    batch_size = args.batch_size//len(gpus)
+def get_rec_iter(args, kv=None, dali_cpu=False):
+    gpus = args.gpus
    num_threads = args.dali_threads
-    num_validation_threads = args.validation_dali_threads
-    #db_folder = "/data/imagenet/train-480-val-256-recordio/"
+    num_validation_threads = args.dali_validation_threads
+    pad_output = (args.image_shape[0] == 4)

    # the input_layout w.r.t. the model is the output_layout of the image pipeline
    output_layout = types.NHWC if args.input_layout == 'NHWC' else types.NCHW

-    rank = kv.rank if kv else 0
-    nWrk = kv.num_workers if kv else 1
+    if 'horovod' in args.kv_store:
+        rank = hvd.rank()
+        nWrk = hvd.size()
+    else:
+        rank = kv.rank if kv else 0
+        nWrk = kv.num_workers if kv else 1

-    trainpipes = [HybridTrainPipe(batch_size     = batch_size,
+    batch_size = args.batch_size // nWrk // len(gpus)
+
+    trainpipes = [HybridTrainPipe(args           = args,
+                                  batch_size     = batch_size,
                                  num_threads    = num_threads,
                                  device_id      = gpu_id,
                                  rec_path       = args.data_train,
                                  idx_path       = args.data_train_idx,
                                  shard_id       = gpus.index(gpu_id) + len(gpus)*rank,
                                  num_shards     = len(gpus)*nWrk,
-                                  crop_shape     = target_shape[1:],
+                                  crop_shape     = args.image_shape[1:],
                                  output_layout  = output_layout,
-                                  pad_output     = pad_output,
                                  dtype          = args.dtype,
+                                  pad_output     = pad_output,
+                                  dali_cpu       = dali_cpu,
                                  nvjpeg_padding = args.dali_nvjpeg_memory_padding * 1024 * 1024,
                                  prefetch_queue = args.dali_prefetch_queue) for gpu_id in gpus]

-    valpipes = [HybridValPipe(batch_size     = batch_size,
-                              num_threads    = num_validation_threads,
-                              device_id      = gpu_id,
-                              rec_path       = args.data_val,
-                              idx_path       = args.data_val_idx,
-                              shard_id       = 0 if args.separ_val
-                                                 else gpus.index(gpu_id) + len(gpus)*rank,
-                              num_shards     = 1 if args.separ_val else len(gpus)*nWrk,
-                              crop_shape     = target_shape[1:],
-                              resize_shp     = resize,
-                              output_layout  = output_layout,
-                              pad_output     = pad_output,
-                              dtype          = args.dtype,
-                              nvjpeg_padding = args.dali_nvjpeg_memory_padding * 1024 * 1024,
-                              prefetch_queue = args.dali_prefetch_queue) for gpu_id in gpus] if args.data_val else None
+    if args.data_val:
+        valpipes = [HybridValPipe(args           = args,
+                                  batch_size     = batch_size,
+                                  num_threads    = num_validation_threads,
+                                  device_id      = gpu_id,
+                                  rec_path       = args.data_val,
+                                  idx_path       = args.data_val_idx,
+                                  shard_id       = 0 if args.dali_separ_val
+                                                      else gpus.index(gpu_id) + len(gpus)*rank,
+                                  num_shards     = 1 if args.dali_separ_val else len(gpus)*nWrk,
+                                  crop_shape     = args.image_shape[1:],
+                                  resize_shp     = args.data_val_resize,
+                                  output_layout  = output_layout,
+                                  dtype          = args.dtype,
+                                  pad_output     = pad_output,
+                                  dali_cpu       = dali_cpu,
+                                  nvjpeg_padding = args.dali_nvjpeg_memory_padding * 1024 * 1024,
+                                  prefetch_queue = args.dali_prefetch_queue) for gpu_id in gpus] if args.data_val else None
    trainpipes[0].build()
    if args.data_val:
        valpipes[0].build()
+        worker_val_examples = valpipes[0].epoch_size("Reader")
+        if not args.dali_separ_val:
+            worker_val_examples = worker_val_examples // nWrk
+            if rank < valpipes[0].epoch_size("Reader") % nWrk:
+                worker_val_examples += 1

    if args.num_examples < trainpipes[0].epoch_size("Reader"):
        warnings.warn("{} training examples will be used, although full training set contains {} examples".format(args.num_examples, trainpipes[0].epoch_size("Reader")))
    dali_train_iter = DALIClassificationIterator(trainpipes, args.num_examples // nWrk)
-    dali_val_iter = DALIClassificationIterator(valpipes, valpipes[0].epoch_size("Reader") // (1 if args.separ_val else nWrk), fill_last_batch = False) if args.data_val else None
-    return dali_train_iter, dali_val_iter

+    if args.data_val:
+        dali_val_iter = DALIClassificationIterator(valpipes, worker_val_examples, fill_last_batch = False) if args.data_val else None
+    else:
+        dali_val_iter = None
+
+    return dali_train_iter, dali_val_iter
--- a/MxNet/Classification/RN50v1.5/data.py
+++ b/MxNet/Classification/RN50v1.5/data.py
@ -1,7 +1,5 @@
-# -----------------------------------------------------------------------
 # Copyright 2017-2018 The Apache Software Foundation
 #
-#
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
@ -36,128 +34,61 @@
 # limitations under the License.

 import mxnet as mx
+import mxnet.ndarray as nd
 import random
 import argparse
 from mxnet.io import DataBatch, DataIter
 import numpy as np
+import horovod.mxnet as hvd
+
+import dali

 def add_data_args(parser):
-    data = parser.add_argument_group('Data', 'the input images')
+    def float_list(x):
+        return list(map(float, x.split(',')))
+    def int_list(x):
+        return list(map(int, x.split(',')))
+
+    data = parser.add_argument_group('Data')
    data.add_argument('--data-train', type=str, help='the training data')
    data.add_argument('--data-train-idx', type=str, default='', help='the index of training data')
    data.add_argument('--data-val', type=str, help='the validation data')
    data.add_argument('--data-val-idx', type=str, default='', help='the index of validation data')
-    data.add_argument('--rgb-mean', type=str, default='123.68,116.779,103.939',
+    data.add_argument('--data-pred', type=str, help='the image on which run inference (only for pred mode)')
+
+    data.add_argument('--data-backend', choices=('dali-gpu', 'dali-cpu', 'mxnet', 'synthetic'), default='dali-gpu',
+                      help='set data loading & augmentation backend')
+    data.add_argument('--image-shape', type=int_list, default=[3, 224, 224],
+                      help='the image shape feed into the network')
+    data.add_argument('--rgb-mean', type=float_list, default=[123.68, 116.779, 103.939],
                      help='a tuple of size 3 for the mean rgb')
-    data.add_argument('--rgb-std', type=str, default='1,1,1',
+    data.add_argument('--rgb-std', type=float_list, default=[58.393, 57.12, 57.375],
                      help='a tuple of size 3 for the std rgb')
-    data.add_argument('--pad-size', type=int, default=0,
-                      help='padding the input image')
-    data.add_argument('--fill-value', type=int, default=127,
-                      help='Set the padding pixels value to fill_value')
-    data.add_argument('--image-shape', type=str,
-                      help='the image shape feed into the network, e.g. (3,224,224)')
-    data.add_argument('--num-classes', type=int, help='the number of classes')
-    data.add_argument('--num-examples', type=int, help='the number of training examples')
-    data.add_argument('--data-nthreads', type=int, default=4,
-                      help='number of threads for data decoding')
-    data.add_argument('--benchmark-iters', type=int, default=None,
-                      help='run only benchmark-iters iterations from each epoch')
-    data.add_argument('--input-layout', type=str, default='NCHW',
-                      help='the layout of the input data (e.g. NCHW)')
-    data.add_argument('--conv-layout', type=str, default='NCHW',
-                      help='the layout of the data assumed by the conv operation (e.g. NCHW)')
-    data.add_argument('--conv-algo', type=int, default=-1,
-                      help='set the convolution algos (fwd, dgrad, wgrad)')
-    data.add_argument('--batchnorm-layout', type=str, default='NCHW',
-                      help='the layout of the data assumed by the batchnorm operation (e.g. NCHW)')
-    data.add_argument('--batchnorm-eps', type=float, default=2e-5,
-                      help='the amount added to the batchnorm variance to prevent output explosion.')
-    data.add_argument('--batchnorm-mom', type=float, default=0.9,
-                      help='the leaky-integrator factor controling the batchnorm mean and variance.')
-    data.add_argument('--pooling-layout', type=str, default='NCHW',
-                      help='the layout of the data assumed by the pooling operation (e.g. NCHW)')
-    data.add_argument('--verbose', type=int, default=0,
-                      help='turn on reporting of chosen algos for convolution, etc.')
-    data.add_argument('--seed', type=int, default=None,
-                      help='set the seed for python, nd and mxnet rngs')
-    data.add_argument('--custom-bn-off', type=int, default=0,
-                      help='disable use of custom batchnorm kernel')
-    data.add_argument('--fuse-bn-relu', type=int, default=0,
-                      help='have batchnorm kernel perform activation relu')
-    data.add_argument('--fuse-bn-add-relu', type=int, default=0,
-                      help='have batchnorm kernel perform add followed by activation relu')
-    data.add_argument('--force-tensor-core', type=int, default=0,
-                      help='require conv algos to be tensor core')
+
+    data.add_argument('--input-layout', type=str, default='NCHW', choices=('NCHW', 'NHWC'),
+                      help='the layout of the input data')
+    data.add_argument('--conv-layout', type=str, default='NCHW', choices=('NCHW', 'NHWC'),
+                      help='the layout of the data assumed by the conv operation')
+    data.add_argument('--batchnorm-layout', type=str, default='NCHW', choices=('NCHW', 'NHWC'),
+                      help='the layout of the data assumed by the batchnorm operation')
+    data.add_argument('--pooling-layout', type=str, default='NCHW', choices=('NCHW', 'NHWC'),
+                      help='the layout of the data assumed by the pooling operation')
+
+    data.add_argument('--num-examples', type=int, default=1281167,
+                      help="the number of training examples (doesn't work with mxnet data backend)")
+    data.add_argument('--data-val-resize', type=int, default=256,
+                      help='base length of shorter edge for validation dataset')
+
    return data

-# Action to translate --set-resnet-aug flag to its component settings.
-class SetResnetAugAction(argparse.Action):
-    def __init__(self, nargs=0, **kwargs):
-        if nargs != 0:
-            raise ValueError('nargs for SetResnetAug must be 0.')
-        super(SetResnetAugAction, self).__init__(nargs=nargs, **kwargs)
-    def __call__(self, parser, namespace, values, option_string=None):
-        # standard data augmentation setting for resnet training
-        setattr(namespace, 'random_crop', 1)
-        setattr(namespace, 'random_resized_crop', 1)
-        setattr(namespace, 'random_mirror', 1)
-        setattr(namespace, 'min_random_area', 0.08)
-        setattr(namespace, 'max_random_aspect_ratio', 4./3.)
-        setattr(namespace, 'min_random_aspect_ratio', 3./4.)
-        setattr(namespace, 'brightness', 0.4)
-        setattr(namespace, 'contrast', 0.4)
-        setattr(namespace, 'saturation', 0.4)
-        setattr(namespace, 'pca_noise', 0.1)
-        # record that this --set-resnet-aug 'macro arg' has been invoked
-        setattr(namespace, self.dest, 1)
-
-# Similar to the above, but suitable for calling within a training script to set the defaults.
-def set_resnet_aug(aug):
-    # standard data augmentation setting for resnet training
-    aug.set_defaults(random_crop=0, random_resized_crop=1)
-    aug.set_defaults(random_mirror=1)
-    aug.set_defaults(min_random_area=0.08)
-    aug.set_defaults(max_random_aspect_ratio=4./3., min_random_aspect_ratio=3./4.)
-    aug.set_defaults(brightness=0.4, contrast=0.4, saturation=0.4, pca_noise=0.1)
-
-# Action to translate --set-data-aug-level <N> arg to its component settings.
-class SetDataAugLevelAction(argparse.Action):
-    def __init__(self, option_strings, dest, nargs=None, **kwargs):
-        if nargs is not None:
-            raise ValueError("nargs not allowed")
-        super(SetDataAugLevelAction, self).__init__(option_strings, dest, **kwargs)
-    def __call__(self, parser, namespace, values, option_string=None):
-        level = values
-        # record that this --set-data-aug-level <N> 'macro arg' has been invoked
-        setattr(namespace, self.dest, level)
-        if level >= 1:
-            setattr(namespace, 'random_crop', 1)
-            setattr(namespace, 'random_mirror', 1)
-        if level >= 2:
-            setattr(namespace, 'max_random_h', 36)
-            setattr(namespace, 'max_random_s', 50)
-            setattr(namespace, 'max_random_l', 50)
-        if level >= 3:
-            setattr(namespace, 'max_random_rotate_angle', 10)
-            setattr(namespace, 'max_random_shear_ratio', 0.1)
-            setattr(namespace, 'max_random_aspect_ratio', 0.25)
-
-# Similar to the above, but suitable for calling within a training script to set the defaults.
-def set_data_aug_level(aug, level):
-    if level >= 1:
-        aug.set_defaults(random_crop=1, random_mirror=1)
-    if level >= 2:
-        aug.set_defaults(max_random_h=36, max_random_s=50, max_random_l=50)
-    if level >= 3:
-        aug.set_defaults(max_random_rotate_angle=10, max_random_shear_ratio=0.1, max_random_aspect_ratio=0.25)
-
 def add_data_aug_args(parser):
    aug = parser.add_argument_group(
-        'Image augmentations', 'implemented in src/io/image_aug_default.cc')
+            'MXNet data backend', 'entire group applies only to mxnet data backend')
+    aug.add_argument('--data-mxnet-threads', type=int, default=40,
+                     help='number of threads for data decoding for mxnet data backend')
    aug.add_argument('--random-crop', type=int, default=0,
                     help='if or not randomly crop the image')
-    aug.add_argument('--random-mirror', type=int, default=0,
+    aug.add_argument('--random-mirror', type=int, default=1,
                     help='if or not randomly flip horizontally')
    aug.add_argument('--max-random-h', type=int, default=0,
                     help='max change of hue, whose range is [0, 180]')
@ -165,9 +96,9 @@ def add_data_aug_args(parser):
                     help='max change of saturation, whose range is [0, 255]')
    aug.add_argument('--max-random-l', type=int, default=0,
                     help='max change of intensity, whose range is [0, 255]')
-    aug.add_argument('--min-random-aspect-ratio', type=float, default=None,
+    aug.add_argument('--min-random-aspect-ratio', type=float, default=0.75,
                     help='min value of aspect ratio, whose value is either None or a positive value.')
-    aug.add_argument('--max-random-aspect-ratio', type=float, default=0,
+    aug.add_argument('--max-random-aspect-ratio', type=float, default=1.33,
                     help='max value of aspect ratio. If min_random_aspect_ratio is None, '
                          'the aspect ratio range is [1-max_random_aspect_ratio, '
                          '1+max_random_aspect_ratio], otherwise it is '
@ -183,7 +114,7 @@ def add_data_aug_args(parser):
                          'otherwise use --pad-size')
    aug.add_argument('--max-random-area', type=float, default=1,
                     help='max area to crop in random resized crop, whose range is [0, 1]')
-    aug.add_argument('--min-random-area', type=float, default=1,
+    aug.add_argument('--min-random-area', type=float, default=0.05,
                     help='min area to crop in random resized crop, whose range is [0, 1]')
    aug.add_argument('--min-crop-size', type=int, default=-1,
                     help='Crop both width and height into a random size in '
@ -199,87 +130,200 @@ def add_data_aug_args(parser):
                     help='saturation jittering, whose range is [0, 1]')
    aug.add_argument('--pca-noise', type=float, default=0,
                     help='pca noise, whose range is [0, 1]')
-    aug.add_argument('--random-resized-crop', type=int, default=0,
+    aug.add_argument('--random-resized-crop', type=int, default=1,
                     help='whether to use random resized crop')
-    aug.add_argument('--set-resnet-aug', action=SetResnetAugAction,
-                     help='whether to employ standard resnet augmentations (see data.py)')
-    aug.add_argument('--set-data-aug-level', type=int, default=None, action=SetDataAugLevelAction,
-                     help='set multiple data augmentations based on a `level` (see data.py)')
    return aug

+def get_data_loader(args):
+    if args.data_backend == 'dali-gpu':
+        return (lambda *args, **kwargs: dali.get_rec_iter(*args, **kwargs, dali_cpu=False))
+    if args.data_backend == 'dali-cpu':
+        return (lambda *args, **kwargs: dali.get_rec_iter(*args, **kwargs, dali_cpu=True))
+    if args.data_backend == 'synthetic':
+        return get_synthetic_rec_iter
+    if args.data_backend == 'mxnet':
+        return get_rec_iter
+    raise ValueError('Wrong data backend')
+
+class DataGPUSplit:
+    def __init__(self, dataloader, ctx, dtype):
+        self.dataloader = dataloader
+        self.ctx = ctx
+        self.dtype = dtype
+        self.batch_size = dataloader.batch_size // len(ctx)
+        self._num_gpus = len(ctx)
+
+    def __iter__(self):
+        return DataGPUSplit(iter(self.dataloader), self.ctx, self.dtype)
+
+    def __next__(self):
+        data = next(self.dataloader)
+        ret = []
+        for i in range(len(self.ctx)):
+            start = i * len(data.data[0]) // len(self.ctx)
+            end = (i + 1) * len(data.data[0]) // len(self.ctx)
+            pad = max(0, min(data.pad - (len(self.ctx) - i - 1) * self.batch_size, self.batch_size))
+            ret.append(mx.io.DataBatch(
+                [data.data[0][start:end].as_in_context(self.ctx[i]).astype(self.dtype)],
+                [data.label[0][start:end].as_in_context(self.ctx[i])],
+                pad=pad))
+        return ret
+
+    def next(self):
+        return next(self)
+
+    def reset(self):
+        self.dataloader.reset()
+
 def get_rec_iter(args, kv=None):
-    image_shape = tuple([int(l) for l in args.image_shape.split(',')])
-    if args.input_layout == 'NHWC':
-        image_shape = image_shape[1:] + (image_shape[0],)
-    if kv:
-        (rank, nworker) = (kv.rank, kv.num_workers)
+    gpus = args.gpus
+    if 'horovod' in args.kv_store:
+        rank = hvd.rank()
+        nworker = hvd.size()
+        gpus = [gpus[0]]
+        batch_size = args.batch_size // hvd.size()
    else:
-        (rank, nworker) = (0, 1)
-    rgb_mean = [float(i) for i in args.rgb_mean.split(',')]
-    rgb_std = [float(i) for i in args.rgb_std.split(',')]
+        rank = kv.rank if kv else 0
+        nworker = kv.num_workers if kv else 1
+        batch_size = args.batch_size
+
    if args.input_layout == 'NHWC':
        raise ValueError('ImageRecordIter cannot handle layout {}'.format(args.input_layout))

-    train = mx.io.ImageRecordIter(
-        path_imgrec         = args.data_train,
-        path_imgidx         = args.data_train_idx,
-        label_width         = 1,
-        mean_r              = rgb_mean[0],
-        mean_g              = rgb_mean[1],
-        mean_b              = rgb_mean[2],
-        std_r               = rgb_std[0],
-        std_g               = rgb_std[1],
-        std_b               = rgb_std[2],
-        data_name           = 'data',
-        label_name          = 'softmax_label',
-        data_shape          = image_shape,
-        batch_size          = args.batch_size,
-        rand_crop           = args.random_crop,
-        max_random_scale    = args.max_random_scale,
-        pad                 = args.pad_size,
-        fill_value          = args.fill_value,
-        random_resized_crop = args.random_resized_crop,
-        min_random_scale    = args.min_random_scale,
-        max_aspect_ratio    = args.max_random_aspect_ratio,
-        min_aspect_ratio    = args.min_random_aspect_ratio,
-        max_random_area     = args.max_random_area,
-        min_random_area     = args.min_random_area,
-        min_crop_size       = args.min_crop_size,
-        max_crop_size       = args.max_crop_size,
-        brightness          = args.brightness,
-        contrast            = args.contrast,
-        saturation          = args.saturation,
-        pca_noise           = args.pca_noise,
-        random_h            = args.max_random_h,
-        random_s            = args.max_random_s,
-        random_l            = args.max_random_l,
-        max_rotate_angle    = args.max_random_rotate_angle,
-        max_shear_ratio     = args.max_random_shear_ratio,
-        rand_mirror         = args.random_mirror,
-        preprocess_threads  = args.data_nthreads,
-        shuffle             = True,
-        num_parts           = nworker,
-        part_index          = rank)
+
+    train = DataGPUSplit(mx.io.ImageRecordIter(
+            path_imgrec         = args.data_train,
+            path_imgidx         = args.data_train_idx,
+            label_width         = 1,
+            mean_r              = args.rgb_mean[0],
+            mean_g              = args.rgb_mean[1],
+            mean_b              = args.rgb_mean[2],
+            std_r               = args.rgb_std[0],
+            std_g               = args.rgb_std[1],
+            std_b               = args.rgb_std[2],
+            data_name           = 'data',
+            label_name          = 'softmax_label',
+            data_shape          = args.image_shape,
+            batch_size          = batch_size,
+            rand_crop           = args.random_crop,
+            max_random_scale    = args.max_random_scale,
+            random_resized_crop = args.random_resized_crop,
+            min_random_scale    = args.min_random_scale,
+            max_aspect_ratio    = args.max_random_aspect_ratio,
+            min_aspect_ratio    = args.min_random_aspect_ratio,
+            max_random_area     = args.max_random_area,
+            min_random_area     = args.min_random_area,
+            min_crop_size       = args.min_crop_size,
+            max_crop_size       = args.max_crop_size,
+            brightness          = args.brightness,
+            contrast            = args.contrast,
+            saturation          = args.saturation,
+            pca_noise           = args.pca_noise,
+            random_h            = args.max_random_h,
+            random_s            = args.max_random_s,
+            random_l            = args.max_random_l,
+            max_rotate_angle    = args.max_random_rotate_angle,
+            max_shear_ratio     = args.max_random_shear_ratio,
+            rand_mirror         = args.random_mirror,
+            preprocess_threads  = args.data_mxnet_threads,
+            shuffle             = True,
+            num_parts           = nworker,
+            part_index          = rank,
+            seed                = args.seed or '0',
+        ), [mx.gpu(gpu) for gpu in gpus], args.dtype)
    if args.data_val is None:
        return (train, None)
-    val = mx.io.ImageRecordIter(
-        path_imgrec         = args.data_val,
-        path_imgidx         = args.data_val_idx,
-        label_width         = 1,
-        mean_r              = rgb_mean[0],
-        mean_g              = rgb_mean[1],
-        mean_b              = rgb_mean[2],
-        std_r               = rgb_std[0],
-        std_g               = rgb_std[1],
-        std_b               = rgb_std[2],
-        data_name           = 'data',
-        label_name          = 'softmax_label',
-        batch_size          = args.batch_size,
-        round_batch         = False,
-        data_shape          = image_shape,
-        preprocess_threads  = args.data_nthreads,
-        rand_crop           = False,
-        rand_mirror         = False,
-        num_parts           = nworker,
-        part_index          = rank)
+    val = DataGPUSplit(mx.io.ImageRecordIter(
+            path_imgrec         = args.data_val,
+            path_imgidx         = args.data_val_idx,
+            label_width         = 1,
+            mean_r              = args.rgb_mean[0],
+            mean_g              = args.rgb_mean[1],
+            mean_b              = args.rgb_mean[2],
+            std_r               = args.rgb_std[0],
+            std_g               = args.rgb_std[1],
+            std_b               = args.rgb_std[2],
+            data_name           = 'data',
+            label_name          = 'softmax_label',
+            batch_size          = batch_size,
+            round_batch         = False,
+            data_shape          = args.image_shape,
+            preprocess_threads  = args.data_mxnet_threads,
+            rand_crop           = False,
+            rand_mirror         = False,
+            num_parts           = nworker,
+            part_index          = rank,
+            resize              = args.data_val_resize,
+        ), [mx.gpu(gpu) for gpu in gpus], args.dtype)
    return (train, val)
+
+
+class SyntheticDataIter(DataIter):
+    def __init__(self, num_classes, data_shape, max_iter, ctx, dtype):
+        self.batch_size = data_shape[0]
+        self.cur_iter = 0
+        self.max_iter = max_iter
+        self.dtype = dtype
+        label = np.random.randint(0, num_classes, [self.batch_size,])
+        data = np.random.uniform(-1, 1, data_shape)
+        self.data = []
+        self.label = []
+        self._num_gpus = len(ctx)
+        for dev in ctx:
+            self.data.append(mx.nd.array(data, dtype=self.dtype, ctx=dev))
+            self.label.append(mx.nd.array(label, dtype=self.dtype, ctx=dev))
+
+    def __iter__(self):
+        return self
+
+    def next(self):
+        self.cur_iter += 1
+        if self.cur_iter <= self.max_iter:
+            return [DataBatch(data=(data,), label=(label,), pad=0) for data, label in zip(self.data, self.label)]
+        else:
+            raise StopIteration
+
+    def __next__(self):
+        return self.next()
+
+    def reset(self):
+        self.cur_iter = 0
+
+def get_synthetic_rec_iter(args, kv=None):
+    gpus = args.gpus
+    if 'horovod' in args.kv_store:
+        gpus = [gpus[0]]
+        batch_size = args.batch_size // hvd.size()
+    else:
+        batch_size = args.batch_size
+
+    if args.input_layout == 'NCHW':
+        data_shape = (batch_size, *args.image_shape)
+    elif args.input_layout == 'NHWC':
+        data_shape = (batch_size, *args.image_shape[1:], args.image_shape[0])
+    else:
+        raise ValueError('Wrong input layout')
+
+    train = SyntheticDataIter(args.num_classes, data_shape,
+                              args.num_examples // args.batch_size,
+                              [mx.gpu(gpu) for gpu in gpus], args.dtype)
+    if args.data_val is None:
+        return (train, None)
+
+    val = SyntheticDataIter(args.num_classes, data_shape,
+                            args.num_examples // args.batch_size,
+                            [mx.gpu(gpu) for gpu in gpus], args.dtype)
+    return (train, val)
+
+def load_image(args, path, ctx=mx.cpu()):
+    image = mx.image.imread(path).astype('float32')
+    image = mx.image.imresize(image, *args.image_shape[1:])
+    image = (image - nd.array(args.rgb_mean)) / nd.array(args.rgb_std)
+    image = image.as_in_context(ctx)
+    if args.input_layout == 'NCHW':
+        image = image.transpose((2, 0, 1))
+    image = image.astype(args.dtype)
+    if args.image_shape[0] == 4:
+        dim = 0 if args.input_layout == 'NCHW' else 2
+        image = nd.concat(image, nd.zeros((1, *image.shape[1:]), dtype=image.dtype, ctx=image.context), dim=dim)
+    return image
--- a/MxNet/Classification/RN50v1.5/examples/INFER_BENCHMARK_FP16.sh
+++ b/MxNet/Classification/RN50v1.5/examples/INFER_BENCHMARK_FP16.sh
@ -1,19 +0,0 @@
-# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-# This script launches ResNet50 inference benchmark in FP16 on 1 GPU with 1,2,4,64,128,192,208 batch size
-# Usage ./INFER_BENCHMARK_FP16.sh <additionals flags>
-
-python benchmark.py -n 1 -b 1,2,4,64,128,192,208 --only-inference -e 3 -w 1 -i 100 -o report.json $@
--- a/MxNet/Classification/RN50v1.5/examples/RN50_FP16_1GPU.sh
+++ b/MxNet/Classification/RN50v1.5/examples/RN50_FP16_1GPU.sh
@ -1,19 +0,0 @@
-# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-# This script launches ResNet50 training in FP16 on 1 GPUs using 208 batch size (208 per GPU)
-# Usage ./RN50_FP16_1GPU.sh <path to this repository> <additionals flags>
-
-"$1/runner" -n 1 -b 208 --model-prefix model ${@:2}
--- a/MxNet/Classification/RN50v1.5/examples/RN50_FP16_8GPU.sh
+++ b/MxNet/Classification/RN50v1.5/examples/RN50_FP16_8GPU.sh
@ -1,19 +0,0 @@
-# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-# This script launches ResNet50 training in FP16 on 8 GPUs using 1664 batch size (208 per GPU)
-# Usage ./RN50_FP16_8GPU.sh <path to this repository> <additionals flags>
-
-"$1/runner" -n 8 -b 208 --model-prefix model ${@:2}
--- a/MxNet/Classification/RN50v1.5/examples/RN50_FP32_1GPU.sh
+++ b/MxNet/Classification/RN50v1.5/examples/RN50_FP32_1GPU.sh
@ -1,19 +0,0 @@
-# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-# This script launches ResNet50 training in FP32 on 1 GPUs using 96 batch size (96 per GPU)
-# Usage ./RN50_FP32_1GPU.sh <path to this repository> <additionals flags>
-
-"$1/runner" -n 1 -b 96 --dtype float32 --model-prefix model ${@:2}
--- a/MxNet/Classification/RN50v1.5/examples/RN50_FP32_4GPU.sh
+++ b/MxNet/Classification/RN50v1.5/examples/RN50_FP32_4GPU.sh
@ -1,19 +0,0 @@
-# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-# This script launches ResNet50 training in FP32 on 4 GPUs using 384 batch size (96 per GPU)
-# Usage ./RN50_FP32_4GPU.sh <path to this repository> <additionals flags>
-
-"$1/runner" -n 4 -b 96 --dtype float32 --model-prefix model ${@:2}
--- a/MxNet/Classification/RN50v1.5/examples/RN50_FP32_8GPU.sh
+++ b/MxNet/Classification/RN50v1.5/examples/RN50_FP32_8GPU.sh
@ -1,19 +0,0 @@
-# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-# This script launches ResNet50 training in FP32 on 8 GPUs using 768 batch size (96 per GPU)
-# Usage ./RN50_FP32_8GPU.sh <path to this repository> <additionals flags>
-
-"$1/runner" -n 8 -b 96 --dtype float32 --model-prefix model ${@:2}
--- a/MxNet/Classification/RN50v1.5/examples/SCORE_FP16.sh
+++ b/MxNet/Classification/RN50v1.5/examples/SCORE_FP16.sh
@ -1,19 +0,0 @@
-# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-# This script score ResNet50 checkpoint in FP16 on 1 GPUs using 128 batch size
-# Usage ./SCORE_FP16.sh <model prefix> <epoch> <additionals flags>
-
-./runner -n 1 -b 128 --only-inference --model-prefix $1 --load-epoch $2 -e 1 ${@:3}
--- a/MxNet/Classification/RN50v1.5/examples/SCORE_FP32.sh
+++ b/MxNet/Classification/RN50v1.5/examples/SCORE_FP32.sh
@ -1,19 +0,0 @@
-# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-# This script score ResNet50 checkpoint in FP32 on 1 GPUs using 64 batch size
-# Usage ./SCORE_FP32.sh <model prefix> <epoch> <additionals flags>
-
-./runner -n 1 -b 64 --dtype float32 --only-inference --model-prefix $1 --load-epoch $2 -e 1 ${@:3}
--- a/MxNet/Classification/RN50v1.5/fit.py
+++ b/MxNet/Classification/RN50v1.5/fit.py
@ -33,197 +33,408 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-""" example train fit utility """
+""" train fit utility """
 import logging
 import os
 import time
 import re
 import math
 import sys
+import random
+from itertools import starmap
+import numpy as np
 import mxnet as mx
+import mxnet.ndarray as nd
+import horovod.mxnet as hvd
+import mxnet.contrib.amp as amp
+from mxnet import autograd as ag
+from mxnet import gluon
 from report import Report
 from benchmarking import BenchmarkingDataIter
-
-def get_epoch_size(args, kv):
-    return math.ceil(int(args.num_examples / kv.num_workers) / args.batch_size)
-
-def _get_lr_scheduler(args, kv):
-    if 'lr_factor' not in args or args.lr_factor >= 1:
-        return (args.lr, None)
-    epoch_size = get_epoch_size(args, kv)
-    begin_epoch = args.load_epoch if args.load_epoch else 0
-    if 'pow' in args.lr_step_epochs:
-        lr = args.lr
-        max_up = args.num_epochs * epoch_size
-        pwr = float(re.sub('pow[- ]*', '', args.lr_step_epochs))
-        poly_sched = mx.lr_scheduler.PolyScheduler(max_up, lr, pwr)
-        return (lr, poly_sched)
-    step_epochs = [int(l) for l in args.lr_step_epochs.split(',')]
-    lr = args.lr
-    for s in step_epochs:
-        if begin_epoch >= s:
-            lr *= args.lr_factor
-    if lr != args.lr:
-        logging.info('Adjust learning rate to %e for epoch %d',
-                     lr, begin_epoch)
-
-    steps = [epoch_size * (x - begin_epoch)
-             for x in step_epochs if x - begin_epoch > 0]
-    if steps:
-        if kv:
-            num_workers = kv.num_workers
-        else:
-            num_workers = 1
-        epoch_size = math.ceil(int(args.num_examples/num_workers)/args.batch_size)
-        return (lr, mx.lr_scheduler.MultiFactorScheduler(step=steps, factor=args.lr_factor,
-                                                         base_lr=args.lr, warmup_steps=epoch_size * args.warmup_epochs,
-                                                         warmup_mode=args.warmup_strategy))
-    else:
-        return (lr, None)
-
-def _load_model(args, rank=0):
-    if 'load_epoch' not in args or args.load_epoch is None:
-        return (None, None, None)
-    assert args.model_prefix is not None
-    model_prefix = args.model_prefix
-    if rank > 0 and os.path.exists("%s-%d-symbol.json" % (model_prefix, rank)):
-        model_prefix += "-%d" % (rank)
-    sym, arg_params, aux_params = mx.model.load_checkpoint(
-        model_prefix, args.load_epoch)
-    logging.info('Loaded model %s_%04d.params', model_prefix, args.load_epoch)
-    return (sym, arg_params, aux_params)
-
-
-def _save_model(args, rank=0):
-    if args.model_prefix is None:
-        return None
-    return mx.callback.do_checkpoint(args.model_prefix if rank == 0 else "%s-%d" % (
-        args.model_prefix, rank), period=args.save_period)
-
+import data

 def add_fit_args(parser):
-    """
-    parser : argparse.ArgumentParser
-    return a parser added with args required by fit
-    """
-    train = parser.add_argument_group('Training', 'model training')
-    train.add_argument('--num-layers', type=int,
-                       help='number of layers in the neural network, \
-                             required by some networks such as resnet')
-    train.add_argument('--gpus', type=str,
-                       help='list of gpus to run, e.g. 0 or 0,2,5. empty means using cpu')
-    train.add_argument('--kv-store', type=str, default='device',
+    def int_list(x):
+        return list(map(int, x.split(',')))
+
+    def float_list(x):
+        return list(map(float, x.split(',')))
+
+    train = parser.add_argument_group('Training')
+    train.add_argument('--mode', default='train_val', choices=('train_val', 'train', 'val', 'pred'),
+                       help='mode')
+    train.add_argument('--seed', type=int, default=None,
+                       help='random seed')
+
+    train.add_argument('--gpus', type=int_list, default=[0],
+                       help='list of gpus to run, e.g. 0 or 0,2,5')
+    train.add_argument('--kv-store', type=str, default='device', choices=('device', 'horovod'),
                       help='key-value store type')
-    train.add_argument('--num-epochs', type=int, default=100,
-                       help='max num of epochs')
+
+    train.add_argument('--dtype', type=str, default='float16', choices=('float32', 'float16'),
+                       help='precision')
+    train.add_argument('--amp', action='store_true',
+                       help='If enabled, turn on AMP (Automatic Mixed Precision)')
+    train.add_argument('--batch-size', type=int, default=192,
+                       help='the batch size')
+    train.add_argument('--num-epochs', type=int, default=90,
+                       help='number of epochs')
    train.add_argument('--lr', type=float, default=0.1,
                       help='initial learning rate')
-    train.add_argument('--lr-factor', type=float, default=0.1,
+    train.add_argument('--lr-schedule', choices=('multistep', 'cosine'), default='cosine',
+                       help='learning rate schedule')
+    train.add_argument('--lr-factor', type=float, default=0.256,
                       help='the ratio to reduce lr on each step')
-    train.add_argument('--lr-step-epochs', type=str,
+    train.add_argument('--lr-steps', type=float_list, default=[],
                       help='the epochs to reduce the lr, e.g. 30,60')
-    train.add_argument('--initializer', type=str, default='default',
-                       help='the initializer type')
-    train.add_argument('--optimizer', type=str, default='sgd',
-                       help='the optimizer type')
-    train.add_argument('--mom', type=float, default=0.9,
-                       help='momentum for sgd')
-    train.add_argument('--wd', type=float, default=0.0001,
-                       help='weight decay for sgd')
-    train.add_argument('--batch-size', type=int, default=208,
-                       help='the batch size')
-    train.add_argument('--disp-batches', type=int, default=20,
-                       help='show progress for every n batches')
-    train.add_argument('--model-prefix', type=str,
-                       help='model prefix')
-    train.add_argument('--save-period', type=int, default=1, help='params saving period')
-    parser.add_argument('--monitor', dest='monitor', type=int, default=0,
-                        help='log network parameters every N iters if larger than 0')
-    train.add_argument('--load-epoch', type=int,
-                       help='load the model on an epoch using the model-load-prefix')
-    train.add_argument('--loss', type=str, default='',
-                       help='show the cross-entropy or nll loss. ce strands for cross-entropy, nll-loss stands for likelihood loss')
-    train.add_argument('--test-io', type=int, default=0,
-                       help='1 means test reading speed without training')
-    train.add_argument('--dtype', type=str, default='float16',
-                       help='precision: float32 or float16')
-    train.add_argument('--gc-type', type=str, default='none',
-                       help='type of gradient compression to use, \
-                             takes `2bit` or `none` for now')
-    train.add_argument('--gc-threshold', type=float, default=0.5,
-                       help='threshold for 2bit gradient compression')
-    # additional parameters for large batch sgd
-    train.add_argument('--macrobatch-size', type=int, default=0,
-                       help='distributed effective batch size')
    train.add_argument('--warmup-epochs', type=int, default=5,
                       help='the epochs to ramp-up lr to scaled large-batch value')
-    train.add_argument('--warmup-strategy', type=str, default='linear',
-                       help='the ramping-up strategy for large batch sgd')
-    train.add_argument('--logging-dir', type=str, default='logs')
-    train.add_argument('--log', type=str, default='')
-    train.add_argument('--bn-gamma-init0', action='store_true')
-    train.add_argument('--epoch-size',type=int, default=0,
-                       help='set number of batches in an epoch. useful for debugging')
-    #train.add_argument('--tensorboard', type=str, default='',
-    #                   help='log parameters to visualize in tensorboard every epoch. takes name to specify as tensorboard run. Empty means tensorboard logging is disabled')
-    train.add_argument('--profile-worker-suffix', type=str, default='',
-                       help='profile workers actions into this file. During distributed training\
-                             filename saved will be rank1_ followed by this suffix')
-    train.add_argument('--profile-server-suffix', type=str, default='',
-                       help='profile server actions into a file with name like rank1_ followed by this suffix \
-                             during distributed training')
-    train.add_argument('--report', type=str, help='file where to save report')
-    train.add_argument('--only-inference', action='store_true', help='do not train, only inference (for benchmarking)')
+    train.add_argument('--optimizer', type=str, default='sgd',
+                       help='the optimizer type')
+    train.add_argument('--mom', type=float, default=0.875,
+                       help='momentum for sgd')
+    train.add_argument('--wd', type=float, default=1 / 32768,
+                       help='weight decay for sgd')
+    train.add_argument('--label-smoothing', type=float, default=0.1,
+                       help='label smoothing factor')
+    train.add_argument('--mixup', type=float, default=0,
+                       help='alpha parameter for mixup (if 0 then mixup is not applied)')
+
+    train.add_argument('--disp-batches', type=int, default=20,
+                       help='show progress for every n batches')
+    train.add_argument('--model-prefix', type=str, default='model',
+                       help='model checkpoint prefix')
+    train.add_argument('--save-frequency', type=int, default=-1,
+                       help='frequency of saving model in epochs (--model-prefix must be specified). '
+                            'If -1 then save only best model. If 0 then do not save anything.')
+    train.add_argument('--begin-epoch', type=int, default=0,
+                       help='start the model from an epoch')
+    train.add_argument('--load', help='checkpoint to load')
+
+    train.add_argument('--test-io', action='store_true',
+                       help='test reading speed without training')
+    train.add_argument('--test-io-mode', default='train', choices=('train', 'val'),
+                       help='data to test')
+
+    train.add_argument('--log', type=str, default='log.log',
+                       help='file where to save the log from the experiment')
+    train.add_argument('--report', default='report.json', help='file where to save report')
+
    train.add_argument('--no-metrics', action='store_true', help='do not calculate evaluation metrics (for benchmarking)')
+    train.add_argument('--benchmark-iters', type=int, default=None,
+                       help='run only benchmark-iters iterations from each epoch')
    return train

+def get_epoch_size(args, kv):
+    return math.ceil(args.num_examples / args.batch_size)

-def fit(args, network, data_loader, **kwargs):
+def get_lr_scheduler(args):
+    def multistep_schedule(x):
+        lr = args.lr * (args.lr_factor ** (len(list(filter(lambda step: step <= x, args.lr_steps)))))
+        warmup_coeff = min(1, x / args.warmup_epochs)
+        return warmup_coeff * lr
+
+    def cosine_schedule(x):
+        steps = args.lr_steps
+        if not steps or steps[0] > args.warmup_epochs:
+            steps = [args.warmup_epochs] + steps
+        elif not steps or steps[0] != 0:
+            steps = [0] + steps
+
+        if steps[-1] != args.num_epochs:
+            steps.append(args.num_epochs)
+
+        if x < args.warmup_epochs:
+            return args.lr * x / args.warmup_epochs
+
+        for i, (step, next_step) in enumerate(zip(steps, steps[1:])):
+            if next_step > x:
+                return args.lr * 0.5 * (1 + math.cos(math.pi * (x - step) / (next_step - step))) * (args.lr_factor ** i)
+        return 0
+
+    schedules = {
+        'multistep': multistep_schedule,
+        'cosine': cosine_schedule,
+    }
+    return schedules[args.lr_schedule]
+
+def load_model(args, model):
+    if args.load is None:
+        return False
+    model.load_parameters(args.load)
+    logging.info('Loaded model {}'.format(args.load))
+    return True
+
+def save_checkpoint(net, epoch, top1, best_acc, model_prefix, save_frequency, kvstore):
+    if model_prefix is None or save_frequency == 0 or ('horovod' in kvstore and hvd.rank() != 0):
+        return
+    if save_frequency > 0 and (epoch + 1) % save_frequency == 0:
+        fname = '{}_{:04}.params'.format(model_prefix, epoch)
+        net.save_parameters(fname)
+        logging.info('[Epoch {}] Saving checkpoint to {} with Accuracy: {:.4f}'.format(epoch, fname, top1))
+    if top1 > best_acc:
+        fname = '{}_best.params'.format(model_prefix)
+        net.save_parameters(fname)
+        logging.info('[Epoch {}] Saving checkpoint to {} with Accuracy: {:.4f}'.format(epoch, fname, top1))
+
+def add_metrics_to_report(report, mode, metric, durations, total_batch_size, loss=None, warmup=20):
+    if report is None:
+        return
+
+    top1 = metric.get('accuracy', None)
+    if top1 is not None:
+        report.add_value('{}.top1'.format(mode), top1)
+
+    top5 = metric.get('top_k_accuracy_5', None)
+    if top5 is not None:
+        report.add_value('{}.top5'.format(mode), top5)
+
+    if loss is not None:
+        report.add_value('{}.loss'.format(mode), loss.get_global()[1])
+
+    if len(durations) > warmup:
+        durations = durations[warmup:]
+    duration = np.mean(durations)
+    total_ips = total_batch_size / duration
+    report.add_value('{}.latency_avg'.format(mode), duration)
+    for percentile in [50, 90, 95, 99, 100]:
+        report.add_value('{}.latency_{}'.format(mode, percentile), np.percentile(durations, percentile))
+    report.add_value('{}.total_ips'.format(mode), total_ips)
+
+def model_pred(args, model, image):
+    from imagenet_classes import classes
+    output = model(image.reshape(-1, *image.shape))[0].softmax().as_in_context(mx.cpu())
+    top = output.argsort(is_ascend=False)[:10]
+    for i, ind in enumerate(top):
+        ind = int(ind.asscalar())
+        logging.info('{:2d}. {:5.2f}% -> {}'.format(i + 1, output[ind].asscalar() * 100, classes[ind]))
+
+def reduce_metrics(args, metrics, kvstore):
+    if 'horovod' not in kvstore or not metrics[0] or hvd.size() == 1:
+        return metrics
+
+    m = mx.ndarray.array(metrics[1], ctx=mx.gpu(args.gpus[0]))
+    reduced = hvd.allreduce(m)
+    values = reduced.as_in_context(mx.cpu()).asnumpy().tolist()
+    return (metrics[0], values)
+
+def model_score(args, net, val_data, metric, kvstore, report=None):
+    if val_data is None:
+        logging.info('Omitting validation: no data')
+        return [], []
+
+    if not isinstance(metric, mx.metric.EvalMetric):
+        metric = mx.metric.create(metric)
+    metric.reset()
+
+    val_data.reset()
+
+    total_batch_size = val_data.batch_size * val_data._num_gpus * (hvd.size() if 'horovod' in kvstore else 1)
+
+    durations = []
+    tic = time.time()
+    outputs = []
+    for batches in val_data:
+        # synchronize to previous iteration
+        for o in outputs:
+            o.wait_to_read()
+
+        data = [b.data[0] for b in batches]
+        label = [b.label[0][:len(b.data[0]) - b.pad] for b in batches if len(b.data[0]) != b.pad]
+        outputs = [net(X) for X, b in zip(data, batches)]
+        outputs = [o[:len(b.data[0]) - b.pad] for o, b in zip(outputs, batches) if len(b.data[0]) != b.pad]
+        metric.update(label, outputs)
+
+        durations.append(time.time() - tic)
+        tic = time.time()
+
+    metric = reduce_metrics(args, metric.get_global(), kvstore)
+    add_metrics_to_report(report, 'val', dict(zip(*metric)), durations, total_batch_size)
+    return metric
+
+class ScalarMetric(mx.metric.Loss):
+    def update(self, _, scalar):
+        self.sum_metric += scalar
+        self.global_sum_metric += scalar
+        self.num_inst += 1
+        self.global_num_inst += 1
+
+def label_smoothing(labels, classes, eta):
+    return labels.one_hot(classes, on_value=1 - eta + eta / classes, off_value=eta / classes)
+
+def model_fit(args, net, train_data, eval_metric, optimizer,
+        optimizer_params, lr_scheduler, eval_data, kvstore, kv,
+        begin_epoch, num_epoch, model_prefix, report, print_loss):
+
+    if not isinstance(eval_metric, mx.metric.EvalMetric):
+        eval_metric = mx.metric.create(eval_metric)
+    loss_metric = ScalarMetric()
+
+    if 'horovod' in kvstore:
+        trainer = hvd.DistributedTrainer(net.collect_params(), optimizer, optimizer_params)
+    else:
+        trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params,
+                                kvstore=kv, update_on_kvstore=False)
+
+    if args.amp:
+        amp.init_trainer(trainer)
+
+    sparse_label_loss = (args.label_smoothing == 0 and args.mixup == 0)
+    loss = gluon.loss.SoftmaxCrossEntropyLoss(sparse_label=sparse_label_loss)
+    loss.hybridize(static_shape=True, static_alloc=True)
+
+    local_batch_size = train_data.batch_size
+    total_batch_size = local_batch_size * train_data._num_gpus * (hvd.size() if 'horovod' in kvstore else 1)
+    durations = []
+
+    epoch_size = get_epoch_size(args, kv)
+
+    def transform_data(images, labels):
+        if args.mixup != 0:
+            coeffs = mx.nd.array(np.random.beta(args.mixup, args.mixup, size=images.shape[0])).as_in_context(images.context)
+            image_coeffs = coeffs.astype(images.dtype, copy=False).reshape(*coeffs.shape, 1, 1, 1)
+            ret_images = image_coeffs * images + (1 - image_coeffs) * images[::-1]
+
+            ret_labels = label_smoothing(labels, args.num_classes, args.label_smoothing)
+            label_coeffs = coeffs.reshape(*coeffs.shape, 1)
+            ret_labels = label_coeffs * ret_labels + (1 - label_coeffs) * ret_labels[::-1]
+        else:
+            ret_images = images
+            if not sparse_label_loss:
+                ret_labels = label_smoothing(labels, args.num_classes, args.label_smoothing)
+            else:
+                ret_labels = labels
+
+        return ret_images, ret_labels
+
+
+    best_accuracy = -1
+    for epoch in range(begin_epoch, num_epoch):
+        tic = time.time()
+        train_data.reset()
+        eval_metric.reset()
+        loss_metric.reset()
+        btic = time.time()
+
+        logging.info('Starting epoch {}'.format(epoch))
+        outputs = []
+        for i, batches in enumerate(train_data):
+            # synchronize to previous iteration
+            for o in outputs:
+                o.wait_to_read()
+
+            trainer.set_learning_rate(lr_scheduler(epoch + i / epoch_size))
+
+            data = [b.data[0] for b in batches]
+            label = [b.label[0].as_in_context(b.data[0].context) for b in batches]
+            orig_label = label
+
+            data, label = zip(*starmap(transform_data, zip(data, label)))
+
+            outputs = []
+            Ls = []
+            with ag.record():
+                for x, y in zip(data, label):
+                    z = net(x)
+                    L = loss(z, y)
+                    # store the loss and do backward after we have done forward
+                    # on all GPUs for better speed on multiple GPUs.
+                    Ls.append(L)
+                    outputs.append(z)
+
+                if args.amp:
+                    with amp.scale_loss(Ls, trainer) as scaled_loss:
+                        ag.backward(scaled_loss)
+                else:
+                    ag.backward(Ls)
+
+            if 'horovod' in kvstore:
+                trainer.step(local_batch_size)
+            else:
+                trainer.step(total_batch_size)
+
+            if print_loss:
+                loss_metric.update(..., np.mean([l.asnumpy() for l in Ls]).item())
+            eval_metric.update(orig_label, outputs)
+
+            if args.disp_batches and not (i + 1) % args.disp_batches:
+                name, acc = eval_metric.get()
+                if print_loss:
+                    name = [loss_metric.get()[0]] + name
+                    acc = [loss_metric.get()[1]] + acc
+
+                logging.info('Epoch[{}] Batch [{}-{}]\tSpeed: {} samples/sec\tLR: {}\t{}'.format(
+                    epoch, (i // args.disp_batches) * args.disp_batches, i,
+                    args.disp_batches * total_batch_size / (time.time() - btic), trainer.learning_rate,
+                    '\t'.join(list(map(lambda x: '{}: {:.6f}'.format(*x), zip(name, acc))))))
+                eval_metric.reset_local()
+                loss_metric.reset_local()
+                btic = time.time()
+
+            durations.append(time.time() - tic)
+            tic = time.time()
+
+
+        add_metrics_to_report(report, 'train', dict(eval_metric.get_global_name_value()), durations, total_batch_size, loss_metric if print_loss else None)
+
+        if args.mode == 'train_val':
+            logging.info('Validating epoch {}'.format(epoch))
+            score = model_score(args, net, eval_data, eval_metric, kvstore, report)
+            for name, value in zip(*score):
+                logging.info('Epoch[{}] Validation {:20}: {}'.format(epoch, name, value))
+
+            score = dict(zip(*score))
+            accuracy = score.get('accuracy', -1)
+            save_checkpoint(net, epoch, accuracy, best_accuracy, model_prefix, args.save_frequency, kvstore)
+            best_accuracy = max(best_accuracy, accuracy)
+
+
+def fit(args, model, data_loader):
    """
    train a model
    args : argparse returns
-    network : the symbol definition of the nerual network
+    model : the the neural network model
    data_loader : function that returns the train and val data iterators
    """

    start_time = time.time()

+    report = Report(args.arch, len(args.gpus), sys.argv)
+
+    # select gpu for horovod process
+    if 'horovod' in args.kv_store:
+        hvd.init()
+        args.gpus = [args.gpus[hvd.local_rank()]]
+
+    if args.amp:
+        amp.init()
+
+    if args.seed is not None:
+        logging.info('Setting seeds to {}'.format(args.seed))
+        random.seed(args.seed)
+        np.random.seed(args.seed)
+        mx.random.seed(args.seed)
+
    # kvstore
-    kv = mx.kvstore.create(args.kv_store)
-    if args.gc_type != 'none':
-        kv.set_gradient_compression({'type': args.gc_type,
-                                     'threshold': args.gc_threshold})
-    if args.profile_server_suffix:
-        mx.profiler.set_config(filename=args.profile_server_suffix, profile_all=True, profile_process='server')
-        mx.profiler.set_state(state='run', profile_process='server')
-
-    if args.profile_worker_suffix:
-        if kv.num_workers > 1:
-            filename = 'rank' + str(kv.rank) + '_' + args.profile_worker_suffix
-        else:
-            filename = args.profile_worker_suffix
-        mx.profiler.set_config(filename=filename, profile_all=True, profile_process='worker')
-        mx.profiler.set_state(state='run', profile_process='worker')
-
-    # logging
-    head = '%(asctime)-15s Node[' + str(kv.rank) + '] %(message)s'
-    logging.basicConfig(level=logging.DEBUG, format=head)
-    logging.info('start with arguments %s', args)
-
-    epoch_size = get_epoch_size(args, kv)
-
-    # data iterators
-    (train, val) = data_loader(args, kv)
-    if 'dist' in args.kv_store and not 'async' in args.kv_store:
-        logging.info('Resizing training data to %d batches per machine', epoch_size)
-        # resize train iter to ensure each machine has same number of batches per epoch
-        # if not, dist_sync can hang at the end with one machine waiting for other machines
-        if not args.use_dali:
-            train = mx.io.ResizeIter(train, epoch_size)
+    if 'horovod' in args.kv_store:
+        kv = None
+        rank = hvd.rank()
+        num_workers = hvd.size()
+    else:
+        kv = mx.kvstore.create(args.kv_store)
+        rank = kv.rank
+        num_workers = kv.num_workers

    if args.test_io:
+        train, val = data_loader(args, kv)
+
+        if args.test_io_mode == 'train':
+            data_iter = train
+        else:
+            data_iter = val
+
        tic = time.time()
-        for i, batch in enumerate(train):
+        for i, batch in enumerate(data_iter):
            if isinstance(batch, list):
                for b in batch:
                    for j in b.data:
@ -232,232 +443,90 @@ def fit(args, network, data_loader, **kwargs):
                for j in batch.data:
                    j.wait_to_read()
            if (i + 1) % args.disp_batches == 0:
-                logging.info('Batch [%d]\tSpeed: %.2f samples/sec', i,
-                             args.disp_batches * args.batch_size / (time.time() - tic))
+                logging.info('Batch [{}]\tSpeed: {:.2f} samples/sec'.format(
+                    i, args.disp_batches * args.batch_size / (time.time() - tic)))
                tic = time.time()
        return

-    # load model
-    if 'arg_params' in kwargs and 'aux_params' in kwargs:
-        arg_params = kwargs['arg_params']
-        aux_params = kwargs['aux_params']
-    else:
-        sym, arg_params, aux_params = _load_model(args, kv.rank)
-
-    # save model
-    checkpoint = _save_model(args, kv.rank)
-    epoch_end_callbacks = []
-    if checkpoint:
-        epoch_end_callbacks.append(checkpoint)
+    if not load_model(args, model):
+        # all initializers should be specified in the model definition.
+        # if not, this will raise an error
+        model.initialize(mx.init.Initializer())

    # devices for training
-    devs = mx.cpu() if args.gpus is None or args.gpus == "" else [
-        mx.gpu(int(i)) for i in args.gpus.split(',')]
+    devs = list(map(mx.gpu, args.gpus))
+    model.collect_params().reset_ctx(devs)
+
+    if args.mode == 'pred':
+        logging.info('Infering image {}'.format(args.data_pred))
+        model_pred(args, model, data.load_image(args, args.data_pred, devs[0]))
+        return

    # learning rate
-    lr, lr_scheduler = _get_lr_scheduler(args, kv)
-
-    # create model
-    model = mx.mod.Module(
-        context=devs,
-        symbol=network
-    )
+    lr_scheduler = get_lr_scheduler(args)

    optimizer_params = {
-        'learning_rate': lr,
+        'learning_rate': 0,
        'wd': args.wd,
-        'lr_scheduler': lr_scheduler,
-        'multi_precision': True}
+        'multi_precision': True,
+    }

    # Only a limited number of optimizers have 'momentum' property
    has_momentum = {'sgd', 'dcasgd', 'nag', 'signum', 'lbsgd'}
    if args.optimizer in has_momentum:
        optimizer_params['momentum'] = args.mom

-    monitor = mx.mon.Monitor(
-        args.monitor, pattern=".*") if args.monitor > 0 else None
-
-    # A limited number of optimizers have a warmup period
-    has_warmup = {'lbsgd', 'lbnag'}
-    if args.optimizer in has_warmup:
-        if 'dist' in args.kv_store:
-            nworkers = kv.num_workers
-        else:
-            nworkers = 1
-        epoch_size = args.num_examples / args.batch_size / nworkers
-
-        if epoch_size < 1:
-            epoch_size = 1
-        macrobatch_size = args.macrobatch_size
-        if macrobatch_size < args.batch_size * nworkers:
-            macrobatch_size = args.batch_size * nworkers
-        #batch_scale = round(float(macrobatch_size) / args.batch_size / nworkers +0.4999)
-        batch_scale = math.ceil(
-            float(macrobatch_size) / args.batch_size / nworkers)
-        optimizer_params['updates_per_epoch'] = epoch_size
-        optimizer_params['begin_epoch'] = args.load_epoch if args.load_epoch else 0
-        optimizer_params['batch_scale'] = batch_scale
-        optimizer_params['warmup_strategy'] = args.warmup_strategy
-        optimizer_params['warmup_epochs'] = args.warmup_epochs
-        optimizer_params['num_epochs'] = args.num_epochs
-
-    if args.initializer == 'default':
-        initializer = mx.init.Xavier(
-            rnd_type='gaussian', factor_type="in", magnitude=2)
-    # initializer   = mx.init.Xavier(factor_type="in", magnitude=2.34),
-    elif args.initializer == 'xavier':
-        initializer = mx.init.Xavier()
-    elif args.initializer == 'msra':
-        initializer = mx.init.MSRAPrelu()
-    elif args.initializer == 'orthogonal':
-        initializer = mx.init.Orthogonal()
-    elif args.initializer == 'normal':
-        initializer = mx.init.Normal()
-    elif args.initializer == 'uniform':
-        initializer = mx.init.Uniform()
-    elif args.initializer == 'one':
-        initializer = mx.init.One()
-    elif args.initializer == 'zero':
-        initializer = mx.init.Zero()
-
    # evaluation metrices
    if not args.no_metrics:
-        eval_metrics = ['crossentropy', 'accuracy']
+        eval_metrics = ['accuracy']
        eval_metrics.append(mx.metric.create(
            'top_k_accuracy', top_k=5))
    else:
        eval_metrics = []

-    supported_loss = ['ce', 'nll_loss']
-    if len(args.loss) > 0:
-        # ce or nll loss is only applicable to softmax output
-        loss_type_list = args.loss.split(',')
-        if 'softmax_output' in network.list_outputs():
-            for loss_type in loss_type_list:
-                loss_type = loss_type.strip()
-                if loss_type == 'nll':
-                    loss_type = 'nll_loss'
-                if loss_type not in supported_loss:
-                    logging.warning(loss_type + ' is not an valid loss type, only cross-entropy or ' \
-                                    'negative likelihood loss is supported!')
-                else:
-                    eval_metrics.append(mx.metric.create(loss_type))
-        else:
-            logging.warning("The output is not softmax_output, loss argument will be skipped!")
-
-    # callbacks that run after each batch
-    batch_end_callbacks = []
-    batch_end_callbacks.append(mx.callback.Speedometer(
-        args.batch_size, args.disp_batches))
-
-    if 'batch_end_callback' in kwargs:
-        cbs = kwargs['batch_end_callback']
-        batch_end_callbacks += cbs if isinstance(cbs, list) else [cbs]
-
-
-    report = Report('resnet{}'.format(args.num_layers), len(args.gpus.split(',')), sys.argv)
-
+    train, val = data_loader(args, kv)
    train = BenchmarkingDataIter(train, args.benchmark_iters)
-    val = BenchmarkingDataIter(val, args.benchmark_iters)
+    if val is not None:
+        val = BenchmarkingDataIter(val, args.benchmark_iters)

-    class Gatherer:
-        def __init__(self, report, mode, data_iter, total_bs=None):
-            self.report = report
-            self.mode = mode
-            self.total_bs = total_bs
-            self.data_iter = data_iter
-            self.clear()
-
-        def clear(self):
-            self.num = 0
-            self.top1 = 0
-            self.top5 = 0
-            self.loss = 0
-            self.time = 0
-            self.tic = 0
-
-        def gather_metrics(self, data):
-            params = dict(data.eval_metric.get_global_name_value())
-
-            if self.num != 0:
-                self.time += time.time() - self.tic
-            self.num += 1
-            if not args.no_metrics:
-                self.top1 = params['accuracy']
-                self.top5 = params['top_k_accuracy_5']
-                self.loss = params['cross-entropy']
-
-            self.tic = time.time()
-
-        def add_metrics(self, *a, **k):
-            top1 = self.top1 * 100
-            top5 = self.top5 * 100
-            loss = self.loss
-            if self.num <= 1:
-                time = float('nan')
-            else:
-                time = self.time / (self.num - 1)
-            data = self.data_iter.get_avg_time_and_clear()
-            if self.total_bs is not None:
-                compute_ips = self.total_bs / (time - data)
-                total_ips = self.total_bs / time
-
-            if not args.no_metrics:
-                self.report.add_value('{}.top1'.format(self.mode), top1)
-                self.report.add_value('{}.top5'.format(self.mode), top5)
-                self.report.add_value('{}.loss'.format(self.mode), loss)
-            self.report.add_value('{}.time'.format(self.mode), time)
-            # self.report.add_value('{}.data'.format(self.mode), data)
-            if self.total_bs is not None:
-                # self.report.add_value('{}.compute_ips'.format(self.mode), compute_ips)
-                self.report.add_value('{}.total_ips'.format(self.mode), total_ips)
-            self.clear()
-
-    def save_report(*a, **k):
-        report.set_total_duration(time.time() - start_time)
-        if args.report:
-            report.save(args.report)
-
-    train_gatherer = Gatherer(report, 'train', train, args.batch_size)
-    eval_gatherer = Gatherer(report, 'val', val, args.batch_size)
-
-    batch_end_callbacks = [train_gatherer.gather_metrics] + batch_end_callbacks
-    epoch_end_callbacks = [train_gatherer.add_metrics, save_report] + epoch_end_callbacks
-
-    eval_batch_end_callbacks = [eval_gatherer.gather_metrics]
-    eval_end_callbacks = [eval_gatherer.add_metrics, save_report]
+    if 'horovod' in args.kv_store:
+        # Fetch and broadcast parameters
+        params = model.collect_params()
+        if params is not None:
+            hvd.broadcast_parameters(params, root_rank=0)

    # run
-    model.fit(train,
-              begin_epoch=args.load_epoch if args.load_epoch else 0,
-              num_epoch=args.num_epochs if not args.only_inference else 0,
-              eval_data=val,
-              eval_metric=eval_metrics,
-              kvstore=kv,
-              optimizer=args.optimizer,
-              optimizer_params=optimizer_params,
-              initializer=initializer,
-              arg_params=arg_params,
-              aux_params=aux_params,
-              batch_end_callback=batch_end_callbacks,
-              epoch_end_callback=epoch_end_callbacks, #checkpoint if args.use_dali else ,,
-              eval_batch_end_callback=eval_batch_end_callbacks,
-              eval_end_callback=eval_end_callbacks,
-              allow_missing=True,
-              monitor=monitor)
+    if args.mode in ['train_val', 'train']:
+        model_fit(
+            args,
+            model,
+            train,
+            begin_epoch=args.begin_epoch,
+            num_epoch=args.num_epochs,
+            eval_data=val,
+            eval_metric=eval_metrics,
+            kvstore=args.kv_store,
+            kv=kv,
+            optimizer=args.optimizer,
+            optimizer_params=optimizer_params,
+            lr_scheduler=lr_scheduler,
+            report=report,
+            model_prefix=args.model_prefix,
+            print_loss=not args.no_metrics,
+        )
+    elif args.mode == 'val':
+        for epoch in range(args.num_epochs):  # loop for benchmarking
+            score = model_score(args, model, val, eval_metrics, args.kv_store, report=report)
+            for name, value in zip(*score):
+                logging.info('Validation {:20}: {}'.format(name, value))
+    else:
+        raise ValueError('Wrong mode')

-    if args.only_inference:
-        for epoch in range(args.num_epochs):
-            score = model.score(val, eval_metrics, batch_end_callback=eval_batch_end_callbacks, score_end_callback=eval_end_callbacks, epoch=epoch)
-            print('-------------')
-            for name, value in score:
-                print('{}: {}'.format(name, value))
+    mx.nd.waitall()

-    if args.profile_server_suffix:
-        mx.profiler.set_state(state='run', profile_process='server')
-    if args.profile_worker_suffix:
-        mx.profiler.set_state(state='run', profile_process='worker')
+    report.set_total_duration(time.time() - start_time)
+    if args.report:
+        suffix = '-{}'.format(hvd.rank()) if 'horovod' in args.kv_store and hvd.rank() != 0 else ''
+        report.save(args.report + suffix)

-    save_report()
-
-    print('Experiment took: {} sec'.format(report.total_duration))
+    logging.info('Experiment took: {} sec'.format(report.total_duration))
--- a/MxNet/Classification/RN50v1.5/imagenet_classes.py
+++ b/MxNet/Classification/RN50v1.5/imagenet_classes.py
--- a/MxNet/Classification/RN50v1.5/img/dgx1-16g_250e_training_loss.png
+++ b/MxNet/Classification/RN50v1.5/img/dgx1-16g_250e_training_loss.png
--- a/MxNet/Classification/RN50v1.5/img/dgx1-16g_250e_validation_top1.png
+++ b/MxNet/Classification/RN50v1.5/img/dgx1-16g_250e_validation_top1.png
--- a/MxNet/Classification/RN50v1.5/img/dgx1-16g_250e_validation_top5.png
+++ b/MxNet/Classification/RN50v1.5/img/dgx1-16g_250e_validation_top5.png
--- a/MxNet/Classification/RN50v1.5/img/training_accuracy.png
+++ b/MxNet/Classification/RN50v1.5/img/training_accuracy.png
--- a/MxNet/Classification/RN50v1.5/img/validation_accuracy.png
+++ b/MxNet/Classification/RN50v1.5/img/validation_accuracy.png
--- a/MxNet/Classification/RN50v1.5/models.py
+++ b/MxNet/Classification/RN50v1.5/models.py
@ -0,0 +1,522 @@
+# Copyright 2017-2018 The Apache Software Foundation
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -----------------------------------------------------------------------
+#
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import copy
+
+import mxnet as mx
+from mxnet.gluon.block import HybridBlock
+from mxnet.gluon import nn
+
+def add_model_args(parser):
+    model = parser.add_argument_group('Model')
+    model.add_argument('--arch', default='resnetv15',
+                       choices=['resnetv1', 'resnetv15',
+                                'resnextv1', 'resnextv15',
+                                'xception'],
+                       help='model architecture')
+    model.add_argument('--num-layers', type=int, default=50,
+                       help='number of layers in the neural network, \
+                             required by some networks such as resnet')
+    model.add_argument('--num-groups', type=int, default=32,
+                       help='number of groups for grouped convolutions, \
+                             required by some networks such as resnext')
+    model.add_argument('--num-classes', type=int, default=1000,
+                       help='the number of classes')
+    model.add_argument('--batchnorm-eps', type=float, default=1e-5,
+                       help='the amount added to the batchnorm variance to prevent output explosion.')
+    model.add_argument('--batchnorm-mom', type=float, default=0.9,
+                       help='the leaky-integrator factor controling the batchnorm mean and variance.')
+    model.add_argument('--fuse-bn-relu', type=int, default=0,
+                       help='have batchnorm kernel perform activation relu')
+    model.add_argument('--fuse-bn-add-relu', type=int, default=0,
+                       help='have batchnorm kernel perform add followed by activation relu')
+    return model
+
+class Builder:
+    def __init__(self, dtype, input_layout, conv_layout, bn_layout,
+                 pooling_layout, bn_eps, bn_mom, fuse_bn_relu, fuse_bn_add_relu):
+        self.dtype = dtype
+        self.input_layout = input_layout
+        self.conv_layout = conv_layout
+        self.bn_layout = bn_layout
+        self.pooling_layout = pooling_layout
+        self.bn_eps = bn_eps
+        self.bn_mom = bn_mom
+        self.fuse_bn_relu = fuse_bn_relu
+        self.fuse_bn_add_relu = fuse_bn_add_relu
+
+        self.act_type = 'relu'
+        self.bn_gamma_initializer = lambda last: 'zeros' if last else 'ones'
+        self.linear_initializer = lambda groups=1: mx.init.Xavier(rnd_type='gaussian', factor_type="in",
+                                                                  magnitude=2 * (groups ** 0.5))
+
+        self.last_layout = self.input_layout
+
+    def copy(self):
+        return copy.copy(self)
+
+    def batchnorm(self, last=False):
+        gamma_initializer = self.bn_gamma_initializer(last)
+        bn_axis = 3 if self.bn_layout == 'NHWC' else 1
+        return self.sequence(
+            self.transpose(self.bn_layout),
+            nn.BatchNorm(axis=bn_axis, momentum=self.bn_mom, epsilon=self.bn_eps,
+                         gamma_initializer=gamma_initializer,
+                         running_variance_initializer=gamma_initializer)
+        )
+
+    def batchnorm_add_relu(self, last=False):
+        gamma_initializer = self.bn_gamma_initializer(last)
+        if self.fuse_bn_add_relu:
+            bn_axis = 3 if self.bn_layout == 'NHWC' else 1
+            return self.sequence(
+                self.transpose(self.bn_layout),
+                BatchNormAddRelu(axis=bn_axis, momentum=self.bn_mom,
+                                 epsilon=self.bn_eps, act_type=self.act_type,
+                                 gamma_initializer=gamma_initializer,
+                                 running_variance_initializer=gamma_initializer)
+            )
+        return NonFusedBatchNormAddRelu(self, last=last)
+
+    def batchnorm_relu(self, last=False):
+        gamma_initializer = self.bn_gamma_initializer(last)
+        if self.fuse_bn_relu:
+            bn_axis = 3 if self.bn_layout == 'NHWC' else 1
+            return self.sequence(
+                self.transpose(self.bn_layout),
+                nn.BatchNorm(axis=bn_axis, momentum=self.bn_mom,
+                             epsilon=self.bn_eps, act_type=self.act_type,
+                             gamma_initializer=gamma_initializer,
+                             running_variance_initializer=gamma_initializer)
+            )
+
+        return self.sequence(self.batchnorm(last=last), self.activation())
+
+    def activation(self):
+        return nn.Activation(self.act_type)
+
+    def global_avg_pool(self):
+        return self.sequence(
+            self.transpose(self.pooling_layout),
+            nn.GlobalAvgPool2D(layout=self.pooling_layout)
+        )
+
+    def max_pool(self, pool_size, strides=1, padding=True):
+        padding = pool_size // 2 if padding is True else int(padding)
+        return self.sequence(
+            self.transpose(self.pooling_layout),
+            nn.MaxPool2D(pool_size, strides=strides, padding=padding,
+                         layout=self.pooling_layout)
+        )
+
+    def conv(self, channels, kernel_size, padding=True, strides=1, groups=1, in_channels=0):
+        padding = kernel_size // 2 if padding is True else int(padding)
+        initializer = self.linear_initializer(groups=groups)
+        return self.sequence(
+            self.transpose(self.conv_layout),
+            nn.Conv2D(channels, kernel_size=kernel_size, strides=strides,
+                      padding=padding, use_bias=False, groups=groups,
+                      in_channels=in_channels, layout=self.conv_layout,
+                      weight_initializer=initializer)
+        )
+
+    def separable_conv(self, channels, kernel_size, in_channels, padding=True, strides=1):
+        return self.sequence(
+            self.conv(in_channels, kernel_size, padding=padding,
+                      strides=strides, groups=in_channels, in_channels=in_channels),
+            self.conv(channels, 1, in_channels=in_channels)
+        )
+
+    def dense(self, units, in_units=0):
+        return nn.Dense(units, in_units=in_units,
+                        weight_initializer=self.linear_initializer())
+
+    def transpose(self, to_layout):
+        if self.last_layout == to_layout:
+            return None
+        ret = Transpose(self.last_layout, to_layout)
+        self.last_layout = to_layout
+        return ret
+
+    def sequence(self, *seq):
+        seq = list(filter(lambda x: x is not None, seq))
+        if len(seq) == 1:
+            return seq[0]
+        ret = nn.HybridSequential()
+        ret.add(*seq)
+        return ret
+
+
+class Transpose(HybridBlock):
+    def __init__(self, from_layout, to_layout):
+        super().__init__()
+        supported_layouts = ['NCHW', 'NHWC']
+        if from_layout not in supported_layouts:
+            raise ValueError('Not prepared to handle layout: {}'.format(from_layout))
+        if to_layout not in supported_layouts:
+            raise ValueError('Not prepared to handle layout: {}'.format(to_layout))
+        self.from_layout = from_layout
+        self.to_layout = to_layout
+
+    def hybrid_forward(self, F, x):
+        # Insert transpose if from_layout and to_layout don't match
+        if self.from_layout == 'NCHW' and self.to_layout == 'NHWC':
+            return F.transpose(x, axes=(0, 2, 3, 1))
+        elif self.from_layout == 'NHWC' and self.to_layout == 'NCHW':
+            return F.transpose(x, axes=(0, 3, 1, 2))
+        else:
+            return x
+
+    def __repr__(self):
+        s = '{name}({content})'
+        if self.from_layout == self.to_layout:
+            content = 'passthrough ' + self.from_layout
+        else:
+            content = self.from_layout + ' -> ' + self.to_layout
+        return s.format(name=self.__class__.__name__,
+                        content=content)
+
+class LayoutWrapper(HybridBlock):
+    def __init__(self, op, io_layout, op_layout, **kwargs):
+        super(LayoutWrapper, self).__init__(**kwargs)
+        with self.name_scope():
+            self.layout1 = Transpose(io_layout, op_layout)
+            self.op = op
+            self.layout2 = Transpose(op_layout, io_layout)
+
+    def hybrid_forward(self, F, *x):
+        return self.layout2(self.op(*(self.layout1(y) for y in x)))
+
+class BatchNormAddRelu(nn.BatchNorm):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        if self._kwargs.pop('act_type') != 'relu':
+            raise ValueError('BatchNormAddRelu can be used only with ReLU as activation')
+
+    def hybrid_forward(self, F, x, y, gamma, beta, running_mean, running_var):
+        return F.BatchNormAddRelu(data=x, addend=y, gamma=gamma, beta=beta,
+                moving_mean=running_mean, moving_var=running_var, name='fwd', **self._kwargs)
+
+class NonFusedBatchNormAddRelu(HybridBlock):
+    def __init__(self, builder, **kwargs):
+        super().__init__()
+        self.bn = builder.batchnorm(**kwargs)
+        self.act = builder.activation()
+
+    def hybrid_forward(self, F, x, y):
+        return self.act(self.bn(x) + y)
+
+
+# Blocks
+class ResNetBasicBlock(HybridBlock):
+    def __init__(self, builder, channels, stride, downsample=False, in_channels=0,
+                 version='1', resnext_groups=None, **kwargs):
+        super().__init__()
+        assert not resnext_groups
+
+        self.transpose = builder.transpose(builder.conv_layout)
+        builder_copy = builder.copy()
+
+        body = [
+            builder.conv(channels, 3, strides=stride, in_channels=in_channels),
+            builder.batchnorm_relu(),
+            builder.conv(channels, 3),
+        ]
+
+        self.body = builder.sequence(*body)
+        self.bn_add_relu = builder.batchnorm_add_relu(last=True)
+
+        builder = builder_copy
+        if downsample:
+            self.downsample = builder.sequence(
+                builder.conv(channels, 1, strides=stride, in_channels=in_channels),
+                builder.batchnorm()
+            )
+        else:
+            self.downsample = None
+
+    def hybrid_forward(self, F, x):
+        if self.transpose is not None:
+            x = self.transpose(x)
+        residual = x
+
+        x = self.body(x)
+
+        if self.downsample:
+            residual = self.downsample(residual)
+
+        x = self.bn_add_relu(x, residual)
+        return x
+
+
+class ResNetBottleNeck(HybridBlock):
+    def __init__(self, builder, channels, stride, downsample=False, in_channels=0,
+                 version='1', resnext_groups=None):
+        super().__init__()
+        stride1 = stride if version == '1' else 1
+        stride2 = 1 if version == '1' else stride
+
+        mult = 2 if resnext_groups else 1
+        groups = resnext_groups or 1
+
+        self.transpose = builder.transpose(builder.conv_layout)
+        builder_copy = builder.copy()
+
+        body = [
+            builder.conv(channels * mult // 4, 1, strides=stride1, in_channels=in_channels),
+            builder.batchnorm_relu(),
+            builder.conv(channels * mult // 4, 3, strides=stride2),
+            builder.batchnorm_relu(),
+            builder.conv(channels, 1)
+        ]
+
+        self.body = builder.sequence(*body)
+        self.bn_add_relu = builder.batchnorm_add_relu(last=True)
+
+        builder = builder_copy
+        if downsample:
+            self.downsample = builder.sequence(
+                builder.conv(channels, 1, strides=stride, in_channels=in_channels),
+                builder.batchnorm()
+            )
+        else:
+            self.downsample = None
+
+    def hybrid_forward(self, F, x):
+        if self.transpose is not None:
+            x = self.transpose(x)
+        residual = x
+
+        x = self.body(x)
+
+        if self.downsample:
+            residual = self.downsample(residual)
+
+        x = self.bn_add_relu(x, residual)
+        return x
+
+
+class XceptionBlock(HybridBlock):
+    def __init__(self, builder, definition, in_channels, relu_at_beginning=True):
+        super().__init__()
+
+        self.transpose = builder.transpose(builder.conv_layout)
+        builder_copy = builder.copy()
+
+        body = []
+        if relu_at_beginning:
+            body.append(builder.activation())
+
+        last_channels = in_channels
+        for channels1, channels2 in zip(definition, definition[1:] + [0]):
+            if channels1 > 0:
+                body.append(builder.separable_conv(channels1, 3, in_channels=last_channels))
+                if channels2 > 0:
+                    body.append(builder.batchnorm_relu())
+                else:
+                    body.append(builder.batchnorm(last=True))
+
+                last_channels = channels1
+            else:
+                body.append(builder.max_pool(3, 2))
+
+        self.body = builder.sequence(*body)
+
+        builder = builder_copy
+        if any(map(lambda x: x <= 0, definition)):
+            self.shortcut = builder.sequence(
+                builder.conv(last_channels, 1, strides=2, in_channels=in_channels),
+                builder.batchnorm(),
+            )
+        else:
+            self.shortcut = builder.sequence()
+
+    def hybrid_forward(self, F, x):
+        return self.shortcut(x) + self.body(x)
+
+# Nets
+class ResNet(HybridBlock):
+    def __init__(self, builder, block, layers, channels, classes=1000,
+                 version='1', resnext_groups=None):
+        super().__init__()
+        assert len(layers) == len(channels) - 1
+
+        self.version = version
+        with self.name_scope():
+            features = [
+                builder.conv(channels[0], 7, strides=2),
+                builder.batchnorm_relu(),
+                builder.max_pool(3, 2),
+            ]
+
+            for i, num_layer in enumerate(layers):
+                stride = 1 if i == 0 else 2
+                features.append(self.make_layer(builder, block, num_layer, channels[i+1],
+                                                stride, in_channels=channels[i],
+                                                resnext_groups=resnext_groups))
+            features.append(builder.global_avg_pool())
+
+            self.features = builder.sequence(*features)
+            self.output = builder.dense(classes, in_units=channels[-1])
+
+    def make_layer(self, builder, block, layers, channels, stride,
+                    in_channels=0, resnext_groups=None):
+        layer = []
+        layer.append(block(builder, channels, stride, channels != in_channels,
+                            in_channels=in_channels, version=self.version,
+                            resnext_groups=resnext_groups))
+        for _ in range(layers-1):
+            layer.append(block(builder, channels, 1, False, in_channels=channels,
+                               version=self.version, resnext_groups=resnext_groups))
+        return builder.sequence(*layer)
+
+    def hybrid_forward(self, F, x):
+        x = self.features(x)
+        x = self.output(x)
+        return x
+
+
+class Xception(HybridBlock):
+    def __init__(self, builder,
+                 definition=([32, 64],
+                             [[128, 128, 0], [256, 256, 0], [728, 728, 0],
+                              *([[728, 728, 728]] * 8), [728, 1024, 0]],
+                             [1536, 2048]),
+                 classes=1000):
+        super().__init__()
+
+        definition1, definition2, definition3 = definition
+
+        with self.name_scope():
+            features = []
+            last_channels = 0
+            for i, channels in enumerate(definition1):
+                features += [
+                    builder.conv(channels, 3, strides=(2 if i == 0 else 1), in_channels=last_channels),
+                    builder.batchnorm_relu(),
+                ]
+                last_channels = channels
+
+            for i, block_definition in enumerate(definition2):
+                features.append(XceptionBlock(builder, block_definition, in_channels=last_channels,
+                                              relu_at_beginning=False if i == 0 else True))
+                last_channels = list(filter(lambda x: x > 0, block_definition))[-1]
+
+            for i, channels in enumerate(definition3):
+                features += [
+                    builder.separable_conv(channels, 3, in_channels=last_channels),
+                    builder.batchnorm_relu(),
+                ]
+                last_channels = channels
+
+            features.append(builder.global_avg_pool())
+
+            self.features = builder.sequence(*features)
+            self.output = builder.dense(classes, in_units=last_channels)
+
+    def hybrid_forward(self, F, x):
+        x = self.features(x)
+        x = self.output(x)
+
+        return x
+
+
+resnet_spec = {18: (ResNetBasicBlock, [2, 2, 2, 2], [64, 64, 128, 256, 512]),
+               34: (ResNetBasicBlock, [3, 4, 6, 3], [64, 64, 128, 256, 512]),
+               50: (ResNetBottleNeck, [3, 4, 6, 3], [64, 256, 512, 1024, 2048]),
+               101: (ResNetBottleNeck, [3, 4, 23, 3], [64, 256, 512, 1024, 2048]),
+               152: (ResNetBottleNeck, [3, 8, 36, 3], [64, 256, 512, 1024, 2048])}
+
+def create_resnet(builder, version, num_layers=50, resnext=False, classes=1000):
+    assert num_layers in resnet_spec, \
+        "Invalid number of layers: {}. Options are {}".format(
+            num_layers, str(resnet_spec.keys()))
+    block_class, layers, channels = resnet_spec[num_layers]
+    assert not resnext or num_layers >= 50, \
+        "Cannot create resnext with less then 50 layers"
+    net = ResNet(builder, block_class, layers, channels, version=version,
+                 resnext_groups=args.num_groups if resnext else None)
+    return net
+
+class fp16_model(mx.gluon.block.HybridBlock):
+    def __init__(self, net, **kwargs):
+        super(fp16_model, self).__init__(**kwargs)
+        with self.name_scope():
+            self._net = net
+
+    def hybrid_forward(self, F, x):
+        y = self._net(x)
+        y = F.cast(y, dtype='float32')
+        return y
+
+def get_model(arch, num_classes, num_layers, image_shape, dtype, amp,
+              input_layout, conv_layout, batchnorm_layout, pooling_layout,
+              batchnorm_eps, batchnorm_mom, fuse_bn_relu, fuse_bn_add_relu, **kwargs):
+
+    builder = Builder(
+            dtype               = dtype,
+            input_layout        = input_layout,
+            conv_layout         = conv_layout,
+            bn_layout           = batchnorm_layout,
+            pooling_layout      = pooling_layout,
+            bn_eps              = batchnorm_eps,
+            bn_mom              = batchnorm_mom,
+            fuse_bn_relu        = fuse_bn_relu,
+            fuse_bn_add_relu    = fuse_bn_add_relu,
+    )
+
+    if arch.startswith('resnet') or arch.startswith('resnext'):
+        version = '1' if arch in {'resnetv1', 'resnextv1'} else '1.5'
+        net = create_resnet(
+                builder         = builder,
+                version         = version,
+                resnext         = arch.startswith('resnext'),
+                num_layers      = num_layers,
+                classes         = num_classes,
+        )
+    elif arch == 'xception':
+        net = Xception(builder, classes=num_classes)
+    else:
+        raise ValueError('Wrong model architecture')
+
+    net.hybridize(static_shape=True, static_alloc=True)
+
+    if not amp:
+        net.cast(dtype)
+        if dtype == 'float16':
+            net = fp16_model(net)
+
+    return net
--- a/MxNet/Classification/RN50v1.5/report.py
+++ b/MxNet/Classification/RN50v1.5/report.py
@ -21,15 +21,21 @@
 # - "metrics"        : per epoch metrics for train and validation
 #                      (some of below metrics may not exist in the report,
 #                       depending on application arguments)
-#       - "train.top1"      : training top1 accuracy in epoch.
-#       - "train.top5"      : training top5 accuracy in epoch.
-#       - "train.loss"      : training loss in epoch.
-#       - "train.time"      : average training time of iteration in seconds.
-#       - "train.total_ips" : training speed (data and compute time taken into account) for epoch in images/sec.
-#       - "val.top1", "val.top5", "val.loss", "val.time", "val.total_ips" : the same but for validation.
+#       - "train.top1"        : training top1 accuracy in epoch.
+#       - "train.top5"        : training top5 accuracy in epoch.
+#       - "train.loss"        : training loss in epoch.
+#       - "train.total_ips"   : training speed (data and compute time taken into account) for epoch in images/sec.
+#       - "train.latency_avg" : average latency of one iteration in seconds.
+#       - "train.latency_50"  : median latency of one iteration in seconds.
+#       - "train.latency_90"  : 90th percentile latency of one iteration in seconds.
+#       - "train.latency_95"  : 95th percentile latency of one iteration in seconds.
+#       - "train.latency_99"  : 99th percentile latency of one iteration in seconds.
+#       - "train.latency_100" : highest observed latency of one iteration in seconds.
+#       - "val.top1", "val.top5", "val.time", "val.total_ips", "val.latency_avg", "val.latency_50",
+#         "val.latency_90", "val.latency_95", "val.latency_99", "val.latency_100"      : the same but for validation.

 import json
-from collections import defaultdict, OrderedDict
+from collections import OrderedDict

 class Report:
    def __init__(self, model_name, ngpus, cmd):
@ -37,15 +43,21 @@ class Report:
        self.ngpus = ngpus
        self.cmd = cmd
        self.total_duration = 0
-        self.metrics = defaultdict(lambda: [])
+        self.metrics = OrderedDict()

    def add_value(self, metric, value):
+        if metric not in self.metrics:
+            self.metrics[metric] = []
        self.metrics[metric].append(value)

    def set_total_duration(self, duration):
        self.total_duration = duration

    def save(self, filename):
+        with open(filename, 'w') as f:
+            f.write(self.get_report())
+
+    def get_report(self):
        report = OrderedDict([
            ('model', self.model_name),
            ('ngpus', self.ngpus),
@ -53,5 +65,4 @@ class Report:
            ('cmd', self.cmd),
            ('metrics', self.metrics),
        ])
-        with open(filename, 'w') as f:
-            json.dump(report, f, indent=4)
+        return json.dumps(report, indent=4)
--- a/MxNet/Classification/RN50v1.5/resnet.py
+++ b/MxNet/Classification/RN50v1.5/resnet.py
@ -1,376 +0,0 @@
-# Copyright 2017-2018 The Apache Software Foundation
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-# -----------------------------------------------------------------------
-#
-# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-'''
-Adapted from https://github.com/tornadomeet/ResNet/blob/master/symbol_resnet.py
-(Original author Wei Wu) by Antti-Pekka Hynninen
-
-"Flexible Layout" (fl) version created by Dick Carter.
-
-Implementing the original resnet ILSVRC 2015 winning network from:
-
-Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun. "Deep Residual Learning for Image Recognition"
-'''
-import mxnet as mx
-import numpy as np
-import random
-
-# Transform a symbol from one layout to another, or do nothing if they have the same layout
-def transform_layout(data, from_layout, to_layout):
-    supported_layouts = ['NCHW', 'NHWC']
-    if from_layout not in supported_layouts:
-        raise ValueError('Not prepared to handle layout: {}'.format(from_layout))
-    if to_layout not in supported_layouts:
-        raise ValueError('Not prepared to handle layout: {}'.format(to_layout))
-
-    # Insert transpose if from_layout and to_layout don't match
-    if from_layout == 'NCHW' and to_layout == 'NHWC':
-        return mx.sym.transpose(data, axes=(0, 2, 3, 1))
-    elif from_layout == 'NHWC' and to_layout == 'NCHW':
-        return mx.sym.transpose(data, axes=(0, 3, 1, 2))
-    else:
-        return data
-
-# A BatchNorm wrapper that responds to the input layout
-def batchnorm(data, io_layout, batchnorm_layout, **kwargs):
-    # Transpose as needed to batchnorm_layout
-    transposed_as_needed = transform_layout(data, io_layout, batchnorm_layout)
-    bn_axis = 3 if batchnorm_layout == 'NHWC' else 1
-    batchnormed = mx.sym.BatchNorm(data=transposed_as_needed, axis=bn_axis, **kwargs)
-    # Transpose back to i/o layout as needed
-    return transform_layout(batchnormed, batchnorm_layout, io_layout)
-
-# A BatchNormAddRelu wrapper that responds to the input layout
-def batchnorm_add_relu(data, addend, io_layout, batchnorm_layout, **kwargs):
-    # Transpose as needed to batchnorm_layout
-    transposed_data_as_needed = transform_layout(data, io_layout, batchnorm_layout)
-    transposed_addend_as_needed = transform_layout(addend, io_layout, batchnorm_layout)
-    bn_axis = 3 if batchnorm_layout == 'NHWC' else 1
-    batchnormed = mx.sym.BatchNormAddRelu(data=transposed_data_as_needed,
-                                      addend=transposed_addend_as_needed,
-                                      axis=bn_axis, **kwargs)
-    # Transpose back to i/o layout as needed
-    return transform_layout(batchnormed, batchnorm_layout, io_layout)
-
-# A Pooling wrapper that responds to the input layout
-def pooling(data, io_layout, pooling_layout, **kwargs):
-    # Pooling kernel, as specified by pooling_layout, may be in conflict with i/o layout.
-    transposed_as_needed = transform_layout(data, io_layout, pooling_layout)
-    pooled = mx.sym.Pooling(data=transposed_as_needed, layout=pooling_layout, **kwargs)
-    # Transpose back to i/o layout as needed
-    return transform_layout(pooled, pooling_layout, io_layout)
-
-# Assumption is that data comes in and out in the 'conv_layout' format.
-# If this format is different from the 'batchnorm_layout' format, then the batchnorm() routine
-# will introduce transposes on both sides of the mx.sym.BatchNorm symbol
-def residual_unit(data, num_filter, stride, dim_match, name, bottle_neck=True,
-                  workspace=256, memonger=False, conv_layout='NCHW', batchnorm_layout='NCHW',
-                  verbose=False, cudnn_bn_off=False, bn_eps=2e-5, bn_mom=0.9, conv_algo=-1,
-                  fuse_bn_relu=False, fuse_bn_add_relu=False, cudnn_tensor_core_only=False):
-    """Return ResNet Unit symbol for building ResNet
-    Parameters
-    ----------
-    data : str
-        Input data
-    num_filter : int
-        Number of output channels
-    bnf : int
-        Bottle neck channels factor with regard to num_filter
-    stride : tuple
-        Stride used in convolution
-    dim_match : Boolean
-        True means channel number between input and output is the same, otherwise means differ
-    name : str
-        Base name of the operators
-    workspace : int
-        Workspace used in convolution operator
-    """
-
-    act = 'relu' if fuse_bn_relu else None
-    if bottle_neck:
-        conv1 = mx.sym.Convolution(data=data, num_filter=int(num_filter*0.25), kernel=(1,1), stride=(1,1), pad=(0,0),
-                                   no_bias=True, workspace=workspace, name=name + '_conv1', layout=conv_layout,
-                                   cudnn_algo_verbose=verbose,
-                                   cudnn_algo_fwd=conv_algo, cudnn_algo_bwd_data=conv_algo, cudnn_algo_bwd_filter=conv_algo,
-                                   cudnn_tensor_core_only=cudnn_tensor_core_only)
-        bn1 = batchnorm(data=conv1, io_layout=conv_layout, batchnorm_layout=batchnorm_layout,
-                        fix_gamma=False, eps=bn_eps, momentum=bn_mom, name=name + '_bn1', cudnn_off=cudnn_bn_off, act_type=act)
-        act1 = mx.sym.Activation(data=bn1, act_type='relu', name=name + '_relu1') if not fuse_bn_relu else bn1
-        conv2 = mx.sym.Convolution(data=act1, num_filter=int(num_filter*0.25), kernel=(3,3), stride=stride, pad=(1,1),
-                                   no_bias=True, workspace=workspace, name=name + '_conv2', layout=conv_layout,
-                                   cudnn_algo_verbose=verbose,
-                                   cudnn_algo_fwd=conv_algo, cudnn_algo_bwd_data=conv_algo, cudnn_algo_bwd_filter=conv_algo,
-                                   cudnn_tensor_core_only=cudnn_tensor_core_only)
-        bn2 = batchnorm(data=conv2, io_layout=conv_layout, batchnorm_layout=batchnorm_layout,
-                        fix_gamma=False, eps=bn_eps, momentum=bn_mom, name=name + '_bn2', cudnn_off=cudnn_bn_off, act_type=act)
-        act2 = mx.sym.Activation(data=bn2, act_type='relu', name=name + '_relu2') if not fuse_bn_relu else bn2
-        conv3 = mx.sym.Convolution(data=act2, num_filter=num_filter, kernel=(1,1), stride=(1,1), pad=(0,0), no_bias=True,
-                                   workspace=workspace, name=name + '_conv3', layout=conv_layout,
-                                   cudnn_algo_verbose=verbose,
-                                   cudnn_algo_fwd=conv_algo, cudnn_algo_bwd_data=conv_algo, cudnn_algo_bwd_filter=conv_algo,
-                                   cudnn_tensor_core_only=cudnn_tensor_core_only)
-
-        if dim_match:
-            shortcut = data
-        else:
-            conv1sc = mx.sym.Convolution(data=data, num_filter=num_filter, kernel=(1,1), stride=stride, no_bias=True,
-                                            workspace=workspace, name=name+'_conv1sc', layout=conv_layout,
-                                         cudnn_algo_verbose=verbose,
-                                         cudnn_algo_fwd=conv_algo, cudnn_algo_bwd_data=conv_algo, cudnn_algo_bwd_filter=conv_algo,
-                                         cudnn_tensor_core_only=cudnn_tensor_core_only)
-            shortcut = batchnorm(data=conv1sc, io_layout=conv_layout, batchnorm_layout=batchnorm_layout,
-                                 fix_gamma=False, eps=bn_eps, momentum=bn_mom, name=name + '_sc', cudnn_off=cudnn_bn_off)
-        if memonger:
-            shortcut._set_attr(mirror_stage='True')
-
-        if fuse_bn_add_relu:
-            return batchnorm_add_relu(data=conv3, addend=shortcut, io_layout=conv_layout, batchnorm_layout=batchnorm_layout,
-                            fix_gamma=False, eps=bn_eps, momentum=bn_mom, name=name + '_bn3', cudnn_off=cudnn_bn_off)
-        else:
-            bn3 = batchnorm(data=conv3, io_layout=conv_layout, batchnorm_layout=batchnorm_layout,
-                            fix_gamma=False, eps=bn_eps, momentum=bn_mom, name=name + '_bn3', cudnn_off=cudnn_bn_off)
-            return mx.sym.Activation(data=bn3 + shortcut, act_type='relu', name=name + '_relu3')
-
-    else:
-        conv1 = mx.sym.Convolution(data=data, num_filter=num_filter, kernel=(3,3), stride=stride, pad=(1,1),
-                                      no_bias=True, workspace=workspace, name=name + '_conv1', layout=conv_layout,
-                                   cudnn_algo_verbose=verbose,
-                                   cudnn_algo_fwd=conv_algo, cudnn_algo_bwd_data=conv_algo, cudnn_algo_bwd_filter=conv_algo,
-                                   cudnn_tensor_core_only=cudnn_tensor_core_only)
-        bn1 = batchnorm(data=conv1, io_layout=conv_layout, batchnorm_layout=batchnorm_layout,
-                        fix_gamma=False, momentum=bn_mom, eps=bn_eps, name=name + '_bn1', cudnn_off=cudnn_bn_off, act_type=act)
-        act1 = mx.sym.Activation(data=bn1, act_type='relu', name=name + '_relu1') if not fuse_bn_relu else bn1
-        conv2 = mx.sym.Convolution(data=act1, num_filter=num_filter, kernel=(3,3), stride=(1,1), pad=(1,1),
-                                      no_bias=True, workspace=workspace, name=name + '_conv2', layout=conv_layout,
-                                   cudnn_algo_verbose=verbose,
-                                   cudnn_algo_fwd=conv_algo, cudnn_algo_bwd_data=conv_algo, cudnn_algo_bwd_filter=conv_algo,
-                                   cudnn_tensor_core_only=cudnn_tensor_core_only)
-
-        if dim_match:
-            shortcut = data
-        else:
-            conv1sc = mx.sym.Convolution(data=data, num_filter=num_filter, kernel=(1,1), stride=stride, no_bias=True,
-                                            workspace=workspace, name=name+'_conv1sc', layout=conv_layout,
-                                         cudnn_algo_verbose=verbose,
-                                         cudnn_algo_fwd=conv_algo, cudnn_algo_bwd_data=conv_algo, cudnn_algo_bwd_filter=conv_algo,
-                                         cudnn_tensor_core_only=cudnn_tensor_core_only)
-            shortcut = batchnorm(data=conv1sc, io_layout=conv_layout, batchnorm_layout=batchnorm_layout,
-                                 fix_gamma=False, momentum=bn_mom, eps=bn_eps, name=name + '_sc', cudnn_off=cudnn_bn_off)
-        if memonger:
-            shortcut._set_attr(mirror_stage='True')
-
-        if fuse_bn_add_relu:
-            return batchnorm_add_relu(data=conv2, addend=shortcut, io_layout=conv_layout, batchnorm_layout=batchnorm_layout,
-                            fix_gamma=False, momentum=bn_mom, eps=bn_eps, name=name + '_bn2', cudnn_off=cudnn_bn_off)
-        else:
-            bn2 = batchnorm(data=conv2, io_layout=conv_layout, batchnorm_layout=batchnorm_layout,
-                            fix_gamma=False, momentum=bn_mom, eps=bn_eps, name=name + '_bn2', cudnn_off=cudnn_bn_off)
-            return mx.sym.Activation(data=bn2 + shortcut, act_type='relu', name=name + '_relu2')
-
-def resnet(units, num_stages, filter_list, num_classes, image_shape, bottle_neck=True, workspace=256, dtype='float32', memonger=False,
-           input_layout='NCHW', conv_layout='NCHW',  batchnorm_layout='NCHW', pooling_layout='NCHW', verbose=False,
-           cudnn_bn_off=False, bn_eps=2e-5, bn_mom=0.9, conv_algo=-1,
-           fuse_bn_relu=False, fuse_bn_add_relu=False, force_tensor_core=False, use_dali=True):
-    """Return ResNet symbol of
-    Parameters
-    ----------
-    units : list
-        Number of units in each stage
-    num_stages : int
-        Number of stage
-    filter_list : list
-        Channel size of each stage
-    num_classes : int
-        Ouput size of symbol
-    dataset : str
-        Dataset type, only cifar10 and imagenet supports
-    workspace : int
-        Workspace used in convolution operator
-    dtype : str
-        Precision (float32 or float16)
-    memonger : boolean
-        Activates "memory monger" to reduce the model's memory footprint
-    input_layout : str
-        interpretation (e.g. NCHW vs NHWC) of data provided by the i/o pipeline (may introduce transposes
-        if in conflict with 'layout' above)
-    conv_layout : str
-        interpretation (e.g. NCHW vs NHWC) of data for convolution operation.
-    batchnorm_layout : str
-        directs which kernel performs the batchnorm (may introduce transposes if in conflict with 'conv_layout' above)
-    pooling_layout : str
-        directs which kernel performs the pooling (may introduce transposes if in conflict with 'conv_layout' above)
-    """
-
-    act = 'relu' if fuse_bn_relu else None
-    num_unit = len(units)
-    assert(num_unit == num_stages)
-    data = mx.sym.Variable(name='data')
-    if not use_dali:
-        # double buffering of data
-        if dtype == 'float32':
-            data = mx.sym.identity(data=data, name='id')
-        else:
-            if dtype == 'float16':
-                data = mx.sym.Cast(data=data, dtype=np.float16)
-    (nchannel, height, width) = image_shape
-
-    # Insert transpose as needed to get the input layout to match the desired processing layout
-    data = transform_layout(data, input_layout, conv_layout)
-
-    if height <= 32:            # such as cifar10
-        body = mx.sym.Convolution(data=data, num_filter=filter_list[0], kernel=(3, 3), stride=(1,1), pad=(1, 1),
-                                  no_bias=True, name="conv0", workspace=workspace, layout=conv_layout,
-                                  cudnn_algo_verbose=verbose,
-                                  cudnn_algo_fwd=conv_algo, cudnn_algo_bwd_data=conv_algo, cudnn_algo_bwd_filter=conv_algo,
-                                  cudnn_tensor_core_only=force_tensor_core)
-        # Is this BatchNorm supposed to be here?
-        body = batchnorm(data=body, io_layout=conv_layout, batchnorm_layout=batchnorm_layout,
-                         fix_gamma=False, eps=bn_eps, momentum=bn_mom, name='bn0', cudnn_off=cudnn_bn_off)
-    else:                       # often expected to be 224 such as imagenet
-        body = mx.sym.Convolution(data=data, num_filter=filter_list[0], kernel=(7, 7), stride=(2,2), pad=(3, 3),
-                                  no_bias=True, name="conv0", workspace=workspace, layout=conv_layout,
-                                  cudnn_algo_verbose=verbose,
-                                  cudnn_algo_fwd=conv_algo, cudnn_algo_bwd_data=conv_algo, cudnn_algo_bwd_filter=conv_algo,
-                                  cudnn_tensor_core_only=force_tensor_core)
-        body = batchnorm(data=body, io_layout=conv_layout, batchnorm_layout=batchnorm_layout,
-                         fix_gamma=False, eps=bn_eps, momentum=bn_mom, name='bn0', cudnn_off=cudnn_bn_off, act_type=act)
-        if not fuse_bn_relu:
-            body = mx.sym.Activation(data=body, act_type='relu', name='relu0')
-        body = pooling(data=body, io_layout=conv_layout, pooling_layout=pooling_layout,
-                       kernel=(3, 3), stride=(2, 2), pad=(1, 1), pool_type='max')
-
-    for i in range(num_stages):
-        body = residual_unit(body, filter_list[i+1], (1 if i==0 else 2, 1 if i==0 else 2), False,
-                             name='stage%d_unit%d' % (i + 1, 1),
-                             bottle_neck=bottle_neck, workspace=workspace,
-                             memonger=memonger, conv_layout=conv_layout, batchnorm_layout=batchnorm_layout,
-                             verbose=verbose, cudnn_bn_off=cudnn_bn_off, bn_eps=bn_eps, bn_mom=bn_mom,
-                             conv_algo=conv_algo, fuse_bn_relu=fuse_bn_relu, fuse_bn_add_relu=fuse_bn_add_relu,
-                             cudnn_tensor_core_only=force_tensor_core)
-        for j in range(units[i]-1):
-            body = residual_unit(body, filter_list[i+1], (1,1), True, name='stage%d_unit%d' % (i + 1, j + 2),
-                                 bottle_neck=bottle_neck, workspace=workspace,
-                                 memonger=memonger, conv_layout=conv_layout, batchnorm_layout=batchnorm_layout,
-                                 verbose=verbose, cudnn_bn_off=cudnn_bn_off, bn_eps = bn_eps, bn_mom=bn_mom,
-                                 conv_algo=conv_algo, fuse_bn_relu=fuse_bn_relu, fuse_bn_add_relu=fuse_bn_add_relu,
-                                 cudnn_tensor_core_only=force_tensor_core)
-    # bn1 = mx.sym.BatchNorm(data=body, fix_gamma=False, eps=2e-5, momentum=bn_mom, name='bn1')
-    # relu1 = mx.sym.Activation(data=bn1, act_type='relu', name='relu1')
-    # Although kernel is not used here when global_pool=True, we should put one
-    pool1 = pooling(data=body, io_layout=conv_layout, pooling_layout=pooling_layout,
-                    global_pool=True, kernel=(7, 7), pool_type='avg', name='pool1')
-    flat = mx.sym.Flatten(data=pool1)
-    fc1 = mx.sym.FullyConnected(data=flat, num_hidden=num_classes, name='fc1', cublas_algo_verbose=verbose)
-    if dtype == 'float16':
-        fc1 = mx.sym.Cast(data=fc1, dtype=np.float32)
-    return mx.sym.SoftmaxOutput(data=fc1, name='softmax')
-
-def get_symbol(num_classes, num_layers, image_shape, conv_workspace=256, dtype='float32',
-               input_layout='NCHW', conv_layout='NCHW', batchnorm_layout='NCHW', pooling_layout='NCHW',
-               verbose=False, seed=None, cudnn_bn_off=False, batchnorm_eps=2e-5, batchnorm_mom=0.9,
-               conv_algo=-1, fuse_bn_relu=False, fuse_bn_add_relu=False, force_tensor_core=False, use_dali=True, **kwargs):
-    """
-    Adapted from https://github.com/tornadomeet/ResNet/blob/master/symbol_resnet.py
-    (Original author Wei Wu) by Antti-Pekka Hynninen
-    Implementing the original resnet ILSVRC 2015 winning network from:
-    Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun. "Deep Residual Learning for Image Recognition"
-    """
-    if seed is not None:
-        print('Setting seeds to %s' % (seed,))
-        random.seed(seed)
-        np.random.seed(seed)
-        mx.random.seed(seed)
-
-    image_shape = [int(l) for l in image_shape.split(',')]
-    (nchannel, height, width) = image_shape
-    if height <= 28:
-        num_stages = 3
-        if (num_layers-2) % 9 == 0 and num_layers >= 164:
-            per_unit = [(num_layers-2)//9]
-            filter_list = [16, 64, 128, 256]
-            bottle_neck = True
-        elif (num_layers-2) % 6 == 0 and num_layers < 164:
-            per_unit = [(num_layers-2)//6]
-            filter_list = [16, 16, 32, 64]
-            bottle_neck = False
-        else:
-            raise ValueError("no experiments done on num_layers {}, you can do it yourself".format(num_layers))
-        units = per_unit * num_stages
-    else:
-        if num_layers >= 50:
-            filter_list = [64, 256, 512, 1024, 2048]
-            bottle_neck = True
-        else:
-            filter_list = [64, 64, 128, 256, 512]
-            bottle_neck = False
-        num_stages = 4
-        if num_layers == 18:
-            units = [2, 2, 2, 2]
-        elif num_layers == 34:
-            units = [3, 4, 6, 3]
-        elif num_layers == 50:
-            units = [3, 4, 6, 3]
-        elif num_layers == 101:
-            units = [3, 4, 23, 3]
-        elif num_layers == 152:
-            units = [3, 8, 36, 3]
-        elif num_layers == 200:
-            units = [3, 24, 36, 3]
-        elif num_layers == 269:
-            units = [3, 30, 48, 8]
-        else:
-            raise ValueError("no experiments done on num_layers {}, you can do it yourself".format(num_layers))
-
-    return resnet(units             = units,
-                  num_stages        = num_stages,
-                  filter_list       = filter_list,
-                  num_classes       = num_classes,
-                  image_shape       = image_shape,
-                  bottle_neck       = bottle_neck,
-                  workspace         = conv_workspace,
-                  dtype             = dtype,
-                  input_layout      = input_layout,
-                  conv_layout       = conv_layout,
-                  batchnorm_layout  = batchnorm_layout,
-                  pooling_layout    = pooling_layout,
-                  verbose           = verbose,
-                  cudnn_bn_off      = cudnn_bn_off,
-                  bn_eps            = batchnorm_eps,
-                  bn_mom            = batchnorm_mom,
-                  conv_algo         = conv_algo,
-                  fuse_bn_relu      = fuse_bn_relu,
-                  fuse_bn_add_relu  = fuse_bn_add_relu,
-                  force_tensor_core = force_tensor_core,
-                  use_dali          = use_dali)
--- a/MxNet/Classification/RN50v1.5/runner
+++ b/MxNet/Classification/RN50v1.5/runner
@ -14,77 +14,56 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-import os, socket
-from argparse import ArgumentParser
-import warnings
+import os
+import argparse
+from pathlib import Path

-
-optparser = ArgumentParser(description="train resnet50 with MXNet")
-optparser.add_argument("-n", "--n-GPUs", type=int, default=8, help="number of GPUs to use; " +\
-                       "default = 8")
-optparser.add_argument("-b", "--batch-size", type=int, default=208, help="batch size per GPU; " +\
-                       "default = 208")
-optparser.add_argument("-e", "--num-epochs", type=int, default=90, help="number of epochs; " +\
-                       "default = 90")
-optparser.add_argument("-l", "--lr", type=float, default=0.1, help="learning rate; default = 0.1; " +\
-                       "IMPORTANT: true learning rate will be calculated as `lr * batch_size/256`")
-optparser.add_argument("--no-val", action="store_true",
-                       help="if set no validation will be performed")
-optparser.add_argument("--no-dali", action="store_true", default=False,
-                       help="use default MXNet pipeline instead of DALI")
-optparser.add_argument("--data-root", type=str, help="Directory with RecordIO data files", default="/data/imagenet/train-val-recordio-passthrough")
-optparser.add_argument("--data-nthreads", type=int, help="number of threads for data loading; default = 40", default=40)
-optparser.add_argument("--dtype", type=str, help="Precision, float16 or float32", default="float16")
+optparser = argparse.ArgumentParser(description='Train classification models on ImageNet',
+                                    formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+optparser.add_argument('-n', '--ngpus', type=int, default=1, help='number of GPUs to use')
+optparser.add_argument('-b', '--batch-size', type=int, default=192, help='batch size per GPU')
+optparser.add_argument('-e', '--num-epochs', type=int, default=90, help='number of epochs')
+optparser.add_argument('-l', '--lr', type=float, default=0.256, help='learning rate; '
+                       'IMPORTANT: true learning rate will be calculated as `lr * batch_size / 256`')
+optparser.add_argument('--data-root', type=Path, help='Directory with RecordIO data files', default=Path('/data/imagenet/train-val-recordio-passthrough'))
+optparser.add_argument('--dtype', help='Precision', default='float16', choices=('float32', 'float16'))
+optparser.add_argument('--kv-store', default='horovod', choices=('device', 'horovod'), help='key-value store type')
+optparser.add_argument('--data-backend', default='dali-gpu', choices=('dali-gpu', 'dali-cpu', 'mxnet', 'synthetic'), help='data backend')

 opts, args = optparser.parse_known_args()

-if opts.dtype == "float16":
-    n_ch = str(4 - int(opts.no_dali))
+if opts.dtype == 'float16':
+    n_ch = str(4 - int(opts.data_backend == 'mxnet'))
 else:
    n_ch = str(3)

-opts.batch_size *= opts.n_GPUs
+opts.batch_size *= opts.ngpus
+opts.lr *= opts.batch_size / 256

-opts.lr *= opts.batch_size/256
-
-command = ""
-command += "python "+os.path.dirname(__file__)+"/train.py"
-command += " --num-layers 50"
-command += " --data-train " + opts.data_root + "/train.rec"
-command += " --data-train-idx " + opts.data_root + "/train.idx"
-if not opts.no_val:
-    command += " --data-val " + opts.data_root + "/val.rec"
-    command += " --data-val-idx " + opts.data_root + "/val.idx"
-command += " --data-nthreads " + str(opts.data_nthreads)
-command += " --optimizer sgd --dtype " + opts.dtype
-command += " --lr-step-epochs 30,60,80 --max-random-area 1"
-command += " --min-random-area 0.05 --max-random-scale 1"
-command += " --min-random-scale 1 --min-random-aspect-ratio 0.75"
-command += " --max-random-aspect-ratio 1.33 --max-random-shear-ratio 0"
-command += " --max-random-rotate-angle 0 --random-resized-crop 1"
-command += " --random-crop 0 --random-mirror 1"
-command += " --image-shape "+n_ch+",224,224 --warmup-epochs 5"
-command += " --disp-batches 20"
-command += " --batchnorm-mom 0.9 --batchnorm-eps 1e-5"
+command = []
+if 'horovod' in opts.kv_store:
+    command += ['horovodrun', '-np', str(opts.ngpus)]
+command += ['python', str(Path(__file__).parent / "train.py")]
+command += ['--data-train', str(opts.data_root / "train.rec")]
+command += ['--data-train-idx', str(opts.data_root / "train.idx")]
+command += ['--data-val', str(opts.data_root / "val.rec")]
+command += ['--data-val-idx', str(opts.data_root / "val.idx")]
+command += ['--dtype', opts.dtype]
+command += ['--image-shape', n_ch + ',224,224']
 if opts.dtype == 'float16':
-    command += " --fuse-bn-relu 1"
-    command += " --input-layout NHWC --conv-layout NHWC"
-    command += " --batchnorm-layout NHWC --pooling-layout NHWC"
-    command += " --conv-algo 1 --force-tensor-core 1"
-    command += " --fuse-bn-add-relu 1"
+    command += '--fuse-bn-relu 1 --fuse-bn-add-relu 1'.split()
+    command += '--input-layout NCHW --conv-layout NHWC ' \
+               '--batchnorm-layout NHWC --pooling-layout NHWC'.split()

-command += " --kv-store device"
-if not opts.no_dali:
-    command += " --use-dali"
-    command += " --dali-prefetch-queue 2 --dali-nvjpeg-memory-padding 64"
-command += " --lr "+str(opts.lr)
-command += " --gpus " + str(list(range(opts.n_GPUs))).replace(' ', '').replace('[', '').replace(']', '')
-command += " --batch-size " + str(opts.batch_size)
-command += " --num-epochs " + str(opts.num_epochs)
+command += ['--kv-store', opts.kv_store]
+command += ['--data-backend', opts.data_backend]
+command += ['--lr', str(opts.lr)]
+command += ['--gpus', ','.join(list(map(str, range(opts.ngpus))))]
+command += ['--batch-size', str(opts.batch_size)]
+command += ['--num-epochs', str(opts.num_epochs)]

+command += args

-for arg in args:
-    command += " " + arg

 os.environ['MXNET_UPDATE_ON_KVSTORE'] = "0"
 os.environ['MXNET_EXEC_ENABLE_ADDTO'] = "1"
@ -92,5 +71,11 @@ os.environ['MXNET_USE_TENSORRT'] = "0"
 os.environ['MXNET_GPU_WORKER_NTHREADS'] = "2"
 os.environ['MXNET_GPU_COPY_NTHREADS'] = "1"
 os.environ['MXNET_OPTIMIZER_AGGREGATION_SIZE'] = "54"
+os.environ['HOROVOD_CYCLE_TIME'] = "0.1"
+os.environ['HOROVOD_FUSION_THRESHOLD'] = "67108864"
+os.environ['HOROVOD_NUM_NCCL_STREAMS'] = "2"
+os.environ['MXNET_HOROVOD_NUM_GROUPS'] = "16"
+os.environ['MXNET_EXEC_BULK_EXEC_MAX_NODE_TRAIN_FWD'] = "999"
+os.environ['MXNET_EXEC_BULK_EXEC_MAX_NODE_TRAIN_BWD'] = "25"

-exit(os.system('/bin/bash -c "'+command+'"'))
+os.execvp(command[0], command)
--- a/MxNet/Classification/RN50v1.5/scripts/prepare_imagenet.sh
+++ b/MxNet/Classification/RN50v1.5/scripts/prepare_imagenet.sh
@ -1,3 +1,4 @@
+#!/bin/bash
 # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@ -12,8 +13,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+if [ $# -lt 2 ] ; then
+    echo "usage: $0 raw_dataset prepared_dataset"
+    exit 1
+fi

-# This script launches ResNet50 benchmark in FP16 on 1,4,8 GPUs with 64,128,192,208 batch size
-# Usage ./BENCHMARK_FP16.sh <additionals flags>
-
-python benchmark.py -n 1,4,8 -b 64,128,192,208 -e 2 -w 1 -i 100 -o report.json $@
+cd "$2" &&
+python /opt/mxnet/tools/im2rec.py --list --recursive train "$1/train" &&
+python /opt/mxnet/tools/im2rec.py --list --recursive val "$1/val" &&
+python /opt/mxnet/tools/im2rec.py --pass-through --num-thread 40 train "$1/train" &&
+python /opt/mxnet/tools/im2rec.py --pass-through --num-thread 40 val "$1/val" &&
+echo "Dataset was prepared succesfully!"
--- a/MxNet/Classification/RN50v1.5/train.py
+++ b/MxNet/Classification/RN50v1.5/train.py
@ -34,58 +34,37 @@
 # limitations under the License.

 import os
+import sys
 import argparse
 import logging
-logging.basicConfig(level=logging.DEBUG)
-import data, dali, fit
 import mxnet as mx
 import numpy as np

-def set_imagenet_aug(aug):
-    # standard data augmentation setting for imagenet training
-    aug.set_defaults(rgb_mean='123.68,116.779,103.939', rgb_std='58.393,57.12,57.375')
-    aug.set_defaults(random_crop=0, random_resized_crop=1, random_mirror=1)
-    aug.set_defaults(min_random_area=0.08)
-    aug.set_defaults(max_random_aspect_ratio=4./3., min_random_aspect_ratio=3./4.)
-    aug.set_defaults(brightness=0.4, contrast=0.4, saturation=0.4, pca_noise=0.1)
+import data, dali
+import fit
+import models

-if __name__ == '__main__':
-    # parse args
-    parser = argparse.ArgumentParser(description="train resnet on imagenet",
+def parse_args():
+    parser = argparse.ArgumentParser(description="Train classification models on ImageNet",
                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    models.add_model_args(parser)
    fit.add_fit_args(parser)
    data.add_data_args(parser)
    dali.add_dali_args(parser)
    data.add_data_aug_args(parser)
-    
-    # Instead, to get standard resnet augmentation on a per-use basis, invoke as in:
-    # train_imagenet.py --set-resnet-aug ...
-    # Finally, to get the legacy MXNet v1.2 training settings on a per-use basis, invoke as in:
-    # train_imagenet.py --set-data-aug-level 3
-    parser.set_defaults(
-        # network
-        num_layers       = 50,
+    return parser.parse_args()

-        # data
-        resize           = 256,
-        num_classes      = 1000,
-        num_examples     = 1281167,
-        image_shape      = '3,224,224',
-        min_random_scale = 1, # if input image has min size k, suggest to use
-                              # 256.0/x, e.g. 0.533 for 480
-        # train
-        num_epochs       = 90,
-        lr_step_epochs   = '30,60,80',
-        dtype            = 'float32'
-    )
-    args = parser.parse_args()
+def setup_logging(args):
+    head = '{asctime}:{levelname}: {message}'
+    logging.basicConfig(level=logging.DEBUG, format=head, style='{',
+                        handlers=[logging.StreamHandler(sys.stderr), logging.FileHandler(args.log)])
+    logging.info('Start with arguments {}'.format(args))

-    if not args.use_dali:
-        data.set_data_aug_level(parser, 0)
+if __name__ == '__main__':
+    args = parse_args()
+    setup_logging(args)

-    # load network
-    import resnet as net
-    sym = net.get_symbol(**vars(args))
+    model = models.get_model(**vars(args))
+    data_loader = data.get_data_loader(args)

-    # train
-    fit.fit(args, sym, dali.get_rec_iter)
+    fit.fit(args, model, data_loader)
--- a/PyTorch/Detection/SSD/.gitignore
+++ b/PyTorch/Detection/SSD/.gitignore
@ -0,0 +1 @@
+**/__pycache__
--- a/PyTorch/Detection/SSD/Dockerfile
+++ b/PyTorch/Detection/SSD/Dockerfile
@ -1,11 +1,11 @@
-FROM nvcr.io/nvidia/pytorch:19.05-py3
+FROM nvcr.io/nvidia/pytorch:19.08-py3

 # Set working directory
 WORKDIR /workspace

 ENV PYTHONPATH "${PYTHONPATH}:/workspace"

-RUN apt-get update && apt-get install -y python3-tk python-pip git tmux htop tree
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y python3-tk python-pip git tmux htop tree

 # Necessary pip packages
 RUN pip install --upgrade pip
--- a/PyTorch/Detection/SSD/README.md
+++ b/PyTorch/Detection/SSD/README.md
@ -242,11 +242,11 @@ The following section lists the requirements in order to start training the SSD3


 ### Requirements
-This repository contains `Dockerfile` which extends the PyTorch 19.06 NGC container
+This repository contains `Dockerfile` which extends the PyTorch 19.08 NGC container
 and encapsulates some dependencies.  Aside from these dependencies,
 ensure you have the following software:
 * [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker)
-* [PyTorch 19.06-py3+ NGC container](https://ngc.nvidia.com/registry/nvidia-pytorch)
+* [PyTorch 19.08-py3+ NGC container](https://ngc.nvidia.com/registry/nvidia-pytorch)
 * [NVIDIA Volta](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/) or [Turing](https://www.nvidia.com/en-us/geforce/turing/) based GPU

 For more information about how to get started with NGC containers, see the
@ -256,7 +256,7 @@ Documentation:
 * [Accessing And Pulling From The NGC Container Registry](https://docs.nvidia.com/deeplearning/dgx/user-guide/index.html#accessing_registry)
 * [Running PyTorch](https://docs.nvidia.com/deeplearning/dgx/pytorch-release-notes/running.html#running)

-For those unable to use the [PyTorch 19.06-py3 NGC container](https://ngc.nvidia.com/registry/nvidia-pytorch),
+For those unable to use the [PyTorch 19.08-py3 NGC container](https://ngc.nvidia.com/registry/nvidia-pytorch),
 to set up the required environment or create your own container,
 see the versioned [NVIDIA Container Support Matrix](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html).

@ -537,9 +537,9 @@ The flag `--save` flag enables storing checkpoints after each epoch under `./mod
 Our scripts for SSD300 v1.1 presents two ways to run inference.
 To get meaningful results, you need a pre-trained model checkpoint.

-One way is to run an interactive session on Jupyter notebook, as described in a [Quick Start Guide](#8-start-inferencepredictions).
+One way is to run an interactive session on Jupyter notebook, as described in a 8th step of the [Quick Start Guide](#quick-start-guide).

-Another way is to run a script `src/SSD300_inference.py`. It contains the logic from the notebook, wrapped into a Python script. The script contains sample usage.
+Another way is to run a script `examples/SSD300_inference.py`. It contains the logic from the notebook, wrapped into a Python script. The script contains sample usage.

 To use the inference example script in your own code, you can call the `main` function, providing input image URIs as an argument. The result will be a list of detections for each input image.

@ -597,16 +597,18 @@ The following sections provide details on how we achieved our performance and ac
 ##### NVIDIA DGX-1 (8x V100 16G)

 Our results were obtained by running the `./examples/SSD300_FP{16,32}_{1,4,8}GPU.sh`
-script in the `pytorch-19.06-py3` NGC container on NVIDIA DGX-1 with 8x
+script in the `pytorch-19.08-py3` NGC container on NVIDIA DGX-1 with 8x
 V100 16G GPUs. Performance numbers (in items/images per second) were averaged
 over an entire training epoch.

-| **Number of GPUs** | **Mixed precision mAP** | **Training time with mixed precision** | **FP32 mAP** | **Training time with FP32** |
-|:------------------:|:------------------------:|:-------------------------------------:|:------------:|:---------------------------:|
-| 1                  | 0.2494                   | 10h 39min                             | 0.2483       | 21h 40min                   |
-| 4                  | 0.2495                   | 2h 53min                              | 0.2478       | 5h 52min                    |
-| 8                  | 0.2489                   | 1h 31min                              | 0.2475       | 2h 54min                    |
-
+|GPUs       |Batch size / GPU|Accuracy - FP32|Accuracy  - mixed precision|Time to train - FP32|Time to train  - mixed precision|Time to train speedup  (FP32 to mixed precision)|
+|-----------|----------------|---------------|---------------------------|--------------------|--------------------------------|------------------------------------------------|
+|1          |32              |0.250          |0.250                      |20:20:13            |10:23:46                        |195.62%                                         |
+|4          |32              |0.249          |0.250                      |5:11:17             |2:39:28                         |195.20%                                         |
+|8          |32              |0.250          |0.250                      |2:37:35             |1:25:38                         |184.01%                                         |
+|1          |64              |<N/A>          |0.252                      |<N/A>               |9:27:33                         |215.00%                                         |
+|4          |64              |<N/A>          |0.251                      |<N/A>               |2:24:43                         |215.10%                                         |
+|8          |64              |<N/A>          |0.252                      |<N/A>               |1:13:01                         |215.85%                                         |

 Here are example graphs of FP32 and FP16 training on 8 GPU configuration:

@ -620,15 +622,18 @@ Here are example graphs of FP32 and FP16 training on 8 GPU configuration:
 ##### NVIDIA DGX-1 (8x V100 16G)

 Our results were obtained by running the `main.py` script with the `--mode
-benchmark-training` flag in the `pytorch-19.06-py3` NGC container on NVIDIA
+benchmark-training` flag in the `pytorch-19.08-py3` NGC container on NVIDIA
 DGX-1 with 8x V100 16G GPUs. Performance numbers (in items/images per second)
 were averaged over an entire training epoch.

-| **Number of GPUs** | **Batch size per GPU** | **Mixed precision img/s (median)** | **FP32 img/s (median)** | **Speed-up with mixed precision** | **Multi-gpu weak scaling with mixed precision** | **Multi-gpu weak scaling with FP32** |
-|:------------------:|:----------------------:|:----------------------------------:|:-----------------------:|:---------------------------------:|:-----------------------------------------------:|:------------------------------------:|
-| 1                  | 32                     |  217.052                           |  102.495                | 2.12                              | 1.00                                            | 1.00                                 |
-| 4                  | 32                     |  838.457                           |  397.797                | 2.11                              | 3.86                                            | 3.88                                 |
-| 8                  | 32                     | 1639.843                           |  789.695                | 2.08                              | 7.56                                            | 7.70                                 |
+|GPUs       |Batch size / GPU|Throughput - FP32|Throughput  - mixed precision|Throughput speedup (FP32 - mixed precision)|Weak scaling - FP32             |Weak scaling  - mixed precision                 |
+|-----------|----------------|-----------------|-----------------------------|-------------------------------------------|--------------------------------|------------------------------------------------|
+|1          |32              |133.67           |215.30                       |161.07%                                    |100.00%                         |100.00%                                         |
+|4          |32              |532.05           |828.63                       |155.74%                                    |398.04%                         |384.88%                                         |
+|8          |32              |1,060.33         |1,647.74                     |155.40%                                    |793.27%                         |765.33%                                         |
+|1          |64              |<N/A>            |232.22                       |173.73%                                    |<N/A>                           |100.00%                                         |
+|4          |64              |<N/A>            |910.77                       |171.18%                                    |<N/A>                           |392.20%                                         |
+|8          |64              |<N/A>            |1,769.48                     |166.88%                                    |<N/A>                           |761.99%                                         |

 To achieve these same results, follow the [Quick Start Guide](#quick-start-guide) outlined above.

@ -638,16 +643,16 @@ To achieve these same results, follow the [Quick Start Guide](#quick-start-guide
 ##### NVIDIA DGX-1 (1x V100 16G)

 Our results were obtained by running the `main.py` script with `--mode
-benchmark-inference` flag in the pytorch-19.06-py3 NGC container on NVIDIA
+benchmark-inference` flag in the pytorch-19.08-py3 NGC container on NVIDIA
 DGX-1 with (1x V100 16G) GPUs.

-| **Batch size** | **Mixed precision img/s (median)** | **FP32 img/s (median)** |
-|:--------------:|:----------------------------------:|:-----------------------:|
-|              2 |                            163.12  |                147.91   |
-|              4 |                            296.60  |                201.62   |
-|              8 |                            412.52  |                228.16   |
-|             16 |                            470.10  |                280.57   |
-|             32 |                            520.54  |                302.43   |
+|Batch size |Throughput - FP32|Throughput  - mixed precision|Throughput speedup (FP32 - mixed precision)|Weak scaling - FP32 |Weak scaling  - mixed precision |
+|-----------|-----------------|-----------------------------|-------------------------------------------|--------------------|--------------------------------|
+|2          |148.99           |186.60                       |125.24%                                    |100.00%             |100.00%                         |
+|4          |203.35           |326.69                       |160.66%                                    |136.48%             |175.08%                         |
+|8          |227.32           |433.45                       |190.68%                                    |152.57%             |232.29%                         |
+|16         |278.02           |493.19                       |177.39%                                    |186.60%             |264.31%                         |
+|32         |299.81           |545.84                       |182.06%                                    |201.23%             |292.53%                         |

 To achieve these same results, follow the [Quick Start Guide](#quick-start-guide) outlined above.

@ -655,6 +660,13 @@ To achieve these same results, follow the [Quick Start Guide](#quick-start-guide

 ### Changelog

+August 2019
+ * upgrade the PyTorch container to 19.08
+ * update Results section in the README
+ * code updated to use DALI 0.12.0
+ * checkpoint loading fix
+ * fixed links in the README
+
 July 2019
 * script and notebook for inference
 * use AMP instead of hand-crafted FP16 support
@ -666,7 +678,7 @@ July 2019
 March 2019
 * Initial release

-### Known issues
+## Known issues

 There are no known issues with this model.

--- a/PyTorch/Detection/SSD/csrc/box_encoder_cuda.cu
+++ b/PyTorch/Detection/SSD/csrc/box_encoder_cuda.cu
@ -1,6 +1,6 @@
 /******************************************************************************
 *
-* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
--- a/PyTorch/Detection/SSD/csrc/interface.cpp
+++ b/PyTorch/Detection/SSD/csrc/interface.cpp
@ -1,6 +1,6 @@
 /******************************************************************************
 *
-* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
--- a/PyTorch/Detection/SSD/csrc/random_horiz_flip.cu
+++ b/PyTorch/Detection/SSD/csrc/random_horiz_flip.cu
@ -1,6 +1,6 @@
 /******************************************************************************
 *
-* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
--- a/PyTorch/Detection/SSD/dle/inference.py
+++ b/PyTorch/Detection/SSD/dle/inference.py
@ -1,3 +1,17 @@
+# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import numpy as np
 import skimage

--- a/PyTorch/Detection/SSD/examples/SSD300_inference.py
+++ b/PyTorch/Detection/SSD/examples/SSD300_inference.py
@ -1,3 +1,17 @@
+# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import torch
 import numpy as np

@ -10,7 +24,6 @@ from src.utils import dboxes300_coco, Encoder

 def load_checkpoint(model, model_file):
    cp = torch.load(model_file)['model']
-    cp = { k.replace('module.1.', ''): cp[k] for k in cp }
    model.load_state_dict(cp)


--- a/PyTorch/Detection/SSD/examples/inference.ipynb
+++ b/PyTorch/Detection/SSD/examples/inference.ipynb
--- a/PyTorch/Detection/SSD/main.py
+++ b/PyTorch/Detection/SSD/main.py
@ -1,3 +1,17 @@
+# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import os
 import time
 from argparse import ArgumentParser
@ -157,13 +171,12 @@ def train(train_loop_func, logger, args):

    if args.checkpoint is not None:
        if os.path.isfile(args.checkpoint):
-            load_checkpoint(ssd300, args.checkpoint)
+            load_checkpoint(ssd300.module if args.distributed else ssd300, args.checkpoint)
            checkpoint = torch.load(args.checkpoint,
                                    map_location=lambda storage, loc: storage.cuda(torch.cuda.current_device()))
            start_epoch = checkpoint['epoch']
            iteration = checkpoint['iteration']
            scheduler.load_state_dict(checkpoint['scheduler'])
-            ssd300.load_state_dict(checkpoint['model'])
            optimizer.load_state_dict(checkpoint['optimizer'])
        else:
            print('Provided checkpoint is not path to a file')
--- a/PyTorch/Detection/SSD/setup.py
+++ b/PyTorch/Detection/SSD/setup.py
@ -1,6 +1,6 @@
 #!/usr/bin/env python

-# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
--- a/PyTorch/Detection/SSD/src/coco.py
+++ b/PyTorch/Detection/SSD/src/coco.py
@ -1,3 +1,17 @@
+# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 __author__ = 'tylin'
 __version__ = '2.0'
 # Interface for accessing the Microsoft COCO dataset.
--- a/PyTorch/Detection/SSD/src/coco_pipeline.py
+++ b/PyTorch/Detection/SSD/src/coco_pipeline.py
@ -1,4 +1,4 @@
-# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@ -43,7 +43,7 @@ class COCOPipeline(Pipeline):
        self.input = ops.COCOReader(file_root = file_root, annotations_file = annotations_file,
                            shard_id = shard_id, num_shards = num_gpus, ratio=True, ltrb=True, random_shuffle=True,
                                    skip_empty=True)
-        self.decode = ops.HostDecoder(device = "cpu", output_type = types.RGB)
+        self.decode = ops.ImageDecoder(device = "cpu", output_type = types.RGB)

        # Augumentation techniques
        self.crop = ops.SSDRandomCrop(device="cpu", num_attempts=1)
@ -163,7 +163,7 @@ class DALICOCOIterator(object):
        for p in self._pipes:
            p._prefetch()
        for p in self._pipes:
-            outputs.append(p._share_outputs())
+            outputs.append(p.share_outputs())
        for i in range(self._num_gpus):
            dev_id = self._pipes[i].device_id
            out_images = []
@ -237,8 +237,8 @@ class DALICOCOIterator(object):
                pyt_offsets[j] = torch.IntTensor(bbox_offsets[j])

        for p in self._pipes:
-            p._release_outputs()
-            p._run()
+            p.release_outputs()
+            p.schedule_run()

        copy_db_index = self._current_data_batch
        # Change index for double buffering
--- a/PyTorch/Detection/SSD/src/data.py
+++ b/PyTorch/Detection/SSD/src/data.py
@ -1,3 +1,17 @@
+# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import os

 import torch
@ -18,7 +32,7 @@ def get_train_loader(args, local_seed):
                    output_fp16=args.amp, output_nhwc=False,
                    pad_output=False, seed=local_seed)
    train_pipe.build()
-    test_run = train_pipe.run()
+    test_run = train_pipe.schedule_run(), train_pipe.share_outputs(), train_pipe.release_outputs()
    train_loader = DALICOCOIterator(train_pipe, 118287 / args.N_gpu)
    return train_loader

--- a/PyTorch/Detection/SSD/src/evaluate.py
+++ b/PyTorch/Detection/SSD/src/evaluate.py
@ -1,3 +1,17 @@
+# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import torch
 import time
 import numpy as np
--- a/PyTorch/Detection/SSD/src/logger.py
+++ b/PyTorch/Detection/SSD/src/logger.py
@ -1,3 +1,17 @@
+# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import math
 import numpy as np

--- a/PyTorch/Detection/SSD/src/model.py
+++ b/PyTorch/Detection/SSD/src/model.py
@ -1,3 +1,17 @@
+# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import torch
 import torch.nn as nn
 from torchvision.models.resnet import resnet18, resnet34, resnet50, resnet101, resnet152
--- a/PyTorch/Detection/SSD/src/train.py
+++ b/PyTorch/Detection/SSD/src/train.py
@ -1,3 +1,17 @@
+# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from torch.autograd import Variable
 import torch
 import time
--- a/PyTorch/Detection/SSD/src/utils.py
+++ b/PyTorch/Detection/SSD/src/utils.py
@ -1,3 +1,17 @@
+# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import torch
 import torchvision.transforms as transforms
 import torch.utils.data as data
--- a/PyTorch/Detection/SSD/ssd/pycache/argparse.cpython-36.pyc
+++ b/PyTorch/Detection/SSD/ssd/pycache/argparse.cpython-36.pyc
--- a/PyTorch/LanguageModeling/BERT/.dockerignore
+++ b/PyTorch/LanguageModeling/BERT/.dockerignore
@ -1,8 +1,20 @@
-data/download/
-data/extracted/
-data/formatted_one_article_per_line/
-data/sharded/
-data/hdf5/
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+data/download
+data/extracted
+data/formatted_one_article_per_line
+data/sharded
+data/hdf5
 vocab/
-results/
-checkpoints/*
+results/
--- a/PyTorch/LanguageModeling/BERT/.gitignore
+++ b/PyTorch/LanguageModeling/BERT/.gitignore
@ -8,14 +8,11 @@ __pycache__/
 # C extensions
 *.so

-#Data       
+#Data checkpoints and results       
 data/*/*/   
 data/*/*.zip
-data/*
-
-#checkpoints and results
-checkpoints/*
-results/*
+checkpoints/
+results/

 # Distribution / packaging
 .Python
--- a/PyTorch/LanguageModeling/BERT/Dockerfile
+++ b/PyTorch/LanguageModeling/BERT/Dockerfile
@ -1,24 +1,22 @@
-ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:19.07-py3
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:19.08-py3
 FROM ${FROM_IMAGE_NAME}
 RUN apt-get update && apt-get install -y pbzip2 pv bzip2 cabextract

 ENV BERT_PREP_WORKING_DIR /workspace/bert/data

-WORKDIR /opt
-RUN rm -rf /opt/pytorch/apex ; \
-  git clone https://github.com/NVIDIA/apex.git pytorch/apex ; \
-  cd pytorch/apex ; \
-  pip uninstall --yes apex; \
-  git checkout 880ab925bce9f817a93988b021e12db5f67f7787;  \
-  git pull; \
-  pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" .
-
-#WORKDIR /opt
-#RUN cd pytorch/apex \
-# && git fetch origin pull/334/head:multi_tensor_lamb_optimizer \
-# && git checkout multi_tensor_lamb_optimizer \
-# && python setup.py develop --cuda_ext --cpp_ext
-
 WORKDIR /workspace
 RUN git clone https://github.com/attardi/wikiextractor.git
 RUN git clone https://github.com/soskek/bookcorpus.git
--- a/PyTorch/LanguageModeling/BERT/LICENSE
+++ b/PyTorch/LanguageModeling/BERT/LICENSE
@ -1,4 +1,3 @@
-
                                 Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/
@ -176,6 +175,8 @@

   END OF TERMS AND CONDITIONS

+   Copyright 2019 NVIDIA CORPORATION. All rights reserved.
+
   APPENDIX: How to apply the Apache License to your work.

      To apply the Apache License to your work, attach the following
--- a/PyTorch/LanguageModeling/BERT/README.md
+++ b/PyTorch/LanguageModeling/BERT/README.md
@ -1,8 +1,8 @@
 # BERT For PyTorch

-This repository provides a script and recipe to train the BERT model to achieve state of the art accuracy, and is tested and maintained by NVIDIA.
+This repository provides a script and recipe to train the BERT model for PyTorch to achieve state-of-the-art accuracy, and is tested and maintained by NVIDIA.

-**Table Of Contents**
+## Table Of Contents

 - [Model overview](#model-overview)
    * [Model architecture](#model-architecture)
@ -11,6 +11,7 @@ This repository provides a script and recipe to train the BERT model to achieve
        * [Features](#features)
    * [Mixed precision training](#mixed-precision-training)
        * [Enabling mixed precision](#enabling-mixed-precision)
+        * [Glossary](#glossary)
 - [Setup](#setup)
    * [Requirements](#requirements)
 - [Quick Start Guide](#quick-start-guide)
@ -18,14 +19,12 @@ This repository provides a script and recipe to train the BERT model to achieve
    * [Scripts and sample code](#scripts-and-sample-code)
    * [Parameters](#parameters)
        * [Pre-training parameters](#pre-training-parameters)
+        * [Multi-node](#multi-node)
        * [Fine-tuning parameters](#fine-tuning-parameters)     
    * [Command-line options](#command-line-options)
    * [Getting the data](#getting-the-data)
        * [Dataset guidelines](#dataset-guidelines)
        * [Multi-dataset](#multi-dataset)
-            * [Relocating hdf5 files](#relocating-hdf5-files)
-            * [Inter sequence-pair mixing](#inter-sequence-pair-mixing)
-            * [Retaining document-level granularity](#retaining-document-level-granularity)
    * [Training process](#training-process)
        * [Pre-training](#pre-training)
        * [Fine-tuning](#fine-tuning)   
@ -43,31 +42,34 @@ This repository provides a script and recipe to train the BERT model to achieve
            * [Training stability test](#training-stability-test)
                * [Pre-training stability test](#pre-training-stability-test)
                * [Fine-tuning stability test](#fine-tuning-stability-test) 
-            * [Training performance results](#training-performance-results)
-                * [Training performance: NVIDIA DGX-1 (8x V100 16G)](#training-performance-nvidia-dgx-1-8x-v100-16g)
-                    * [Pre-training NVIDIA DGX-1 With 16G](#pre-training-nvidia-dgx-1-with-16g)
-                    * [Fine-tuning NVIDIA DGX-1 With 16G](#fine-tuning-nvidia-dgx-1-with-16g)   
-                * [Training performance: NVIDIA DGX-1 (8x V100 32G)](#training-performance-nvidia-dgx-1-8x-v100-32g)
-                    * [Pre-training NVIDIA DGX-1 With 32G](#pre-training-nvidia-dgx-1-with-32g)
-                    * [Fine-tuning NVIDIA DGX-1 With 32G](#fine-tuning-nvidia-dgx-1-with-32g)   
-                * [Training performance: NVIDIA DGX-2 (16x V100 32G)](#training-performance-nvidia-dgx-2-16x-v100-32g)
-                    * [Pre-training NVIDIA DGX-2 With 32G](#pre-training-nvidia-dgx-2-with-32g)
-                    * [Fine-tuning NVIDIA DGX-2 With 32G](#fine-tuning-nvidia-dgx-2-with-32g)   
-            * [Inference performance results](#inference-performance-results)
-                * [Inference performance: NVIDIA DGX-1 (1x V100 16G)](#inference-performance-nvidia-dgx-1-1x-v100-16g)
-                    * [Pre-training inference on NVIDIA DGX-1 with 16G](#pre-training-inference-on-nvidia-dgx-1-with-16g)
-                    * [Fine-tuning inference on NVIDIA DGX-1 with 16G](#fine-tuning-inference-on-nvidia-dgx-1-with-16g)
-                * [Inference performance: NVIDIA DGX-1 (1x V100 32G)](#inference-performance-nvidia-dgx-1-1x-v100-32g)
-                    * [Pre-training inference on NVIDIA DGX-1 with 32G](#pre-training-inference-on-nvidia-dgx-1-with-32g)
-                    * [Fine-tuning inference on NVIDIA DGX-1 with 32G](#fine-tuning-inference-on-nvidia-dgx-1-with-32g)
-                * [Inference performance: NVIDIA DGX-2 (1x V100 32G)](#inference-performance-nvidia-dgx-2-1x-v100-32g)
-                    * [Pre-training inference on NVIDIA DGX-2 with 32G](#pre-training-inference-on-nvidia-dgx-2-with-32g)
-                    * [Fine-tuning inference on NVIDIA DGX-2 with 32G](#fine-tuning-inference-on-nvidia-dgx-2-with-32g)
+          * [Training performance results](#training-performance-results)
+              * [Training performance: NVIDIA DGX-1 (8x V100 16G)](#training-performance-nvidia-dgx-1-8x-v100-16g)
+                  * [Pre-training NVIDIA DGX-1 With 16G](#pre-training-nvidia-dgx-1-with-16g)
+                  * [Pre-training on multiple NVIDIA DGX-1 With 16G](#pre-training-on-multiple-nvidia-dgx-1-with-16g)
+                  * [Fine-tuning NVIDIA DGX-1 With 16G](#fine-tuning-nvidia-dgx-1-with-16g)   
+              * [Training performance: NVIDIA DGX-1 (8x V100 32G)](#training-performance-nvidia-dgx-1-8x-v100-32g)
+                  * [Pre-training NVIDIA DGX-1 With 32G](#pre-training-nvidia-dgx-1-with-32g)
+                  * [Fine-tuning NVIDIA DGX-1 With 32G](#fine-tuning-nvidia-dgx-1-with-32g)   
+              * [Training performance: NVIDIA DGX-2 (16x V100 32G)](#training-performance-nvidia-dgx-2-16x-v100-32g)
+                  * [Pre-training NVIDIA DGX-2 With 32G](#pre-training-nvidia-dgx-2-with-32g)
+                  * [Pre-training on multiple NVIDIA DGX-2H With 32G](#pre-training-on-multiple-nvidia-dgx-2h-with-32g)
+                  * [Fine-tuning NVIDIA DGX-2 With 32G](#fine-tuning-nvidia-dgx-2-with-32g)   
+          * [Inference performance results](#inference-performance-results)
+              * [Inference performance: NVIDIA DGX-1 (1x V100 16G)](#inference-performance-nvidia-dgx-1-1x-v100-16g)
+                  * [Pre-training inference on NVIDIA DGX-1 with 16G](#pre-training-inference-on-nvidia-dgx-1-with-16g)
+                  * [Fine-tuning inference on NVIDIA DGX-1 with 16G](#fine-tuning-inference-on-nvidia-dgx-1-with-16g)
+              * [Inference performance: NVIDIA DGX-1 (1x V100 32G)](#inference-performance-nvidia-dgx-1-1x-v100-32g)
+                  * [Pre-training inference on NVIDIA DGX-1 with 32G](#pre-training-inference-on-nvidia-dgx-1-with-32g)
+                  * [Fine-tuning inference on NVIDIA DGX-1 with 32G](#fine-tuning-inference-on-nvidia-dgx-1-with-32g)
+              * [Inference performance: NVIDIA DGX-2 (1x V100 32G)](#inference-performance-nvidia-dgx-2-1x-v100-32g)
+                  * [Pre-training inference on NVIDIA DGX-2 with 32G](#pre-training-inference-on-nvidia-dgx-2-with-32g)
+                  * [Fine-tuning inference on NVIDIA DGX-2 with 32G](#fine-tuning-inference-on-nvidia-dgx-2-with-32g)
 - [Release notes](#release-notes)
    * [Changelog](#changelog)
    * [Known issues](#known-issues)


+
 ## Model overview
 
 BERT, or Bidirectional Encoder Representations from Transformers, is a new method of pre-training language representations which obtains state-of-the-art results on a wide array of Natural Language Processing (NLP) tasks. This model is based on the [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) paper. NVIDIA's implementation of BERT is an optimized version of the [Hugging Face implementation](https://github.com/huggingface/pytorch-pretrained-BERT), leveraging mixed precision arithmetic and Tensor Cores on V100 GPUs for faster training times while maintaining target accuracy.
@ -75,22 +77,25 @@ BERT, or Bidirectional Encoder Representations from Transformers, is a new metho
 The repository also contains scripts to interactively launch data download, training, benchmarking and inference routines in a Docker container for both pre-training and fine-tuning for tasks such as question answering. The major differences between the original implementation of the paper and this version of BERT are as follows:

 -   Scripts to download Wikipedia and BookCorpus datasets
-   Scripts to preprocess downloaded data or a custom corpus into inputs and targets for pre-training in a modular fashion.
+-   Scripts to preprocess downloaded data or a custom corpus into inputs and targets for pre-training in a modular fashion
 -   Fused [LAMB](https://arxiv.org/pdf/1904.00962.pdf) optimizer to support training with larger batches
 -   Fused Adam optimizer for fine tuning tasks
 -   Fused CUDA kernels for better performance LayerNorm
-   Automatic Mixed precision training support
+-   Automatic mixed precision (AMP) training support
+-   Scripts to launch on multiple number of nodes

 Other publicly available implementations of BERT include:
-
-1.  [Google's official implementation](https://github.com/google-research/bert)
-2.  [codertimo](https://github.com/codertimo/BERT-pytorch)
+1. [NVIDIA Tensorflow](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow/LanguageModeling/BERT)
+2. [Hugging Face](https://github.com/huggingface/pytorch-pretrained-BERT)
+3. [codertimo](https://github.com/codertimo/BERT-pytorch)
+4. [gluon-nlp](https://github.com/dmlc/gluon-nlp/tree/master/scripts/bert)
+5. [Google's implementation](https://github.com/google-research/bert)
    
 This model trains with mixed precision Tensor Cores on Volta and provides a push-button solution to pretraining on a corpus of choice. As a result, researchers can get results 4x faster than training without Tensor Cores. This model is tested against each NGC monthly container release to ensure consistent accuracy and performance over time.

 ### Model architecture

-The BERT architecture uses the same architecture as the encoder half of the Transformer. Input sequences are projected into an embedding space before being fed into the encoder structure. Additionally, a positional and segment encodings are added to the embeddings to preserve positional information. The encoder structure is simply a stack of Transformer blocks, which consist of a multi-head attention layer followed by successive stages of feed-forward networks and layer normalization. The multi-head attention layer accomplishes self-attention on multiple input representations.
+The BERT architecture uses the same architecture as the encoder half of the Transformer. Input sequences are projected into an embedding space before being fed into the encoder structure. Additionally, positional and segment encodings are added to the embeddings to preserve positional information. The encoder structure is simply a stack of Transformer blocks, which consist of a multi-head attention layer followed by successive stages of feed-forward networks and layer normalization. The multi-head attention layer accomplishes self-attention on multiple input representations.

 An illustration of the architecture taken from the [Transformer paper](https://arxiv.org/pdf/1706.03762.pdf) is shown below.

@ -100,14 +105,14 @@ An illustration of the architecture taken from the [Transformer paper](https://a

 The architecture of the BERT model is almost identical to the Transformer model that was first introduced in the [Attention Is All You Need paper](https://arxiv.org/pdf/1706.03762.pdf). The main innovation of BERT lies in the pre-training step, where the model is trained on two unsupervised prediction tasks using a large text corpus. Training on these unsupervised tasks produces a generic language model, which can then be quickly fine-tuned to achieve state-of-the-art performance on language processing tasks such as question answering.

-The BERT paper reports results two configurations of BERT, each corresponding to a unique model size. This implementation provides the same configurations by default, which are described in the table below.  
+The BERT paper reports the results for two configurations of BERT, each corresponding to a unique model size. This implementation provides the same configurations by default, which are described in the table below.  

 | **Model** | **Hidden layers** | **Hidden unit size** | **Attention heads** | **Feedforward filter size** | **Max sequence length** | **Parameters** |
 |:---------:|:----------:|:----:|:---:|:--------:|:---:|:----:|
 |BERTBASE |12 encoder| 768| 12|4 x  768|512|110M|
 |BERTLARGE|24 encoder|1024| 16|4 x 1024|512|330M|

-Additionally, this implementation supports training on multiple GPUs. Mixed precision training and inference with dynamic loss scaling is also supported.
+

 ### Feature support matrix

@ -118,12 +123,13 @@ The following features are supported by this model.
 |APEX AMP|Yes|
 |APEX DDP|Yes|
 |LAMB|Yes|
+|Multi-node|Yes|

 #### Features
 
-[APEX](https://github.com/NVIDIA/apex) is a Pytorch extension with NVIDIA-maintained utilities to streamline mixed precision and distributed training. 
+[APEX](https://github.com/NVIDIA/apex) is a Pytorch extension with NVIDIA-maintained utilities to streamline mixed precision and distributed training, whereas [AMP](https://nvidia.github.io/apex/amp.html) is an abbreviation used for automatic mixed precision training.
 
-[DDP](https://nvidia.github.io/apex/parallel.html) stands for DistributedDataParallel and is used for multi-GPU training, where as [AMP](https://nvidia.github.io/apex/amp.html) is an abbreviation used for automatic mixed precision training.
+[DDP](https://nvidia.github.io/apex/parallel.html) stands for DistributedDataParallel and is used for multi-GPU training.

 [LAMB](https://arxiv.org/pdf/1904.00962.pdf) stands for Layerwise Adaptive Moments based optimizer, is a large batch optimization technique that helps accelerate training of deep neural networks using large minibatches. It allows using a global batch size of 65536 and 32768 on sequence lengths 128 and 512 respectively, compared to a batch size of 256 for Adam. The optimized implementation accumulates 1024 gradients batches in phase 1 and 4096 steps in phase 2 before updating weights once. This results in 15% training speedup. On multi-node systems, LAMB allows scaling up to 1024 GPUs resulting in training speedups of up to 72x in comparison to [Adam](https://arxiv.org/pdf/1412.6980.pdf). Adam has limitations on the learning rate that can be used since it is applied globally on all parameters whereas LAMB follows a layerwise learning rate strategy.

@ -135,10 +141,9 @@ Mixed precision is the combined use of different numerical precisions in a compu
 2.  Adding loss scaling to preserve small gradient values.

 For information about:
-
 -   How to train using mixed precision, see the [Mixed Precision Training](https://arxiv.org/abs/1710.03740) paper and [Training With Mixed Precision](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html) documentation.
 -   Techniques used for mixed precision training, see the [Mixed-Precision Training of Deep Neural Networks](https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/) blog.
-   APEX tools for mixed precision training, see the [NVIDIA Apex: Tools for Easy Mixed-Precision Training in PyTorch](https://devblogs.nvidia.com/apex-pytorch-easy-mixed-precision-training/).
+-   APEX tools for mixed precision training, see the [NVIDIA APEX: Tools for Easy Mixed-Precision Training in PyTorch](https://devblogs.nvidia.com/apex-pytorch-easy-mixed-precision-training/).
 
 #### Enabling mixed precision

@ -149,15 +154,35 @@ Automatic mixed precision can be enabled with the following code changes:
 ```
 from apex import amp
 if fp16:
-	# Wrap optimizer and model
-	model, optimizer = amp.initialize(model, optimizer, opt_level=<opt_level>, loss_scale=”dynamic”)
+    # Wrap optimizer and model
+    model, optimizer = amp.initialize(model, optimizer, opt_level=<opt_level>, loss_scale=”dynamic”)
 
 if fp16:
-	with amp.scale_loss(loss, optimizer) as scaled_loss:
-    	scaled_loss.backward()
+    with amp.scale_loss(loss, optimizer) as scaled_loss:
+        scaled_loss.backward()
   ```

-Where `<opt_level>` is the optimization level. In the pretraining, “O2” is set as the optimization level. Mixed precision training can be turned on by passing the `fp16` argument to the pre-training and fine-tuning Python scripts. Shell scripts all have a positional argument available to enable mixed precision training.
+Where `<opt_level>` is the optimization level. In the pretraining, `O2` is set as the optimization level. Mixed precision training can be turned on by passing the `fp16` argument to the `run_pretraining.py` and `run_squad.py`. All shell scripts have a positional argument available to enable mixed precision training.
+
+### Glossary
+
+**Fine-tuning**  
+Training an already pretrained model further using a task specific dataset for subject-specific refinements, by adding task-specific layers on top if required.
+
+**Language Model**  
+Assigns a probability distribution over a sequence of words. Given a sequence of words, it assigns a probability to the whole sequence.
+
+**Pre-training**  
+Training a model on vast amounts of data on the same (or different) task to build general understandings.
+
+**Transformer**  
+The paper [Attention Is All You Need](https://arxiv.org/abs/1706.03762) introduces a novel architecture called Transformer that uses an attention mechanism and transforms one sequence into another.
+
+**Phase1**  
+Pretraining on samples of sequence length 128 and 20 masked predictions per sequence.
+
+**Phase2**  
+Pretraining on samples of sequence length 512 and 80 masked predictions per sequence.
 
 ## Setup

@ -178,9 +203,14 @@ For more information about how to get started with NGC containers, see the follo

 For those unable to use the PyTorch NGC container, to set up the required environment or create your own container, see the versioned [NVIDIA Container Support Matrix](https://docs.nvidia.com/deeplearning/dgx/support-matrix/index.html).

+For multi-node, the sample provided in this repository requires [Enroot](https://github.com/NVIDIA/enroot) and [Pyxis](https://github.com/NVIDIA/pyxis) set up on a [SLURM](https://slurm.schedmd.com) cluster.
+
+More information on how to set up and launch can be found in the [Multi-node Documentation](https://docs.nvidia.com/ngc/multi-node-bert-user-guide).
+
+
 ## Quick Start Guide

-To train your model using mixed precision with Tensor Cores or using FP32, perform the following steps using the default parameters of the BERT model. The default parameters for pretraining have been set to run on 8 x V100 32G cards. For the specifics concerning training and inference, see [Advanced](#advanced).
+To train your model using mixed precision with Tensor Cores or using FP32, perform the following steps using the default parameters of the BERT model. The default parameters for pretraining have been set to run on 8 x V100 32G cards. For the specifics concerning training and inference, see the [Advanced](#advanced) section.
 

 1. Clone the repository.
@ -190,11 +220,11 @@ To train your model using mixed precision with Tensor Cores or using FP32, perfo
 `cd DeepLearningExamples/PyTorch/LanguageModeling/BERT`


-2. Download NVIDIA pretrained checkpoint.
+2. Download the NVIDIA pretrained checkpoint.

-If you want to use a pretrained checkpoint, visit [NGC](https://ngc.nvidia.com/catalog/models) and browse the available models. This downloaded checkpoint is used to fine-tune on SQuAD. Make sure to place the downloaded checkpoint in `checkpoints/` folder.
+If you want to use a pretrained checkpoint, visit [NGC](https://ngc.nvidia.com/catalog/models) and browse the available models. This downloaded checkpoint is used to fine-tune on SQuAD. Ensure you place the downloaded checkpoint in the `checkpoints/` folder.

-3. Build the BERT 19.07 NGC container.
+3. Build the BERT 19.08 NGC container.

 `bash scripts/docker/build.sh`

@ -202,7 +232,7 @@ If you want to use a pretrained checkpoint, visit [NGC](https://ngc.nvidia.com/c

 `bash scripts/docker/launch.sh`

-Resultant logs and checkpoints of pretraining and finetuning routines get stored in the `results/` folder.
+Resultant logs and checkpoints of pretraining and fine-tuning routines get stored in the `results/` folder.

 `data` and `vocab.txt` are downloaded in `data/` directory by default. Refer to the [Getting the data](#getting-the-data) section for more details on how to process a custom corpus as required for BERT pretraining.

@ -214,25 +244,29 @@ This repository provides scripts to download, verify and extract the following d
 -   Wikipedia (pre-training)
 -   BookCorpus (pre-training)

-To download, verify, extract the datasets, and create the shards in hdf5 format, run:
+To download, verify, extract the datasets, and create the shards in hdf5 format, run:  
 `/workspace/bert/data/create_datasets_from_start.sh`

-6. Start pre-training.
+Depending on the speed of your internet connection, this process takes about a day to complete.

-BERT is designed to pre-train deep bidirectional representations for language representations. The following scripts are to replicate pretraining on Wikipedia+Book Corpus from this [paper](https://arxiv.org/pdf/1810.04805.pdf). These scripts are general and can be used for pre-training language representations on any corpus of choice.
+6. Start pretraining.

-From within the container, you can use the following script to run pre-training.
+BERT is designed to pre-train deep bidirectional networks for language representations. The following scripts replicate pretraining on Wikipedia + BookCorpus from this [paper](https://arxiv.org/pdf/1810.04805.pdf). These scripts are general and can be used for pre-training language representations on any corpus of choice.
+
+To run on a single node, from within the container, you can use the following script to run pre-training.  
 `bash scripts/run_pretraining.sh`

-More details can be found in Details/Training Process
- 
-7. Start fine-tuning with the SQUAD dataset.
+The default hyperparameters are set to run on 8 x V100 32G cards.

-The above pretrained BERT representations can be fine tuned with just one additional output layer for a state-of-the-art question answering system. Running the following script launches fine-tuning for question answering with the SQuaD dataset.
+To run on multiple nodes, see the [Multi-node](#multi-node) section.  
+ 
+7. Start fine-tuning with the SQuAD dataset.
+
+The above pretrained BERT representations can be fine tuned with just one additional output layer for a state-of-the-art question answering system. Running the following script launches fine-tuning for question answering with the SQuAD dataset.

 `bash scripts/run_squad.sh /workspace/checkpoints/<downloaded_checkpoint>`

-Default arguments are listed below in order,
+Default arguments are listed below in the order the scripts expects:

 -   Initial checkpoint - The default is `/workspace/checkpoints/bert_uncased.pt`.
 -   Number of training Epochs - The default is `2`.
@ -244,18 +278,18 @@ Default arguments are listed below in order,
 -   SQuAD directory -  The default is `/workspace/bert/data/v1.1`.
 -   Vocabulary file (token to ID mapping) - The default is `/workspace/bert/vocab/vocab`.
 -   Output directory for result - The default is `/results/SQuAD`.
-   Mode (“train”, “eval”, “train eval”, "predict") - The default is `train`.
-   Config file for the bert model (It should be the same as the pretrained model) - The default is `/workspace/bert/bert_config.json`.
+-   Mode (`train`, `eval`, `train eval`, `predict`) - The default is `train`.
+-   Config file for the BERT model (It should be the same as the pretrained model) - The default is `/workspace/bert/bert_config.json`.

-The script will save the final checkpoint to the `/results/SQuAD/pytorch_model.bin` file.
+The script saves the final checkpoint to the `/results/SQuAD/pytorch_model.bin` file.

 9. Start validation/evaluation.

-Validation can be performed with the same script as above, setting `Mode` to "prediction".
+Validation can be performed with the same script as above, setting `Mode` to `prediction`.

 10. Start inference/predictions.

-Inference can be performed with the same script as above, setting `Mode` to `eval`. Inference predictions get saved to `<OUTPUT_DIRECTORY>/predictions.json`.
+Inference can be performed with the same script as above, setting `Mode` to `eval`. Inference predictions are saved to `<OUTPUT_DIRECTORY>/predictions.json`.

 ## Advanced

@ -273,7 +307,7 @@ Descriptions of the key scripts and folders are provided below.
 -   `create_pretraining_data.py` - Creates `.hdf5` files from shared text files in the final step of dataset creation.
 -   `model.py` - Implements the BERT pre-training and fine-tuning model architectures with PyTorch.
 -   `optimization.py` - Implements the LAMB optimizer with PyTorch.
-   `run_squad.py` - Implements fine tuning training and evaluation for question answering on the [SQuaD](https://rajpurkar.github.io/SQuAD-explorer/) dataset.
+-   `run_squad.py` - Implements fine tuning training and evaluation for question answering on the [SQuAD](https://rajpurkar.github.io/SQuAD-explorer/) dataset.
 -   `run_pretraining.py` - Implements BERT pre-training.
 -   `run_pretraining_inference.py` - Implements evaluation of a BERT pre-trained model.

@ -284,145 +318,169 @@ Descriptions of the key scripts and folders are provided below.
 The complete list of the available parameters for the `run_pretraining.py` script are:

 ```
-  --input_dir INPUT_DIR   	- The input data directory.
-                            	Should contain .hdf5 files for the task.
+  --input_dir INPUT_DIR       - The input data directory.
+                                Should contain .hdf5 files for the task.

-  --config_file CONFIG_FILE   - Path to a json file describing the BERT model
-                            	configuration. This file configures the model
-                            	architecture, such as the number of transformer
-                            	blocks, number of attention heads, etc.
+  --config_file CONFIG_FILE      - Path to a json file describing the BERT model
+                                configuration. This file configures the model
+                                architecture, such as the number of transformer
+                                blocks, number of attention heads, etc.

-  --bert_model BERT_MODEL 	- Specifies the type of BERT model to use;
-                            	should be one of the following:
-    	bert-base-uncased
-    	bert-large-uncased
-    	bert-base-cased
-    	bert-base-multilingual
-    	bert-base-chinese
+  --bert_model BERT_MODEL        - Specifies the type of BERT model to use;
+                                should be one of the following:
+        bert-base-uncased
+        bert-large-uncased
+        bert-base-cased
+        bert-base-multilingual
+        bert-base-chinese

-  --output_dir OUTPUT_DIR 	- Path to the output directory where the model
-                            	checkpoints will be written.
+  --output_dir OUTPUT_DIR        - Path to the output directory where the model
+                                checkpoints will be written.

  --max_seq_length MAX_SEQ_LENGTH
-                          	- The maximum total input sequence length after
-                            	WordPiece tokenization. Sequences longer than
-                            	this will be truncated, and sequences shorter
-                            	than this will be padded.
+                              - The maximum total input sequence length after
+                                WordPiece tokenization. Sequences longer than
+                                this will be truncated, and sequences shorter
+                                than this will be padded.

  --max_predictions_per_seq MAX_PREDICTIONS_PER_SEQ
-                          	- The maximum total of masked tokens per input
-                            	sequence for Masked LM.
+                              - The maximum total of masked tokens per input
+                                sequence for Masked LM.

  --train_batch_size TRAIN_BATCH_SIZE
-                          	- Batch size per GPU for training.
+                              - Batch size per GPU for training.

  --learning_rate LEARNING_RATE
-                          	- The initial learning rate for LAMB optimizer.
+                              - The initial learning rate for LAMB optimizer.

-  --max_steps MAX_STEPS   	- Total number of training steps to perform.
+  --max_steps MAX_STEPS       - Total number of training steps to perform.

  --warmup_proportion WARMUP_PROPORTION
-                          	- Proportion of training to perform linear learning
-                            	rate warmup for. For example, 0.1 = 10% of training.
+                              - Proportion of training to perform linear learning
+                                rate warmup for. For example, 0.1 = 10% of training.

-  --seed SEED             	- Sets the seed to use for random number generation.
+  --seed SEED                 - Sets the seed to use for random number generation.

  --gradient_accumulation_steps GRADIENT_ACCUMULATION_STEPS
-                          	- Number of update steps to accumulate before
-                            	performing a backward/update pass.
+                              - Number of update steps to accumulate before
+                                performing a backward/update pass.

-  --fp16                  	- If set, will perform computations using
-                            	automatic mixed precision.
+  --fp16                      - If set, will perform computations using
+                                automatic mixed precision.

-  --loss_scale LOSS_SCALE 	- Sets the loss scaling value to use when
-                            	mixed precision is used. The default value (0)
-                            	tells the script to use dynamic loss scaling
-                            	instead of fixed loss scaling.
+  --loss_scale LOSS_SCALE        - Sets the loss scaling value to use when
+                                mixed precision is used. The default value (0)
+                                tells the script to use dynamic loss scaling
+                                instead of fixed loss scaling.

-  --log_freq LOG_FREQ     	- If set, the script will output the training
-                            	loss every LOG_FREQ steps.
+  --log_freq LOG_FREQ         - If set, the script will output the training
+                                loss every LOG_FREQ steps.
 
-  --resume_from_checkpoint	- If set, training will resume from a checkpoint
-                            	that currently exists in OUTPUT_DIR.
+  --resume_from_checkpoint       - If set, training will resume from a checkpoint
+                                that currently exists in OUTPUT_DIR.

  --num_steps_per_checkpoint NUM_STEPS_PER_CHECKPOINT
-                          	- Number of update steps until a model checkpoint
-                            	is saved to disk.`
+                              - Number of update steps until a model checkpoint
+                                is saved to disk.
+  --phase2                 - Specified if training on phase 2 only. If not specified, default pretraining is on phase 1.

+  --phase1_end_step        - The number of steps phase 1 was trained for. In order to  
+                           resume phase 2 the correct way, phase1_end_step should correspond to the --max_steps phase 1 was trained for.
 ```
 
+
+#### Multi-node
+
+Multi-node runs can be launched on a pyxis/enroot Slurm cluster (see [Requirements](#requirements)) with the `run.sub` script with the following command for a 4-node DGX1 example for both phase 1 and phase 2:
+```
+BATCHSIZE=2048 LR=6e-3 GRADIENT_STEPS=128 PHASE=1 sbatch -N4 --ntasks-per-node=8 run.sub
+BATCHSIZE=1024 LR=4e-3 GRADIENT_STEPS=256 PHASE=2 sbatch -N4 --ntasks-per-node=8 run.sub
+```
+
+
+Checkpoint after phase 1 will be saved in `checkpointdir` specified in `run.sub`. The checkpoint will be automatically picked up to resume training on phase 2. Note that phase 2 should be run after phase 1.
+
+Variables to re-run the [Training performance results](#training-performance-results) are available in the `configurations.yml` file. 
+
+The batch variables `BATCHSIZE`, `LR`, `GRADIENT_STEPS`,`PHASE` refer to the Python arguments `train_batch_size`, `learning_rate`, `gradient_accumulation_steps`, `phase2` respectively.
+
+Note that the `run.sub` script is a starting point that has to be adapted depending on the environment. In particular, variables such as `datadir` handle the location of the files for each phase. 
+
+Refer to the files contents to see the full list of variables to adjust for your system.
+
+
 #### Fine-tuning parameters

-The run_squad.py script contains many of the same arguments as `run_pretraining.py`.
+The `run_squad.py` script contains many of the same arguments as `run_pretraining.py`.
 The main script specific parameters are:

 ```
- --bert_model BERT_MODEL  	- Specifies the type of BERT model to use;
-                            	should be one of the following:
-    	bert-base-uncased
-    	bert-large-uncased
-    	bert-base-cased
-    	bert-base-multilingual
-    	bert-base-chinese
+ --bert_model BERT_MODEL      - Specifies the type of BERT model to use;
+                                should be one of the following:
+        bert-base-uncased
+        bert-large-uncased
+        bert-base-cased
+        bert-base-multilingual
+        bert-base-chinese

- --train_file TRAIN_FILE  	- Path to the SQuAD json for training.
-                            	For example, train-v1.1.json.
+ --train_file TRAIN_FILE      - Path to the SQuAD json for training.
+                                For example, train-v1.1.json.

- --predict_file PREDICT_FILE  - Path to the SQuAD json for predictions.
-                            	For example, dev-v1.1.json or test-v1.1.json.
+ --predict_file PREDICT_FILE     - Path to the SQuAD json for predictions.
+                                For example, dev-v1.1.json or test-v1.1.json.

 --max_seq_length MAX_SEQ_LENGTH
-                          	- The maximum total input sequence length
-                            	after WordPiece tokenization.
-                            	Sequences longer than this will be truncated,
-                            	and sequences shorter than this will be padded.
+                              - The maximum total input sequence length
+                                after WordPiece tokenization.
+                                Sequences longer than this will be truncated,
+                                and sequences shorter than this will be padded.

- --doc_stride DOC_STRIDE  	- When splitting up a long document into chunks
-                            	this parameters sets how much stride to take
-                            	between chunks of tokens.
+ --doc_stride DOC_STRIDE      - When splitting up a long document into chunks
+                                this parameters sets how much stride to take
+                                between chunks of tokens.

 --max_query_length MAX_QUERY_LENGTH
-                          	- The maximum number of tokens for the question.
-                            	Questions longer than <max_query_length>
-                            	will be truncated to the value specified.
+                              - The maximum number of tokens for the question.
+                                Questions longer than <max_query_length>
+                                will be truncated to the value specified.

- --n_best_size N_BEST_SIZE	- The total number of n-best predictions to
-                            	generate in the nbest_predictions.json
-                            	output file.
+ --n_best_size N_BEST_SIZE       - The total number of n-best predictions to
+                                generate in the nbest_predictions.json
+                                output file.

 --max_answer_length MAX_ANSWER_LENGTH
-                          	- The maximum length of an answer that can be
-                            	generated. This is needed because the start and
-                            	end predictions are not conditioned on one another.
+                              - The maximum length of an answer that can be
+                                generated. This is needed because the start and
+                                end predictions are not conditioned on one another.

- --verbose_logging        	- If true, all the warnings related to data
-                            	processing will be printed. A number of warnings
-                            	are expected for a normal SQuAD evaluation.
+ --verbose_logging            - If true, all the warnings related to data
+                                processing will be printed. A number of warnings
+                                are expected for a normal SQuAD evaluation.

- --do_lower_case          	- Whether to lower case the input text. Set to
-                            	true for uncased models and false for cased models.
+ --do_lower_case              - Whether to lower case the input text. Set to
+                                true for uncased models and false for cased models.

- --version_2_with_negative	- If true, the SQuAD examples contain questions
-                            	that do not have an answer.
+ --version_2_with_negative       - If true, the SQuAD examples contain questions
+                                that do not have an answer.

 --null_score_diff_threshold NULL_SCORE_DIFF_THRES HOLD
-                          	- A null answer will be predicted if null_score if
-                            	best_non_null is greater than NULL_SCORE_DIFF_THRESHOLD.
+                              - A null answer will be predicted if null_score if
+                                best_non_null is greater than NULL_SCORE_DIFF_THRESHOLD.
 ```

 ### Command-line options

-To see the full list of available options and their descriptions, use the -h or --help command line option, for example:
+To see the full list of available options and their descriptions, use the `-h` or `--help` command line option, for example:

 `python run_pretraining.py --help`

 `python run_squad.py --help`

-Detailed descriptions of command line options can be found in the Parameters section above.
+Detailed descriptions of command-line options can be found in the [Parameters](#parameters) section.

 ### Getting the data

-For pre-training BERT, we use the concatenation of Wikipedia (2500M words) as well as Book Corpus (800M words). For Wikipedia, we extract only the text passages and ignore headers, lists, and tables. BERT requires that datasets are structured as a document level corpus rather than a shuffled sentence level corpus because it is critical to extract long contiguous sentences.
+For pre-training BERT, we use the concatenation of Wikipedia (2500M words) as well as BookCorpus (800M words). For Wikipedia, we extract only the text passages and ignore headers, lists, and tables. BERT requires that datasets are structured as a document level corpus rather than a shuffled sentence level corpus because it is critical to extract long contiguous sentences.

 The preparation of pre-training dataset is described in the `bertPrep.py` script found in the `data/` folder. The component steps in the automated scripts to prepare the datasets are as follows:

@ -436,12 +494,11 @@ The preparation of pre-training dataset is described in the `bertPrep.py` script

 5.  hdf5 file creation - each text file shard is processed by the `create_pretraining_data.py` script to produce a corresponding hdf5 file. The script generates input data and labels for masked language modeling and sentence prediction tasks for the input text shard.

-The tools used for preparing the Bookcorpus and Wikipedia datasets can be applied to prepare an arbitrary corpus. The `create_datasets_from_start.sh` script in the `data/` directory applies sentence segmentation, sharding, and hdf5 file creation given an arbitrary text file containing a document-separated text corpus.
-
+The tools used for preparing the BookCorpus and Wikipedia datasets can be applied to prepare an arbitrary corpus. The `create_datasets_from_start.sh` script in the `data/` directory applies sentence segmentation, sharding, and hdf5 file creation given an arbitrary text file containing a document-separated text corpus.

 For fine-tuning a pre-trained BERT model for specific tasks, by default this repository prepares the following dataset:

-   [SQuAD](https://rajpurkar.github.io/SQuAD-explorer/): for question answering 
+-   [SQuAD](https://rajpurkar.github.io/SQuAD-explorer/): for question answering

 #### Dataset guidelines

@ -469,7 +526,7 @@ The training process consists of two steps: pre-training and fine-tuning.

 Pre-training is performed using the `run_pretraining.py` script along with parameters defined in the `scripts/run_pretraining.sh`.

-The `run_pretraining.sh` script runs a job on a single node that trains the BERT-large model from scratch using the Wikipedia and BookCorpus datasets as training data using LAMB optimizer. By default, the training script runs two phases of training with a hyperparameter recipe specific to 8 x V100 32G cards:
+The `run_pretraining.sh` script runs a job on a single node that trains the BERT-large model from scratch using Wikipedia and BookCorpus datasets as training data using the LAMB optimizer. By default, the training script runs two phases of training with a hyperparameter recipe specific to 8 x V100 32G cards:

 Phase 1: (Maximum sequence length of 128)
 -   Runs on 8 GPUs with training batch size of 64 per GPU
@ -487,7 +544,7 @@ Phase 2: (Maximum sequence length of 512)
 -   Saves a checkpoint every 200 iterations (keeps only the latest 3 checkpoints) and at the end of training. All checkpoints, and training logs are saved to the `/results` directory (in the container which can be mounted to a local directory).
 -   Creates a log file containing all the output

-These parameters will train on Wikipedia and BooksCorpus to SoTA accuracy on a DGX-1 with 32GB V100 cards.
+These parameters will train on Wikipedia and BookCorpus to SoTA accuracy on a DGX-1 with 32GB V100 cards.

 `bash run_pretraining.sh <training_batch_size> <learning-rate> <precision> <num_gpus> <warmup_proportion> <training_steps> <save_checkpoint_steps> <resume_training> <create_logfile> <accumulate_gradients> <gradient_accumulation_steps> <seed> <job_name> <allreduce_post_accumulation> <allreduce_post_accumulation_fp16> <accumulate_into_fp16> <train_bath_size_phase2> <learning_rate_phase2> <warmup_proportion_phase2> <train_steps_phase2> <gradient_accumulation_steps_phase2> `

@ -496,20 +553,23 @@ Where:
 -   `<training_batch_size>` is per-GPU batch size used for training. Larger batch sizes run more efficiently, but require more memory.
 -   `<learning_rate>` is the base learning rate for training
 -   `<precision>` is the type of math in your model, can be either `fp32` or `fp16`. The options mean:
-	-   FP32: 32-bit IEEE single precision floats.
-	-   FP16: Mixed precision 16 and 32 bit floats.
+    -   FP32: 32-bit IEEE single precision floats.
+    -   FP16: Mixed precision 16 and 32 bit floats.
 -   `<num_gpus>` is the number of GPUs to use for training. Must be equal to or smaller than the number of GPUs attached to your node.
 -   `<warmup_proportion>` is the percentage of training steps used for warm-up at the start of training.
 -   `<training_steps>` is the total number of training steps.
 -   `<save_checkpoint_steps>` controls how often checkpoints are saved.
-   `<resume_training>` if set to true, training should resume from latest model in /results/checkpoints. Default is false.
-   `<create_logfile>` a flag indicating if output should be written to a log file or not (acceptable values are true or false. true indicates output should be saved to a log file.)
+-   `<resume_training>` if set to `true`, training should resume from latest model in `/results/checkpoints`. Default is `false`.
+-   `<create_logfile>` a flag indicating if output should be written to a log file or not (acceptable values are `true` or 'false`. `true` indicates output should be saved to a log file.)
 -   `<accumulate_gradient>` a flag indicating whether a larger batch should be simulated with gradient accumulation.
 -   `<gradient_accumulation_steps>` an integer indicating the number of steps to accumulate gradients over. Effective batch size = `training_batch_size` / `gradient_accumulation_steps`.
 -   `<seed>` random seed for the run.
 - `<allreduce_post_accumulation>` - If set to `true`, performs allreduce only after the defined number of gradient accumulation steps.
 - `<allreduce_post_accumulation_fp16>` -  If set to `true`, performs allreduce after gradient accumulation steps in FP16.
 - `<accumulate_into_fp16>` - If set to `true`, accumulates/sums the gradients in FP16.
+
+    Note: The above three options need to be set to false when running on fp32. 
+    
 -  `<training_batch_size_phase2>` is per-GPU batch size used for training in phase 2. Larger batch sizes run more efficiently, but require more memory.
 -   `<learning_rate_phase2>` is the base learning rate for training phase 2.
 -   `<warmup_proportion_phase2>` is the percentage of training steps used for warm-up at the start of training.
@ -522,7 +582,7 @@ For example:

 Trains BERT-large from scratch on a DGX-1 32G using FP16 arithmetic. 90% of the training steps are done with sequence length 128 (phase1 of training) and 10% of the training steps are done with sequence length 512 (phase2 of training).

-In order to train on a DGX-1 16G, set `gradient_accumulation_steps` to `512` and `gradient_accumulation_steps_phase2` to `1024` in `scripts/run_pretraining.sh`
+In order to train on a DGX-1 16G, set `gradient_accumulation_steps` to `512` and `gradient_accumulation_steps_phase2` to `1024` in `scripts/run_pretraining.sh`.

 In order to train on a DGX-2 32G, set `train_batch_size` to `4096`, `train_batch_size_phase2` to `2048`, `num_gpus` to `16`, `gradient_accumulation_steps` to `64` and `gradient_accumulation_steps_phase2` to `256` in `scripts/run_pretraining.sh`

@ -538,17 +598,17 @@ By default, each Python script implements fine-tuning a pre-trained BERT model f
 -   Has FP16 precision enabled
 -   Saves a checkpoint at the end of training to the `/results/<dataset_name>` folder

-Fine-tuning Python scripts implement support for mixed precision and multi-GPU training through NVIDIA’s [Apex](https://github.com/NVIDIA/apex) library. For a full list of parameters and associated explanations, consult the [Parameters](#parameters) section.
+Fine-tuning Python scripts implement support for mixed precision and multi-GPU training through NVIDIA’s [APEX](https://github.com/NVIDIA/apex) library. For a full list of parameters and associated explanations, see the [Parameters](#parameters) section.

 All fine-tuning shell scripts have the same positional arguments, outlined below:

-`bash scripts/run_squad.sh <checkpoint_to_load> <epochs> <batch_size per GPU> <learning rate> <precision (either `fp16` or `fp32`)> <number of GPUs to use> <seed> <SQUAD_DATA_DIR> <VOCAB_FILE> <OUTPUT_DIR> <mode (either `train`, `eval` or `train eval`)> <CONFIG_FILE>`
+```bash scripts/run_squad.sh <checkpoint_to_load> <epochs> <batch_size per GPU> <learning rate> <precision (either `fp16` or `fp32`)> <number of GPUs to use> <seed> <SQuAD_DATA_DIR> <VOCAB_FILE> <OUTPUT_DIR> <mode (either `train`, `eval` or `train eval`)> <CONFIG_FILE>```

 By default, the mode positional argument is set to train eval. See the [Quick Start Guide](#quick-start-guide) for explanations of each positional argument.

 Note: The first positional argument (the path to the checkpoint to load) is required.

-Each fine-tuning script assumes that the corresponding dataset files exist in the `data/` directory or separate path can be a command line input to `run_squad.sh`.
+Each fine-tuning script assumes that the corresponding dataset files exist in the `data/` directory or separate path can be a command-line input to `run_squad.sh`.

 ### Inference process

@ -578,13 +638,13 @@ Where:

 -   `<evaluation_batch_size>` is per-GPU batch size used for inference. Larger batch sizes run more efficiently, but require more memory.
 -   `<precision>` is the type of math in your model, can be either `fp32` or `fp16`. The options mean:
-	-   `fp32`: 32-bit IEEE single precision floats
-	-   `fp16`: 16-bit floats for 3.2x faster inference
+    -   `fp32`: 32-bit IEEE single precision floats
+    -   `fp16`: 16-bit floats for 3.2x faster inference
 -   `<num_gpus>` is the number of GPUs to use for inference. Must be equal to or smaller than the number of GPUs attached to your node.
 -   `<inference_mode>` is either `--eval` for evaluation or `--prediction` for inference
-   `<model_checkpoint>` is the model checkpoint to run inference on. Default is `-1`, which takes the most recent model checkpoint from the checkpoints folder.
+-   `<model_checkpoint>` is the model checkpoint to run inference on. Default is `-1`, which takes the most recent model checkpoint from the `checkpoints` folder.
 -   `<inference_steps>` is the total number of inference steps per process. Default is `-1`, which iterates over the entire dataset.
-   `<create_logfile>` a flag indicating if output should be written to a logfile or not (acceptable values are true or false. true indicates output should be saved to a logfile.)
+-   `<create_logfile>` a flag indicating if output should be written to a logfile or not (acceptable values are `true` or `false`. `true` indicates output should be saved to a logfile.)

 For example:

@ -598,11 +658,10 @@ Evaluation fine-tuning is enabled by the same scripts as training:

 The mode positional argument of the shell script is used to run in evaluation mode. The fine-tuned BERT model will be run on the evaluation dataset, and the evaluation loss and accuracy will be displayed.

-Each inference shell script expects dataset files to exist in the same locations as the corresponding training scripts. The inference scripts can be run with default settings. By setting `mode` variable in the script to either `eval` or `prediction` flag, you can choose between running evaluation on a given dataset or doing prediction.
+Each inference shell script expects dataset files to exist in the same locations as the corresponding training scripts. The inference scripts can be run with default settings. By setting `mode` variable in the script to either `eval` or `prediction` flag, you can choose between running prediction and evaluating them on a given dataset or just the former.

 `bash scripts/run_squad.sh <path to fine-tuned model checkpoint>`

-Note: Fine-tuning evaluation is only supported on single GPU.

 ## Performance

@ -612,11 +671,11 @@ The following section shows how to run benchmarks measuring the model performanc

 #### Training performance benchmark

-Training performance benchmarks for both pretraining and fine-tuning can be obtained by running `scripts/run_pretraining.sh` and `scripts/run_squad.sh` respectively. The required parameters can be passed through the command line as described in [Training process](#training-process).
+Training performance benchmarks for both pretraining and fine-tuning can be obtained by running `scripts/run_pretraining.sh` and `scripts/run_squad.sh` respectively. The required parameters can be passed through the command-line as described in [Training process](#training-process).

 To benchmark the training performance on a specific batch size, run:

-`bash scripts/run_squad.sh <pretrained model path> <epochs> <batch size> <learning rate> <fp16|fp32> <num_gpus> <seed> <path to squad dataset> <path to vocab set> <results directory> train <BERT config path] <max steps>`
+`bash scripts/run_squad.sh <pretrained model path> <epochs> <batch size> <learning rate> <fp16|fp32> <num_gpus> <seed> <path to SQuAD dataset> <path to vocab set> <results directory> train <BERT config path] <max steps>`

 An example call used to generate throughput numbers:

@ -626,11 +685,11 @@ An example call used to generate throughput numbers:

 #### Inference performance benchmark

-Inference performance benchmarks for both pretraining and fine-tuning can be obtained by running `scripts/run_pretraining_inference.sh` and `scripts/run_squad.sh` respectively. The required parameters can be passed through the command line as described in [Inference process](#inference-process).
+Inference performance benchmarks for both pretraining and fine-tuning can be obtained by running `scripts/run_pretraining_inference.sh` and `scripts/run_squad.sh` respectively. The required parameters can be passed through the command-line as described in [Inference process](#inference-process).

 To benchmark the inference performance on a specific batch size, run:

-`bash scripts/run_squad.sh <pretrained model path> <epochs> <batch size> <learning rate> <fp16|fp32> <num_gpus> <seed> <path to squad dataset> <path to vocab set> <results directory> eval <BERT config path> <max steps>`
+`bash scripts/run_squad.sh <pretrained model path> <epochs> <batch size> <learning rate> <fp16|fp32> <num_gpus> <seed> <path to SQuAD dataset> <path to vocab set> <results directory> eval <BERT config path> <max steps>`

 An example call used to generate throughput numbers:

@ -644,18 +703,20 @@ An example call used to generate throughput numbers:
 #### Training accuracy results


-
-Our results were obtained by running `scripts/run_squad.sh` and `scripts/run_pretraining.sh` training scripts in the pytorch:19.07-py3 NGC container on NVIDIA DGX-2 with (16x V100 32G) GPUs for pretraining and NVIDIA DGX-1 with (8x V100 16G) GPUs for fine-tuning.
-
-Note: Pretraining results were obtained with a dataset that was created using an earlier version of the data preprocessing scripts than are currently in this repository, and with an an earlier snapshot of wikidumps. The results in the table will be updated soon with results using the latest data prep scripts. Early data show the results are quite similar.
+Our results were obtained by running the `scripts/run_squad.sh` and `scripts/run_pretraining.sh` training scripts in the pytorch:19.07-py3 NGC container on NVIDIA DGX-2 with (16x V100 32G) GPUs for pretraining and NVIDIA DGX-1 with (8x V100 16G) GPUs for fine-tuning.


 ##### Pre-training loss results

-| DGX System | GPUs | Accumulated Batch size / GPU (Phase 1 and Phase 2) | Accumulation steps (Phase 1 and Phase 2) | Final Loss - FP32 | Final Loss - mixed precision | Time to train(days) - FP32 | Time to train(days) - mixed precision | Time to train speedup (FP32 to mixed precision)
+| DGX System | GPUs | Accumulated Batch size / GPU (Phase 1 and Phase 2) | Accumulation steps (Phase 1 and Phase 2) | Final Loss - FP32 | Final Loss - mixed precision | Time to train(hours) - FP32 | Time to train(hours) - mixed precision | Time to train speedup (FP32 to mixed precision)
 |---|---|---|---|---|---|---|---|---
-| NVIDIA DGX-1 With 16G|8|8192 and 4196 |512 and 1024|-|1.53|-|6.84|- 
-| NVIDIA DGX-2 With 32G|16|4096 and 2048 |64 and 256|-|1.52|-|2.71|- 
+| 1 x NVIDIA DGX-1 With 16G|8|8192 and 4196 |512 and 1024|-|1.36|-|153.16|-
+| 1 x NVIDIA DGX-2H With 32G|16|4096 and 2048 |64 and 256|-|1.35|-|58.4|-
+| 4 x NVIDIA DGX-1 With 16G|8|2048 and 1024 |128 and 256|-|1.34|-|39.27|-
+| 4 x NVIDIA DGX-2H With 32G|16|1024 and 512 |16 and 64|-|1.33|-|15.35|-
+| 16 x NVIDIA DGX-1 With 16G|8|512 and 256 |32 and 64|-|1.329|-|10.36|-
+| 16 x NVIDIA DGX-2H With 32G|16|256 and 128 |4 and 16|-|1.33|-|3.94|-
+| 64 x NVIDIA DGX-2H With 32G|16|64 and 32 |(1 and 4)FP16 and (2 and 8)FP32|1.33|1.331|4.338|1.124|3.85

 ##### Fine-tuning accuracy results

@ -667,9 +728,9 @@ Note: Pretraining results were obtained with a dataset that was created using an

 ###### Pre-training stability test

-| Accuracy Metric | Seed 1
-|---|---
-| Final Loss | 1.52
+| Accuracy Metric | Seed 1 | Seed 2 | Seed 3 | Seed 4 | Seed 5 | Mean | Standard Deviation
+|---|---|---|---|---|---|---|---
+|Final Loss| 1.344 | 1.328 | 1.324 | 1.326 | 1.333 | 1.331 | 0.009

 ###### Fine-tuning stability test

@ -680,11 +741,12 @@ Training stability with 8 GPUs, FP16 computations, batch size of 4:
 |Exact Match %| 84.50 | 84.07 | 84.52 | 84.23 | 84.17 | 84.30 | .200
 | f1 % | 91.29 | 91.01 | 91.14 |  91.10 | 90.85 | 91.08 | 0.162

+
 #### Training performance results

 ##### Training performance: NVIDIA DGX-1 (8x V100 16G)

-Our results were obtained by running `scripts/run_pretraining.sh` and `scripts/run_squad.shtraining scripts in the pytorch:19.07-py3 NGC container on NVIDIA DGX-1 with (8x V100 16G) GPUs. Performance numbers (in sequences per second) were averaged over a predefined number of training iterations.
+Our results were obtained by running the `scripts/run_pretraining.sh` and `scripts/run_squad.sh` training scripts in the pytorch:19.07-py3 NGC container on NVIDIA DGX-1 with (8x V100 16G) GPUs. Performance numbers (in sequences per second) were averaged over a predefined number of training iterations.

 ###### Pre-training NVIDIA DGX-1 With 16G

@ -698,6 +760,18 @@ Our results were obtained by running `scripts/run_pretraining.sh` and `scripts/r
 | 8| 2| 4| 512| 56.16 |194.56 | 3.46| 7.43| 7.30


+###### Pre-training on multiple NVIDIA DGX-1 With 16G
+
+| Nodes | GPUs | Batch size / GPU (FP32) | Batch size / GPU (FP16) | Sequence length | Throughput - FP32(sequences/sec) | Throughput - mixed precision(sequences/sec) | Throughput speedup (FP32 - mixed precision) | Weak scaling - FP32 | Weak scaling - mixed precision
+|------------------|----------------------|----------------------|-------------------|-----------------------------------------------|------------------------------------|---------------------------------|----------------------|----------------------------------------------|--------------
+|1 |8 | N/A | 16| 128| N/A |874.24 |N/A |N/A | 1.00
+|4 |8 | N/A | 16| 128| N/A |3089.76 | N/A| N/A| 3.53
+|16 |8 | N/A | 16| 128| N/A |12144.64 | N/A| N/A| 13.89
+|1 |8 | N/A | 4| 512| N/A |195.93 |N/A |N/A | 1.00
+|4 |8 | N/A | 4| 512| N/A |700.16 | N/A| N/A| 3.57
+|16| 8| N/A | 4| 512| N/A |2746.368 | N/A| N/A| 14.02
+
+
 ###### Fine-tuning NVIDIA DGX-1 With 16G


@ -713,7 +787,7 @@ Our results were obtained by running `scripts/run_pretraining.sh` and `scripts/r

 ##### Training performance: NVIDIA DGX-1 (8x V100 32G)

-Our results were obtained by running `scripts/run_pretraining.sh` and `scripts/run_squad.sh` training scripts in the pytorch:19.07-py3 NGC container on NVIDIA DGX-1 with (8x V100 32G) GPUs. Performance numbers (in sequences per second) were averaged over an entire training epoch.
+Our results were obtained by running the `scripts/run_pretraining.sh` and `scripts/run_squad.sh` training scripts in the pytorch:19.07-py3 NGC container on NVIDIA DGX-1 with (8x V100 32G) GPUs. Performance numbers (in sequences per second) were averaged over an entire training epoch.

 ###### Pre-training NVIDIA DGX-1 With 32G

@ -729,6 +803,7 @@ Our results were obtained by running `scripts/run_pretraining.sh` and `scripts/r
 |4 |N/A | 10| 512|N/A |164.00 | N/A| N/A| 3.57
 | 8|N/A | 10| 512|N/A |325.60| N/A| N/A| 7.08

+
 ###### Fine-tuning NVIDIA DGX-1 With 32G

 | GPUs | Batch size / GPU | Throughput - FP32(sequences/sec) | Throughput - mixed precision(sequences/sec) | Throughput speedup (FP32 - mixed precision) | Weak scaling - FP32 | Weak scaling - mixed precision
@ -743,7 +818,7 @@ Our results were obtained by running `scripts/run_pretraining.sh` and `scripts/r

 ##### Training performance: NVIDIA DGX-2 (16x V100 32G)

-Our results were obtained by running `scripts/run_pretraining.sh` and `scripts/run_squad.sh` training scripts in the pytorch:19.07-py3 NGC container on NVIDIA DGX-2 with (16x V100 32G) GPUs. Performance numbers (in sequences per second) were averaged over an entire training epoch.
+Our results were obtained by running the `scripts/run_pretraining.sh` and `scripts/run_squad.sh` training scripts in the pytorch:19.07-py3 NGC container on NVIDIA DGX-2 with (16x V100 32G) GPUs. Performance numbers (in sequences per second) were averaged over an entire training epoch.

 ###### Pre-training NVIDIA DGX-2 With 32G

@ -762,6 +837,22 @@ Our results were obtained by running `scripts/run_pretraining.sh` and `scripts/r
 |8 | N/A | 10| 512| N/A| 325.60| N/A| N/A| 6.87
 |16 | N/A | 10| 512| N/A| 648.00| N/A| N/A| 13.67

+###### Pre-training on multiple NVIDIA DGX-2H With 32G
+
+Note: Multi-node performance numbers below are on DGX-2H whereas the single node performance numbers above are on DGX-2.
+
+
+| Nodes | GPUs | Batch size / GPU (FP32) | Batch size / GPU (FP16) | Sequence length | Throughput - FP32(sequences/sec) | Throughput - mixed precision(sequences/sec) | Throughput speedup (FP32 - mixed precision) | Weak scaling - FP32 | Weak scaling - mixed precision
+|------------------|----------------------|----------------------|-------------------|-----------------------------------------------|------------------------------------|---------------------------------|----------------------|----------------------------------------------|---------------------
+|1 |16 | N/A | 64| 128| N/A |3379.2 |N/A |N/A | 1.00
+|4 |16 | N/A | 64| 128| N/A |12709.88 | N/A| N/A| 3.76
+|16 |16 | N/A | 64| 128| N/A |51937.28 | N/A| N/A| 15.37
+|64 |16 | 32 | 64| 128| 46628.86 |188088.32 | 4.03 | N/A| 55.66
+|1 |16 | N/A | 8| 512| N/A |625.66 |N/A |N/A | 1.00
+|4 |16 | N/A | 8| 512| N/A |2386.38 | N/A| N/A| 3.81
+|16| 16| N/A | 8| 512| N/A |9932.8 | N/A| N/A| 15.87
+|64| 16| 4 | 8| 512| 9543.68 |37478.4 | 3.92| N/A| 59.9
+
 ###### Fine-tuning NVIDIA DGX-2 With 32G

 | GPUs | Batch size / GPU | Throughput - FP32(sequences/sec) | Throughput - mixed precision(sequences/sec) | Throughput speedup (FP32 - mixed precision) | Weak scaling - FP32 | Weak scaling - mixed precision
@ -781,7 +872,7 @@ To achieve these same results, follow the steps in the [Quick Start Guide](#quic

 ##### Inference performance: NVIDIA DGX-1 (1x V100 16G)

-Our results were obtained by running `scripts/run_pretraining_inference.sh` on data of sequence length 512 and `scripts/run_squad.sh` scripts in the pytorch:19.07-py3 NGC container on NVIDIA DGX-1 with (1x V100 16G) GPUs.
+Our results were obtained by running the `scripts/run_pretraining_inference.sh` script on data of sequence length 512 and the `scripts/run_squad.sh` script in the pytorch:19.07-py3 NGC container on NVIDIA DGX-1 with (1x V100 16G) GPUs.

 ###### Pre-training inference on NVIDIA DGX-1 with 16G

@ -797,7 +888,7 @@ Our results were obtained by running `scripts/run_pretraining_inference.sh` on d

 ##### Inference performance: NVIDIA DGX-1 (1x V100 32G)

-Our results were obtained by running `scripts/run_pretraining_inference.sh` and  `scripts/run_squad.sh` scripts in the pytorch:19.07-py3 NGC container on NVIDIA DGX-1 with (1x V100 32G) GPUs.
+Our results were obtained by running the `scripts/run_pretraining_inference.sh` and  `scripts/run_squad.sh` scripts in the pytorch:19.07-py3 NGC container on NVIDIA DGX-1 with (1x V100 32G) GPUs.

 ###### Pre-training inference on NVIDIA DGX-1 with 32G

@ -813,13 +904,13 @@ Our results were obtained by running `scripts/run_pretraining_inference.sh` and

 ##### Inference performance: NVIDIA DGX-2 (1x V100 32G)

-Our results were obtained by running `scripts/run_pretraining_inference.sh` and  `scripts/run_squad.sh` scripts in the pytorch:19.07-py3 NGC container on NVIDIA DGX-2 with (1x V100 32G) GPUs.
+Our results were obtained by running the `scripts/run_pretraining_inference.sh` and  `scripts/run_squad.sh` scripts in the pytorch:19.07-py3 NGC container on NVIDIA DGX-2 with (1x V100 32G) GPUs.

 ###### Pre-training inference on NVIDIA DGX-2 with 32G

 |GPUs | Throughput - FP32(sequences/sec)|Throughput - Mixed Precision(sequences/sec)
 |---------- |---------|---------------
-| 1| 30.24 97.72
+| 1| 30.24| 97.72

 ###### Fine-tuning inference on NVIDIA DGX-2 with 32G

@ -835,16 +926,20 @@ The inference performance metrics used were items/second.

 ### Changelog

+September 2019
+- Scripts to support multi-node launch
+- Update pretraining loss results based on the latest data preparation scripts
+
 August 2019
-
- Pretraining support with LAMB optimizer
-
+- Pre-training support with LAMB optimizer
 - Updated Data download and Preprocessing

 July 2019
-
 - Initial release

 ### Known issues

 There are no known issues with this model.
+
+
+
--- a/PyTorch/LanguageModeling/BERT/bind_pyt.py
+++ b/PyTorch/LanguageModeling/BERT/bind_pyt.py
@ -1,3 +1,16 @@
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import sys
 import subprocess
 import os
--- a/PyTorch/LanguageModeling/BERT/configurations.yml
+++ b/PyTorch/LanguageModeling/BERT/configurations.yml
@ -0,0 +1,182 @@
+# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#1 DGX1 phase1
+bert--DGX1:
+  <<: *BERT_ON_CLUSTER
+  <<: *DGX1
+  variables:
+    <<: *DGX1_VARS
+    NNODES: "1"
+    BATCHSIZE: "8192"
+    LR: "6e-3"
+    GRADIENT_STEPS: "512"
+    PHASE: "1"
+
+#4 DGX1 phase1
+bert--DGX1_4x8x16x128:
+  <<: *BERT_ON_CLUSTER
+  <<: *DGX1
+  variables:
+    <<: *DGX1_VARS
+    NNODES: "4"
+    BATCHSIZE: "2048"
+    LR: "6e-3"
+    GRADIENT_STEPS: "128"
+    PHASE: "1"
+
+#16 DGX1 phase1
+bert--DGX1_16x8x16x32:
+  <<: *BERT_ON_CLUSTER
+  <<: *DGX1
+  variables:
+    <<: *DGX1_VARS
+    NNODES: "16"
+    BATCHSIZE: "512"
+    LR: "6e-3"
+    GRADIENT_STEPS: "32"
+    PHASE: "1"
+
+#1 DGX2 phase1
+bert--DGX2:
+  <<: *BERT_ON_CLUSTER
+  <<: *DGX2
+  variables:
+    <<: *DGX2_VARS
+    NNODES: "1"
+    BATCHSIZE: "4096"
+    LR: "6e-3"
+    GRADIENT_STEPS: "64"
+    PHASE: "1"
+
+#4 DGX2 phase1
+bert--DGX2_4x16x64x16:
+  <<: *BERT_ON_CLUSTER
+  <<: *DGX2
+  variables:
+    <<: *DGX2_VARS
+    NNODES: "4"
+    BATCHSIZE: "1024"
+    LR: "6e-3"
+    GRADIENT_STEPS: "16"
+    PHASE: "1"
+
+#16 DGX2 phase1
+bert--DGX2_16x16x64x4:
+  <<: *BERT_ON_CLUSTER
+  <<: *DGX2
+  variables:
+    <<: *DGX2_VARS
+    NNODES: "16"
+    BATCHSIZE: "256"
+    LR: "6e-3"
+    GRADIENT_STEPS: "4"
+    PHASE: "1"
+
+#64 DGX2 phase1
+bert--DGX2_64x16x64:
+  <<: *BERT_ON_CLUSTER
+  <<: *DGX2
+  variables:
+    <<: *DGX2_VARS
+    NNODES: "64"
+    BATCHSIZE: "64"
+    LR: "6e-3"
+    GRADIENT_STEPS: "1"
+    PHASE: "1"
+
+#1 DGX1 phase2
+bert--DGX1_1x8x4x1024:
+  <<: *BERT_ON_CLUSTER
+  <<: *DGX1
+  variables:
+    <<: *DGX1_VARS
+    NNODES: "1"
+    BATCHSIZE: "4096"
+    LR: "4e-3"
+    GRADIENT_STEPS: "1024"
+    PHASE: "2"
+
+#4 DGX1 phase2
+bert--DGX1_4x8x4x256:
+  <<: *BERT_ON_CLUSTER
+  <<: *DGX1
+  variables:
+    <<: *DGX1_VARS
+    NNODES: "4"
+    BATCHSIZE: "1024"
+    LR: "4e-3"
+    GRADIENT_STEPS: "256"
+    PHASE: "2"
+
+#16 DGX1 phase2
+bert--DGX1_16x8x4x64:
+  <<: *BERT_ON_CLUSTER
+  <<: *DGX1
+  variables:
+    <<: *DGX1_VARS
+    NNODES: "16"
+    BATCHSIZE: "256"
+    LR: "4e-3"
+    GRADIENT_STEPS: "64"
+    PHASE: "2"
+
+#1 DGX2 phase2
+bert--DGX2_1x16x8x256:
+  <<: *BERT_ON_CLUSTER
+  <<: *DGX2
+  variables:
+    <<: *DGX2_VARS
+    NNODES: "1"
+    BATCHSIZE: "2048"
+    LR: "4e-3"
+    GRADIENT_STEPS: "256"
+    PHASE: "2"
+
+#4 DGX2 phase2
+bert--DGX2_4x16x8x64:
+  <<: *BERT_ON_CLUSTER
+  <<: *DGX2
+  variables:
+    <<: *DGX2_VARS
+    NNODES: "4"
+    BATCHSIZE: "512"
+    LR: "4e-3"
+    GRADIENT_STEPS: "64"
+    PHASE: "2"
+
+#16 DGX2 phase2
+bert--DGX2_16x16x8x16:
+  <<: *BERT_ON_CLUSTER
+  <<: *DGX2
+  variables:
+    <<: *DGX2_VARS
+    NNODES: "16"
+    BATCHSIZE: "128"
+    LR: "4e-3"
+    GRADIENT_STEPS: "16"
+    PHASE: "2"
+
+#64 DGX2 phase2
+bert--DGX2_64x16x8x4:
+  <<: *BERT_ON_CLUSTER
+  <<: *DGX2
+  variables:
+    <<: *DGX2_VARS
+    NNODES: "64"
+    BATCHSIZE: "32"
+    LR: "4e-3"
+    GRADIENT_STEPS: "4"
+    PHASE: "2"
+
--- a/PyTorch/LanguageModeling/BERT/create_pretraining_data.py
+++ b/PyTorch/LanguageModeling/BERT/create_pretraining_data.py
@ -1,6 +1,6 @@
 # coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors.
-#
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@ -12,6 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 """Create masked LM/next sentence masked_lm TF examples for BERT."""
 from __future__ import absolute_import, division, print_function, unicode_literals

--- a/PyTorch/LanguageModeling/BERT/data/BooksDownloader.py
+++ b/PyTorch/LanguageModeling/BERT/data/BooksDownloader.py
@ -1,4 +1,16 @@
 # Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import subprocess

 class BooksDownloader:
--- a/PyTorch/LanguageModeling/BERT/data/BookscorpusTextFormatting.py
+++ b/PyTorch/LanguageModeling/BERT/data/BookscorpusTextFormatting.py
@ -1,4 +1,16 @@
 # Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import glob
 import os

--- a/PyTorch/LanguageModeling/BERT/data/Downloader.py
+++ b/PyTorch/LanguageModeling/BERT/data/Downloader.py
@ -1,4 +1,16 @@
 # Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from GooglePretrainedWeightDownloader import GooglePretrainedWeightDownloader
 from NVIDIAPretrainedWeightDownloader import NVIDIAPretrainedWeightDownloader
 from WikiDownloader import WikiDownloader
--- a/PyTorch/LanguageModeling/BERT/data/GooglePretrainedWeightDownloader.py
+++ b/PyTorch/LanguageModeling/BERT/data/GooglePretrainedWeightDownloader.py
@ -1,4 +1,15 @@
 # Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

 import hashlib
 import os
--- a/PyTorch/LanguageModeling/BERT/data/MRPCDownloader.py
+++ b/PyTorch/LanguageModeling/BERT/data/MRPCDownloader.py
@ -1,4 +1,15 @@
 # Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

 import bz2
 import os
--- a/PyTorch/LanguageModeling/BERT/data/NVIDIAPretrainedWeightDownloader.py
+++ b/PyTorch/LanguageModeling/BERT/data/NVIDIAPretrainedWeightDownloader.py
@ -1,4 +1,15 @@
 # Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

 import os

--- a/PyTorch/LanguageModeling/BERT/data/SquadDownloader.py
+++ b/PyTorch/LanguageModeling/BERT/data/SquadDownloader.py
@ -1,4 +1,15 @@
 # Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

 import bz2
 import os
--- a/PyTorch/LanguageModeling/BERT/data/TextSharding.py
+++ b/PyTorch/LanguageModeling/BERT/data/TextSharding.py
@ -1,4 +1,15 @@
 # Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

 from collections import defaultdict
 from itertools import islice
--- a/PyTorch/LanguageModeling/BERT/data/WikiDownloader.py
+++ b/PyTorch/LanguageModeling/BERT/data/WikiDownloader.py
@ -1,4 +1,15 @@
 # Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

 import bz2
 import os
@ -43,6 +54,4 @@ class WikiDownloader:
            subprocess.run('bzip2 -dk ' + self.save_path + '/' + filename, shell=True, check=True)

        else:
-            assert False, 'WikiDownloader not implemented for this language yet.'
-
-
+            assert False, 'WikiDownloader not implemented for this language yet.'
--- a/PyTorch/LanguageModeling/BERT/data/WikicorpusTextFormatting.py
+++ b/PyTorch/LanguageModeling/BERT/data/WikicorpusTextFormatting.py
@ -1,4 +1,15 @@
 # Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

 import glob
 import os
--- a/PyTorch/LanguageModeling/BERT/data/init.py
+++ b/PyTorch/LanguageModeling/BERT/data/init.py
@ -0,0 +1,12 @@
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/PyTorch/LanguageModeling/BERT/data/bertPrep.py
+++ b/PyTorch/LanguageModeling/BERT/data/bertPrep.py
@ -1,4 +1,15 @@
 # Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

 import BookscorpusTextFormatting
 import Downloader
@ -70,14 +81,13 @@ def main(args):
                wikiextractor_command = path_to_wikiextractor_in_container + ' ' + directory_structure['download'] + '/' + args.dataset + '/wikicorpus_en.xml ' + '-b 100M --processes ' + str(args.n_processes) + ' -o ' + directory_structure['extracted'] + '/' + args.dataset
                print('WikiExtractor Command:', wikiextractor_command)
                wikiextractor_process = subprocess.run(wikiextractor_command, shell=True, check=True)
+                #wikiextractor_process.communicate()

            wiki_path = directory_structure['extracted'] + '/wikicorpus_en'
            output_filename = directory_structure['formatted'] + '/wikicorpus_en_one_article_per_line.txt'
            wiki_formatter = WikicorpusTextFormatting.WikicorpusTextFormatting(wiki_path, output_filename, recursive=True)
            wiki_formatter.merge()

-            assert os.stat(output_filename).st_size > 0, 'File glob did not pick up extracted wiki files from WikiExtractor.'
-            
        elif args.dataset == 'wikicorpus_zh':
            assert False, 'wikicorpus_zh not fully supported at this time. The simplified/tradition Chinese data needs to be translated and properly segmented still, and should work once this step is added.'
            if args.skip_wikiextractor == 0:
@ -85,6 +95,7 @@ def main(args):
                wikiextractor_command = path_to_wikiextractor_in_container + ' ' + directory_structure['download'] + '/' + args.dataset + '/wikicorpus_zh.xml ' + '-b 100M --processes ' + str(args.n_processes) + ' -o ' + directory_structure['extracted'] + '/' + args.dataset
                print('WikiExtractor Command:', wikiextractor_command)
                wikiextractor_process = subprocess.run(wikiextractor_command, shell=True, check=True)
+                #wikiextractor_process.communicate()

            wiki_path = directory_structure['extracted'] + '/wikicorpus_zh'
            output_filename = directory_structure['formatted'] + '/wikicorpus_zh_one_article_per_line.txt'
--- a/PyTorch/LanguageModeling/BERT/data/create_datasets_from_start.sh
+++ b/PyTorch/LanguageModeling/BERT/data/create_datasets_from_start.sh
@ -1,5 +1,18 @@
 #!/bin/bash
+
 # Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 # Download
 python3 /workspace/bert/data/bertPrep.py --action download --dataset bookscorpus
 python3 /workspace/bert/data/bertPrep.py --action download --dataset wikicorpus_en
@ -26,4 +39,4 @@ python3 /workspace/bert/data/bertPrep.py --action create_hdf5_files --dataset bo

 # Create HDF5 files Phase 2
 python3 /workspace/bert/data/bertPrep.py --action create_hdf5_files --dataset books_wiki_en_corpus --max_seq_length 512 \
- --max_predictions_per_seq 80 --vocab_file $BERT_PREP_WORKING_DIR/download/google_pretrained_weights/uncased_L-24_H-1024_A-16/vocab.txt --do_lower_case 1
+ --max_predictions_per_seq 80 --vocab_file $BERT_PREP_WORKING_DIR/download/google_pretrained_weights/uncased_L-24_H-1024_A-16/vocab.txt --do_lower_case 1
--- a/PyTorch/LanguageModeling/BERT/data/glue/download_mrpc.sh
+++ b/PyTorch/LanguageModeling/BERT/data/glue/download_mrpc.sh
@ -1,5 +1,18 @@
 #!/usr/bin/env bash

+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 echo "Downloading MRPC data"

 wget https://gist.githubusercontent.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e/raw/17b8dd0d724281ed7c3b2aeeda662b92809aadd5/download_glue_data.py
--- a/PyTorch/LanguageModeling/BERT/data/squad/squad_download.sh
+++ b/PyTorch/LanguageModeling/BERT/data/squad/squad_download.sh
@ -1,5 +1,18 @@
 #!/usr/bin/env bash

+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 echo "Downloading dataset for squad..."

 # Download SQuAD
--- a/PyTorch/LanguageModeling/BERT/extract_features.py
+++ b/PyTorch/LanguageModeling/BERT/extract_features.py
@ -12,6 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 """Extract pre-computed feature vectors from a PyTorch BERT model."""

 from __future__ import absolute_import
--- a/PyTorch/LanguageModeling/BERT/file_utils.py
+++ b/PyTorch/LanguageModeling/BERT/file_utils.py
@ -1,8 +1,22 @@
+# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 """
 Utilities for working with the local dataset cache.
 This file is adapted from the AllenNLP library at https://github.com/allenai/allennlp
 Copyright by the AllenNLP authors.
 """
+
 from __future__ import (absolute_import, division, print_function, unicode_literals)

 import json
--- a/PyTorch/LanguageModeling/BERT/modeling.py
+++ b/PyTorch/LanguageModeling/BERT/modeling.py
@ -1,7 +1,6 @@
 # coding=utf-8
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
 # Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@ -13,6 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 """PyTorch BERT model."""

 from __future__ import absolute_import, division, print_function, unicode_literals
--- a/PyTorch/LanguageModeling/BERT/optimization.py
+++ b/PyTorch/LanguageModeling/BERT/optimization.py
@ -13,6 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 """PyTorch optimization for BERT model."""

 import math
@ -24,6 +25,7 @@ from torch.nn.utils import clip_grad_norm_
 from apex.optimizers import FusedAdam
 from apex.multi_tensor_apply import multi_tensor_applier
 import amp_C
+
 multi_tensor_l2norm = amp_C.multi_tensor_l2norm
 lamb_compute_update = amp_C.multi_tensor_lamb_stage1_cuda
 lamb_apply_update = amp_C.multi_tensor_lamb_stage2_cuda
--- a/PyTorch/LanguageModeling/BERT/run.sub
+++ b/PyTorch/LanguageModeling/BERT/run.sub
@ -0,0 +1,74 @@
+#!/bin/bash
+#SBATCH --exclusive
+#SBATCH --mem=0
+#SBATCH --overcommit
+
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -eux
+
+# The following variables variables need to be set
+# Base container to be used  
+readonly docker_image="nvcr.io/nvidia/pytorch:19.08-py3"
+# Location of dataset for phase 1
+readonly datadir="/raid/datasets/bert/hdf5/shard_1472_test_split_10/seq_128_pred_20_dupe_5/training"
+# Location of dataset for phase 2
+readonly datadir_phase2="/raid/datasets/bert/hdf5/shard_1472_test_split_10/seq_512_pred_80_dupe_5/training"
+# Path to where trained checkpoints will be saved on the system
+readonly checkpointdir="$PWD/checkpoints"
+
+readonly mounts=".:/workspace/bert,${datadir}:/workspace/data,${datadir_phase2}:/workspace/data_phase2,${checkpointdir}:/results"
+
+srun --ntasks="${SLURM_JOB_NUM_NODES}" --ntasks-per-node=1 mkdir -p "${checkpointdir}"
+
+PHASE1="\
+    --train_batch_size=${BATCHSIZE:-16} \
+    --learning_rate=${LR:-6e-3} \
+    --warmup_proportion=${WARMUP_UPDATES:-0.2843} \
+    --input_dir=/workspace/data \
+    --max_seq_length=128 \
+    --max_predictions_per_seq=20 \
+    --max_steps=7038 \
+    --num_steps_per_checkpoint=2500 \
+    "
+PHASE2="\
+    --train_batch_size=${BATCHSIZE:-4096} \
+    --learning_rate=${LR:-4e-3} \
+    --warmup_proportion=${WARMUP_UPDATES:-0.128} \
+    --input_dir=/workspace/data_phase2 \
+    --phase2 \
+    --max_seq_length=512 \
+    --max_predictions_per_seq=80 \
+    --max_steps=1563 \
+    --num_steps_per_checkpoint=1000 \
+    --resume_from_checkpoint --phase1_end_step=7038 \
+    "
+PHASES=( "$PHASE1" "$PHASE2" ) 
+
+PHASE=${PHASE:-1}
+
+BERT_CMD="\
+    python -u /workspace/bert/run_pretraining.py \
+    --seed=42 \
+    ${PHASES[$((PHASE-1))]} \
+    --do_train \
+    --config_file=/workspace/bert/bert_config.json \
+    --output_dir=/results \
+    --fp16 \
+    --allreduce_post_accumulation --allreduce_post_accumulation_fp16 \
+    --gradient_accumulation_steps=${GRADIENT_STEPS:-2} \
+    --log_freq=1 \
+    --local_rank=\${SLURM_LOCALID}"
+
+srun -l --container-image="${docker_image}" --container-mounts="${mounts}" sh -c "${BERT_CMD}"
--- a/PyTorch/LanguageModeling/BERT/run_glue.py
+++ b/PyTorch/LanguageModeling/BERT/run_glue.py
@ -1,7 +1,6 @@
 # coding=utf-8
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
 # Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@ -13,6 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 """BERT finetuning runner."""

 from __future__ import absolute_import, division, print_function
--- a/PyTorch/LanguageModeling/BERT/run_pretraining.py
+++ b/PyTorch/LanguageModeling/BERT/run_pretraining.py
@ -13,6 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 """BERT finetuning runner."""

 from __future__ import absolute_import
@ -65,7 +66,6 @@ def create_pretraining_dataset(input_file, max_pred_length, shared_list, args):
    train_dataloader = DataLoader(train_data, sampler=train_sampler,
                                  batch_size=args.train_batch_size * args.n_gpu, num_workers=4,
                                  pin_memory=True)
-    # shared_list["0"] = (train_dataloader, input_file)
    return train_dataloader, input_file

 class pretraining_dataset(Dataset):
@ -179,7 +179,7 @@ def parse_arguments():
                        type=float, default=0.0,
                        help='Loss scaling, positive power of 2 values can improve fp16 convergence.')
    parser.add_argument('--log_freq',
-                        type=float, default=50.0,
+                        type=float, default=1.0,
                        help='frequency of logging loss.')
    parser.add_argument('--checkpoint_activations',
                        default=False,
@ -253,7 +253,7 @@ def setup_training(args):
        raise ValueError(" `do_train`  must be True.")

    if not args.resume_from_checkpoint and os.path.exists(args.output_dir) and (
-            os.listdir(args.output_dir) and os.listdir(args.output_dir) != ['logfile.txt']):
+            os.listdir(args.output_dir) and any([i.startswith('ckpt') for i in os.listdir(args.output_dir)])):
        raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))

    if not args.resume_from_checkpoint:
@ -478,8 +478,7 @@ def main():

            for f_id in range(f_start_id + 1 , len(files)):
                
-                # torch.cuda.synchronize()
-                # f_start = time.time()    
+   
                if torch.distributed.get_world_size() > num_files:
                    data_file = files[(f_id*torch.distributed.get_world_size()+torch.distributed.get_rank() + remainder*f_id)%num_files]
                else:
@ -489,23 +488,10 @@ def main():

                previous_file = data_file

-                # train_dataloader = shared_file_list["0"][0]
-
-                # thread = multiprocessing.Process(
-                #     name="LOAD DATA:" + str(f_id) + ":" + str(data_file),
-                #     target=create_pretraining_dataset,
-                #     args=(data_file, args.max_predictions_per_seq, shared_file_list, args, n_gpu)
-                # )
-                # thread.start()
                dataset_future = pool.submit(create_pretraining_dataset, data_file, args.max_predictions_per_seq, shared_file_list, args)
-                # torch.cuda.synchronize()
-                # f_end = time.time()
-                # print('[{}] : shard overhead {}'.format(torch.distributed.get_rank(), f_end - f_start))

                train_iter = tqdm(train_dataloader, desc="Iteration") if is_main_process() else train_dataloader
                for step, batch in enumerate(train_iter):
-                    # torch.cuda.synchronize()
-                    # iter_start = time.time()

                    training_steps += 1
                    batch = [t.to(device) for t in batch]
@ -533,7 +519,7 @@ def main():
                        global_step = take_optimizer_step(args, optimizer, model, overflow_buf, global_step)

                    if global_step >= args.max_steps:
-                        last_num_steps = global_step % args.log_freq
+                        last_num_steps = int(training_steps / args.gradient_accumulation_steps) % args.log_freq
                        last_num_steps = args.log_freq if last_num_steps == 0 else last_num_steps
                        average_loss = torch.tensor(average_loss, dtype=torch.float32).cuda()
                        average_loss = average_loss / (last_num_steps * divisor)
@ -541,7 +527,7 @@ def main():
                            average_loss /= torch.distributed.get_world_size()
                            torch.distributed.all_reduce(average_loss)
                        if is_main_process():
-                            logger.info("Total Steps:{} Final Loss = {}".format(training_steps, average_loss.item()))
+                            logger.info("Total Steps:{} Final Loss = {}".format(training_steps / args.gradient_accumulation_steps, average_loss.item()))
                    elif training_steps % (args.log_freq * args.gradient_accumulation_steps) == 0:
                        if is_main_process():
                            print("Step:{} Average Loss = {} Step Loss = {} LR {}".format(global_step, average_loss / (
@ -578,13 +564,6 @@ def main():
                            # thread.join()
                            return args

-
-                    # torch.cuda.synchronize()
-                    # iter_end = time.time()
-
-                    # if torch.distributed.get_rank() == 0:
-                    #     print('step {} : {}'.format(global_step, iter_end - iter_start))
-
                del train_dataloader
                # thread.join()
                # Make sure pool has finished and switch train_dataloader
--- a/PyTorch/LanguageModeling/BERT/run_pretraining_inference.py
+++ b/PyTorch/LanguageModeling/BERT/run_pretraining_inference.py
@ -12,6 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 """BERT finetuning runner."""

 from __future__ import absolute_import
--- a/PyTorch/LanguageModeling/BERT/run_squad.py
+++ b/PyTorch/LanguageModeling/BERT/run_squad.py
@ -1,7 +1,6 @@
 # coding=utf-8
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
 # Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@ -13,6 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 """Run BERT on SQuAD."""

 from __future__ import absolute_import, division, print_function
@ -40,6 +40,7 @@ from file_utils import PYTORCH_PRETRAINED_BERT_CACHE
 from modeling import BertForQuestionAnswering, BertConfig, WEIGHTS_NAME, CONFIG_NAME
 from optimization import BertAdam, warmup_linear
 from tokenization import (BasicTokenizer, BertTokenizer, whitespace_tokenize)
+from utils import is_main_process

 if sys.version_info[0] == 2:
    import cPickle as pickle
@ -923,9 +924,11 @@ def main():
    model = BertForQuestionAnswering(config)
    # model = BertForQuestionAnswering.from_pretrained(args.bert_model,
                # cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank)))
-    print("USING CHECKOINT")
+    if is_main_process():
+        print("LOADING CHECKOINT")
    model.load_state_dict(torch.load(args.init_checkpoint, map_location='cpu')["model"], strict=False)
-    print("USED CHECKPOINT \n\n")
+    if is_main_process():
+        print("LOADED CHECKPOINT")
    model.to(device)
    if args.fp16 and args.old:
        model.half()
--- a/PyTorch/LanguageModeling/BERT/run_swag.py
+++ b/PyTorch/LanguageModeling/BERT/run_swag.py
@ -1,7 +1,6 @@
 # coding=utf-8
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
 # Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@ -13,6 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 """BERT finetuning runner."""

 import argparse
--- a/PyTorch/LanguageModeling/BERT/schedulers.py
+++ b/PyTorch/LanguageModeling/BERT/schedulers.py
@ -1,3 +1,17 @@
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import math
 import torch
 from torch.optim.optimizer import Optimizer
--- a/PyTorch/LanguageModeling/BERT/scripts/data_download.sh
+++ b/PyTorch/LanguageModeling/BERT/scripts/data_download.sh
@ -1,4 +1,18 @@
 #!/usr/bin/env bash
+
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 DATA_DIR=${1:-/workspace/bert/data}

 # Download vocab files from pretrained model
--- a/PyTorch/LanguageModeling/BERT/scripts/run_glue.sh
+++ b/PyTorch/LanguageModeling/BERT/scripts/run_glue.sh
@ -1,5 +1,18 @@
 #!/bin/bash

+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 MRPC_DIR=/workspace/bert/data/glue/MRPC
 OUT_DIR=/results/MRPC

@ -55,7 +68,8 @@ CMD+="$use_fp16"
 LOGFILE=$OUT_DIR/logfile
 $CMD |& tee $LOGFILE

-sed -r 's/
|(\[A)/\n/g' $LOGFILE > $LOGFILE.edit
+sed -r 's/
+|(\[A)/\n/g' $LOGFILE > $LOGFILE.edit

 throughput=`cat $LOGFILE.edit | grep -E 'Iteration.*[0-9.]+(s/it|it/s)' | tail -1 | egrep -o '[0-9.]+(s/it|it/s)'`

--- a/PyTorch/LanguageModeling/BERT/scripts/run_pretraining.sh
+++ b/PyTorch/LanguageModeling/BERT/scripts/run_pretraining.sh
@ -1,5 +1,18 @@
 #!/bin/bash

+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 echo "Container nvidia build = " $NVIDIA_BUILD_ID
 train_batch_size=${1:-8192}
 learning_rate=${2:-"6e-3"}
@ -18,11 +31,11 @@ allreduce_post_accumulation=${14:-"true"}
 allreduce_post_accumulation_fp16=${15:-"true"}
 accumulate_into_fp16=${16:-"false"}

-train_batch_size_phase2=${1:-4096}
-learning_rate_phase2=${2:-"4e-3"}
-warmup_proportion_phase2=${5:-"0.128"}
-train_steps_phase2=${6:-1563}
-gradient_accumulation_steps_phase2=${11:-512}
+train_batch_size_phase2=${17:-4096}
+learning_rate_phase2=${18:-"4e-3"}
+warmup_proportion_phase2=${19:-"0.128"}
+train_steps_phase2=${20:-1563}
+gradient_accumulation_steps_phase2=${21:-512}

 DATASET=hdf5_lower_case_1_seq_len_128_max_pred_20_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5/books_wiki_en_corpus # change this for other datasets
 DATA_DIR=$BERT_PREP_WORKING_DIR/${DATASET}/
@ -108,13 +121,7 @@ CMD+=" $ALL_REDUCE_POST_ACCUMULATION"
 CMD+=" $ALL_REDUCE_POST_ACCUMULATION_FP16"
 CMD+=" $ACCUMULATE_INTO_FP16"
 CMD+=" --do_train"
-
-if [ "$num_gpus" -gt 1  ] ; then
-   CMD="python3 -m torch.distributed.launch --nproc_per_node=$num_gpus $CMD"
-else
-   CMD="python3  $CMD"
-fi
-
+CMD="python3 -m torch.distributed.launch --nproc_per_node=$num_gpus $CMD"

 if [ "$create_logfile" = "true" ] ; then
  export GBS=$(expr $train_batch_size \* $num_gpus)
@ -145,7 +152,7 @@ throughput=`cat $LOGFILE | grep Iteration | tail -1 | awk -F'it/s' '{print $1}'
 loss=`cat $LOGFILE | grep 'Average Loss' | tail -1 | awk -F'Average Loss =' '{print $2}' | awk -F' ' '{print $1}' | egrep -o [0-9.]+`
 final_loss=`cat $LOGFILE | grep 'Total Steps' | tail -1 | awk -F'Final Loss =' '{print $2}' | awk -F' ' '{print $1}' | egrep -o [0-9.]+`

-train_perf=$(awk 'BEGIN {print ('$throughput' * '$num_gpus' * '$train_batch_size')}')
+train_perf=$(awk 'BEGIN {print ('$throughput' * '$num_gpus' * '$train_batch_size' / '$gradient_accumulation_steps' )}')
 echo " training throughput phase1: $train_perf sequences/second"
 echo "average loss: $loss"
 echo "final loss: $final_loss"
@ -207,13 +214,7 @@ CMD+=" $ALL_REDUCE_POST_ACCUMULATION"
 CMD+=" $ALL_REDUCE_POST_ACCUMULATION_FP16"
 CMD+=" $ACCUMULATE_INTO_FP16"
 CMD+=" --do_train --phase2 --resume_from_checkpoint --phase1_end_step=$train_steps"
-
-if [ "$num_gpus" -gt 1  ] ; then
-   CMD="python3 -m torch.distributed.launch --nproc_per_node=$num_gpus $CMD"
-else
-   CMD="python3  $CMD"
-fi
-
+CMD="python3 -m torch.distributed.launch --nproc_per_node=$num_gpus $CMD"

 if [ "$create_logfile" = "true" ] ; then
  export GBS=$(expr $train_batch_size_phase2 \* $num_gpus)
@ -239,7 +240,8 @@ throughput=`cat $LOGFILE | grep Iteration | tail -1 | awk -F'it/s' '{print $1}'
 loss=`cat $LOGFILE | grep 'Average Loss' | tail -1 | awk -F'Average Loss =' '{print $2}' | awk -F' ' '{print $1}' | egrep -o [0-9.]+`
 final_loss=`cat $LOGFILE | grep 'Total Steps' | tail -1 | awk -F'Final Loss =' '{print $2}' | awk -F' ' '{print $1}' | egrep -o [0-9.]+`

-train_perf=$(awk 'BEGIN {print ('$throughput' * '$num_gpus' * '$train_batch_size_phase2')}')
+train_perf=$(awk 'BEGIN {print ('$throughput' * '$num_gpus' * '$train_batch_size_phase2' / '$gradient_accumulation_steps_phase2')}')
+
 echo " training throughput phase2: $train_perf sequences/second"
 echo "average loss: $loss"
 echo "final loss: $final_loss"
--- a/PyTorch/LanguageModeling/BERT/scripts/run_pretraining_inference.sh
+++ b/PyTorch/LanguageModeling/BERT/scripts/run_pretraining_inference.sh
@ -1,5 +1,18 @@
 #!/bin/bash

+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 echo "Container nvidia build = " $NVIDIA_BUILD_ID

 DATASET=wikipedia_corpus # change this for other datasets
--- a/PyTorch/LanguageModeling/BERT/scripts/run_squad.sh
+++ b/PyTorch/LanguageModeling/BERT/scripts/run_squad.sh
@ -1,7 +1,19 @@
 #!/usr/bin/env bash

-#OUT_DIR=/results/SQuAD
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

+#OUT_DIR=/results/SQuAD

 echo "Container nvidia build = " $NVIDIA_BUILD_ID

--- a/PyTorch/LanguageModeling/BERT/scripts/run_swag.sh
+++ b/PyTorch/LanguageModeling/BERT/scripts/run_swag.sh
@ -1,5 +1,18 @@
 #!/bin/bash

+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 SWAG_DIR=/workspace/bert/data/swag
 OUT_DIR=/results/SWAG

@ -54,7 +67,8 @@ CMD+="$use_fp16"
 LOGFILE=$OUT_DIR/logfile
 $CMD |& tee $LOGFILE

-sed -r 's/
|(\[A)/\n/g' $LOGFILE > $LOGFILE.edit
+sed -r 's/
+|(\[A)/\n/g' $LOGFILE > $LOGFILE.edit

 throughput=`cat $LOGFILE.edit | grep -E 'Iteration.*[0-9.]+(s/it|it/s)' | tail -1 | egrep -o '[0-9.]+(s/it|it/s)'`

--- a/PyTorch/LanguageModeling/BERT/scripts/start_pretraining.sh
+++ b/PyTorch/LanguageModeling/BERT/scripts/start_pretraining.sh
@ -1,4 +1,18 @@
 #!/bin/bash
+
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 # purpose: for multinode training on slurm clusters
 node_type=${1:-"dgx1"}
 num_nodes=${2:-1}
--- a/PyTorch/LanguageModeling/BERT/tokenization.py
+++ b/PyTorch/LanguageModeling/BERT/tokenization.py
@ -1,6 +1,6 @@
 # coding=utf-8
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
 # Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
-#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@ -12,6 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 """Tokenization classes."""

 from __future__ import absolute_import, division, print_function, unicode_literals
--- a/PyTorch/LanguageModeling/BERT/utils.py
+++ b/PyTorch/LanguageModeling/BERT/utils.py
@ -1,3 +1,16 @@
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import torch
 import torch.distributed as dist

--- a/PyTorch/Recommendation/NCF/Dockerfile
+++ b/PyTorch/Recommendation/NCF/Dockerfile
@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:19.06-py3
+ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:19.09-py3
 FROM ${FROM_IMAGE_NAME}

 RUN apt-get update && \
--- a/Show more
+++ b/Show more