Merge pull request #5 from NVIDIA/master

Update 20.01
2020-02-06 16:51:47 -08:00 · 2020-02-06 16:51:47 -08:00 · 3ed70df5e3
parent e9d9372ce7 ce73b32068
commit 3ed70df5e3
427 changed files with 30733 additions and 5219 deletions
--- a/.github/ISSUE_TEMPLATE/bug_report.md
+++ b/.github/ISSUE_TEMPLATE/bug_report.md
@ -0,0 +1,29 @@
+---
+name: Bug report
+about: Create a report to help us improve
+title: "[Model/Framework] What is the problem?"
+labels: bug
+assignees: ''
+
+---
+
+Related to **Model/Framework(s)** 
+*(e.g. GNMT/PyTorch or FasterTransformer/All)*
+
+**Describe the bug**
+A clear and concise description of what the bug is.
+
+**To Reproduce**
+Steps to reproduce the behavior:
+1. Install '...'
+2. Set "..."
+2. Launch '...'
+
+**Expected behavior**
+A clear and concise description of what you expected to happen.
+
+**Environment**
+Please provide at least:
+* Container version (e.g. pytorch:19.05-py3):
+* GPUs in the system: (e.g. 8x Tesla V100-SXM2-16GB):
+* CUDA driver version (e.g. 418.67):
--- a/.github/ISSUE_TEMPLATE/feature_request.md
+++ b/.github/ISSUE_TEMPLATE/feature_request.md
@ -0,0 +1,28 @@
+---
+name: Feature request
+about: Suggest an idea for this project
+title: "[Model/Framework or something else] Feature requested"
+labels: enhancement
+assignees: ''
+
+---
+
+Related to **Model/Framework(s) or something else (describe)** 
+
+*Examples:* 
+* *GNMT/PyTorch* 
+* *AMP* 
+* *Tensorflow 2.0*
+* *Jupyter notebooks*
+
+**Is your feature request related to a problem? Please describe.**
+A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
+
+**Describe the solution you'd like**
+A clear and concise description of what you want to happen.
+
+**Describe alternatives you've considered**
+A clear and concise description of any alternative solutions or features you've considered.
+
+**Additional context**
+Add any other context or screenshots about the feature request here.
--- a/.gitmodules
+++ b/.gitmodules
@ -1,7 +1,7 @@
 [submodule "PyTorch/Translation/Transformer/cutlass"]
-	path = PyTorch/Translation/Transformer/cutlass
-	url = https://github.com/NVIDIA/cutlass.git
-[submodule "TensorFlow/LanguageModeling/BERT/tensorrt-inference-server"]
-        path = TensorFlow/LanguageModeling/BERT/tensorrt-inference-server
-        url = https://github.com/NVIDIA/tensorrt-inference-server.git
-	branch = r19.06
+    path = PyTorch/Translation/Transformer/cutlass
+    url = https://github.com/NVIDIA/cutlass.git
+[submodule "PyTorch/SpeechRecognition/Jasper/external/tensorrt-inference-server"]
+    path = PyTorch/SpeechRecognition/Jasper/external/tensorrt-inference-server
+    url = https://github.com/NVIDIA/tensorrt-inference-server.git
+    branch = r19.06
--- a/Kaldi/SpeechRecognition/.dockerignore
+++ b/Kaldi/SpeechRecognition/.dockerignore
@ -0,0 +1,3 @@
+.git/
+data/
+kaldi/
--- a/Kaldi/SpeechRecognition/.gitignore
+++ b/Kaldi/SpeechRecognition/.gitignore
@ -0,0 +1,4 @@
+data/*
+!data/README.md
+.*.swp
+.*.swo
--- a/PyTorch/Classification/RN50v1.5/img/.gitkeep
+++ b/PyTorch/Classification/RN50v1.5/img/.gitkeep
--- a/Kaldi/SpeechRecognition/Dockerfile
+++ b/Kaldi/SpeechRecognition/Dockerfile
@ -0,0 +1,55 @@
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+FROM nvcr.io/nvidia/kaldi:19.12-online-beta-py3 as kb
+ENV DEBIAN_FRONTEND=noninteractive
+
+ARG PYVER=3.6
+
+FROM nvcr.io/nvidia/tensorrtserver:19.12-py3
+
+# Kaldi dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        automake \
+        autoconf \
+        cmake \
+        flac \
+        gawk \
+        libatlas3-base \
+        libtool \
+        python$PYVER \
+        python$PYVER-dev \
+        sox \
+        subversion \
+        unzip \
+        bc \
+        libatlas-base-dev \
+        zlib1g-dev
+
+RUN mkdir /opt/trtis-kaldi && mkdir -p /workspace/model-repo/kaldi_online/1 && mkdir -p /mnt/model-repo
+# Copying static files
+COPY scripts /workspace/scripts
+
+# Moving Kaldi to container
+COPY --from=kb /opt/kaldi /opt/kaldi
+ENV LD_LIBRARY_PATH /opt/kaldi/src/lib/:$LD_LIBRARY_PATH
+
+# Building the custom backend
+COPY trtis-kaldi-backend /workspace/trtis-kaldi-backend
+#COPY --from=cbe /workspace/install/custom-backend-sdk /workspace/trtis-kaldi-backend/custom-backend-sdk
+RUN cd /workspace/trtis-kaldi-backend && wget https://github.com/NVIDIA/tensorrt-inference-server/releases/download/v1.9.0/v1.9.0_ubuntu1804.custombackend.tar.gz -O custom-backend-sdk.tar.gz && tar -xzf custom-backend-sdk.tar.gz
+RUN cd /workspace/trtis-kaldi-backend/ && make && cp libkaldi-trtisbackend.so /workspace/model-repo/kaldi_online/1/ && cd - && rm -r /workspace/trtis-kaldi-backend
+
+COPY scripts/nvidia_kaldi_trtis_entrypoint.sh /opt/trtis-kaldi
+
+ENTRYPOINT ["/opt/trtis-kaldi/nvidia_kaldi_trtis_entrypoint.sh"]
--- a/Kaldi/SpeechRecognition/Dockerfile.client
+++ b/Kaldi/SpeechRecognition/Dockerfile.client
@ -0,0 +1,41 @@
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+FROM nvcr.io/nvidia/kaldi:19.12-online-beta-py3 as kb
+FROM nvcr.io/nvidia/tensorrtserver:19.12-py3-clientsdk
+
+# Kaldi dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        automake \
+        autoconf \
+        cmake \
+        flac \
+        gawk \
+        libatlas3-base \
+        libtool \
+        python$PYVER \
+        python$PYVER-dev \
+        sox \
+        subversion \
+        unzip \
+        bc \
+        libatlas-base-dev \
+        zlib1g-dev
+
+# Moving Kaldi to container
+COPY --from=kb /opt/kaldi /opt/kaldi
+ENV LD_LIBRARY_PATH /opt/kaldi/src/lib/:$LD_LIBRARY_PATH
+
+COPY kaldi-asr-client /workspace/src/clients/c++/kaldi-asr-client
+RUN echo "add_subdirectory(kaldi-asr-client)" >> "/workspace/src/clients/c++/CMakeLists.txt"
+RUN cd /workspace/build/ && make -j16 trtis-clients
--- a/Kaldi/SpeechRecognition/LICENSE
+++ b/Kaldi/SpeechRecognition/LICENSE
@ -0,0 +1,203 @@
+   Except where otherwise noted, the following license applies to all files in this repo. 
+        
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright 2019 NVIDIA Corporation
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
--- a/Kaldi/SpeechRecognition/README.md
+++ b/Kaldi/SpeechRecognition/README.md
@ -0,0 +1,241 @@
+# Kaldi ASR Integration With TensorRT Inference Server
+
+This repository provides a Kaldi ASR custom backend for the NVIDIA TensorRT Inference Server (TRTIS). It can be used to demonstrate high-performance online inference on Kaldi ASR models. This includes handling the gRPC communication between the TensorRT Inference Server and clients, and the dynamic batching of inference requests. This repository is tested and maintained by NVIDIA.
+
+## Table Of Contents
+
+- [Table Of Contents](#table-of-contents)
+- [Solution overview](#solution-overview)
+   * [Reference model](#reference-model)
+   * [Default configuration](#default-configuration)
+- [Setup](#setup)
+  * [Requirements](#requirements)
+- [Quick Start Guide](#quick-start-guide)
+- [Advanced](#advanced)
+   * [Parameters](#parameters)
+      * [Model path](#model-path)
+      * [Model configuration](#model-configuration)
+      * [Inference engine configuration](#inference-engine-configuration)
+  * [Inference process](#inference-process)
+  * [Client command-line parameters](#client-command-line-parameters)
+   * [Input/Output](#inputoutput)
+     * [Input](#input)
+     * [Output](#output)
+   * [Using a custom Kaldi ASR model](#using-a-custom-kaldi-asr-model)
+- [Performance](#performance)
+  * [Metrics](#metrics)
+  * [Results](#results)
+- [Release notes](#release-notes)
+  * [Changelog](#changelog)
+  * [Known issues](#known-issues)
+
+## Solution overview
+
+This repository provides a wrapper around the online GPU-accelerated ASR pipeline from the paper [GPU-Accelerated Viterbi Exact Lattice Decoder for Batched Online and Offline Speech Recognition](https://arxiv.org/abs/1910.10032). That work includes a high-performance implementation of a GPU HMM Decoder, a low-latency Neural Net driver, fast Feature Extraction for preprocessing, and new ASR pipelines tailored for GPUs. These different modules have been integrated into the Kaldi ASR framework.
+
+This repository contains a TensorRT Inference Server custom backend for the Kaldi ASR framework. This custom backend calls the high-performance online GPU pipeline from the Kaldi ASR framework. This TensorRT Inference Server integration provides ease-of-use to Kaldi ASR inference: gRPC streaming server, dynamic sequence batching, and multi-instances support. A client connects to the gRPC server, streams audio by sending chunks to the server, and gets back the inferred text as an answer (see [Input/Output](#input-output)). More information about the TensorRT Inference Server can be found [here](https://docs.nvidia.com/deeplearning/sdk/tensorrt-inference-server-guide/docs/).  
+
+This TensorRT Inference Server integration is meant to be used with the LibriSpeech model for demonstration purposes. We include a pre-trained version of this model to allow you to easily test this work (see [Quick Start Guide](#quick-start-guide)). Both the TensorRT Inference Server integration and the underlying Kaldi ASR online GPU pipeline are a work in progress and will support more functionalities in the future. This includes online iVectors not currently supported in the Kaldi ASR GPU online pipeline and being replaced by a zero vector (see [Known issues](#known-issues)). Support for a custom Kaldi model is experimental (see [Using a custom Kaldi model](#using-custom-kaldi-model)).
+
+### Reference model
+
+A reference model is used by all test scripts and benchmarks presented in this repository to illustrate this solution. We are using the Kaldi ASR `LibriSpeech` recipe, available [here](https://github.com/kaldi-asr/kaldi/blob/master/egs/librispeech/s5). It was trained by NVIDIA and is delivered as a pre-trained model.
+
+### Default configuration
+
+Details about parameters can be found in the [Parameters](#parameters) section.
+
+* `model path`: Configured to use the pretrained LibriSpeech model.
+* `beam`: 10
+* `lattice_beam`: 7
+* `max_active`: 10,000
+* `frame_subsampling_factor`: 3
+* `acoustic_scale`: 1.0
+* `num_worker_threads`: 20
+* `max_execution_batch_size`: 256
+* `max_batch_size`: 4096
+* `instance_group.count`: 2
+
+## Setup
+
+### Requirements 
+
+This repository contains Dockerfiles which extends the Kaldi and TensorRT Inference Server NVIDIA GPU Cloud (NGC) containers and encapsulates some dependencies. Aside from these dependencies, ensure you have [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker) installed.
+
+
+For more information about how to get started with NGC containers, see the following sections from the NVIDIA GPU Cloud Documentation and the Deep Learning Documentation:
+-   [Getting Started Using NVIDIA GPU Cloud](https://docs.nvidia.com/ngc/ngc-getting-started-guide/index.html)
+-   [Accessing And Pulling From The NGC Container Registry](https://docs.nvidia.com/deeplearning/dgx/user-guide/index.html#accessing_registry)
+
+
+## Quick Start Guide
+
+1. Clone the repository.
+ 
+```
+git clone https://github.com/NVIDIA/DeepLearningExamples.git
+cd DeepLearningExamples/Kaldi/SpeechRecognition
+```
+
+2. Build the client and server containers.
+ 
+`scripts/docker/build.sh`
+
+3. Download and set up the pre-trained model and eval dataset.
+
+`scripts/docker/launch_download.sh`
+
+The model and dataset are downloaded in the `data/` folder.
+
+4. Start the server.
+
+`scripts/docker/launch_server.sh`
+
+Once you see the line `Starting Metrics Service at 0.0.0.0:8002`, the server is ready to be used. You can then start the client.
+
+Currently, multi-GPU is not supported. By default GPU 0 is used. You can use a specific GPU by using `NVIDIA_VISIBLE_DEVICES`:
+
+`NVIDIA_VISIBLE_DEVICES=<GPUID> scripts/docker/launch_server.sh`
+
+5. Start the client.
+
+The following command will stream 1000 parallel streams to the server. The `-p` option prints the inferred `TEXT` sent back from the server. 
+
+`scripts/docker/launch_client.sh -p`
+
+
+## Advanced
+
+### Parameters
+
+The configuration is done through the `config.pbtxt` file available in `model-repo/` directory. It allows you to specify the following:
+
+####  Model path
+
+The following parameters can be modified if you want to use your own Kaldi model. 
+
+* `mfcc_filename`
+* `ivector_filename`
+* `nnet3_rxfilename`
+* `fst_rxfilename`
+* `word_syms_rxfilename`
+
+#### Model configuration
+
+The model configuration parameters are passed to the model and have  an impact on both accuracy and performance. The model parameters are usually Kaldi ASR parameters, meaning, if they are, you can reuse the values that are currently being used in the CPU Kaldi ASR pipeline. 
+
+* `beam`
+* `lattice_beam`
+* `max_active`
+* `frame_subsampling_factor`
+* `acoustic_scale`
+
+#### Inference engine configuration
+
+The inference engine configuration parameters configure the inference engine. They impact performance, but not accuracy.
+
+* `max_batch_size`: The maximum number of inference channels opened at a given time. If set to `4096`, then one instance will handle at most 4096 concurrent users.
+* `num_worker_threads`: The number of CPU threads for the postprocessing CPU tasks, such as lattice determinization and text generation from the lattice.
+* `max_execution_batch_size`: The size of one execution batch on the GPU. This parameter should be set as large as necessary to saturate the GPU, but not bigger. Larger batches will lead to a higher throughput, smaller batches to lower latency. 
+* `input.WAV_DATA.dims`: The maximum number of samples per chunk. The value must be a multiple of `frame_subsampling_factor * chunks_per_frame`.
+
+### Inference process
+
+Inference is done through simulating concurrent users. Each user is attributed to one utterance from the LibriSpeech dataset. It streams that utterance by cutting it into chunks and gets the final `TEXT` output once the final chunk has been sent. A parameter sets the number of active users being simulated in parallel.  
+
+### Client command-line parameters
+
+The client can be configured through a set of parameters that define its behavior. To see the full list of available options and their descriptions, use the `-h` command-line option. The parameters are:
+
+```
+    -v
+    -i <Number of iterations on the dataset>
+    -c <Number of parallel audio channels>
+    -a <Path to the scp dataset file>
+    -l <Maximum number of samples per chunk. Must correspond to the server config>
+    -u <URL for inference service and its gRPC port>
+    -o : Only feed each channel at realtime speed. Simulates online clients.
+    -p : Print text outputs
+
+```
+
+### Input/Output
+
+The API is currently experimental.
+
+#### Input
+
+The server execpts chunks of audio each containing up to `input.WAV_DATA.dims` samples. Per default, this corresponds to 510ms of audio per chunk. The last chunk can send a partial chunk smaller than this maximum value. 
+
+The chunk is made of a float array set in the input `WAV_DATA`, with the input `WAV_DATA_DIM` containing the number of samples contained in that chunk. Flags can be set to declare a chunk as a first chunk or last chunk for a sequence. Finally, each chunk from a given sequence is associated with a `CorrelationID`. Every chunk belonging to the same sequence must be given the same `CorrelationID`. 
+
+#### Output
+
+Once the server receives the final chunk for a sequence (with the `END` flag set), it will generate the output associated with that sequence, and send it back to the client. The end of the sequencing procedure is:
+
+1. Process the last chunk.
+2. Flush and process the Neural Net context. 
+3. Generate the full lattice for the sequence.
+4. Determinize the lattice.
+5. Find the best path in the lattice.
+6. Generate the text output for that best path.
+7. Send the text back to the client.
+
+Even if only the best path is used, we are still generating a full lattice for benchmarking purposes. Partial results (generated after each timestep) are currently not available but will be added in a future release. 
+
+### Using a custom Kaldi ASR model
+
+Support for Kaldi ASR models that are different from the provided LibriSpeech model is experimental. However, it is possible to modify the [Model Path](#model-path) section of the config file `model-repo/kaldi_online/config.pbtxt` to set up your own model. 
+
+The models and Kaldi allocators are currently not shared between instances. This means that if your model is large, you may end up with not enough memory on the GPU to store two different instances. If that's the case, you can set `count` to `1` in the `instance_group` section of the config file.
+
+## Performance
+
+
+### Metrics
+
+Throughput is measured using the RTFX metric. It is defined such as : `RTFX = (number of seconds of audio inferred) / (compute time in seconds)`. It is the inverse of the RTF (Real Time Factor) metric, such as `RTFX = 1/RTF`.
+
+Latency is defined as the delay between the availability of the last chunk of audio and the reception of the inferred text. More precisely, it is defined such as :
+
+1. *Client:* Last audio chunk available
+2. ***t0** <- Current time*
+3. *Client:* Send last audio chunk
+4. *Server:* Compute inference of last chunk
+5. *Server:* Generate the raw lattice for the full utterance
+6. *Server:* Determinize the raw lattice
+7. *Server:* Generate the text output associated with the best path in the determinized lattice
+8. *Client:* Receive text output
+9. *Client:* Call callback with output
+10. ***t1** <- Current time*  
+
+The latency is defined such as `latency = t1 - t0`.
+
+### Results
+
+Our results were obtained by:
+
+1. Building and starting the server as described in [Quick Start Guide](#quick-start-guide).
+2. Running  `scripts/run_inference_all_v100.sh` and  `scripts/run_inference_all_t4.sh`
+
+| GPU | Realtime I/O | Number of parallel audio channels | Throughput (RTFX) | Latency | | | |
+| ------ | ------ | ------ | ------ | ------ | ------ | ------ |------ |
+| | | | | 90% | 95% | 99% | Avg |
+| V100 | No | 2000 | 1769.8 | N/A | N/A | N/A | N/A |
+| V100 | Yes | 1500 |  1220 | 0.424 | 0.473 | 0.758 | 0.345 |
+| V100 | Yes | 1000 |  867.4 | 0.358 | 0.405 | 0.707 | 0.276 |
+| V100 | Yes | 800 |  647.8 | 0.304 | 0.325 | 0.517 | 0.238 |
+| T4 | No | 1000 | 906.7 | N/A | N/A | N/A| N/A |
+| T4 | Yes | 700 | 629.6 | 0.629 | 0.782 | 1.01 | 0.463 |
+| T4 | Yes | 400 | 373.7 | 0.417 | 0.441 | 0.690 | 0.349 |
+
+## Release notes
+
+### Changelog
+
+January 2020
+* Initial release
+
+### Known issues
+
+Only mfcc features are supported at this time. The reference model used in the benchmark scripts requires both mfcc and iVector features to deliver the best accuracy. Support for iVector features will be added in a future release.
--- a/PyTorch/Classification/RN50v1.5/resnet50v1.5/README.md
+++ b/PyTorch/Classification/RN50v1.5/resnet50v1.5/README.md
--- a/Kaldi/SpeechRecognition/kaldi-asr-client/CMakeLists.txt
+++ b/Kaldi/SpeechRecognition/kaldi-asr-client/CMakeLists.txt
@ -0,0 +1,65 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+cmake_minimum_required (VERSION 3.5)
+
+add_executable(kaldi_asr_parallel_client kaldi_asr_parallel_client.cc asr_client_imp.cc) 
+
+target_link_libraries(
+   kaldi_asr_parallel_client
+   PRIVATE request
+)
+
+target_link_libraries(
+  kaldi_asr_parallel_client
+  PRIVATE protobuf::libprotobuf
+)
+
+target_include_directories(
+  kaldi_asr_parallel_client 
+  PRIVATE
+  /opt/kaldi/src/
+)
+
+target_include_directories(
+  kaldi_asr_parallel_client 
+  PRIVATE
+  /opt/kaldi/tools/openfst-1.6.7/include/
+)
+
+target_link_libraries(
+  kaldi_asr_parallel_client
+  PRIVATE /opt/kaldi/src/lib/libkaldi-feat.so
+)
+
+target_link_libraries(
+  kaldi_asr_parallel_client
+  PRIVATE /opt/kaldi/src/lib/libkaldi-util.so
+)
+
+target_link_libraries(
+   kaldi_asr_parallel_client
+   PRIVATE /opt/kaldi/src/lib/libkaldi-matrix.so
+)
+
+target_link_libraries(
+  kaldi_asr_parallel_client
+  PRIVATE /opt/kaldi/src/lib/libkaldi-base.so
+)
+
+
+install(
+  TARGETS kaldi_asr_parallel_client
+  RUNTIME DESTINATION bin
+)
--- a/Kaldi/SpeechRecognition/kaldi-asr-client/asr_client_imp.cc
+++ b/Kaldi/SpeechRecognition/kaldi-asr-client/asr_client_imp.cc
@ -0,0 +1,177 @@
+// Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "asr_client_imp.h"
+#include <unistd.h>
+#include <cmath>
+#include <cstring>
+#include <iomanip>
+#include <numeric>
+
+#define FAIL_IF_ERR(X, MSG)                                        \
+  {                                                                \
+    nic::Error err = (X);                                          \
+    if (!err.IsOk()) {                                             \
+      std::cerr << "error: " << (MSG) << ": " << err << std::endl; \
+      exit(1);                                                     \
+    }                                                              \
+  }
+
+void TRTISASRClient::CreateClientContext() {
+  contextes_.emplace_back();
+  ClientContext& client = contextes_.back();
+  FAIL_IF_ERR(nic::InferGrpcStreamContext::Create(
+                  &client.trtis_context, /*corr_id*/ -1, url_, model_name_,
+                  /*model_version*/ -1,
+                  /*verbose*/ false),
+              "unable to create context");
+}
+
+void TRTISASRClient::SendChunk(ni::CorrelationID corr_id,
+                               bool start_of_sequence, bool end_of_sequence,
+                               float* chunk, int chunk_byte_size) {
+  ClientContext* client = &contextes_[corr_id % ncontextes_];
+  nic::InferContext& context = *client->trtis_context;
+  if (start_of_sequence) n_in_flight_.fetch_add(1, std::memory_order_consume);
+
+  // Setting options
+  std::unique_ptr<nic::InferContext::Options> options;
+  FAIL_IF_ERR(nic::InferContext::Options::Create(&options),
+              "unable to create inference options");
+  options->SetBatchSize(1);
+  options->SetFlags(0);
+  options->SetCorrelationId(corr_id);
+  if (start_of_sequence)
+    options->SetFlag(ni::InferRequestHeader::FLAG_SEQUENCE_START,
+                     start_of_sequence);
+  if (end_of_sequence) {
+    options->SetFlag(ni::InferRequestHeader::FLAG_SEQUENCE_END,
+                     end_of_sequence);
+    for (const auto& output : context.Outputs()) {
+      options->AddRawResult(output);
+    }
+  }
+
+  FAIL_IF_ERR(context.SetRunOptions(*options), "unable to set context options");
+  std::shared_ptr<nic::InferContext::Input> in_wave_data, in_wave_data_dim;
+  FAIL_IF_ERR(context.GetInput("WAV_DATA", &in_wave_data),
+              "unable to get WAV_DATA");
+  FAIL_IF_ERR(context.GetInput("WAV_DATA_DIM", &in_wave_data_dim),
+              "unable to get WAV_DATA_DIM");
+
+  // Wave data input
+  FAIL_IF_ERR(in_wave_data->Reset(), "unable to reset WAVE_DATA");
+  uint8_t* wave_data = reinterpret_cast<uint8_t*>(chunk);
+  if (chunk_byte_size < max_chunk_byte_size_) {
+    std::memcpy(&chunk_buf_[0], chunk, chunk_byte_size);
+    wave_data = &chunk_buf_[0];
+  }
+  FAIL_IF_ERR(in_wave_data->SetRaw(wave_data, max_chunk_byte_size_),
+              "unable to set data for WAVE_DATA");
+  // Dim
+  FAIL_IF_ERR(in_wave_data_dim->Reset(), "unable to reset WAVE_DATA_DIM");
+  int nsamples = chunk_byte_size / sizeof(float);
+  FAIL_IF_ERR(in_wave_data_dim->SetRaw(reinterpret_cast<uint8_t*>(&nsamples),
+                                       sizeof(int32_t)),
+              "unable to set data for WAVE_DATA_DIM");
+
+  total_audio_ += (static_cast<double>(nsamples) / 16000.);  // TODO freq
+  double start = gettime_monotonic();
+  FAIL_IF_ERR(context.AsyncRun([corr_id, end_of_sequence, start, this](
+                  nic::InferContext* ctx,
+                  const std::shared_ptr<nic::InferContext::Request>& request) {
+    if (end_of_sequence) {
+      double elapsed = gettime_monotonic() - start;
+      std::string out;
+      std::map<std::string, std::unique_ptr<nic::InferContext::Result>> results;
+      ctx->GetAsyncRunResults(request, &results);
+
+      if (results.size() != 1) {
+        std::cerr << "Warning: Could not read output for corr_id " << corr_id
+                  << std::endl;
+      } else {
+        FAIL_IF_ERR(results["TEXT"]->GetRawAtCursor(0, &out),
+                    "unable to get TEXT output");
+        if (print_results_) {
+          std::lock_guard<std::mutex> lk(stdout_m_);
+          std::cout << "CORR_ID " << corr_id << "\t\t" << out << std::endl;
+        }
+        {
+          std::lock_guard<std::mutex> lk(results_m_);
+          results_.insert({corr_id, {std::move(out), elapsed}});
+        }
+      }
+      n_in_flight_.fetch_sub(1, std::memory_order_relaxed);
+    }
+  }),
+              "unable to run model");
+}
+
+void TRTISASRClient::WaitForCallbacks() {
+  int n;
+  while ((n = n_in_flight_.load(std::memory_order_consume))) {
+    usleep(1000);
+  }
+}
+
+void TRTISASRClient::PrintStats() {
+  double now = gettime_monotonic();
+  double diff = now - started_at_;
+  double rtf = total_audio_ / diff;
+  std::cout << "Throughput:\t" << rtf << " RTFX" << std::endl;
+  std::vector<double> latencies;
+  {
+    std::lock_guard<std::mutex> lk(results_m_);
+    latencies.reserve(results_.size());
+    for (auto& result : results_) latencies.push_back(result.second.latency);
+  }
+  std::sort(latencies.begin(), latencies.end());
+  double nresultsf = static_cast<double>(latencies.size());
+  size_t per90i = static_cast<size_t>(std::floor(90. * nresultsf / 100.));
+  size_t per95i = static_cast<size_t>(std::floor(95. * nresultsf / 100.));
+  size_t per99i = static_cast<size_t>(std::floor(99. * nresultsf / 100.));
+
+  double lat_90 = latencies[per90i];
+  double lat_95 = latencies[per95i];
+  double lat_99 = latencies[per99i];
+
+  double avg = std::accumulate(latencies.begin(), latencies.end(), 0.0) /
+               latencies.size();
+
+  std::cout << std::setprecision(3);
+  std::cout << "Latencies:\t90\t\t95\t\t99\t\tAvg\n";
+  std::cout << "\t\t" << lat_90 << "\t\t" << lat_95 << "\t\t" << lat_99
+            << "\t\t" << avg << std::endl;
+}
+
+TRTISASRClient::TRTISASRClient(const std::string& url,
+                               const std::string& model_name,
+                               const int ncontextes, bool print_results)
+    : url_(url),
+      model_name_(model_name),
+      ncontextes_(ncontextes),
+      print_results_(print_results) {
+  ncontextes_ = std::max(ncontextes_, 1);
+  for (int i = 0; i < ncontextes_; ++i) CreateClientContext();
+
+  std::shared_ptr<nic::InferContext::Input> in_wave_data;
+  FAIL_IF_ERR(contextes_[0].trtis_context->GetInput("WAV_DATA", &in_wave_data),
+              "unable to get WAV_DATA");
+  max_chunk_byte_size_ = in_wave_data->ByteSize();
+  chunk_buf_.resize(max_chunk_byte_size_);
+  shape_ = {max_chunk_byte_size_};
+  n_in_flight_.store(0);
+  started_at_ = gettime_monotonic();
+  total_audio_ = 0;
+}
--- a/Kaldi/SpeechRecognition/kaldi-asr-client/asr_client_imp.h
+++ b/Kaldi/SpeechRecognition/kaldi-asr-client/asr_client_imp.h
@ -0,0 +1,73 @@
+// Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <queue>
+#include <string>
+#include <vector>
+
+#include "request_grpc.h"
+
+#ifndef TRTIS_KALDI_ASR_CLIENT_H_
+#define TRTIS_KALDI_ASR_CLIENT_H_
+namespace ni = nvidia::inferenceserver;
+namespace nic = nvidia::inferenceserver::client;
+
+// time with arbitrary reference
+double inline gettime_monotonic() {
+  struct timespec ts;
+  clock_gettime(CLOCK_MONOTONIC, &ts);
+  double time = ts.tv_sec;
+  time += (double)(ts.tv_nsec) / 1e9;
+  return time;
+}
+
+class TRTISASRClient {
+  struct ClientContext {
+    std::unique_ptr<nic::InferContext> trtis_context;
+  };
+
+  std::string url_;
+  std::string model_name_;
+
+  std::vector<ClientContext> contextes_;
+  int ncontextes_;
+  std::vector<uint8_t> chunk_buf_;
+  std::vector<int64_t> shape_;
+  int max_chunk_byte_size_;
+  std::atomic<int> n_in_flight_;
+  double started_at_;
+  double total_audio_;
+  bool print_results_;
+  std::mutex stdout_m_;
+
+  struct Result {
+    std::string text;
+    double latency;
+  };
+
+  std::unordered_map<ni::CorrelationID, Result> results_;
+  std::mutex results_m_;
+
+ public:
+  void CreateClientContext();
+  void SendChunk(uint64_t corr_id, bool start_of_sequence, bool end_of_sequence,
+                 float* chunk, int chunk_byte_size);
+  void WaitForCallbacks();
+  void PrintStats();
+
+  TRTISASRClient(const std::string& url, const std::string& model_name,
+                 const int ncontextes, bool print_results);
+};
+
+#endif  // TRTIS_KALDI_ASR_CLIENT_H_
--- a/Kaldi/SpeechRecognition/kaldi-asr-client/kaldi_asr_parallel_client.cc
+++ b/Kaldi/SpeechRecognition/kaldi-asr-client/kaldi_asr_parallel_client.cc
@ -0,0 +1,218 @@
+// Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <unistd.h>
+#include <iostream>
+#include <string>
+#include <vector>
+#include "asr_client_imp.h"
+#include "feat/wave-reader.h"  // to read the wav.scp
+#include "util/kaldi-table.h"
+
+using kaldi::BaseFloat;
+
+void Usage(char** argv, const std::string& msg = std::string()) {
+  if (!msg.empty()) {
+    std::cerr << "error: " << msg << std::endl;
+  }
+
+  std::cerr << "Usage: scripts/docker/launch_client.sh [options]" << std::endl;
+  std::cerr << "\t-v" << std::endl;
+  std::cerr << "\t-i <Number of iterations on the dataset>" << std::endl;
+  std::cerr << "\t-c <Number of parallel audio channels>" << std::endl;
+  std::cerr << "\t-a <Path to the scp dataset file>" << std::endl;
+  std::cerr << "\t-l <Maximum number of samples per chunk. Must correspond to "
+               "the server config>"
+            << std::endl;
+  std::cerr << "\t-u <URL for inference service and its gRPC port>"
+            << std::endl;
+  std::cerr << "\t-o : Only feed each channel at realtime speed. Simulates "
+               "online clients."
+            << std::endl;
+  std::cerr << "\t-p : Print text outputs" << std::endl;
+
+  std::cerr << std::endl;
+  exit(1);
+}
+
+int main(int argc, char** argv) {
+  std::cout << "\n";
+  std::cout << "==================================================\n"
+            << "============= TRTIS Kaldi ASR Client =============\n"
+            << "==================================================\n"
+            << std::endl;
+
+  // kaldi nampespace TODO
+  using namespace kaldi;
+  typedef kaldi::int32 int32;
+
+  std::string url = "localhost:8001";
+  std::string model_name = "kaldi_online";
+  std::string wav_rspecifier =
+      "scp:/data/datasets/LibriSpeech/test_clean/wav_conv.scp";
+  int chunk_length = 8160;
+  size_t nchannels = 1000;
+  int niterations = 5;
+  bool verbose = false;
+  float samp_freq = 16000;
+  int ncontextes = 10;
+  bool online = false;
+  bool print_results = false;
+
+  // Parse commandline...
+  int opt;
+  while ((opt = getopt(argc, argv, "va:u:i:c:ophl:")) != -1) {
+    switch (opt) {
+      case 'i':
+        niterations = std::atoi(optarg);
+        break;
+      case 'c':
+        nchannels = std::atoi(optarg);
+        break;
+      case 'a':
+        wav_rspecifier = optarg;
+        break;
+      case 'u':
+        url = optarg;
+        break;
+      case 'v':
+        verbose = true;
+        break;
+      case 'o':
+        online = true;
+        break;
+      case 'p':
+        print_results = true;
+        break;
+      case 'l':
+        chunk_length = std::atoi(optarg);
+        break;
+      case 'h':
+      case '?':
+        Usage(argv);
+        break;
+    }
+  }
+
+  if (niterations <= 0) Usage(argv, "number of iterations must be > 0");
+  if (nchannels <= 0) Usage(argv, "number of audio channels must be > 0");
+  if (chunk_length <= 0) Usage(argv, "chunk length must be > 0");
+
+  std::cout << "Configuration:" << std::endl;
+  std::cout << std::endl;
+  std::cout << "Number of iterations\t\t: " << niterations << std::endl;
+  std::cout << "Number of parallel channels\t: " << nchannels << std::endl;
+  std::cout << "Server URL\t\t\t: " << url << std::endl;
+  std::cout << "Print results\t\t\t: " << (print_results ? "Yes" : "No")
+            << std::endl;
+  std::cout << "Online - Realtime I/O\t\t: " << (online ? "Yes" : "No")
+            << std::endl;
+  std::cout << std::endl;
+
+  float chunk_seconds = (double)chunk_length / samp_freq;
+  // need to read wav files
+  SequentialTableReader<WaveHolder> wav_reader(wav_rspecifier);
+
+  std::atomic<uint64_t> correlation_id;
+  correlation_id.store(1);  // 0 = no correlation
+
+  double total_audio = 0;
+  // pre-loading data
+  // we don't want to measure I/O
+  std::vector<std::shared_ptr<WaveData>> all_wav;
+  {
+    std::cout << "Loading eval dataset..." << std::flush;
+    for (; !wav_reader.Done(); wav_reader.Next()) {
+      std::string utt = wav_reader.Key();
+      std::shared_ptr<WaveData> wave_data = std::make_shared<WaveData>();
+      wave_data->Swap(&wav_reader.Value());
+      all_wav.push_back(wave_data);
+      total_audio += wave_data->Duration();
+    }
+    std::cout << "done" << std::endl;
+  }
+
+  struct Stream {
+    std::shared_ptr<WaveData> wav;
+    ni::CorrelationID corr_id;
+    int offset;
+    float send_next_chunk_at;
+    std::atomic<bool> received_output;
+
+    Stream(const std::shared_ptr<WaveData>& _wav, ni::CorrelationID _corr_id)
+        : wav(_wav), corr_id(_corr_id), offset(0), received_output(true) {
+      send_next_chunk_at = gettime_monotonic();
+    }
+  };
+  std::cout << "Opening GRPC contextes..." << std::flush;
+  TRTISASRClient asr_client(url, model_name, ncontextes, print_results);
+  std::cout << "done" << std::endl;
+  std::cout << "Streaming utterances..." << std::flush;
+  std::vector<std::unique_ptr<Stream>> curr_tasks, next_tasks;
+  curr_tasks.reserve(nchannels);
+  next_tasks.reserve(nchannels);
+  size_t all_wav_i = 0;
+  size_t all_wav_max = all_wav.size() * niterations;
+  while (true) {
+      while (curr_tasks.size() < nchannels && all_wav_i < all_wav_max) {
+        // Creating new tasks
+        uint64_t corr_id = correlation_id.fetch_add(1);
+        std::unique_ptr<Stream> ptr(new Stream(all_wav[all_wav_i%(all_wav.size())], corr_id));
+        curr_tasks.emplace_back(std::move(ptr));
+        ++all_wav_i;
+      }
+      // If still empty, done
+      if (curr_tasks.empty()) break;
+
+      for (size_t itask = 0; itask < curr_tasks.size(); ++itask) {
+        Stream& task = *(curr_tasks[itask]);
+
+        SubVector<BaseFloat> data(task.wav->Data(), 0);
+        int32 samp_offset = task.offset;
+        int32 nsamp = data.Dim();
+        int32 samp_remaining = nsamp - samp_offset;
+        int32 num_samp =
+            chunk_length < samp_remaining ? chunk_length : samp_remaining;
+        bool is_last_chunk = (chunk_length >= samp_remaining);
+        SubVector<BaseFloat> wave_part(data, samp_offset, num_samp);
+        bool is_first_chunk = (samp_offset == 0);
+        if (online) {
+          double now = gettime_monotonic();
+          double wait_for = task.send_next_chunk_at - now;
+          if (wait_for > 0) usleep(wait_for * 1e6);
+        }
+        asr_client.SendChunk(task.corr_id, is_first_chunk, is_last_chunk,
+                             wave_part.Data(), wave_part.SizeInBytes());
+        task.send_next_chunk_at += chunk_seconds;
+        if (verbose)
+          std::cout << "Sending correlation_id=" << task.corr_id
+                    << " chunk offset=" << num_samp << std::endl;
+
+        task.offset += num_samp;
+        if (!is_last_chunk) next_tasks.push_back(std::move(curr_tasks[itask]));
+      }
+
+      curr_tasks.swap(next_tasks);
+      next_tasks.clear();
+      // Showing activity if necessary
+      if (!print_results && !verbose) std::cout << "." << std::flush;
+  }
+  std::cout << "done" << std::endl;
+  std::cout << "Waiting for all results..." << std::flush;
+  asr_client.WaitForCallbacks();
+  std::cout << "done" << std::endl;
+  asr_client.PrintStats();
+
+  return 0;
+}
--- a/Kaldi/SpeechRecognition/model-repo/kaldi_online/config.pbtxt
+++ b/Kaldi/SpeechRecognition/model-repo/kaldi_online/config.pbtxt
@ -0,0 +1,149 @@
+name: "kaldi_online"
+platform: "custom"
+default_model_filename: "libkaldi-trtisbackend.so"
+max_batch_size: 2200
+parameters: {
+key: "mfcc_filename"
+value: { 
+string_value:"/data/models/LibriSpeech/conf/mfcc.conf"
+}
+}
+parameters: {
+key: "ivector_filename"
+value: { 
+string_value:"/data/models/LibriSpeech/conf/ivector_extractor.conf"
+}
+}
+parameters: {
+key: "nnet3_rxfilename"
+value: { 
+string_value: "/data/models/LibriSpeech/final.mdl"
+}
+}
+parameters: {
+key: "fst_rxfilename"
+value: { 
+string_value: "/data/models/LibriSpeech/HCLG.fst"
+}
+}
+parameters: {
+key: "word_syms_rxfilename"
+value: { 
+string_value:"/data/models/LibriSpeech/words.txt"
+}
+}
+parameters: [{
+key: "beam"
+value: { 
+string_value:"10"
+}
+},{
+key: "num_worker_threads"
+value: { 
+string_value:"40"
+} 
+},
+{
+key: "max_execution_batch_size"
+value: { 
+string_value:"512"
+}
+}]
+parameters: {
+key: "lattice_beam"
+value: { 
+string_value:"7" 
+}
+}
+parameters: {
+key: "max_active"
+value: { 
+string_value:"10000" 
+}
+}
+parameters: {
+key: "frame_subsampling_factor"
+value: { 
+string_value:"3" 
+}
+}
+parameters: {
+key: "acoustic_scale"
+value: { 
+string_value:"1.0" 
+}
+}
+sequence_batching {
+max_sequence_idle_microseconds:5000000
+  control_input [
+    {
+      name: "START"
+      control [
+        {
+          kind: CONTROL_SEQUENCE_START
+          int32_false_true: [ 0, 1 ]
+        }
+      ]
+    },
+    {
+      name: "READY"
+      control [
+        {
+          kind: CONTROL_SEQUENCE_READY
+          int32_false_true: [ 0, 1 ]
+        }
+      ]
+    },
+    {
+      name: "END"
+      control [
+        {
+          kind: CONTROL_SEQUENCE_END
+          int32_false_true: [ 0, 1 ]
+        }
+      ]
+    },
+    {
+      name: "CORRID"
+      control [
+        {
+          kind: CONTROL_SEQUENCE_CORRID
+    data_type: TYPE_UINT64
+        }
+      ]
+    }
+  ]
+oldest {
+max_candidate_sequences:2200
+preferred_batch_size:[256,512]
+max_queue_delay_microseconds:1000
+}
+},
+
+input [
+  {
+    name: "WAV_DATA"
+    data_type: TYPE_FP32
+    dims: [ 8160 ]
+  },
+  {
+    name: "WAV_DATA_DIM"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+  }
+]
+output [
+  {
+    name: "TEXT"
+    data_type: TYPE_STRING
+    dims: [ 1 ]
+  }
+]
+instance_group [
+  {
+    count: 2
+    kind: KIND_GPU
+  }
+]
+
+
--- a/Kaldi/SpeechRecognition/scripts/docker/build.sh
+++ b/Kaldi/SpeechRecognition/scripts/docker/build.sh
@ -0,0 +1,17 @@
+#!/bin/bash
+
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+docker build . -f Dockerfile --rm -t trtis_kaldi_server
+docker build . -f Dockerfile.client --rm -t trtis_kaldi_client
--- a/Kaldi/SpeechRecognition/scripts/docker/dataset_setup.sh
+++ b/Kaldi/SpeechRecognition/scripts/docker/dataset_setup.sh
@ -0,0 +1,18 @@
+#!/bin/bash
+
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+/workspace/scripts/docker/prepare_data.sh
+chown -R $1:$2 /data/
+mv /data/* /mnt/data/
--- a/Kaldi/SpeechRecognition/scripts/docker/launch_client.sh
+++ b/Kaldi/SpeechRecognition/scripts/docker/launch_client.sh
@ -0,0 +1,22 @@
+#!/bin/bash
+
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+docker run --rm -it \
+    --net=host \
+    --shm-size=1g \
+    --ulimit memlock=-1 \
+    --ulimit stack=67108864 \
+    -v $PWD/data:/data \
+    trtis_kaldi_client install/bin/kaldi_asr_parallel_client $@
--- a/Kaldi/SpeechRecognition/scripts/docker/launch_download.sh
+++ b/Kaldi/SpeechRecognition/scripts/docker/launch_download.sh
@ -0,0 +1,24 @@
+#!/bin/bash 
+
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Start TRTIS server container for download - need some kaldi tools
+nvidia-docker run --rm \
+   --shm-size=1g \
+   --ulimit memlock=-1 \
+   --ulimit stack=67108864 \
+   -v $PWD/data:/mnt/data \
+   trtis_kaldi_server /workspace/scripts/docker/dataset_setup.sh $(id -u) $(id -g)
+
+#   --user $(id -u):$(id -g) \
--- a/Kaldi/SpeechRecognition/scripts/docker/launch_server.sh
+++ b/Kaldi/SpeechRecognition/scripts/docker/launch_server.sh
@ -0,0 +1,30 @@
+#!/bin/bash 
+
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+NV_VISIBLE_DEVICES=${NVIDIA_VISIBLE_DEVICES:-"0"}
+
+# Start TRTIS server 
+nvidia-docker run --rm -it \
+   --shm-size=1g \
+   --ulimit memlock=-1 \
+   --ulimit stack=67108864 \
+   -p8000:8000 \
+   -p8001:8001 \
+   -p8002:8002 \
+   --name trt_server_asr \
+   -e NVIDIA_VISIBLE_DEVICES=$NV_VISIBLE_DEVICES \
+   -v $PWD/data:/data \
+   -v $PWD/model-repo:/mnt/model-repo \
+   trtis_kaldi_server trtserver --model-repo=/workspace/model-repo/
--- a/Kaldi/SpeechRecognition/scripts/docker/prepare_data.sh
+++ b/Kaldi/SpeechRecognition/scripts/docker/prepare_data.sh
@ -0,0 +1,89 @@
+#!/bin/bash
+
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+WORKSPACE=/data/
+KALDI_ROOT=/opt/kaldi/
+
+model=LibriSpeech
+
+data=${1:-$WORKSPACE/data/}
+datasets=$WORKSPACE/datasets/
+models=$WORKSPACE/models/
+
+# base url for downloads.
+data_url=www.openslr.org/resources/12
+lm_url=www.openslr.org/resources/11
+mfccdir=mfcc
+
+mkdir -p $data/$model
+mkdir -p $models/$model
+mkdir -p $datasets/$model
+  
+
+pushd $KALDI_ROOT/egs/librispeech/s5
+
+. ./cmd.sh
+. ./path.sh
+. parse_options.sh
+
+
+# you might not want to do this for interactive shells.
+set -e
+
+if [[ "$SKIP_DATA_DOWNLOAD" -ne "1" ]]; then
+  echo ----------- Fetching dataset -----------
+
+  # download the data.  Note: we're using the 100 hour setup for
+  # now; later in the script we'll download more and use it to train neural
+  # nets.
+  for part in test-clean test-other; do
+    local/download_and_untar.sh $data $data_url $part
+  done
+fi
+
+# format the data as Kaldi data directories
+echo ----------- Preprocessing dataset -----------
+
+for part in test-clean test-other; do
+  # use underscore-separated names in data directories.
+  local/data_prep.sh $data/$model/$part $datasets/$model/$(echo $part | sed s/-/_/g)
+  # convert the manifests
+  pushd $datasets/$model/$(echo $part | sed s/-/_/g)
+  #sed -i 's@workspace@'"${WORKSPACE}"'@' wav.scp
+  (cat wav.scp | awk '{print $1" "$6}' | sed 's/\.flac/\.wav/g' > wav_conv.scp)
+  popd
+done
+
+if [[ "$SKIP_FLAC2WAV" -ne "1" ]]; then
+  # Convert flac files to wavs
+  for flac in $(find $data/$model -name "*.flac"); do
+     wav=$(echo $flac | sed 's/flac/wav/g')
+     sox $flac -r 16000 -b 16 $wav
+  done
+
+  echo "Converted flac to wav."
+fi
+
+popd >&/dev/null
+
+if [[ "$SKIP_MODEL_DOWNLOAD" -ne "1" ]]; then
+  echo ----------- Fetching trained model -----------
+  pushd $models >&/dev/null
+  wget https://github.com/ryanleary/kaldi-test/releases/download/v0.0/LibriSpeech-trained.tgz -O LibriSpeech-trained.tgz
+  tar -xzf LibriSpeech-trained.tgz -C $model
+  cd $model/conf/
+  find . -name "*.conf" -exec sed -i 's@workspace@'"${WORKSPACE}"'@' {} \;
+  popd >&/dev/null
+fi
--- a/Kaldi/SpeechRecognition/scripts/nvidia_kaldi_trtis_entrypoint.sh
+++ b/Kaldi/SpeechRecognition/scripts/nvidia_kaldi_trtis_entrypoint.sh
@ -0,0 +1,22 @@
+#!/bin/bash
+
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -e
+
+if [ -d "/mnt/model-repo/kaldi_online" ]; then
+	ln -s /mnt/model-repo/kaldi_online/config.pbtxt /workspace/model-repo/kaldi_online/
+fi
+
+/opt/tensorrtserver/nvidia_entrypoint.sh $@
--- a/Kaldi/SpeechRecognition/scripts/run_inference_all_t4.sh
+++ b/Kaldi/SpeechRecognition/scripts/run_inference_all_t4.sh
@ -0,0 +1,30 @@
+#!/bin/bash
+
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -e
+
+if [[ "$(docker ps | grep trtis_kaldi_server | wc -l)" == "0" ]]; then
+	printf "\nThe TensorRT Inference Server is currently not running. Please run scripts/docker/launch_server.sh\n\n"
+	exit 1
+fi
+
+printf "\nOffline benchmarks:\n"
+
+scripts/docker/launch_client.sh -i 5 -c 1000
+
+printf "\nOnline benchmarks:\n"
+
+scripts/docker/launch_client.sh -i 10 -c 700 -o
+scripts/docker/launch_client.sh -i 10 -c 400 -o
--- a/Kaldi/SpeechRecognition/scripts/run_inference_all_v100.sh
+++ b/Kaldi/SpeechRecognition/scripts/run_inference_all_v100.sh
@ -0,0 +1,31 @@
+#!/bin/bash
+
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -e
+
+if [[ "$(docker ps | grep trtis_kaldi_server | wc -l)" == "0" ]]; then
+	printf "\nThe TensorRT Inference Server is currently not running. Please run scripts/docker/launch_server.sh\n\n"
+	exit 1
+fi
+
+printf "\nOffline benchmarks:\n"
+
+scripts/docker/launch_client.sh -i 5 -c 2000
+
+printf "\nOnline benchmarks:\n"
+
+scripts/docker/launch_client.sh -i 10 -c 1500 -o
+scripts/docker/launch_client.sh -i 10 -c 1000 -o
+scripts/docker/launch_client.sh -i 5 -c 800 -o
--- a/Kaldi/SpeechRecognition/trtis-kaldi-backend/Makefile
+++ b/Kaldi/SpeechRecognition/trtis-kaldi-backend/Makefile
@ -0,0 +1,5 @@
+.PHONY: all
+all: kaldibackend
+
+kaldibackend: kaldi-backend.cc kaldi-backend-utils.cc
+	g++ -fpic -shared -std=c++11 -o libkaldi-trtisbackend.so kaldi-backend.cc kaldi-backend-utils.cc -Icustom-backend-sdk/include custom-backend-sdk/lib/libcustombackend.a -I/opt/kaldi/src/ -I/usr/local/cuda/include -I/opt/kaldi/tools/openfst/include/ -L/opt/kaldi/src/lib/ -lkaldi-cudadecoder
--- a/Kaldi/SpeechRecognition/trtis-kaldi-backend/kaldi-backend-utils.cc
+++ b/Kaldi/SpeechRecognition/trtis-kaldi-backend/kaldi-backend-utils.cc
@ -0,0 +1,155 @@
+// Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "kaldi-backend-utils.h"
+
+namespace nvidia {
+namespace inferenceserver {
+namespace custom {
+namespace kaldi_cbe {
+
+int GetInputTensor(CustomGetNextInputFn_t input_fn, void* input_context,
+                   const char* name, const size_t expected_byte_size,
+                   std::vector<uint8_t>* input, const void** out) {
+  input->clear();  // reset buffer
+  // The values for an input tensor are not necessarily in one
+  // contiguous chunk, so we might copy the chunks into 'input' vector.
+  // If possible, we use the data in place
+  uint64_t total_content_byte_size = 0;
+  while (true) {
+    const void* content;
+    uint64_t content_byte_size = expected_byte_size - total_content_byte_size;
+    if (!input_fn(input_context, name, &content, &content_byte_size)) {
+      return kInputContents;
+    }
+
+    // If 'content' returns nullptr we have all the input.
+    if (content == nullptr) break;
+
+    // If the total amount of content received exceeds what we expect
+    // then something is wrong.
+    total_content_byte_size += content_byte_size;
+    if (total_content_byte_size > expected_byte_size) 
+	    return kInputSize;
+
+    if (content_byte_size == expected_byte_size) {
+      *out = content;
+      return kSuccess;
+    }
+
+    input->insert(input->end(), static_cast<const uint8_t*>(content),
+                  static_cast<const uint8_t*>(content) + content_byte_size);
+  }
+
+  // Make sure we end up with exactly the amount of input we expect.
+  if (total_content_byte_size != expected_byte_size) {
+    return kInputSize;
+  }
+  *out = &input[0];
+
+  return kSuccess;
+}
+
+void LatticeToString(fst::SymbolTable& word_syms,
+                     const kaldi::CompactLattice& dlat, std::string* out_str) {
+  kaldi::CompactLattice best_path_clat;
+  kaldi::CompactLatticeShortestPath(dlat, &best_path_clat);
+
+  kaldi::Lattice best_path_lat;
+  fst::ConvertLattice(best_path_clat, &best_path_lat);
+
+  std::vector<int32> alignment;
+  std::vector<int32> words;
+  kaldi::LatticeWeight weight;
+  fst::GetLinearSymbolSequence(best_path_lat, &alignment, &words, &weight);
+  std::ostringstream oss;
+  for (size_t i = 0; i < words.size(); i++) {
+    std::string s = word_syms.Find(words[i]);
+    if (s == "") std::cerr << "Word-id " << words[i] << " not in symbol table.";
+    oss << s << " ";
+  }
+  *out_str = std::move(oss.str());
+}
+
+int ReadParameter(const ModelConfig& model_config_, const std::string& key,
+                  std::string* param) {
+  auto it = model_config_.parameters().find(key);
+  if (it == model_config_.parameters().end()) {
+    std::cerr << "Parameter \"" << key
+              << "\" missing from config file. Exiting." << std::endl;
+    return kInvalidModelConfig;
+  }
+  *param = it->second.string_value();
+  return kSuccess;
+}
+
+int ReadParameter(const ModelConfig& model_config_, const std::string& key,
+                  int* param) {
+  std::string tmp;
+  int err = ReadParameter(model_config_, key, &tmp);
+  *param = std::stoi(tmp);
+  return err;
+}
+
+int ReadParameter(const ModelConfig& model_config_, const std::string& key,
+                  float* param) {
+  std::string tmp;
+  int err = ReadParameter(model_config_, key, &tmp);
+  *param = std::stof(tmp);
+  return err;
+}
+
+const char* CustomErrorString(int errcode) {
+  switch (errcode) {
+    case kSuccess:
+      return "success";
+    case kInvalidModelConfig:
+      return "invalid model configuration";
+    case kGpuNotSupported:
+      return "execution on GPU not supported";
+    case kSequenceBatcher:
+      return "model configuration must configure sequence batcher";
+    case kModelControl:
+      return "'START' and 'READY' must be configured as the control inputs";
+    case kInputOutput:
+      return "model must have four inputs and one output with shape [-1]";
+    case kInputName:
+      return "names for input don't exist";
+    case kOutputName:
+      return "model output must be named 'OUTPUT'";
+    case kInputOutputDataType:
+      return "model inputs or outputs data_type cannot be specified";
+    case kInputContents:
+      return "unable to get input tensor values";
+    case kInputSize:
+      return "unexpected size for input tensor";
+    case kOutputBuffer:
+      return "unable to get buffer for output tensor values";
+    case kBatchTooBig:
+      return "unable to execute batch larger than max-batch-size";
+    case kTimesteps:
+      return "unable to execute more than 1 timestep at a time";
+    case kChunkTooBig:
+      return "a chunk cannot contain more samples than the WAV_DATA dimension";
+    default:
+      break;
+  }
+
+  return "unknown error";
+}
+
+}  // kaldi
+}  // custom
+}  // inferenceserver
+}  // nvidia
--- a/Kaldi/SpeechRecognition/trtis-kaldi-backend/kaldi-backend-utils.h
+++ b/Kaldi/SpeechRecognition/trtis-kaldi-backend/kaldi-backend-utils.h
@ -0,0 +1,66 @@
+// Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "lat/lattice-functions.h"
+#include "src/core/model_config.h"
+#include "src/core/model_config.pb.h"
+#include "src/custom/sdk/custom_instance.h"
+
+namespace nvidia {
+namespace inferenceserver {
+namespace custom {
+namespace kaldi_cbe {
+
+enum ErrorCodes {
+  kSuccess,
+  kUnknown,
+  kInvalidModelConfig,
+  kGpuNotSupported,
+  kSequenceBatcher,
+  kModelControl,
+  kInputOutput,
+  kInputName,
+  kOutputName,
+  kInputOutputDataType,
+  kInputContents,
+  kInputSize,
+  kOutputBuffer,
+  kBatchTooBig,
+  kTimesteps,
+  kChunkTooBig
+};
+
+int GetInputTensor(CustomGetNextInputFn_t input_fn, void* input_context,
+                   const char* name, const size_t expected_byte_size,
+                   std::vector<uint8_t>* input, const void** out);
+
+void LatticeToString(fst::SymbolTable& word_syms,
+                     const kaldi::CompactLattice& dlat, std::string* out_str);
+
+int ReadParameter(const ModelConfig& model_config_, const std::string& key,
+                  std::string* param);
+
+int ReadParameter(const ModelConfig& model_config_, const std::string& key,
+                  int* param);
+int ReadParameter(const ModelConfig& model_config_, const std::string& key,
+                  float* param);
+
+const char* CustomErrorString(int errcode);
+
+}  // kaldi
+}  // custom
+}  // inferenceserver
+}  // nvidia
--- a/Kaldi/SpeechRecognition/trtis-kaldi-backend/kaldi-backend.cc
+++ b/Kaldi/SpeechRecognition/trtis-kaldi-backend/kaldi-backend.cc
@ -0,0 +1,401 @@
+// Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "kaldi-backend.h"
+#include "kaldi-backend-utils.h"
+
+namespace nvidia {
+namespace inferenceserver {
+namespace custom {
+namespace kaldi_cbe {
+
+Context::Context(const std::string& instance_name,
+                 const ModelConfig& model_config, const int gpu_device)
+    : instance_name_(instance_name),
+      model_config_(model_config),
+      gpu_device_(gpu_device),
+      num_channels_(
+          model_config_
+              .max_batch_size()),  // diff in def between kaldi and trtis
+      int32_byte_size_(GetDataTypeByteSize(TYPE_INT32)),
+      int64_byte_size_(GetDataTypeByteSize(TYPE_INT64)) {}
+
+Context::~Context() { delete word_syms_; }
+
+int Context::ReadModelParameters() {
+  // Reading config
+  float beam, lattice_beam;
+  int max_active;
+  int frame_subsampling_factor;
+  float acoustic_scale;
+  int num_worker_threads;
+  int err =
+      ReadParameter(model_config_, "mfcc_filename",
+                    &batched_decoder_config_.feature_opts.mfcc_config) ||
+      ReadParameter(
+          model_config_, "ivector_filename",
+          &batched_decoder_config_.feature_opts.ivector_extraction_config) ||
+      ReadParameter(model_config_, "beam", &beam) ||
+      ReadParameter(model_config_, "lattice_beam", &lattice_beam) ||
+      ReadParameter(model_config_, "max_active", &max_active) ||
+      ReadParameter(model_config_, "frame_subsampling_factor",
+                    &frame_subsampling_factor) ||
+      ReadParameter(model_config_, "acoustic_scale", &acoustic_scale) ||
+      ReadParameter(model_config_, "nnet3_rxfilename", &nnet3_rxfilename_) ||
+      ReadParameter(model_config_, "fst_rxfilename", &fst_rxfilename_) ||
+      ReadParameter(model_config_, "word_syms_rxfilename",
+                    &word_syms_rxfilename_) ||
+      ReadParameter(model_config_, "num_worker_threads", &num_worker_threads) ||
+      ReadParameter(model_config_, "max_execution_batch_size",
+                    &max_batch_size_);
+  if (err) return err;
+  max_batch_size_ = std::max<int>(max_batch_size_, 1);
+  num_channels_ = std::max<int>(num_channels_, 1);
+
+  // Sanity checks
+  if (beam <= 0) return kInvalidModelConfig;
+  if (lattice_beam <= 0) return kInvalidModelConfig;
+  if (max_active <= 0) return kInvalidModelConfig;
+  if (acoustic_scale <= 0) return kInvalidModelConfig;
+  if (num_worker_threads <= 0) return kInvalidModelConfig;
+  if (num_channels_ <= max_batch_size_) return kInvalidModelConfig;
+
+  batched_decoder_config_.compute_opts.frame_subsampling_factor =
+      frame_subsampling_factor;
+  batched_decoder_config_.compute_opts.acoustic_scale = acoustic_scale;
+  batched_decoder_config_.decoder_opts.default_beam = beam;
+  batched_decoder_config_.decoder_opts.lattice_beam = lattice_beam;
+  batched_decoder_config_.decoder_opts.max_active = max_active;
+  batched_decoder_config_.num_worker_threads = num_worker_threads;
+  batched_decoder_config_.max_batch_size = max_batch_size_;
+  batched_decoder_config_.num_channels = num_channels_;
+
+  auto feature_config = batched_decoder_config_.feature_opts;
+  kaldi::OnlineNnet2FeaturePipelineInfo feature_info(feature_config);
+  sample_freq_ = feature_info.mfcc_opts.frame_opts.samp_freq;
+  BaseFloat frame_shift = feature_info.FrameShiftInSeconds();
+  seconds_per_chunk_ = chunk_num_samps_ / sample_freq_;
+
+  int samp_per_frame = static_cast<int>(sample_freq_ * frame_shift);
+  float n_input_framesf = chunk_num_samps_ / samp_per_frame;
+  bool is_integer = (n_input_framesf == std::floor(n_input_framesf));
+  if (!is_integer) {
+    std::cerr << "WAVE_DATA dim must be a multiple fo samples per frame ("
+              << samp_per_frame << ")" << std::endl;
+    return kInvalidModelConfig;
+  }
+  int n_input_frames = static_cast<int>(std::floor(n_input_framesf));
+  batched_decoder_config_.compute_opts.frames_per_chunk = n_input_frames;
+
+  return kSuccess;
+}
+
+int Context::InitializeKaldiPipeline() {
+  batch_corr_ids_.reserve(max_batch_size_);
+  batch_wave_samples_.reserve(max_batch_size_);
+  batch_is_last_chunk_.reserve(max_batch_size_);
+  wave_byte_buffers_.resize(max_batch_size_);
+  output_shape_ = {1, 1};
+  kaldi::CuDevice::Instantiate()
+      .SelectAndInitializeGpuIdWithExistingCudaContext(gpu_device_);
+  kaldi::CuDevice::Instantiate().AllowMultithreading();
+
+  // Loading models
+  {
+    bool binary;
+    kaldi::Input ki(nnet3_rxfilename_, &binary);
+    trans_model_.Read(ki.Stream(), binary);
+    am_nnet_.Read(ki.Stream(), binary);
+
+    kaldi::nnet3::SetBatchnormTestMode(true, &(am_nnet_.GetNnet()));
+    kaldi::nnet3::SetDropoutTestMode(true, &(am_nnet_.GetNnet()));
+    kaldi::nnet3::CollapseModel(kaldi::nnet3::CollapseModelConfig(),
+                                &(am_nnet_.GetNnet()));
+  }
+  fst::Fst<fst::StdArc>* decode_fst = fst::ReadFstKaldiGeneric(fst_rxfilename_);
+  cuda_pipeline_.reset(
+      new kaldi::cuda_decoder::BatchedThreadedNnet3CudaOnlinePipeline(
+          batched_decoder_config_, *decode_fst, am_nnet_, trans_model_));
+  delete decode_fst;
+
+  // Loading word syms for text output
+  if (word_syms_rxfilename_ != "") {
+    if (!(word_syms_ = fst::SymbolTable::ReadText(word_syms_rxfilename_))) {
+      std::cerr << "Could not read symbol table from file "
+                << word_syms_rxfilename_;
+      return kInvalidModelConfig;
+    }
+  }
+  chunk_num_samps_ = cuda_pipeline_->GetNSampsPerChunk();
+  chunk_num_bytes_ = chunk_num_samps_ * sizeof(BaseFloat);
+  return kSuccess;
+}
+
+int Context::Init() {
+  return InputOutputSanityCheck() || ReadModelParameters() ||
+         InitializeKaldiPipeline();
+}
+
+bool Context::CheckPayloadError(const CustomPayload& payload) {
+  int err = payload.error_code;
+  if (err) std::cerr << "Error: " << CustomErrorString(err) << std::endl;
+  return (err != 0);
+}
+
+int Context::Execute(const uint32_t payload_cnt, CustomPayload* payloads,
+                     CustomGetNextInputFn_t input_fn,
+                     CustomGetOutputFn_t output_fn) {
+  // kaldi::Timer timer;
+  if (payload_cnt > num_channels_) return kBatchTooBig;
+  // Each payload is a chunk for one sequence
+  // Currently using dynamic batcher, not sequence batcher
+  for (uint32_t pidx = 0; pidx < payload_cnt; ++pidx) {
+    if (batch_corr_ids_.size() == max_batch_size_) FlushBatch();
+
+    CustomPayload& payload = payloads[pidx];
+    if (payload.batch_size != 1) payload.error_code = kTimesteps;
+    if (CheckPayloadError(payload)) continue;
+
+    // Get input tensors
+    int32_t start, dim, end, ready;
+    CorrelationID corr_id;
+    const BaseFloat* wave_buffer;
+    payload.error_code = GetSequenceInput(
+        input_fn, payload.input_context, &corr_id, &start, &ready, &dim, &end,
+        &wave_buffer, &wave_byte_buffers_[pidx]);
+    if (CheckPayloadError(payload)) continue;
+    if (!ready) continue;
+    if (dim > chunk_num_samps_) payload.error_code = kChunkTooBig;
+    if (CheckPayloadError(payload)) continue;
+
+    kaldi::SubVector<BaseFloat> wave_part(wave_buffer, dim);
+    // Initialize corr_id if first chunk
+    if (start) cuda_pipeline_->InitCorrID(corr_id);
+    // Add to batch
+    batch_corr_ids_.push_back(corr_id);
+    batch_wave_samples_.push_back(wave_part);
+    batch_is_last_chunk_.push_back(end);
+
+    if (end) {
+      // If last chunk, set the callback for that seq
+      cuda_pipeline_->SetLatticeCallback(
+          corr_id, [this, &output_fn, &payloads, pidx,
+                    corr_id](kaldi::CompactLattice& clat) {
+            std::string output;
+            LatticeToString(*word_syms_, clat, &output);
+            SetOutputTensor(output, output_fn, payloads[pidx]);
+          });
+    }
+  }
+  FlushBatch();
+  cuda_pipeline_->WaitForLatticeCallbacks();
+  return kSuccess;
+}
+
+int Context::FlushBatch() {
+  if (!batch_corr_ids_.empty()) {
+    cuda_pipeline_->DecodeBatch(batch_corr_ids_, batch_wave_samples_,
+                                batch_is_last_chunk_);
+    batch_corr_ids_.clear();
+    batch_wave_samples_.clear();
+    batch_is_last_chunk_.clear();
+  }
+}
+
+int Context::InputOutputSanityCheck() {
+  if (!model_config_.has_sequence_batching()) {
+    return kSequenceBatcher;
+  }
+
+  auto& batcher = model_config_.sequence_batching();
+  if (batcher.control_input_size() != 4) {
+    return kModelControl;
+  }
+
+  std::set<std::string> control_input_names;
+  for (int i = 0; i < 4; ++i)
+    control_input_names.insert(batcher.control_input(i).name());
+  if (!(control_input_names.erase("START") &&
+        control_input_names.erase("END") &&
+        control_input_names.erase("CORRID") &&
+        control_input_names.erase("READY"))) {
+    return kModelControl;
+  }
+
+  if (model_config_.input_size() != 2) {
+    return kInputOutput;
+  }
+  if ((model_config_.input(0).dims().size() != 1) ||
+      (model_config_.input(0).dims(0) <= 0) ||
+      (model_config_.input(1).dims().size() != 1) ||
+      (model_config_.input(1).dims(0) != 1)) {
+    return kInputOutput;
+  }
+  chunk_num_samps_ = model_config_.input(0).dims(0);
+  chunk_num_bytes_ = chunk_num_samps_ * sizeof(float);
+
+  if ((model_config_.input(0).data_type() != DataType::TYPE_FP32) ||
+      (model_config_.input(1).data_type() != DataType::TYPE_INT32)) {
+    return kInputOutputDataType;
+  }
+  if ((model_config_.input(0).name() != "WAV_DATA") ||
+      (model_config_.input(1).name() != "WAV_DATA_DIM")) {
+    return kInputName;
+  }
+
+  if (model_config_.output_size() != 1) {
+    return kInputOutput;
+  }
+  if ((model_config_.output(0).dims().size() != 1) ||
+      (model_config_.output(0).dims(0) != 1)) {
+    return kInputOutput;
+  }
+  if (model_config_.output(0).data_type() != DataType::TYPE_STRING) {
+    return kInputOutputDataType;
+  }
+  if (model_config_.output(0).name() != "TEXT") {
+    return kOutputName;
+  }
+
+  return kSuccess;
+}
+
+int Context::GetSequenceInput(CustomGetNextInputFn_t& input_fn,
+                              void* input_context, CorrelationID* corr_id,
+                              int32_t* start, int32_t* ready, int32_t* dim,
+                              int32_t* end, const BaseFloat** wave_buffer,
+                              std::vector<uint8_t>* input_buffer) {
+  int err;
+  //&input_buffer[0]: char pointer -> alias with any types
+  // wave_data[0] will holds the struct
+
+  // Get start of sequence tensor
+  const void* out;
+  err = GetInputTensor(input_fn, input_context, "WAV_DATA_DIM",
+                       int32_byte_size_, &byte_buffer_, &out);
+  if (err != kSuccess) return err;
+  *dim = *reinterpret_cast<const int32_t*>(out);
+
+  err = GetInputTensor(input_fn, input_context, "END", int32_byte_size_,
+                       &byte_buffer_, &out);
+  if (err != kSuccess) return err;
+  *end = *reinterpret_cast<const int32_t*>(out);
+
+  err = GetInputTensor(input_fn, input_context, "START", int32_byte_size_,
+                       &byte_buffer_, &out);
+  if (err != kSuccess) return err;
+  *start = *reinterpret_cast<const int32_t*>(out);
+
+  err = GetInputTensor(input_fn, input_context, "READY", int32_byte_size_,
+                       &byte_buffer_, &out);
+  if (err != kSuccess) return err;
+  *ready = *reinterpret_cast<const int32_t*>(out);
+
+  err = GetInputTensor(input_fn, input_context, "CORRID", int64_byte_size_,
+                       &byte_buffer_, &out);
+  if (err != kSuccess) return err;
+  *corr_id = *reinterpret_cast<const CorrelationID*>(out);
+
+  // Get pointer to speech tensor
+  err = GetInputTensor(input_fn, input_context, "WAV_DATA", chunk_num_bytes_,
+                       input_buffer, &out);
+  if (err != kSuccess) return err;
+  *wave_buffer = reinterpret_cast<const BaseFloat*>(out);
+
+  return kSuccess;
+}
+
+int Context::SetOutputTensor(const std::string& output,
+                             CustomGetOutputFn_t output_fn,
+                             CustomPayload payload) {
+  uint32_t byte_size_with_size_int = output.size() + sizeof(int32);
+
+  // std::cout << output << std::endl;
+
+  // copy output from best_path to output buffer
+  if ((payload.error_code == 0) && (payload.output_cnt > 0)) {
+    const char* output_name = payload.required_output_names[0];
+    // output buffer
+    void* obuffer;
+    if (!output_fn(payload.output_context, output_name, output_shape_.size(),
+                   &output_shape_[0], byte_size_with_size_int, &obuffer)) {
+      payload.error_code = kOutputBuffer;
+      return payload.error_code;
+    }
+
+    // If no error but the 'obuffer' is returned as nullptr, then
+    // skip writing this output.
+    if (obuffer != nullptr) {
+      // std::cout << "writing " << output << std::endl;
+      int32* buffer_as_int = reinterpret_cast<int32*>(obuffer);
+      buffer_as_int[0] = output.size();
+      memcpy(&buffer_as_int[1], output.data(), output.size());
+    }
+  }
+}
+/////////////
+
+extern "C" {
+
+int CustomInitialize(const CustomInitializeData* data, void** custom_context) {
+  // Convert the serialized model config to a ModelConfig object.
+  ModelConfig model_config;
+  if (!model_config.ParseFromString(std::string(
+          data->serialized_model_config, data->serialized_model_config_size))) {
+    return kInvalidModelConfig;
+  }
+
+  // Create the context and validate that the model configuration is
+  // something that we can handle.
+  Context* context = new Context(std::string(data->instance_name), model_config,
+                                 data->gpu_device_id);
+  int err = context->Init();
+  if (err != kSuccess) {
+    return err;
+  }
+
+  *custom_context = static_cast<void*>(context);
+
+  return kSuccess;
+}
+
+int CustomFinalize(void* custom_context) {
+  if (custom_context != nullptr) {
+    Context* context = static_cast<Context*>(custom_context);
+    delete context;
+  }
+
+  return kSuccess;
+}
+
+const char* CustomErrorString(void* custom_context, int errcode) {
+  return CustomErrorString(errcode);
+}
+
+int CustomExecute(void* custom_context, const uint32_t payload_cnt,
+                  CustomPayload* payloads, CustomGetNextInputFn_t input_fn,
+                  CustomGetOutputFn_t output_fn) {
+  if (custom_context == nullptr) {
+    return kUnknown;
+  }
+
+  Context* context = static_cast<Context*>(custom_context);
+  return context->Execute(payload_cnt, payloads, input_fn, output_fn);
+}
+
+}  // extern "C"
+}
+}
+}
+}  // namespace nvidia::inferenceserver::custom::kaldi_cbe
--- a/Kaldi/SpeechRecognition/trtis-kaldi-backend/kaldi-backend.h
+++ b/Kaldi/SpeechRecognition/trtis-kaldi-backend/kaldi-backend.h
@ -0,0 +1,119 @@
+// Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#define HAVE_CUDA 1  // Loading Kaldi headers with GPU
+
+#include <cfloat>
+#include <sstream>
+#include "cudadecoder/batched-threaded-nnet3-cuda-online-pipeline.h"
+#include "fstext/fstext-lib.h"
+#include "lat/lattice-functions.h"
+#include "nnet3/am-nnet-simple.h"
+#include "nnet3/nnet-utils.h"
+#include "util/kaldi-thread.h"
+
+#include "src/core/model_config.h"
+#include "src/core/model_config.pb.h"
+#include "src/custom/sdk/custom_instance.h"
+
+using kaldi::BaseFloat;
+
+namespace nvidia {
+namespace inferenceserver {
+namespace custom {
+namespace kaldi_cbe {
+
+// Context object. All state must be kept in this object.
+class Context {
+ public:
+  Context(const std::string& instance_name, const ModelConfig& config,
+          const int gpu_device);
+  virtual ~Context();
+
+  // Initialize the context. Validate that the model configuration,
+  // etc. is something that we can handle.
+  int Init();
+
+  // Perform custom execution on the payloads.
+  int Execute(const uint32_t payload_cnt, CustomPayload* payloads,
+              CustomGetNextInputFn_t input_fn, CustomGetOutputFn_t output_fn);
+
+ private:
+  // init kaldi pipeline
+  int InitializeKaldiPipeline();
+  int InputOutputSanityCheck();
+  int ReadModelParameters();
+  int GetSequenceInput(CustomGetNextInputFn_t& input_fn, void* input_context,
+                       CorrelationID* corr_id, int32_t* start, int32_t* ready,
+                       int32_t* dim, int32_t* end,
+                       const kaldi::BaseFloat** wave_buffer,
+                       std::vector<uint8_t>* input_buffer);
+
+  int SetOutputTensor(const std::string& output, CustomGetOutputFn_t output_fn,
+                      CustomPayload payload);
+
+  bool CheckPayloadError(const CustomPayload& payload);
+  int FlushBatch();
+
+  // The name of this instance of the backend.
+  const std::string instance_name_;
+
+  // The model configuration.
+  const ModelConfig model_config_;
+
+  // The GPU device ID to execute on or CUSTOM_NO_GPU_DEVICE if should
+  // execute on CPU.
+  const int gpu_device_;
+
+  // Models paths
+  std::string nnet3_rxfilename_, fst_rxfilename_;
+  std::string word_syms_rxfilename_;
+
+  // batch_size
+  int max_batch_size_;
+  int num_channels_;
+  int num_worker_threads_;
+  std::vector<CorrelationID> batch_corr_ids_;
+  std::vector<kaldi::SubVector<kaldi::BaseFloat>> batch_wave_samples_;
+  std::vector<bool> batch_is_last_chunk_;
+
+  BaseFloat sample_freq_, seconds_per_chunk_;
+  int chunk_num_bytes_, chunk_num_samps_;
+
+  // feature_config includes configuration for the iVector adaptation,
+  // as well as the basic features.
+  kaldi::cuda_decoder::BatchedThreadedNnet3CudaOnlinePipelineConfig
+      batched_decoder_config_;
+  std::unique_ptr<kaldi::cuda_decoder::BatchedThreadedNnet3CudaOnlinePipeline>
+      cuda_pipeline_;
+  // Maintain the state of some shared objects
+  kaldi::TransitionModel trans_model_;
+
+  kaldi::nnet3::AmNnetSimple am_nnet_;
+  fst::SymbolTable* word_syms_;
+
+  const uint64_t int32_byte_size_;
+  const uint64_t int64_byte_size_;
+  std::vector<int64_t> output_shape_;
+
+  std::vector<uint8_t> byte_buffer_;
+  std::vector<std::vector<uint8_t>> wave_byte_buffers_;
+};
+
+}  // kaldi
+}  // custom
+}  // inferenceserver
+}  // nvidia
--- a/Kaldi/SpeechRecognition/trtis-kaldi-backend/libkaldi_online.ldscript
+++ b/Kaldi/SpeechRecognition/trtis-kaldi-backend/libkaldi_online.ldscript
@ -0,0 +1,21 @@
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+{
+  global:
+    CustomErrorString;
+    CustomExecute;
+    CustomFinalize;
+    CustomInitialize;
+  local: *;
+};
--- a/PyTorch/SpeechSynthesis/Tacotron2/dllogger/init.py
+++ b/PyTorch/SpeechSynthesis/Tacotron2/dllogger/init.py
--- a/PyTorch/Classification/ConvNets/Dockerfile
+++ b/PyTorch/Classification/ConvNets/Dockerfile
@ -0,0 +1,8 @@
+ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:19.10-py3
+FROM ${FROM_IMAGE_NAME}
+
+ADD requirements.txt /workspace/
+WORKDIR /workspace/
+RUN pip install --no-cache-dir -r requirements.txt
+ADD . /workspace/rn50
+WORKDIR /workspace/rn50
--- a/PyTorch/Classification/ConvNets/LICENSE
+++ b/PyTorch/Classification/ConvNets/LICENSE
--- a/PyTorch/Classification/ConvNets/LOC_synset_mapping.json
+++ b/PyTorch/Classification/ConvNets/LOC_synset_mapping.json
--- a/PyTorch/Classification/ConvNets/README.md
+++ b/PyTorch/Classification/ConvNets/README.md
@ -0,0 +1,109 @@
+# Convolutional Networks for Image Classification in PyTorch
+
+In this repository you will find implementations of various image classification models.
+
+## Table Of Contents
+
+* [Models](#models)
+* [Validation accuracy results](#validation-accuracy-results)
+* [Training performance results](#training-performance-results)
+  * [Training performance: NVIDIA DGX-1 (8x V100 16G)](#training-performance-nvidia-dgx-1-(8x-v100-16G))
+  * [Training performance: NVIDIA DGX-2 (16x V100 32G)](#training-performance-nvidia-dgx-2-(16x-v100-32G))
+* [Model comparison](#model-comparison)
+  * [Accuracy vs FLOPS](#accuracy-vs-flops)
+  * [Latency vs Throughput on different batch sizes](#latency-vs-throughput-on-different-batch-sizes)
+
+## Models
+
+The following table provides links to where you can find additional information on each model:
+
+| **Model** | **Link**|
+|:-:|:-:|
+| resnet50 | [README](./resnet50v1.5/README.md) |
+| resnext101-32x4d | [README](./resnext101-32x4d/README.md) |
+| se-resnext101-32x4d | [README](./se-resnext101-32x4d/README.md) |
+
+## Validation accuracy results
+
+Our results were obtained by running the applicable 
+training scripts in the [framework-container-name] NGC container 
+on NVIDIA DGX-1 with (8x V100 16G) GPUs. 
+The specific training script that was run is documented 
+in the corresponding model's README.
+
+
+The following table shows the validation accuracy results of the 
+three classification models side-by-side.
+
+
+| **arch** | **AMP Top1** | **AMP Top5** | **FP32 Top1** | **FP32 Top1** |
+|:-:|:-:|:-:|:-:|:-:|
+| resnet50 | 78.46 | 94.15 | 78.50 | 94.11 |
+| resnext101-32x4d | 80.08 | 94.89 | 80.14 | 95.02 |
+| se-resnext101-32x4d | 81.01 | 95.52 | 81.12 | 95.54 |
+
+
+## Training performance results
+
+
+### Training performance: NVIDIA DGX-1 (8x V100 16G)
+
+
+Our results were obtained by running the applicable 
+training scripts in the pytorch-19.10 NGC container 
+on NVIDIA DGX-1 with (8x V100 16G) GPUs. 
+Performance numbers (in images per second) 
+were averaged over an entire training epoch.
+The specific training script that was run is documented 
+in the corresponding model's README.
+
+The following table shows the training accuracy results of the 
+three classification models side-by-side.
+
+
+| **arch** | **Mixed Precision** | **FP32** | **Mixed Precision speedup** |
+|:-:|:-:|:-:|:-:|
+| resnet50 | 6888.75 img/s | 2945.37 img/s | 2.34x |
+| resnext101-32x4d | 2384.85 img/s | 1116.58 img/s | 2.14x |
+| se-resnext101-32x4d | 2031.17 img/s | 977.45 img/s | 2.08x |
+
+### Training performance: NVIDIA DGX-2 (16x V100 32G)
+
+
+Our results were obtained by running the applicable 
+training scripts in the pytorch-19.10 NGC container 
+on NVIDIA DGX-2 with (16x V100 32G) GPUs. 
+Performance numbers (in images per second) 
+were averaged over an entire training epoch.
+The specific training script that was run is documented 
+in the corresponding model's README.
+
+The following table shows the training accuracy results of the 
+three classification models side-by-side.
+
+
+| **arch** | **Mixed Precision** | **FP32** | **Mixed Precision speedup** |
+|:-:|:-:|:-:|:-:|
+| resnet50 | 13443.82 img/s | 6263.41 img/s | 2.15x |
+| resnext101-32x4d | 4473.37 img/s | 2261.97 img/s | 1.98x |
+| se-resnext101-32x4d | 3776.03 img/s | 1953.13 img/s | 1.93x |
+
+
+## Model Comparison
+
+### Accuracy vs FLOPS
+![ACCvsFLOPS](./img/ACCvsFLOPS.png)
+
+Plot describes relationship between floating point operations
+needed for computing forward pass on a 224px x 224px image, 
+for the implemented models.
+Dot size indicates number of trainable parameters.
+
+### Latency vs Throughput on different batch sizes
+![LATvsTHR](./img/LATvsTHR.png)
+
+Plot describes relationship between 
+inference latency, throughput and batch size 
+for the implemented models.
+
+
--- a/PyTorch/Classification/ConvNets/checkpoint2model.py
+++ b/PyTorch/Classification/ConvNets/checkpoint2model.py
@ -0,0 +1,42 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import torch
+
+
+def add_parser_arguments(parser):
+    parser.add_argument(
+        "--checkpoint-path", metavar="<path>", help="checkpoint filename"
+    )
+    parser.add_argument(
+        "--weight-path", metavar="<path>", help="name of file in which to store weights"
+    )
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="PyTorch ImageNet Training")
+
+    add_parser_arguments(parser)
+    args = parser.parse_args()
+
+    checkpoint = torch.load(args.checkpoint_path)
+
+    model_state_dict = {
+        k[len("module.1.") :] if "module.1." in k else k: v
+        for k, v in checkpoint["state_dict"].items()
+    }
+
+    print(f"Loaded {checkpoint['arch']} : {checkpoint['best_prec1']}")
+
+    torch.save(model_state_dict, args.weight_path)
--- a/PyTorch/Classification/ConvNets/classify.py
+++ b/PyTorch/Classification/ConvNets/classify.py
@ -0,0 +1,94 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from PIL import Image
+import argparse
+import numpy as np
+import json
+import torch
+import torch.backends.cudnn as cudnn
+import torchvision.transforms as transforms
+import image_classification.resnet as models
+from image_classification.dataloaders import load_jpeg_from_file
+
+try:
+    from apex.fp16_utils import *
+    from apex import amp
+except ImportError:
+    raise ImportError(
+        "Please install apex from https://www.github.com/nvidia/apex to run this example."
+    )
+
+
+def add_parser_arguments(parser):
+    model_names = models.resnet_versions.keys()
+    model_configs = models.resnet_configs.keys()
+    parser.add_argument("--image-size", default="224", type=int)
+    parser.add_argument(
+        "--arch",
+        "-a",
+        metavar="ARCH",
+        default="resnet50",
+        choices=model_names,
+        help="model architecture: " + " | ".join(model_names) + " (default: resnet50)",
+    )
+    parser.add_argument(
+        "--model-config",
+        "-c",
+        metavar="CONF",
+        default="classic",
+        choices=model_configs,
+        help="model configs: " + " | ".join(model_configs) + "(default: classic)",
+    )
+    parser.add_argument("--weights", metavar="<path>", help="file with model weights")
+    parser.add_argument(
+        "--precision", metavar="PREC", default="FP16", choices=["AMP", "FP16", "FP32"]
+    )
+    parser.add_argument("--image", metavar="<path>", help="path to classified image")
+
+
+def main(args):
+    imgnet_classes = np.array(json.load(open("./LOC_synset_mapping.json", "r")))
+    model = models.build_resnet(args.arch, args.model_config, verbose=False)
+
+    if args.weights is not None:
+        weights = torch.load(args.weights)
+        model.load_state_dict(weights)
+
+    model = model.cuda()
+
+    if args.precision == "FP16":
+        model = network_to_half(model)
+
+    model.eval()
+
+    with torch.no_grad():
+        input = load_jpeg_from_file(args.image, cuda=True, fp16=args.precision!='FP32')
+
+        output = torch.nn.functional.softmax(model(input), dim=1).cpu().view(-1).numpy()
+        top5 = np.argsort(output)[-5:][::-1]
+
+        print(args.image)
+        for c, v in zip(imgnet_classes[top5], output[top5]):
+            print(f"{c}: {100*v:.1f}%")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="PyTorch ImageNet Training")
+
+    add_parser_arguments(parser)
+    args = parser.parse_args()
+
+    cudnn.benchmark = True
+
+    main(args)
--- a/PyTorch/Classification/ConvNets/image_classification/init.py
+++ b/PyTorch/Classification/ConvNets/image_classification/init.py
@ -0,0 +1,20 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from . import logger
+from . import dataloaders
+from . import training
+from . import utils
+from . import mixup
+from . import resnet
+from . import smoothing
--- a/PyTorch/Classification/ConvNets/image_classification/dataloaders.py
+++ b/PyTorch/Classification/ConvNets/image_classification/dataloaders.py
@ -1,10 +1,40 @@
+# Copyright (c) 2018-2019, NVIDIA CORPORATION
+# Copyright (c) 2017-      Facebook, Inc
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 import os
 import torch
 import numpy as np
 import torchvision.datasets as datasets
 import torchvision.transforms as transforms
+from PIL import Image

-DATA_BACKEND_CHOICES = ['pytorch']
+DATA_BACKEND_CHOICES = ['pytorch', 'syntetic']
 try:
    from nvidia.dali.plugin.pytorch import DALIClassificationIterator
    from nvidia.dali.pipeline import Pipeline
@ -16,38 +46,68 @@ except ImportError:
    print("Please install DALI from https://www.github.com/NVIDIA/DALI to run this example.")


+def load_jpeg_from_file(path, cuda=True, fp16=False):
+    img_transforms = transforms.Compose(
+        [transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor()]
+    )
+
+    img = img_transforms(Image.open(path))
+    with torch.no_grad():
+        # mean and std are not multiplied by 255 as they are in training script
+        # torch dataloader reads data into bytes whereas loading directly
+        # through PIL creates a tensor with floats in [0,1] range
+        mean = torch.tensor([0.485, 0.456, 0.406]).view(1, 3, 1, 1)
+        std = torch.tensor([0.229, 0.224, 0.225]).view(1, 3, 1, 1)
+
+        if cuda:
+            mean = mean.cuda()
+            std = std.cuda()
+            img = img.cuda()
+        if fp16:
+            mean = mean.half()
+            std = std.half()
+            img = img.half()
+        else:
+            img = img.float()
+
+        input = img.unsqueeze(0).sub_(mean).div_(std)
+
+    return input
+
+
 class HybridTrainPipe(Pipeline):
    def __init__(self, batch_size, num_threads, device_id, data_dir, crop, dali_cpu=False):
        super(HybridTrainPipe, self).__init__(batch_size, num_threads, device_id, seed = 12 + device_id)
        if torch.distributed.is_initialized():
-            local_rank = torch.distributed.get_rank()
+            rank = torch.distributed.get_rank()
            world_size = torch.distributed.get_world_size()
        else:
-            local_rank = 0
+            rank = 0
            world_size = 1

        self.input = ops.FileReader(
                file_root = data_dir,
-                shard_id = local_rank,
+                shard_id = rank,
                num_shards = world_size,
                random_shuffle = True)

        if dali_cpu:
            dali_device = "cpu"
-            self.decode = ops.HostDecoderRandomCrop(device=dali_device, output_type=types.RGB,
-                                                    random_aspect_ratio=[0.75, 4./3.],
-                                                    random_area=[0.08, 1.0],
-                                                    num_attempts=100)
+            self.decode = ops.ImageDecoder(device=dali_device, output_type=types.RGB)
        else:
            dali_device = "gpu"
            # This padding sets the size of the internal nvJPEG buffers to be able to handle all images from full-sized ImageNet
            # without additional reallocations
-            self.decode = ops.nvJPEGDecoderRandomCrop(device="mixed", output_type=types.RGB, device_memory_padding=211025920, host_memory_padding=140544512,
-                                                      random_aspect_ratio=[0.75, 4./3.],
-                                                      random_area=[0.08, 1.0],
-                                                      num_attempts=100)
+            self.decode = ops.ImageDecoder(device="mixed", output_type=types.RGB, device_memory_padding=211025920, host_memory_padding=140544512)
+
+        self.res = ops.RandomResizedCrop(
+                device=dali_device,
+                size=[crop, crop],
+                interp_type=types.INTERP_LINEAR,
+                random_aspect_ratio=[0.75, 4./3.],
+                random_area=[0.08, 1.0],
+                num_attempts=100)

-        self.res = ops.Resize(device=dali_device, resize_x=crop, resize_y=crop, interp_type=types.INTERP_TRIANGULAR)
        self.cmnp = ops.CropMirrorNormalize(device = "gpu",
                                            output_dtype = types.FLOAT,
                                            output_layout = types.NCHW,
@ -70,19 +130,19 @@ class HybridValPipe(Pipeline):
    def __init__(self, batch_size, num_threads, device_id, data_dir, crop, size):
        super(HybridValPipe, self).__init__(batch_size, num_threads, device_id, seed = 12 + device_id)
        if torch.distributed.is_initialized():
-            local_rank = torch.distributed.get_rank()
+            rank = torch.distributed.get_rank()
            world_size = torch.distributed.get_world_size()
        else:
-            local_rank = 0
+            rank = 0
            world_size = 1

        self.input = ops.FileReader(
                file_root = data_dir,
-                shard_id = local_rank,
+                shard_id = rank,
                num_shards = world_size,
                random_shuffle = False)

-        self.decode = ops.nvJPEGDecoder(device = "mixed", output_type = types.RGB)
+        self.decode = ops.ImageDecoder(device = "mixed", output_type = types.RGB)
        self.res = ops.Resize(device = "gpu", resize_shorter = size)
        self.cmnp = ops.CropMirrorNormalize(device = "gpu",
                output_dtype = types.FLOAT,
@ -104,7 +164,7 @@ class DALIWrapper(object):
    def gen_wrapper(dalipipeline, num_classes, one_hot):
        for data in dalipipeline:
            input = data[0]["data"]
-            target = data[0]["label"].squeeze().cuda().long()
+            target = torch.reshape(data[0]["label"], [-1]).cuda().long()
            if one_hot:
                target = expand(num_classes, torch.float, target)
            yield input, target
@ -121,16 +181,16 @@ class DALIWrapper(object):
 def get_dali_train_loader(dali_cpu=False):
    def gdtl(data_path, batch_size, num_classes, one_hot, workers=5, _worker_init_fn=None, fp16=False):
        if torch.distributed.is_initialized():
-            local_rank = torch.distributed.get_rank()
+            rank = torch.distributed.get_rank()
            world_size = torch.distributed.get_world_size()
        else:
-            local_rank = 0
+            rank = 0
            world_size = 1

        traindir = os.path.join(data_path, 'train')

        pipe = HybridTrainPipe(batch_size=batch_size, num_threads=workers,
-                device_id = local_rank,
+                device_id = rank % torch.cuda.device_count(),
                data_dir = traindir, crop = 224, dali_cpu=dali_cpu)

        pipe.build()
@ -144,18 +204,19 @@ def get_dali_train_loader(dali_cpu=False):
 def get_dali_val_loader():
    def gdvl(data_path, batch_size, num_classes, one_hot, workers=5, _worker_init_fn=None, fp16=False):
        if torch.distributed.is_initialized():
-            local_rank = torch.distributed.get_rank()
+            rank = torch.distributed.get_rank()
            world_size = torch.distributed.get_world_size()
        else:
-            local_rank = 0
+            rank = 0
            world_size = 1

        valdir = os.path.join(data_path, 'val')

        pipe = HybridValPipe(batch_size=batch_size, num_threads=workers,
-                device_id = local_rank,
+                device_id = rank % torch.cuda.device_count(),
                data_dir = valdir,
                crop = 224, size = 256)
+
        pipe.build()
        val_loader = DALIClassificationIterator(pipe, size = int(pipe.epoch_size("Reader") / world_size))

@ -199,8 +260,8 @@ class PrefetchedWrapper(object):

        for next_input, next_target in loader:
            with torch.cuda.stream(stream):
-                next_input = next_input.cuda(async=True)
-                next_target = next_target.cuda(async=True)
+                next_input = next_input.cuda(non_blocking=True)
+                next_target = next_target.cuda(non_blocking=True)
                if fp16:
                    next_input = next_input.half()
                    if one_hot:
@ -280,3 +341,25 @@ def get_pytorch_val_loader(data_path, batch_size, num_classes, one_hot, workers=
            collate_fn=fast_collate)

    return PrefetchedWrapper(val_loader, num_classes, fp16, one_hot), len(val_loader)
+
+class SynteticDataLoader(object):
+    def __init__(self, fp16, batch_size, num_classes, num_channels, height, width, one_hot):
+        input_data = torch.empty(batch_size, num_channels, height, width).cuda().normal_(0, 1.0)
+        if one_hot:
+            input_target = torch.empty(batch_size, num_classes).cuda()
+            input_target[:, 0] = 1.0
+        else:
+            input_target = torch.randint(0, num_classes, (batch_size,))
+        input_target=input_target.cuda()
+        if fp16:
+            input_data = input_data.half()
+
+        self.input_data = input_data
+        self.input_target = input_target
+
+    def __iter__(self):
+        while True:
+            yield self.input_data, self.input_target
+
+def get_syntetic_loader(data_path, batch_size, num_classes, one_hot, workers=None, _worker_init_fn=None, fp16=False):
+    return SynteticDataLoader(fp16, batch_size, 1000, 3, 224, 224, one_hot), -1
--- a/PyTorch/Classification/ConvNets/image_classification/logger.py
+++ b/PyTorch/Classification/ConvNets/image_classification/logger.py
@ -0,0 +1,310 @@
+# Copyright (c) 2018-2019, NVIDIA CORPORATION
+# Copyright (c) 2017-      Facebook, Inc
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+from collections import OrderedDict
+import dllogger
+import numpy as np
+
+
+def format_step(step):
+    if isinstance(step, str):
+        return step
+    s = ""
+    if len(step) > 0:
+        s += "Epoch: {} ".format(step[0])
+    if len(step) > 1:
+        s += "Iteration: {} ".format(step[1])
+    if len(step) > 2:
+        s += "Validation Iteration: {} ".format(step[2])
+    if len(step) == 0:
+        s = "Summary:"
+    return s
+
+
+PERF_METER = lambda: Meter(AverageMeter(), AverageMeter(), AverageMeter())
+LOSS_METER = lambda: Meter(AverageMeter(), AverageMeter(), MinMeter())
+ACC_METER = lambda: Meter(AverageMeter(), AverageMeter(), MaxMeter())
+LR_METER = lambda: Meter(LastMeter(), LastMeter(), LastMeter())
+
+LAT_100 = lambda: Meter(QuantileMeter(1), QuantileMeter(1), QuantileMeter(1))
+LAT_99 = lambda: Meter(QuantileMeter(0.99), QuantileMeter(0.99), QuantileMeter(0.99))
+LAT_95 = lambda: Meter(QuantileMeter(0.95), QuantileMeter(0.95), QuantileMeter(0.95))
+
+class Meter(object):
+    def __init__(self, iteration_aggregator, epoch_aggregator, run_aggregator):
+        self.run_aggregator = run_aggregator
+        self.epoch_aggregator = epoch_aggregator
+        self.iteration_aggregator = iteration_aggregator
+
+    def record(self, val, n=1):
+        self.iteration_aggregator.record(val, n=n)
+
+    def get_iteration(self):
+        v, n = self.iteration_aggregator.get_val()
+        return v
+
+    def reset_iteration(self):
+        v, n = self.iteration_aggregator.get_data()
+        self.iteration_aggregator.reset()
+        if v is not None:
+            self.epoch_aggregator.record(v, n=n)
+
+    def get_epoch(self):
+        v, n = self.epoch_aggregator.get_val()
+        return v
+
+    def reset_epoch(self):
+        v, n = self.epoch_aggregator.get_data()
+        self.epoch_aggregator.reset()
+        if v is not None:
+            self.run_aggregator.record(v, n=n)
+
+    def get_run(self):
+        v, n = self.run_aggregator.get_val()
+        return v
+
+    def reset_run(self):
+        self.run_aggregator.reset()
+
+
+class QuantileMeter(object):
+    def __init__(self, q):
+        self.q = q
+        self.reset()
+
+    def reset(self):
+        self.vals = []
+        self.n = 0
+
+    def record(self, val, n=1):
+        if isinstance(val, list):
+            self.vals += val
+            self.n += len(val)
+        else:
+            self.vals += [val] * n
+            self.n += n
+
+    def get_val(self):
+        if not self.vals:
+            return None, self.n
+        return np.quantile(self.vals, self.q, interpolation='nearest'), self.n
+
+    def get_data(self):
+        return self.vals, self.n
+
+
+class MaxMeter(object):
+    def __init__(self):
+        self.reset()
+
+    def reset(self):
+        self.max = None
+        self.n = 0
+
+    def record(self, val, n=1):
+        if self.max is None:
+            self.max = val
+        else:
+            self.max = max(self.max, val)
+        self.n = n
+
+    def get_val(self):
+        return self.max, self.n
+
+    def get_data(self):
+        return self.max, self.n
+
+
+class MinMeter(object):
+    def __init__(self):
+        self.reset()
+
+    def reset(self):
+        self.min = None
+        self.n = 0
+
+    def record(self, val, n=1):
+        if self.min is None:
+            self.min = val
+        else:
+            self.min = max(self.min, val)
+        self.n = n
+
+    def get_val(self):
+        return self.min, self.n
+
+    def get_data(self):
+        return self.min, self.n
+
+
+class LastMeter(object):
+    def __init__(self):
+        self.reset()
+
+    def reset(self):
+        self.last = None
+        self.n = 0
+
+    def record(self, val, n=1):
+        self.last = val
+        self.n = n
+
+    def get_val(self):
+        return self.last, self.n
+
+    def get_data(self):
+        return self.last, self.n
+
+
+class AverageMeter(object):
+    def __init__(self):
+        self.reset()
+
+    def reset(self):
+        self.n = 0
+        self.val = 0
+
+    def record(self, val, n=1):
+        self.n += n
+        self.val += val * n
+
+    def get_val(self):
+        if self.n == 0:
+            return None, 0
+        return self.val / self.n, self.n
+
+    def get_data(self):
+        if self.n == 0:
+            return None, 0
+        return self.val / self.n, self.n
+
+
+class Logger(object):
+    def __init__(self, print_interval, backends, verbose=False):
+        self.epoch = -1
+        self.iteration = -1
+        self.val_iteration = -1
+        self.metrics = OrderedDict()
+        self.backends = backends
+        self.print_interval = print_interval
+        self.verbose = verbose
+        dllogger.init(backends)
+
+    def log_parameter(self, data, verbosity=0):
+        dllogger.log(step="PARAMETER", data=data, verbosity=verbosity)
+
+    def register_metric(self, metric_name, meter, verbosity=0, metadata={}):
+        if self.verbose:
+            print("Registering metric: {}".format(metric_name))
+        self.metrics[metric_name] = {'meter': meter, 'level': verbosity}
+        dllogger.metadata(metric_name, metadata)
+
+    def log_metric(self, metric_name, val, n=1):
+        self.metrics[metric_name]['meter'].record(val, n=n)
+
+    def start_iteration(self, val=False):
+        if val:
+            self.val_iteration += 1
+        else:
+            self.iteration += 1
+
+    def end_iteration(self, val=False):
+        it = self.val_iteration if val else self.iteration
+        if (it % self.print_interval == 0):
+            metrics = {
+                n: m
+                for n, m in self.metrics.items() if n.startswith('val') == val
+            }
+            step = (self.epoch,
+                    self.iteration) if not val else (self.epoch,
+                                                     self.iteration,
+                                                     self.val_iteration)
+
+            verbositys = {m['level'] for _, m in metrics.items()}
+            for ll in verbositys:
+                llm = {n: m for n, m in metrics.items() if m['level'] == ll}
+
+                dllogger.log(step=step,
+                         data={
+                             n: m['meter'].get_iteration()
+                             for n, m in llm.items()
+                         },
+                         verbosity=ll)
+
+            for n, m in metrics.items():
+                m['meter'].reset_iteration()
+
+            dllogger.flush()
+
+    def start_epoch(self):
+        self.epoch += 1
+        self.iteration = 0
+        self.val_iteration = 0
+
+        for n, m in self.metrics.items():
+            m['meter'].reset_epoch()
+
+    def end_epoch(self):
+        for n, m in self.metrics.items():
+            m['meter'].reset_iteration()
+
+        verbositys = {m['level'] for _, m in self.metrics.items()}
+        for ll in verbositys:
+            llm = {n: m for n, m in self.metrics.items() if m['level'] == ll}
+            dllogger.log(step=(self.epoch, ),
+                     data={n: m['meter'].get_epoch()
+                           for n, m in llm.items()})
+
+    def end(self):
+        for n, m in self.metrics.items():
+            m['meter'].reset_epoch()
+
+        verbositys = {m['level'] for _, m in self.metrics.items()}
+        for ll in verbositys:
+            llm = {n: m for n, m in self.metrics.items() if m['level'] == ll}
+            dllogger.log(step=tuple(),
+                     data={n: m['meter'].get_run()
+                           for n, m in llm.items()})
+
+        for n, m in self.metrics.items():
+            m['meter'].reset_epoch()
+
+        dllogger.flush()
+
+    def iteration_generator_wrapper(self, gen, val=False):
+        for g in gen:
+            self.start_iteration(val=val)
+            yield g
+            self.end_iteration(val=val)
+
+    def epoch_generator_wrapper(self, gen):
+        for g in gen:
+            self.start_epoch()
+            yield g
+            self.end_epoch()
--- a/PyTorch/Classification/ConvNets/image_classification/mixup.py
+++ b/PyTorch/Classification/ConvNets/image_classification/mixup.py
@ -1,3 +1,16 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import torch
 import torch.nn as nn
 import numpy as np
--- a/PyTorch/Classification/ConvNets/image_classification/resnet.py
+++ b/PyTorch/Classification/ConvNets/image_classification/resnet.py
@ -0,0 +1,354 @@
+# Copyright (c) 2018-2019, NVIDIA CORPORATION
+# Copyright (c) 2017-      Facebook, Inc
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+import math
+import torch
+import torch.nn as nn
+import numpy as np
+
+__all__ = ['ResNet', 'build_resnet', 'resnet_versions', 'resnet_configs']
+
+# ResNetBuilder {{{
+
+class ResNetBuilder(object):
+    def __init__(self, version, config):
+        self.conv3x3_cardinality = 1 if 'cardinality' not in version.keys() else version['cardinality']
+        self.config = config
+
+    def conv(self, kernel_size, in_planes, out_planes, groups=1, stride=1):
+        conv = nn.Conv2d(
+                in_planes, out_planes,
+                kernel_size=kernel_size, groups=groups,
+                stride=stride, padding=int((kernel_size - 1)/2),
+                bias=False)
+
+        if self.config['nonlinearity'] == 'relu': 
+            nn.init.kaiming_normal_(conv.weight,
+                    mode=self.config['conv_init'],
+                    nonlinearity=self.config['nonlinearity'])
+
+        return conv
+
+    def conv3x3(self, in_planes, out_planes, stride=1):
+        """3x3 convolution with padding"""
+        c = self.conv(3, in_planes, out_planes, groups=self.conv3x3_cardinality, stride=stride)
+        return c
+
+    def conv1x1(self, in_planes, out_planes, stride=1):
+        """1x1 convolution with padding"""
+        c = self.conv(1, in_planes, out_planes, stride=stride)
+        return c
+
+    def conv7x7(self, in_planes, out_planes, stride=1):
+        """7x7 convolution with padding"""
+        c = self.conv(7, in_planes, out_planes, stride=stride)
+        return c
+
+    def conv5x5(self, in_planes, out_planes, stride=1):
+        """5x5 convolution with padding"""
+        c = self.conv(5, in_planes, out_planes, stride=stride)
+        return c
+
+    def batchnorm(self, planes, last_bn=False):
+        bn = nn.BatchNorm2d(planes)
+        gamma_init_val = 0 if last_bn and self.config['last_bn_0_init'] else 1
+        nn.init.constant_(bn.weight, gamma_init_val)
+        nn.init.constant_(bn.bias, 0)
+
+        return bn
+
+    def activation(self):
+        return self.config['activation']()
+
+# ResNetBuilder }}}
+
+# BasicBlock {{{
+class BasicBlock(nn.Module):
+    def __init__(self, builder, inplanes, planes, expansion, stride=1, downsample=None):
+        super(BasicBlock, self).__init__()
+        self.conv1 = builder.conv3x3(inplanes, planes, stride)
+        self.bn1 = builder.batchnorm(planes)
+        self.relu = builder.activation()
+        self.conv2 = builder.conv3x3(planes, planes*expansion)
+        self.bn2 = builder.batchnorm(planes*expansion, last_bn=True)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        if self.bn1 is not None:
+            out = self.bn1(out)
+
+        out = self.relu(out)
+
+        out = self.conv2(out)
+
+        if self.bn2 is not None:
+            out = self.bn2(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+# BasicBlock }}}
+
+# SqueezeAndExcitation {{{
+class SqueezeAndExcitation(nn.Module):
+    def __init__(self, planes, squeeze):
+        super(SqueezeAndExcitation, self).__init__()
+        self.squeeze = nn.Linear(planes, squeeze)
+        self.expand = nn.Linear(squeeze, planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.sigmoid = nn.Sigmoid()
+
+    def forward(self, x):
+        out = torch.mean(x.view(x.size(0), x.size(1), -1), 2)
+        out = self.squeeze(out)
+        out = self.relu(out)
+        out = self.expand(out)
+        out = self.sigmoid(out)
+        out = out.unsqueeze(2).unsqueeze(3)
+
+        return out
+
+# }}}
+
+# Bottleneck {{{
+class Bottleneck(nn.Module):
+    def __init__(self, builder, inplanes, planes, expansion, stride=1, se=False, se_squeeze=16, downsample=None):
+        super(Bottleneck, self).__init__()
+        self.conv1 = builder.conv1x1(inplanes, planes)
+        self.bn1 = builder.batchnorm(planes)
+        self.conv2 = builder.conv3x3(planes, planes, stride=stride)
+        self.bn2 = builder.batchnorm(planes)
+        self.conv3 = builder.conv1x1(planes, planes * expansion)
+        self.bn3 = builder.batchnorm(planes * expansion, last_bn=True)
+        self.relu = builder.activation()
+        self.downsample = downsample
+        self.stride = stride
+        self.squeeze = SqueezeAndExcitation(planes*expansion, se_squeeze) if se else None
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        if self.squeeze is None:
+            out += residual
+        else:
+            out = torch.addcmul(residual, 1.0, out, self.squeeze(out))
+
+        out = self.relu(out)
+
+        return out
+
+def SEBottleneck(builder, inplanes, planes, expansion, stride=1, downsample=None):
+    return Bottleneck(builder, inplanes, planes, expansion, stride=stride, se=True, se_squeeze=16, downsample=downsample)
+# Bottleneck }}}
+
+# ResNet {{{
+class ResNet(nn.Module):
+    def __init__(self, builder, block, expansion, layers, widths, num_classes=1000):
+        self.inplanes = 64
+        super(ResNet, self).__init__()
+        self.conv1 = builder.conv7x7(3, 64, stride=2)
+        self.bn1 = builder.batchnorm(64)
+        self.relu = builder.activation()
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layer1 = self._make_layer(builder, block, expansion, widths[0], layers[0])
+        self.layer2 = self._make_layer(builder, block, expansion, widths[1], layers[1], stride=2)
+        self.layer3 = self._make_layer(builder, block, expansion, widths[2], layers[2], stride=2)
+        self.layer4 = self._make_layer(builder, block, expansion, widths[3], layers[3], stride=2)
+        self.avgpool = nn.AdaptiveAvgPool2d(1)
+        self.fc = nn.Linear(widths[3] * expansion, num_classes)
+
+    def _make_layer(self, builder, block, expansion, planes, blocks, stride=1):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * expansion:
+            dconv = builder.conv1x1(self.inplanes, planes * expansion,
+                                    stride=stride)
+            dbn = builder.batchnorm(planes * expansion)
+            if dbn is not None:
+                downsample = nn.Sequential(dconv, dbn)
+            else:
+                downsample = dconv
+
+        layers = []
+        layers.append(block(builder, self.inplanes, planes, expansion, stride=stride, downsample=downsample))
+        self.inplanes = planes * expansion
+        for i in range(1, blocks):
+            layers.append(block(builder, self.inplanes, planes, expansion))
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        if self.bn1 is not None:
+            x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+
+        x = self.avgpool(x)
+        x = x.view(x.size(0), -1)
+        x = self.fc(x)
+
+        return x
+# ResNet }}}
+
+resnet_configs = {
+        'classic' : {
+            'conv' : nn.Conv2d,
+            'conv_init' : 'fan_out',
+            'nonlinearity' : 'relu',
+            'last_bn_0_init' : False,
+            'activation' : lambda: nn.ReLU(inplace=True),
+            },
+        'fanin' : {
+            'conv' : nn.Conv2d,
+            'conv_init' : 'fan_in',
+            'nonlinearity' : 'relu',
+            'last_bn_0_init' : False,
+            'activation' : lambda: nn.ReLU(inplace=True),
+            },
+        'grp-fanin' : {
+            'conv' : nn.Conv2d,
+            'conv_init' : 'fan_in',
+            'nonlinearity' : 'relu',
+            'last_bn_0_init' : False,
+            'activation' : lambda: nn.ReLU(inplace=True),
+            },
+        'grp-fanout' : {
+            'conv' : nn.Conv2d,
+            'conv_init' : 'fan_out',
+            'nonlinearity' : 'relu',
+            'last_bn_0_init' : False,
+            'activation' : lambda: nn.ReLU(inplace=True),
+            },
+        }
+
+resnet_versions = {
+        'resnet18' : {
+            'net' : ResNet,
+            'block' : BasicBlock,
+            'layers' : [2, 2, 2, 2],
+            'widths' : [64, 128, 256, 512],
+            'expansion' : 1,
+            'num_classes' : 1000,
+            },
+         'resnet34' : {
+            'net' : ResNet,
+            'block' : BasicBlock,
+            'layers' : [3, 4, 6, 3],
+            'widths' : [64, 128, 256, 512],
+            'expansion' : 1,
+            'num_classes' : 1000,
+            },
+         'resnet50' : {
+            'net' : ResNet,
+            'block' : Bottleneck,
+            'layers' : [3, 4, 6, 3],
+            'widths' : [64, 128, 256, 512],
+            'expansion' : 4,
+            'num_classes' : 1000,
+            },
+        'resnet101' : {
+            'net' : ResNet,
+            'block' : Bottleneck,
+            'layers' : [3, 4, 23, 3],
+            'widths' : [64, 128, 256, 512],
+            'expansion' : 4,
+            'num_classes' : 1000,
+            },
+        'resnet152' : {
+            'net' : ResNet,
+            'block' : Bottleneck,
+            'layers' : [3, 8, 36, 3],
+            'widths' : [64, 128, 256, 512],
+            'expansion' : 4,
+            'num_classes' : 1000,
+            },
+        'resnext101-32x4d' : {
+            'net' : ResNet,
+            'block' : Bottleneck,
+            'cardinality' : 32,
+            'layers' : [3, 4, 23, 3],
+            'widths' : [128, 256, 512, 1024],
+            'expansion' : 2,
+            'num_classes' : 1000,
+            },
+        'se-resnext101-32x4d' : {
+            'net' : ResNet,
+            'block' : SEBottleneck,
+            'cardinality' : 32,
+            'layers' : [3, 4, 23, 3],
+            'widths' : [128, 256, 512, 1024],
+            'expansion' : 2,
+            'num_classes' : 1000,
+            },
+        }
+
+
+def build_resnet(version, config, verbose=True):
+    version = resnet_versions[version]
+    config = resnet_configs[config]
+
+    builder = ResNetBuilder(version, config)
+    if verbose:
+        print("Version: {}".format(version))
+        print("Config: {}".format(config))
+    model = version['net'](builder,
+                           version['block'],
+                           version['expansion'],
+                           version['layers'],
+                           version['widths'],
+                           version['num_classes'])
+
+    return model
--- a/PyTorch/Classification/ConvNets/image_classification/smoothing.py
+++ b/PyTorch/Classification/ConvNets/image_classification/smoothing.py
@ -1,3 +1,16 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import torch
 import torch.nn as nn

@ -23,4 +36,3 @@ class LabelSmoothing(nn.Module):
        smooth_loss = -logprobs.mean(dim=-1)
        loss = self.confidence * nll_loss + self.smoothing * smooth_loss
        return loss.mean()
-
--- a/PyTorch/Classification/ConvNets/image_classification/training.py
+++ b/PyTorch/Classification/ConvNets/image_classification/training.py
@ -0,0 +1,532 @@
+# Copyright (c) 2018-2019, NVIDIA CORPORATION
+# Copyright (c) 2017-      Facebook, Inc
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+import os
+import time
+import numpy as np
+import torch
+import torch.nn as nn
+from torch.autograd import Variable
+from . import logger as log
+from . import resnet as models
+from . import utils
+import dllogger
+try:
+    from apex.parallel import DistributedDataParallel as DDP
+    from apex.fp16_utils import *
+    from apex import amp
+except ImportError:
+    raise ImportError(
+        "Please install apex from https://www.github.com/nvidia/apex to run this example."
+    )
+
+ACC_METADATA = {'unit': '%','format': ':.2f'}
+IPS_METADATA = {'unit': 'img/s', 'format': ':.2f'}
+TIME_METADATA = {'unit': 's', 'format': ':.5f'}
+LOSS_METADATA = {'format': ':.5f'}
+
+
+class ModelAndLoss(nn.Module):
+    def __init__(self,
+                 arch,
+                 loss,
+                 pretrained_weights=None,
+                 cuda=True,
+                 fp16=False):
+        super(ModelAndLoss, self).__init__()
+        self.arch = arch
+
+        print("=> creating model '{}'".format(arch))
+        model = models.build_resnet(arch[0], arch[1])
+        if pretrained_weights is not None:
+            print("=> using pre-trained model from a file '{}'".format(arch))
+            model.load_state_dict(pretrained_weights)
+
+        if cuda:
+            model = model.cuda()
+        if fp16:
+            model = network_to_half(model)
+
+        # define loss function (criterion) and optimizer
+        criterion = loss()
+
+        if cuda:
+            criterion = criterion.cuda()
+
+        self.model = model
+        self.loss = criterion
+
+    def forward(self, data, target):
+        output = self.model(data)
+        loss = self.loss(output, target)
+
+        return loss, output
+
+    def distributed(self):
+        self.model = DDP(self.model)
+
+    def load_model_state(self, state):
+        if not state is None:
+            self.model.load_state_dict(state)
+
+
+def get_optimizer(parameters,
+                  fp16,
+                  lr,
+                  momentum,
+                  weight_decay,
+                  nesterov=False,
+                  state=None,
+                  static_loss_scale=1.,
+                  dynamic_loss_scale=False,
+                  bn_weight_decay=False):
+
+    if bn_weight_decay:
+        print(" ! Weight decay applied to BN parameters ")
+        optimizer = torch.optim.SGD([v for n, v in parameters],
+                                    lr,
+                                    momentum=momentum,
+                                    weight_decay=weight_decay,
+                                    nesterov=nesterov)
+    else:
+        print(" ! Weight decay NOT applied to BN parameters ")
+        bn_params = [v for n, v in parameters if 'bn' in n]
+        rest_params = [v for n, v in parameters if not 'bn' in n]
+        print(len(bn_params))
+        print(len(rest_params))
+        optimizer = torch.optim.SGD([{
+            'params': bn_params,
+            'weight_decay': 0
+        }, {
+            'params': rest_params,
+            'weight_decay': weight_decay
+        }],
+                                    lr,
+                                    momentum=momentum,
+                                    weight_decay=weight_decay,
+                                    nesterov=nesterov)
+    if fp16:
+        optimizer = FP16_Optimizer(optimizer,
+                                   static_loss_scale=static_loss_scale,
+                                   dynamic_loss_scale=dynamic_loss_scale,
+                                   verbose=False)
+
+    if not state is None:
+        optimizer.load_state_dict(state)
+
+    return optimizer
+
+
+def lr_policy(lr_fn, logger=None):
+    if logger is not None:
+        logger.register_metric('lr',
+                               log.LR_METER(),
+                               verbosity=dllogger.Verbosity.VERBOSE)
+
+    def _alr(optimizer, iteration, epoch):
+        lr = lr_fn(iteration, epoch)
+
+        if logger is not None:
+            logger.log_metric('lr', lr)
+        for param_group in optimizer.param_groups:
+            param_group['lr'] = lr
+
+    return _alr
+
+
+def lr_step_policy(base_lr, steps, decay_factor, warmup_length, logger=None):
+    def _lr_fn(iteration, epoch):
+        if epoch < warmup_length:
+            lr = base_lr * (epoch + 1) / warmup_length
+        else:
+            lr = base_lr
+            for s in steps:
+                if epoch >= s:
+                    lr *= decay_factor
+        return lr
+
+    return lr_policy(_lr_fn, logger=logger)
+
+
+def lr_linear_policy(base_lr, warmup_length, epochs, logger=None):
+    def _lr_fn(iteration, epoch):
+        if epoch < warmup_length:
+            lr = base_lr * (epoch + 1) / warmup_length
+        else:
+            e = epoch - warmup_length
+            es = epochs - warmup_length
+            lr = base_lr * (1 - (e / es))
+        return lr
+
+    return lr_policy(_lr_fn, logger=logger)
+
+
+def lr_cosine_policy(base_lr, warmup_length, epochs, logger=None):
+    def _lr_fn(iteration, epoch):
+        if epoch < warmup_length:
+            lr = base_lr * (epoch + 1) / warmup_length
+        else:
+            e = epoch - warmup_length
+            es = epochs - warmup_length
+            lr = 0.5 * (1 + np.cos(np.pi * e / es)) * base_lr
+        return lr
+
+    return lr_policy(_lr_fn, logger=logger)
+
+
+def lr_exponential_policy(base_lr,
+                          warmup_length,
+                          epochs,
+                          final_multiplier=0.001,
+                          logger=None):
+    es = epochs - warmup_length
+    epoch_decay = np.power(2, np.log2(final_multiplier) / es)
+
+    def _lr_fn(iteration, epoch):
+        if epoch < warmup_length:
+            lr = base_lr * (epoch + 1) / warmup_length
+        else:
+            e = epoch - warmup_length
+            lr = base_lr * (epoch_decay**e)
+        return lr
+
+    return lr_policy(_lr_fn, logger=logger)
+
+
+def get_train_step(model_and_loss,
+                   optimizer,
+                   fp16,
+                   use_amp=False,
+                   batch_size_multiplier=1):
+    def _step(input, target, optimizer_step=True):
+        input_var = Variable(input)
+        target_var = Variable(target)
+        loss, output = model_and_loss(input_var, target_var)
+        if torch.distributed.is_initialized():
+            reduced_loss = utils.reduce_tensor(loss.data)
+        else:
+            reduced_loss = loss.data
+
+        if fp16:
+            optimizer.backward(loss)
+        elif use_amp:
+            with amp.scale_loss(loss, optimizer) as scaled_loss:
+                scaled_loss.backward()
+        else:
+            loss.backward()
+
+        if optimizer_step:
+            opt = optimizer.optimizer if isinstance(
+                optimizer, FP16_Optimizer) else optimizer
+            for param_group in opt.param_groups:
+                for param in param_group['params']:
+                    param.grad /= batch_size_multiplier
+
+            optimizer.step()
+            optimizer.zero_grad()
+
+        torch.cuda.synchronize()
+
+        return reduced_loss
+
+    return _step
+
+
+def train(train_loader,
+          model_and_loss,
+          optimizer,
+          lr_scheduler,
+          fp16,
+          logger,
+          epoch,
+          use_amp=False,
+          prof=-1,
+          batch_size_multiplier=1,
+          register_metrics=True):
+
+    if register_metrics and logger is not None:
+        logger.register_metric('train.loss',
+                               log.LOSS_METER(),
+                               verbosity=dllogger.Verbosity.DEFAULT,
+                               metadata=LOSS_METADATA)
+        logger.register_metric('train.compute_ips',
+                               log.PERF_METER(),
+                               verbosity=dllogger.Verbosity.VERBOSE,
+                               metadata=IPS_METADATA)
+        logger.register_metric('train.total_ips',
+                               log.PERF_METER(),
+                               verbosity=dllogger.Verbosity.DEFAULT,
+                               metadata=IPS_METADATA)
+        logger.register_metric('train.data_time',
+                               log.PERF_METER(),
+                               verbosity=dllogger.Verbosity.VERBOSE,
+                               metadata=TIME_METADATA)
+        logger.register_metric('train.compute_time',
+                               log.PERF_METER(),
+                               verbosity=dllogger.Verbosity.VERBOSE,
+                               metadata=TIME_METADATA)
+
+    step = get_train_step(model_and_loss,
+                          optimizer,
+                          fp16,
+                          use_amp=use_amp,
+                          batch_size_multiplier=batch_size_multiplier)
+
+    model_and_loss.train()
+    end = time.time()
+
+    optimizer.zero_grad()
+
+    data_iter = enumerate(train_loader)
+    if logger is not None:
+        data_iter = logger.iteration_generator_wrapper(data_iter)
+    if prof > 0:
+        data_iter = utils.first_n(prof, data_iter)
+
+    for i, (input, target) in data_iter:
+        bs = input.size(0)
+        lr_scheduler(optimizer, i, epoch)
+        data_time = time.time() - end
+
+        optimizer_step = ((i + 1) % batch_size_multiplier) == 0
+        loss = step(input, target, optimizer_step=optimizer_step)
+
+        it_time = time.time() - end
+
+        if logger is not None:
+            logger.log_metric('train.loss', to_python_float(loss), bs)
+            logger.log_metric('train.compute_ips',
+                              calc_ips(bs, it_time - data_time))
+            logger.log_metric('train.total_ips', calc_ips(bs, it_time))
+            logger.log_metric('train.data_time', data_time)
+            logger.log_metric('train.compute_time', it_time - data_time)
+
+        end = time.time()
+
+
+def get_val_step(model_and_loss):
+    def _step(input, target):
+        input_var = Variable(input)
+        target_var = Variable(target)
+
+        with torch.no_grad():
+            loss, output = model_and_loss(input_var, target_var)
+
+        prec1, prec5 = utils.accuracy(output.data, target, topk=(1, 5))
+
+        if torch.distributed.is_initialized():
+            reduced_loss = utils.reduce_tensor(loss.data)
+            prec1 = utils.reduce_tensor(prec1)
+            prec5 = utils.reduce_tensor(prec5)
+        else:
+            reduced_loss = loss.data
+
+        torch.cuda.synchronize()
+
+        return reduced_loss, prec1, prec5
+
+    return _step
+
+
+def validate(val_loader,
+             model_and_loss,
+             fp16,
+             logger,
+             epoch,
+             prof=-1,
+             register_metrics=True):
+    if register_metrics and logger is not None:
+        logger.register_metric('val.top1',
+                               log.ACC_METER(),
+                               verbosity=dllogger.Verbosity.DEFAULT,
+                               metadata=ACC_METADATA)
+        logger.register_metric('val.top5',
+                               log.ACC_METER(),
+                               verbosity=dllogger.Verbosity.DEFAULT,
+                               metadata=ACC_METADATA)
+        logger.register_metric('val.loss',
+                               log.LOSS_METER(),
+                               verbosity=dllogger.Verbosity.DEFAULT,
+                               metadata=LOSS_METADATA)
+        logger.register_metric('val.compute_ips',
+                               log.PERF_METER(),
+                               verbosity=dllogger.Verbosity.VERBOSE,
+                               metadata=IPS_METADATA)
+        logger.register_metric('val.total_ips',
+                               log.PERF_METER(),
+                               verbosity=dllogger.Verbosity.DEFAULT,
+                               metadata=IPS_METADATA)
+        logger.register_metric('val.data_time',
+                               log.PERF_METER(),
+                               verbosity=dllogger.Verbosity.VERBOSE,
+                               metadata=TIME_METADATA)
+        logger.register_metric('val.compute_latency',
+                               log.PERF_METER(),
+                               verbosity=dllogger.Verbosity.VERBOSE,
+                               metadata=TIME_METADATA)
+        logger.register_metric('val.compute_latency_at100',
+                               log.LAT_100(),
+                               verbosity=dllogger.Verbosity.VERBOSE,
+                               metadata=TIME_METADATA)
+        logger.register_metric('val.compute_latency_at99',
+                               log.LAT_99(),
+                               verbosity=dllogger.Verbosity.VERBOSE,
+                               metadata=TIME_METADATA)
+        logger.register_metric('val.compute_latency_at95',
+                               log.LAT_95(),
+                               verbosity=dllogger.Verbosity.VERBOSE,
+                               metadata=TIME_METADATA)
+
+
+    step = get_val_step(model_and_loss)
+
+    top1 = log.AverageMeter()
+    # switch to evaluate mode
+    model_and_loss.eval()
+
+    end = time.time()
+
+    data_iter = enumerate(val_loader)
+    if not logger is None:
+        data_iter = logger.iteration_generator_wrapper(data_iter, val=True)
+    if prof > 0:
+        data_iter = utils.first_n(prof, data_iter)
+
+    for i, (input, target) in data_iter:
+        bs = input.size(0)
+        data_time = time.time() - end
+
+        loss, prec1, prec5 = step(input, target)
+
+        it_time = time.time() - end
+
+        top1.record(to_python_float(prec1), bs)
+        if logger is not None:
+            logger.log_metric('val.top1', to_python_float(prec1), bs)
+            logger.log_metric('val.top5', to_python_float(prec5), bs)
+            logger.log_metric('val.loss', to_python_float(loss), bs)
+            logger.log_metric('val.compute_ips',
+                              calc_ips(bs, it_time - data_time))
+            logger.log_metric('val.total_ips', calc_ips(bs, it_time))
+            logger.log_metric('val.data_time', data_time)
+            logger.log_metric('val.compute_latency', it_time - data_time)
+            logger.log_metric('val.compute_latency_at95', it_time - data_time)
+            logger.log_metric('val.compute_latency_at99', it_time - data_time)
+            logger.log_metric('val.compute_latency_at100', it_time - data_time)
+
+        end = time.time()
+
+    return top1.get_val()
+
+
+# Train loop {{{
+def calc_ips(batch_size, time):
+    world_size = torch.distributed.get_world_size(
+    ) if torch.distributed.is_initialized() else 1
+    tbs = world_size * batch_size
+    return tbs / time
+
+
+def train_loop(model_and_loss,
+               optimizer,
+               lr_scheduler,
+               train_loader,
+               val_loader,
+               epochs,
+               fp16,
+               logger,
+               should_backup_checkpoint,
+               use_amp=False,
+               batch_size_multiplier=1,
+               best_prec1=0,
+               start_epoch=0,
+               prof=-1,
+               skip_training=False,
+               skip_validation=False,
+               save_checkpoints=True,
+               checkpoint_dir='./'):
+
+    prec1 = -1
+
+    epoch_iter = range(start_epoch, epochs)
+    for epoch in epoch_iter:
+        if logger is not None:
+            logger.start_epoch()
+        if not skip_training:
+            train(train_loader,
+                  model_and_loss,
+                  optimizer,
+                  lr_scheduler,
+                  fp16,
+                  logger,
+                  epoch,
+                  use_amp=use_amp,
+                  prof=prof,
+                  register_metrics=epoch == start_epoch,
+                  batch_size_multiplier=batch_size_multiplier)
+
+        if not skip_validation:
+            prec1, nimg = validate(val_loader,
+                                   model_and_loss,
+                                   fp16,
+                                   logger,
+                                   epoch,
+                                   prof=prof,
+                                   register_metrics=epoch == start_epoch)
+        if logger is not None:
+            logger.end_epoch()
+
+        if save_checkpoints and (not torch.distributed.is_initialized()
+                                 or torch.distributed.get_rank() == 0):
+            if not skip_validation:
+                is_best = logger.metrics['val.top1']['meter'].get_epoch() > best_prec1
+                best_prec1 = max(logger.metrics['val.top1']['meter'].get_epoch(),
+                                 best_prec1)
+            else:
+                is_best = False
+                best_prec1 = 0
+
+            if should_backup_checkpoint(epoch):
+                backup_filename = 'checkpoint-{}.pth.tar'.format(epoch + 1)
+            else:
+                backup_filename = None
+            utils.save_checkpoint(
+                {
+                    'epoch': epoch + 1,
+                    'arch': model_and_loss.arch,
+                    'state_dict': model_and_loss.model.state_dict(),
+                    'best_prec1': best_prec1,
+                    'optimizer': optimizer.state_dict(),
+                },
+                is_best,
+                checkpoint_dir=checkpoint_dir,
+                backup_filename=backup_filename)
+
+
+# }}}
--- a/PyTorch/Classification/ConvNets/image_classification/utils.py
+++ b/PyTorch/Classification/ConvNets/image_classification/utils.py
@ -0,0 +1,106 @@
+# Copyright (c) 2018-2019, NVIDIA CORPORATION
+# Copyright (c) 2017-      Facebook, Inc
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+import os
+import numpy as np
+import torch
+import shutil
+import torch.distributed as dist
+
+
+def should_backup_checkpoint(args):
+    def _sbc(epoch):
+        return args.gather_checkpoints and (epoch < 10 or epoch % 10 == 0)
+
+    return _sbc
+
+
+def save_checkpoint(state,
+                    is_best,
+                    filename='checkpoint.pth.tar',
+                    checkpoint_dir='./',
+                    backup_filename=None):
+    if (not torch.distributed.is_initialized()
+        ) or torch.distributed.get_rank() == 0:
+        filename = os.path.join(checkpoint_dir, filename)
+        print("SAVING {}".format(filename))
+        torch.save(state, filename)
+        if is_best:
+            shutil.copyfile(filename,
+                            os.path.join(checkpoint_dir, 'model_best.pth.tar'))
+        if backup_filename is not None:
+            shutil.copyfile(filename,
+                            os.path.join(checkpoint_dir, backup_filename))
+
+
+def timed_generator(gen):
+    start = time.time()
+    for g in gen:
+        end = time.time()
+        t = end - start
+        yield g, t
+        start = time.time()
+
+
+def timed_function(f):
+    def _timed_function(*args, **kwargs):
+        start = time.time()
+        ret = f(*args, **kwargs)
+        return ret, time.time() - start
+
+    return _timed_function
+
+
+def accuracy(output, target, topk=(1, )):
+    """Computes the precision@k for the specified values of k"""
+    maxk = max(topk)
+    batch_size = target.size(0)
+
+    _, pred = output.topk(maxk, 1, True, True)
+    pred = pred.t()
+    correct = pred.eq(target.view(1, -1).expand_as(pred))
+
+    res = []
+    for k in topk:
+        correct_k = correct[:k].view(-1).float().sum(0, keepdim=True)
+        res.append(correct_k.mul_(100.0 / batch_size))
+    return res
+
+
+def reduce_tensor(tensor):
+    rt = tensor.clone()
+    dist.all_reduce(rt, op=dist.ReduceOp.SUM)
+    rt /= torch.distributed.get_world_size(
+    ) if torch.distributed.is_initialized() else 1
+    return rt
+
+
+def first_n(n, generator):
+    for i, d in zip(range(n), generator):
+        yield d
--- a/PyTorch/Classification/ConvNets/img/.gitkeep
+++ b/PyTorch/Classification/ConvNets/img/.gitkeep
--- a/PyTorch/Classification/ConvNets/img/ACCvsFLOPS.png
+++ b/PyTorch/Classification/ConvNets/img/ACCvsFLOPS.png
--- a/PyTorch/Classification/ConvNets/img/LATvsTHR.png
+++ b/PyTorch/Classification/ConvNets/img/LATvsTHR.png
--- a/PyTorch/Classification/ConvNets/main.py
+++ b/PyTorch/Classification/ConvNets/main.py
@ -0,0 +1,475 @@
+# Copyright (c) 2018-2019, NVIDIA CORPORATION
+# Copyright (c) 2017-      Facebook, Inc
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+import argparse
+import os
+import shutil
+import time
+import random
+
+import numpy as np
+import torch
+from torch.autograd import Variable
+import torch.nn as nn
+import torch.nn.parallel
+import torch.backends.cudnn as cudnn
+import torch.distributed as dist
+import torch.optim
+import torch.utils.data
+import torch.utils.data.distributed
+import torchvision.transforms as transforms
+import torchvision.datasets as datasets
+
+try:
+    from apex.parallel import DistributedDataParallel as DDP
+    from apex.fp16_utils import *
+    from apex import amp
+except ImportError:
+    raise ImportError(
+        "Please install apex from https://www.github.com/nvidia/apex to run this example."
+    )
+
+import image_classification.resnet as models
+import image_classification.logger as log
+
+from image_classification.smoothing import LabelSmoothing
+from image_classification.mixup import NLLMultiLabelSmooth, MixUpWrapper
+from image_classification.dataloaders import *
+from image_classification.training import *
+from image_classification.utils import *
+
+import dllogger
+
+
+def add_parser_arguments(parser):
+    model_names = models.resnet_versions.keys()
+    model_configs = models.resnet_configs.keys()
+
+    parser.add_argument('data', metavar='DIR', help='path to dataset')
+    parser.add_argument('--data-backend',
+                        metavar='BACKEND',
+                        default='dali-cpu',
+                        choices=DATA_BACKEND_CHOICES,
+                        help='data backend: ' +
+                        ' | '.join(DATA_BACKEND_CHOICES) +
+                        ' (default: dali-cpu)')
+
+    parser.add_argument('--arch',
+                        '-a',
+                        metavar='ARCH',
+                        default='resnet50',
+                        choices=model_names,
+                        help='model architecture: ' + ' | '.join(model_names) +
+                        ' (default: resnet50)')
+
+    parser.add_argument('--model-config',
+                        '-c',
+                        metavar='CONF',
+                        default='classic',
+                        choices=model_configs,
+                        help='model configs: ' + ' | '.join(model_configs) +
+                        '(default: classic)')
+
+    parser.add_argument('-j',
+                        '--workers',
+                        default=5,
+                        type=int,
+                        metavar='N',
+                        help='number of data loading workers (default: 5)')
+    parser.add_argument('--epochs',
+                        default=90,
+                        type=int,
+                        metavar='N',
+                        help='number of total epochs to run')
+    parser.add_argument('-b',
+                        '--batch-size',
+                        default=256,
+                        type=int,
+                        metavar='N',
+                        help='mini-batch size (default: 256) per gpu')
+
+    parser.add_argument(
+        '--optimizer-batch-size',
+        default=-1,
+        type=int,
+        metavar='N',
+        help=
+        'size of a total batch size, for simulating bigger batches using gradient accumulation'
+    )
+
+    parser.add_argument('--lr',
+                        '--learning-rate',
+                        default=0.1,
+                        type=float,
+                        metavar='LR',
+                        help='initial learning rate')
+    parser.add_argument('--lr-schedule',
+                        default='step',
+                        type=str,
+                        metavar='SCHEDULE',
+                        choices=['step', 'linear', 'cosine'],
+                        help='Type of LR schedule: {}, {}, {}'.format(
+                            'step', 'linear', 'cosine'))
+
+    parser.add_argument('--warmup',
+                        default=0,
+                        type=int,
+                        metavar='E',
+                        help='number of warmup epochs')
+
+    parser.add_argument('--label-smoothing',
+                        default=0.0,
+                        type=float,
+                        metavar='S',
+                        help='label smoothing')
+    parser.add_argument('--mixup',
+                        default=0.0,
+                        type=float,
+                        metavar='ALPHA',
+                        help='mixup alpha')
+
+    parser.add_argument('--momentum',
+                        default=0.9,
+                        type=float,
+                        metavar='M',
+                        help='momentum')
+    parser.add_argument('--weight-decay',
+                        '--wd',
+                        default=1e-4,
+                        type=float,
+                        metavar='W',
+                        help='weight decay (default: 1e-4)')
+    parser.add_argument(
+        '--bn-weight-decay',
+        action='store_true',
+        help=
+        'use weight_decay on batch normalization learnable parameters, (default: false)'
+    )
+    parser.add_argument('--nesterov',
+                        action='store_true',
+                        help='use nesterov momentum, (default: false)')
+
+    parser.add_argument('--print-freq',
+                        '-p',
+                        default=10,
+                        type=int,
+                        metavar='N',
+                        help='print frequency (default: 10)')
+    parser.add_argument('--resume',
+                        default='',
+                        type=str,
+                        metavar='PATH',
+                        help='path to latest checkpoint (default: none)')
+    parser.add_argument('--pretrained-weights',
+                        default='',
+                        type=str,
+                        metavar='PATH',
+                        help='load weights from here')
+
+    parser.add_argument('--fp16',
+                        action='store_true',
+                        help='Run model fp16 mode.')
+    parser.add_argument(
+        '--static-loss-scale',
+        type=float,
+        default=1,
+        help=
+        'Static loss scale, positive power of 2 values can improve fp16 convergence.'
+    )
+    parser.add_argument(
+        '--dynamic-loss-scale',
+        action='store_true',
+        help='Use dynamic loss scaling.  If supplied, this argument supersedes '
+        + '--static-loss-scale.')
+    parser.add_argument('--prof',
+                        type=int,
+                        default=-1,
+                        metavar='N',
+                        help='Run only N iterations')
+    parser.add_argument('--amp',
+                        action='store_true',
+                        help='Run model AMP (automatic mixed precision) mode.')
+
+    parser.add_argument('--seed',
+                        default=None,
+                        type=int,
+                        help='random seed used for numpy and pytorch')
+
+    parser.add_argument(
+        '--gather-checkpoints',
+        action='store_true',
+        help=
+        'Gather checkpoints throughout the training, without this flag only best and last checkpoints will be stored'
+    )
+
+    parser.add_argument('--raport-file',
+                        default='experiment_raport.json',
+                        type=str,
+                        help='file in which to store JSON experiment raport')
+
+    parser.add_argument('--evaluate',
+                        action='store_true',
+                        help='evaluate checkpoint/model')
+    parser.add_argument('--training-only',
+                        action='store_true',
+                        help='do not evaluate')
+
+    parser.add_argument(
+        '--no-checkpoints',
+        action='store_false',
+        dest='save_checkpoints',
+        help='do not store any checkpoints, useful for benchmarking')
+
+    parser.add_argument(
+        '--workspace',
+        type=str,
+        default='./',
+        metavar='DIR',
+        help='path to directory where checkpoints will be stored')
+
+
+def main(args):
+    exp_start_time = time.time()
+    global best_prec1
+    best_prec1 = 0
+
+    args.distributed = False
+    if 'WORLD_SIZE' in os.environ:
+        args.distributed = int(os.environ['WORLD_SIZE']) > 1
+        args.local_rank = int(os.environ['LOCAL_RANK'])
+
+    args.gpu = 0
+    args.world_size = 1
+
+    if args.distributed:
+        args.gpu = args.local_rank % torch.cuda.device_count()
+        torch.cuda.set_device(args.gpu)
+        dist.init_process_group(backend='nccl', init_method='env://')
+        args.world_size = torch.distributed.get_world_size()
+
+    if args.amp and args.fp16:
+        print("Please use only one of the --fp16/--amp flags")
+        exit(1)
+
+    if args.seed is not None:
+        print("Using seed = {}".format(args.seed))
+        torch.manual_seed(args.seed + args.local_rank)
+        torch.cuda.manual_seed(args.seed + args.local_rank)
+        np.random.seed(seed=args.seed + args.local_rank)
+        random.seed(args.seed + args.local_rank)
+
+        def _worker_init_fn(id):
+            np.random.seed(seed=args.seed + args.local_rank + id)
+            random.seed(args.seed + args.local_rank + id)
+    else:
+
+        def _worker_init_fn(id):
+            pass
+
+    if args.fp16:
+        assert torch.backends.cudnn.enabled, "fp16 mode requires cudnn backend to be enabled."
+
+    if args.static_loss_scale != 1.0:
+        if not args.fp16:
+            print(
+                "Warning:  if --fp16 is not used, static_loss_scale will be ignored."
+            )
+
+    if args.optimizer_batch_size < 0:
+        batch_size_multiplier = 1
+    else:
+        tbs = args.world_size * args.batch_size
+        if args.optimizer_batch_size % tbs != 0:
+            print(
+                "Warning: simulated batch size {} is not divisible by actual batch size {}"
+                .format(args.optimizer_batch_size, tbs))
+        batch_size_multiplier = int(args.optimizer_batch_size / tbs)
+        print("BSM: {}".format(batch_size_multiplier))
+
+    pretrained_weights = None
+    if args.pretrained_weights:
+        if os.path.isfile(args.pretrained_weights):
+            print("=> loading pretrained weights from '{}'".format(
+                args.pretrained_weights))
+            pretrained_weights = torch.load(args.pretrained_weights)
+        else:
+            print("=> no pretrained weights found at '{}'".format(args.resume))
+
+    start_epoch = 0
+    # optionally resume from a checkpoint
+    if args.resume:
+        if os.path.isfile(args.resume):
+            print("=> loading checkpoint '{}'".format(args.resume))
+            checkpoint = torch.load(
+                args.resume,
+                map_location=lambda storage, loc: storage.cuda(args.gpu))
+            start_epoch = checkpoint['epoch']
+            best_prec1 = checkpoint['best_prec1']
+            model_state = checkpoint['state_dict']
+            optimizer_state = checkpoint['optimizer']
+            print("=> loaded checkpoint '{}' (epoch {})".format(
+                args.resume, checkpoint['epoch']))
+        else:
+            print("=> no checkpoint found at '{}'".format(args.resume))
+            model_state = None
+            optimizer_state = None
+    else:
+        model_state = None
+        optimizer_state = None
+
+    loss = nn.CrossEntropyLoss
+    if args.mixup > 0.0:
+        loss = lambda: NLLMultiLabelSmooth(args.label_smoothing)
+    elif args.label_smoothing > 0.0:
+        loss = lambda: LabelSmoothing(args.label_smoothing)
+
+    model_and_loss = ModelAndLoss((args.arch, args.model_config),
+                                  loss,
+                                  pretrained_weights=pretrained_weights,
+                                  cuda=True,
+                                  fp16=args.fp16)
+
+    # Create data loaders and optimizers as needed
+    if args.data_backend == 'pytorch':
+        get_train_loader = get_pytorch_train_loader
+        get_val_loader = get_pytorch_val_loader
+    elif args.data_backend == 'dali-gpu':
+        get_train_loader = get_dali_train_loader(dali_cpu=False)
+        get_val_loader = get_dali_val_loader()
+    elif args.data_backend == 'dali-cpu':
+        get_train_loader = get_dali_train_loader(dali_cpu=True)
+        get_val_loader = get_dali_val_loader()
+    elif args.data_backend == 'syntetic':
+        get_val_loader = get_syntetic_loader
+        get_train_loader = get_syntetic_loader
+
+    train_loader, train_loader_len = get_train_loader(args.data,
+                                                      args.batch_size,
+                                                      1000,
+                                                      args.mixup > 0.0,
+                                                      workers=args.workers,
+                                                      fp16=args.fp16)
+    if args.mixup != 0.0:
+        train_loader = MixUpWrapper(args.mixup, 1000, train_loader)
+
+    val_loader, val_loader_len = get_val_loader(args.data,
+                                                args.batch_size,
+                                                1000,
+                                                False,
+                                                workers=args.workers,
+                                                fp16=args.fp16)
+
+    if not torch.distributed.is_initialized() or torch.distributed.get_rank(
+    ) == 0:
+        logger = log.Logger(args.print_freq, [
+            dllogger.StdOutBackend(dllogger.Verbosity.DEFAULT,
+                               step_format=log.format_step),
+            dllogger.JSONStreamBackend(
+                dllogger.Verbosity.VERBOSE,
+                os.path.join(args.workspace, args.raport_file))
+        ])
+
+    else:
+        logger = log.Logger(args.print_freq, [])
+
+    logger.log_parameter(args.__dict__, verbosity=dllogger.Verbosity.DEFAULT)
+
+    optimizer = get_optimizer(list(model_and_loss.model.named_parameters()),
+                              args.fp16,
+                              args.lr,
+                              args.momentum,
+                              args.weight_decay,
+                              nesterov=args.nesterov,
+                              bn_weight_decay=args.bn_weight_decay,
+                              state=optimizer_state,
+                              static_loss_scale=args.static_loss_scale,
+                              dynamic_loss_scale=args.dynamic_loss_scale)
+
+    if args.lr_schedule == 'step':
+        lr_policy = lr_step_policy(args.lr, [30, 60, 80],
+                                   0.1,
+                                   args.warmup,
+                                   logger=logger)
+    elif args.lr_schedule == 'cosine':
+        lr_policy = lr_cosine_policy(args.lr,
+                                     args.warmup,
+                                     args.epochs,
+                                     logger=logger)
+    elif args.lr_schedule == 'linear':
+        lr_policy = lr_linear_policy(args.lr,
+                                     args.warmup,
+                                     args.epochs,
+                                     logger=logger)
+
+    if args.amp:
+        model_and_loss, optimizer = amp.initialize(
+            model_and_loss,
+            optimizer,
+            opt_level="O2",
+            loss_scale="dynamic"
+            if args.dynamic_loss_scale else args.static_loss_scale)
+
+    if args.distributed:
+        model_and_loss.distributed()
+
+    model_and_loss.load_model_state(model_state)
+
+    train_loop(model_and_loss,
+               optimizer,
+               lr_policy,
+               train_loader,
+               val_loader,
+               args.epochs,
+               args.fp16,
+               logger,
+               should_backup_checkpoint(args),
+               use_amp=args.amp,
+               batch_size_multiplier=batch_size_multiplier,
+               start_epoch=start_epoch,
+               best_prec1=best_prec1,
+               prof=args.prof,
+               skip_training=args.evaluate,
+               skip_validation=args.training_only,
+               save_checkpoints=args.save_checkpoints and not args.evaluate,
+               checkpoint_dir=args.workspace)
+    exp_duration = time.time() - exp_start_time
+    if not torch.distributed.is_initialized() or torch.distributed.get_rank(
+    ) == 0:
+        logger.end()
+    print("Experiment ended")
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='PyTorch ImageNet Training')
+
+    add_parser_arguments(parser)
+    args = parser.parse_args()
+    cudnn.benchmark = True
+
+    main(args)
--- a/PyTorch/Classification/ConvNets/multiproc.py
+++ b/PyTorch/Classification/ConvNets/multiproc.py
@ -1,3 +1,74 @@
+# From PyTorch:
+#
+# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2016-     Facebook, Inc            (Adam Paszke)
+# Copyright (c) 2014-     Facebook, Inc            (Soumith Chintala)
+# Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
+# Copyright (c) 2012-2014 Deepmind Technologies    (Koray Kavukcuoglu)
+# Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
+# Copyright (c) 2011-2013 NYU                      (Clement Farabet)
+# Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
+# Copyright (c) 2006      Idiap Research Institute (Samy Bengio)
+# Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
+#
+# From Caffe2:
+#
+# Copyright (c) 2016-present, Facebook Inc. All rights reserved.
+#
+# All contributions by Facebook:
+# Copyright (c) 2016 Facebook Inc.
+#
+# All contributions by Google:
+# Copyright (c) 2015 Google Inc.
+# All rights reserved.
+#
+# All contributions by Yangqing Jia:
+# Copyright (c) 2015 Yangqing Jia
+# All rights reserved.
+#
+# All contributions from Caffe:
+# Copyright(c) 2013, 2014, 2015, the respective contributors
+# All rights reserved.
+#
+# All other contributions:
+# Copyright(c) 2015, 2016 the respective contributors
+# All rights reserved.
+#
+# Caffe2 uses a copyright model similar to Caffe: each contributor holds
+# copyright over their contributions to Caffe2. The project versioning records
+# all such contribution and copyright details. If a contributor wants to further
+# mark their specific copyright on a particular contribution, they should
+# indicate their copyright solely in the commit message of the change when it is
+# committed.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#
+# 3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America
+#    and IDIAP Research Institute nor the names of its contributors may be
+#    used to endorse or promote products derived from this software without
+#    specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
 import sys
 import subprocess
 import os
@ -68,12 +139,12 @@ def main():
        # each process's rank
        dist_rank = args.nproc_per_node * args.node_rank + local_rank
        current_env["RANK"] = str(dist_rank)
+        current_env["LOCAL_RANK"] = str(local_rank)

        # spawn the processes
        cmd = [sys.executable,
               "-u",
-               args.training_script,
-               "--local_rank={}".format(local_rank)] + args.training_script_args
+               args.training_script] + args.training_script_args

        print(cmd)

@ -94,13 +165,13 @@ def main():
                elif ret != 0:
                    error = True
            time.sleep(1)
-    
+
        if error:
            for p in processes:
                if p.poll() is None:
                    p.terminate()
            exit(1)
-    
+
    except KeyboardInterrupt:
        for p in processes:
            p.terminate()
--- a/PyTorch/Classification/ConvNets/requirements.txt
+++ b/PyTorch/Classification/ConvNets/requirements.txt
@ -0,0 +1 @@
+git+git://github.com/NVIDIA/dllogger.git@26a0f8f1958de2c0c460925ff6102a4d2486d6cc#egg=dllogger
--- a/PyTorch/Classification/ConvNets/resnet50v1.5/README.md
+++ b/PyTorch/Classification/ConvNets/resnet50v1.5/README.md
@ -0,0 +1,688 @@
+# ResNet50 v1.5 For PyTorch
+
+This repository provides a script and recipe to train the ResNet50 model to
+achieve state-of-the-art accuracy, and is tested and maintained by NVIDIA.
+
+## Table Of Contents
+
+* [Model overview](#model-overview)
+  * [Model architecture](#model-architecture)
+  * [Default configuration](#default-configuration)
+    * [Optimizer](#optimizer)
+    * [Data augmentation](#data-augmentation)
+  * [DALI](#dali)
+  * [Feature support matrix](#feature-support-matrix)
+    * [Features](#features)
+  * [Mixed precision training](#mixed-precision-training)
+    * [Enabling mixed precision](#enabling-mixed-precision)
+* [Setup](#setup)
+  * [Requirements](#requirements)
+* [Quick Start Guide](#quick-start-guide)
+* [Advanced](#advanced)
+  * [Scripts and sample code](#scripts-and-sample-code)
+    * [Parameters](#parameters)
+    * [Command-line options](#command-line-options)
+    * [Getting the data](#getting-the-data)
+        * [Dataset guidelines](#dataset-guidelines)
+        * [Multi-dataset](#multi-dataset)
+    * [Training process](#training-process)
+    * [Inference process](#inference-process)
+
+* [Performance](#performance)
+  * [Benchmarking](#benchmarking)
+    * [Training performance benchmark](#training-performance-benchmark)
+    * [Inference performance benchmark](#inference-performance-benchmark)
+  * [Results](#results)
+    * [Training accuracy results](#training-accuracy-results)
+      * [Training accuracy: NVIDIA DGX-1 (8x V100 16G)](#training-accuracy-nvidia-dgx-1-(8x-v100-16G))
+      * [Example plots](*example-plots)
+    * [Training performance results](#training-performance-results)
+      * [Training performance: NVIDIA DGX-1 (8x V100 16G)](#training-performance-nvidia-dgx-1-(8x-v100-16G))
+    * [Training time for 90 epochs](#training-time-for-90-epochs)
+      * [Training time: NVIDIA DGX-1 (8x V100 16G)](#training-time-nvidia-dgx-1-(8x-v100-16G))
+  * [Inference performance results](#inference-performance-results)
+      * [Inference performance: NVIDIA DGX-1 (1x V100 16G)](#inference-performance-nvidia-dgx-1-(1x-v100-16G))
+      * [Inference performance: NVIDIA T4](#inference-performance-nvidia-t4)
+* [Release notes](#release-notes)
+  * [Changelog](#changelog)
+  * [Known issues](#known-issues)
+
+## Model overview
+The ResNet50 v1.5 model is a modified version of the [original ResNet50 v1 model](https://arxiv.org/abs/1512.03385).
+
+The difference between v1 and v1.5 is that, in the bottleneck blocks which requires
+downsampling, v1 has stride = 2 in the first 1x1 convolution, whereas v1.5 has stride = 2 in the 3x3 convolution.
+
+This difference makes ResNet50 v1.5 slightly more accurate (~0.5% top1) than v1, but comes with a smallperformance drawback (~5% imgs/sec).
+
+The model is initialized as described in [Delving deep into rectifiers: Surpassing human-level performance on ImageNet classification](https://arxiv.org/pdf/1502.01852.pdf)
+
+### Default configuration
+
+The following sections highlight the default configurations for the ResNet50 model.
+
+#### Optimizer
+
+This model uses SGD with momentum optimizer with the following hyperparameters:
+
+* Momentum (0.875)
+
+* Learning rate (LR) = 0.256 for 256 batch size, for other batch sizes we lineary
+scale the learning rate.
+
+* Learning rate schedule - we use cosine LR schedule
+
+* For bigger batch sizes (512 and up) we use linear warmup of the learning rate
+during the first couple of epochs
+according to [Training ImageNet in 1 hour](https://arxiv.org/abs/1706.02677).
+Warmup length depends on the total training length.
+
+* Weight decay (WD)= 3.0517578125e-05 (1/32768).
+
+* We do not apply WD on Batch Norm trainable parameters (gamma/bias)
+
+* Label smoothing = 0.1
+
+* We train for:
+
+    * 50 Epochs -> configuration that reaches 75.9% top1 accuracy
+
+    * 90 Epochs -> 90 epochs is a standard for ImageNet networks
+
+    * 250 Epochs -> best possible accuracy.
+
+* For 250 epoch training we also use [MixUp regularization](https://arxiv.org/pdf/1710.09412.pdf).
+
+
+#### Data augmentation
+
+This model uses the following data augmentation:
+
+* For training:
+  * Normalization
+  * Random resized crop to 224x224
+    * Scale from 8% to 100%
+    * Aspect ratio from 3/4 to 4/3
+  * Random horizontal flip
+
+* For inference:
+  * Normalization
+  * Scale to 256x256
+  * Center crop to 224x224
+
+#### Other training recipes
+
+This script does not target any specific benchmark.
+There are changes that others have made which can speed up convergence and/or increase accuracy.
+
+One of the more popular training recipes is provided by [fast.ai](https://github.com/fastai/imagenet-fast).
+
+The fast.ai recipe introduces many changes to the training procedure, one of which is progressive resizing of the training images.
+
+The first part of training uses 128px images, the middle part uses 224px images, and the last part uses 288px images.
+The final validation is performed on 288px images.
+
+Training script in this repository performs validation on 224px images, just like the original paper described.
+
+These two approaches can't be directly compared, since the fast.ai recipe requires validation on 288px images,
+and this recipe keeps the original assumption that validation is done on 224px images.
+
+Using 288px images means that a lot more FLOPs are needed during inference to reach the same accuracy.
+
+### Feature support matrix
+
+The following features are supported by this model:
+
+| Feature               | ResNet50
+|-----------------------|--------------------------
+|[DALI](https://docs.nvidia.com/deeplearning/sdk/dali-release-notes/index.html)   |   Yes
+|[APEX AMP](https://nvidia.github.io/apex/amp.html) | Yes |
+
+#### Features
+
+- NVIDIA DALI - DALI is a library accelerating data preparation pipeline. To accelerate your input pipeline, you only need to define your data loader
+with the DALI library. For more information about DALI, refer to the [DALI product documentation](https://docs.nvidia.com/deeplearning/sdk/index.html#data-loading).
+
+- [APEX](https://github.com/NVIDIA/apex) is a PyTorch extension that contains utility libraries, such as [Automatic Mixed Precision (AMP)](https://nvidia.github.io/apex/amp.html), which require minimal network code changes to leverage Tensor Cores performance. Refer to the [Enabling mixed precision](#enabling-mixed-precision) section for more details.
+
+### DALI
+
+We use [NVIDIA DALI](https://github.com/NVIDIA/DALI),
+which speeds up data loading when CPU becomes a bottleneck.
+DALI can use CPU or GPU, and outperforms the PyTorch native dataloader.
+
+Run training with `--data-backends dali-gpu` or `--data-backends dali-cpu` to enable DALI.
+For DGX1 we recommend `--data-backends dali-cpu`, for DGX2 we recommend `--data-backends dali-gpu`.
+
+### Mixed precision training
+
+Mixed precision is the combined use of different numerical precisions in a computational method. [Mixed precision](https://arxiv.org/abs/1710.03740) training offers significant computational speedup by performing operations in half-precision format, while storing minimal information in single-precision to retain as much information as possible in critical parts of the network. Since the introduction of [Tensor Cores](https://developer.nvidia.com/tensor-cores) in the Volta and Turing architecture, significant training speedups are experienced by switching to mixed precision -- up to 3x overall speedup on the most arithmetically intense model architectures. Using mixed precision training requires two steps:
+1.  Porting the model to use the FP16 data type where appropriate.
+2.  Adding loss scaling to preserve small gradient values.
+
+The ability to train deep learning networks with lower precision was introduced in the Pascal architecture and first supported in [CUDA 8](https://devblogs.nvidia.com/parallelforall/tag/fp16/) in the NVIDIA Deep Learning SDK.
+
+For information about:
+-   How to train using mixed precision, see the [Mixed Precision Training](https://arxiv.org/abs/1710.03740) paper and [Training With Mixed Precision](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html) documentation.
+-   Techniques used for mixed precision training, see the [Mixed-Precision Training of Deep Neural Networks](https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/) blog.
+-   How to access and enable AMP for TensorFlow, see [Using TF-AMP](https://docs.nvidia.com/deeplearning/dgx/tensorflow-user-guide/index.html#tfamp) from the TensorFlow User Guide.
+-   APEX tools for mixed precision training, see the [NVIDIA Apex: Tools for Easy Mixed-Precision Training in PyTorch](https://devblogs.nvidia.com/apex-pytorch-easy-mixed-precision-training/).
+
+#### Enabling mixed precision
+
+Mixed precision is enabled in PyTorch by using the Automatic Mixed Precision (AMP), a library from [APEX](https://github.com/NVIDIA/apex) that casts variables to half-precision upon retrieval,
+while storing variables in single-precision format. Furthermore, to preserve small gradient magnitudes in backpropagation, a [loss scaling](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html#lossscaling) step must be included when applying gradients.
+In PyTorch, loss scaling can be easily applied by using scale_loss() method provided by AMP. The scaling value to be used can be [dynamic](https://nvidia.github.io/apex/fp16_utils.html#apex.fp16_utils.DynamicLossScaler) or fixed.
+
+For an in-depth walk through on AMP, check out sample usage [here](https://github.com/NVIDIA/apex/tree/master/apex/amp#usage-and-getting-started). [APEX](https://github.com/NVIDIA/apex) is a PyTorch extension that contains utility libraries, such as AMP, which require minimal network code changes to leverage tensor cores performance.
+
+To enable mixed precision, you can:
+- Import AMP from APEX, for example:
+
+  ```
+  from apex import amp
+  ```
+- Initialize an AMP handle, for example:
+
+  ```
+  amp_handle = amp.init(enabled=True, verbose=True)
+  ```
+- Wrap your optimizer with the AMP handle, for example:
+
+  ```
+  optimizer = amp_handle.wrap_optimizer(optimizer)
+  ```
+- Scale loss before backpropagation (assuming loss is stored in a variable called losses)
+  - Default backpropagate for FP32:
+
+    ```
+    losses.backward()
+    ```
+  - Scale loss and backpropagate with AMP:
+
+    ```
+    with optimizer.scale_loss(losses) as scaled_losses:
+       scaled_losses.backward()
+    ```
+
+## Setup
+
+The following section lists the requirements that you need to meet in order to start training the ResNet50 model.
+
+### Requirements
+
+This repository contains Dockerfile which extends the PyTorch NGC container and encapsulates some dependencies. Aside from these dependencies, ensure you have the following components:
+
+* [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker)
+* [PyTorch 19.10-py3 NGC container](https://ngc.nvidia.com/registry/nvidia-pytorch) or newer
+* [NVIDIA Volta](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/) or [Turing](https://www.nvidia.com/en-us/geforce/turing/) based GPU
+
+For more information about how to get started with NGC containers, see the
+following sections from the NVIDIA GPU Cloud Documentation and the Deep Learning
+DGX Documentation:
+* [Getting Started Using NVIDIA GPU Cloud](https://docs.nvidia.com/ngc/ngc-getting-started-guide/index.html)
+* [Accessing And Pulling From The NGC Container Registry](https://docs.nvidia.com/deeplearning/dgx/user-guide/index.html#accessing_registry)
+* [Running PyTorch](https://docs.nvidia.com/deeplearning/dgx/pytorch-release-notes/running.html#running)
+
+For those unable to use the PyTorch NGC container, to set up the required environment or create your own container, see the versioned [NVIDIA Container Support Matrix](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html).
+
+## Quick Start Guide
+
+### 1. Clone the repository.
+```
+git clone https://github.com/NVIDIA/DeepLearningExamples
+cd DeepLearningExamples/PyTorch/Classification/
+```
+
+### 2. Download and preprocess the dataset.
+
+The ResNet50 script operates on ImageNet 1k, a widely popular image classification dataset from the ILSVRC challenge.
+
+PyTorch can work directly on JPEGs, therefore, preprocessing/augmentation is not needed.
+
+1. [Download the images](http://image-net.org/download-images).
+
+2. Extract the training data:
+  ```bash
+  mkdir train && mv ILSVRC2012_img_train.tar train/ && cd train
+  tar -xvf ILSVRC2012_img_train.tar && rm -f ILSVRC2012_img_train.tar
+  find . -name "*.tar" | while read NAME ; do mkdir -p "${NAME%.tar}"; tar -xvf "${NAME}" -C "${NAME%.tar}"; rm -f "${NAME}"; done
+  cd ..
+  ```
+
+3. Extract the validation data and move the images to subfolders:
+  ```bash
+  mkdir val && mv ILSVRC2012_img_val.tar val/ && cd val && tar -xvf ILSVRC2012_img_val.tar
+  wget -qO- https://raw.githubusercontent.com/soumith/imagenetloader.torch/master/valprep.sh | bash
+  ```
+
+The directory in which the `train/` and `val/` directories are placed, is referred to as `<path to imagenet>` in this document.
+
+### 3. Build the RN50v1.5 PyTorch NGC container.
+
+```
+docker build . -t nvidia_rn50
+```
+
+### 4. Start an interactive session in the NGC container to run training/inference.
+```
+nvidia-docker run --rm -it -v <path to imagenet>:/data/imagenet --ipc=host nvidia_rn50
+```
+
+### 5. Start training
+
+To run training for a standard configuration (DGX1V/DGX2V, FP16/FP32, 50/90/250 Epochs),
+run one of the scripts in the `./resnet50v1.5/training` directory
+called `./resnet50v1.5/training/{DGX1, DGX2}_RN50_{AMP, FP16, FP32}_{50,90,250}E.sh`.
+
+Ensure ImageNet is mounted in the `/data/imagenet` directory.
+
+Example:
+    `bash ./resnet50v1.5/training/DGX1_RN50_FP16_250E.sh <path were to store checkpoints and logs>`
+
+### 6. Start inference
+
+To run inference on ImageNet on a checkpointed model, run:
+
+`python ./main.py --arch resnet50 --evaluate --epochs 1 --resume <path to checkpoint> -b <batch size> <path to imagenet>`
+
+To run inference on JPEG image, you have to first extract the model weights from checkpoint:
+
+`python checkpoint2model.py --checkpoint-path <path to checkpoint> --weight-path <path where weights will be stored>`
+
+Then run classification script:
+
+`python classify.py --arch resnet50 -c fanin --weights <path to weights from previous step> --precision AMP|FP16|FP32 --image <path to JPEG image>`
+
+
+## Advanced
+
+The following sections provide greater details of the dataset, running training and inference, and the training results.
+
+### Scripts and sample code
+
+To run a non standard configuration use:
+
+* For 1 GPU
+    * FP32
+        `python ./main.py --arch resnet50 -c fanin --label-smoothing 0.1 <path to imagenet>`
+        `python ./main.py --arch resnet50 -c fanin --label-smoothing 0.1 --amp --static-loss-scale 256 <path to imagenet>`
+
+* For multiple GPUs
+    * FP32
+        `python ./multiproc.py --nproc_per_node 8 ./main.py --arch resnet50 -c fanin --label-smoothing 0.1 <path to imagenet>`
+    * AMP
+        `python ./multiproc.py --nproc_per_node 8 ./main.py --arch resnet50 -c fanin --label-smoothing 0.1 --amp --static-loss-scale 256 <path to imagenet>`
+
+Use `python ./main.py -h` to obtain the list of available options in the `main.py` script.
+
+
+### Commmand-line options:
+
+To see the full list of available options and their descriptions, use the `-h` or `--help` command-line option, for example:
+
+`python main.py -h`
+
+
+```
+usage: main.py [-h] [--data-backend BACKEND] [--arch ARCH]
+               [--model-config CONF] [-j N] [--epochs N] [-b N]
+               [--optimizer-batch-size N] [--lr LR] [--lr-schedule SCHEDULE]
+               [--warmup E] [--label-smoothing S] [--mixup ALPHA]
+               [--momentum M] [--weight-decay W] [--bn-weight-decay]
+               [--nesterov] [--print-freq N] [--resume PATH]
+               [--pretrained-weights PATH] [--fp16]
+               [--static-loss-scale STATIC_LOSS_SCALE] [--dynamic-loss-scale]
+               [--prof N] [--amp] [--local_rank LOCAL_RANK] [--seed SEED]
+               [--gather-checkpoints] [--raport-file RAPORT_FILE] [--evaluate]
+               [--training-only] [--no-checkpoints] [--workspace DIR]
+               DIR
+
+PyTorch ImageNet Training
+
+positional arguments:
+  DIR                   path to dataset
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --data-backend BACKEND
+                        data backend: pytorch | syntetic | dali-gpu | dali-cpu
+                        (default: dali-cpu)
+  --arch ARCH, -a ARCH  model architecture: resnet18 | resnet34 | resnet50 |
+                        resnet101 | resnet152 | resnet50 | se-
+                        resnet50 (default: resnet50)
+  --model-config CONF, -c CONF
+                        model configs: classic | fanin | grp-fanin | grp-
+                        fanout(default: classic)
+  -j N, --workers N     number of data loading workers (default: 5)
+  --epochs N            number of total epochs to run
+  -b N, --batch-size N  mini-batch size (default: 256) per gpu
+  --optimizer-batch-size N
+                        size of a total batch size, for simulating bigger
+                        batches using gradient accumulation
+  --lr LR, --learning-rate LR
+                        initial learning rate
+  --lr-schedule SCHEDULE
+                        Type of LR schedule: step, linear, cosine
+  --warmup E            number of warmup epochs
+  --label-smoothing S   label smoothing
+  --mixup ALPHA         mixup alpha
+  --momentum M          momentum
+  --weight-decay W, --wd W
+                        weight decay (default: 1e-4)
+  --bn-weight-decay     use weight_decay on batch normalization learnable
+                        parameters, (default: false)
+  --nesterov            use nesterov momentum, (default: false)
+  --print-freq N, -p N  print frequency (default: 10)
+  --resume PATH         path to latest checkpoint (default: none)
+  --pretrained-weights PATH
+                        load weights from here
+  --fp16                Run model fp16 mode.
+  --static-loss-scale STATIC_LOSS_SCALE
+                        Static loss scale, positive power of 2 values can
+                        improve fp16 convergence.
+  --dynamic-loss-scale  Use dynamic loss scaling. If supplied, this argument
+                        supersedes --static-loss-scale.
+  --prof N              Run only N iterations
+  --amp                 Run model AMP (automatic mixed precision) mode.
+  --local_rank LOCAL_RANK
+                        Local rank of python process. Set up by distributed
+                        launcher
+  --seed SEED           random seed used for numpy and pytorch
+  --gather-checkpoints  Gather checkpoints throughout the training, without
+                        this flag only best and last checkpoints will be
+                        stored
+  --raport-file RAPORT_FILE
+                        file in which to store JSON experiment raport
+  --evaluate            evaluate checkpoint/model
+  --training-only       do not evaluate
+  --no-checkpoints      do not store any checkpoints, useful for benchmarking
+  --workspace DIR       path to directory where checkpoints will be stored
+```
+
+
+### Dataset guidelines
+
+To use your own dataset, divide it in directories as in the following scheme:
+
+ - Training images - `train/<class id>/<image>`
+ - Validation images - `val/<class id>/<image>`
+
+If your dataset's has number of classes different than 1000, you need to add a custom config
+in the `image_classification/resnet.py` file.
+
+```python
+resnet_versions = {
+    ...
+    'resnet50-custom' : {
+       'net' : ResNet,
+       'block' : Bottleneck,
+       'layers' : [3, 4, 6, 3],
+       'widths' : [64, 128, 256, 512],
+       'expansion' : 4,
+       'num_classes' : <custom number of classes>,
+       }
+}
+```
+
+After adding the config, run the training script with `--arch resnet50-custom` flag.
+
+### Training process
+
+All the results of the training will be stored in the directory specified with `--workspace` argument.
+Script will store:
+ - most recent checkpoint - `checkpoint.pth.tar` (unless `--no-checkpoints` flag is used).
+ - checkpoint with best validation accuracy - `model_best.pth.tar` (unless `--no-checkpoints` flag is used).
+ - JSON log - in the file specified with `--raport-file` flag.
+
+Metrics gathered through training:
+
+ - `train.loss` - training loss
+ - `train.total_ips` - training speed measured in images/second
+ - `train.compute_ips` - training speed measured in images/second, not counting data loading
+ - `train.data_time` - time spent on waiting on data
+ - `train.compute_time` - time spent in forward/backward pass
+
+### Inference process
+
+Validation is done every epoch, and can be also run separately on a checkpointed model.
+
+`python ./main.py --arch resnet50 --evaluate --epochs 1 --resume <path to checkpoint> -b <batch size> <path to imagenet>`
+
+Metrics gathered through training:
+
+ - `val.loss` - validation loss
+ - `val.top1` - validation top1 accuracy
+ - `val.top5` - validation top5 accuracy
+ - `val.total_ips` - inference speed measured in images/second
+ - `val.compute_ips` - inference speed measured in images/second, not counting data loading
+ - `val.data_time` - time spent on waiting on data
+ - `val.compute_time` - time spent on inference
+
+
+To run inference on JPEG image, you have to first extract the model weights from checkpoint:
+
+`python checkpoint2model.py --checkpoint-path <path to checkpoint> --weight-path <path where weights will be stored>`
+
+Then run classification script:
+
+`python classify.py --arch resnet50 -c fanin --weights <path to weights from previous step> --precision AMP|FP16|FP32 --image <path to JPEG image>`
+
+Example output:
+
+
+
+## Performance
+
+### Benchmarking
+
+The following section shows how to run benchmarks measuring the model performance in training and inference modes.
+
+#### Training performance benchmark
+
+To benchmark training, run:
+
+* For 1 GPU
+    * FP32
+`python ./main.py --arch resnet50 --training-only -p 1 --raport-file benchmark.json --epochs 1 --prof 100 <path to imagenet>`
+    * FP16
+`python ./main.py --arch resnet50 --training-only -p 1 --raport-file benchmark.json --epochs 1 --prof 100 --fp16 --static-loss-scale 256 <path to imagenet>`
+    * AMP
+`python ./main.py --arch resnet50 --training-only -p 1 --raport-file benchmark.json --epochs 1 --prof 100 --amp --static-loss-scale 256 <path to imagenet>`
+* For multiple GPUs
+    * FP32
+`python ./multiproc.py --nproc_per_node 8 ./main.py --arch resnet50 --training-only -p 1 --raport-file benchmark.json --epochs 1 --prof 100 <path to imagenet>`
+    * FP16
+`python ./multiproc.py --nproc_per_node 8 ./main.py --arch resnet50 --training-only -p 1 --raport-file benchmark.json --fp16 --static-loss-scale 256 --epochs 1 --prof 100 <path to imagenet>`
+    * AMP
+`python ./multiproc.py --nproc_per_node 8 ./main.py --arch resnet50 --training-only -p 1 --raport-file benchmark.json --amp --static-loss-scale 256 --epochs 1 --prof 100 <path to imagenet>`
+
+Each of these scripts will run 100 iterations and save results in the `benchmark.json` file.
+
+#### Inference performance benchmark
+
+To benchmark inference, run:
+
+* FP32
+
+`python ./main.py --arch resnet50 -p 1 --raport-file benchmark.json --epochs 1 --prof 100 --evaluate <path to imagenet>`
+
+* FP16
+
+`python ./main.py --arch resnet50 -p 1 --raport-file benchmark.json --epochs 1 --prof 100 --evaluate --fp16 <path to imagenet>`
+
+* AMP
+
+`python ./main.py --arch resnet50 -p 1 --raport-file benchmark.json --epochs 1 --prof 100 --evaluate --amp <path to imagenet>`
+
+Each of these scripts will run 100 iterations and save results in the `benchmark.json` file.
+
+
+### Results
+
+Our results were obtained by running the applicable training script     in the pytorch-19.10 NGC container.
+
+To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
+
+#### Training accuracy results
+
+##### Training accuracy: NVIDIA DGX-1 (8x V100 16G)
+
+| **epochs** | **Mixed Precision Top1** | **FP32 Top1** |
+|:-:|:-:|:-:|
+| 50 | 76.25 +/- 0.04 | 76.26 +/- 0.07 |
+| 90 | 77.23 +/- 0.04 | 77.08 +/- 0.08 |
+| 250 | 78.42 +/- 0.04 | 78.30 +/- 0.16 |
+
+##### Training accuracy: NVIDIA DGX-2 (16x V100 32G)
+
+| **epochs** | **Mixed Precision Top1** | **FP32 Top1** |
+|:-:|:-:|:-:|
+| 50 | 75.81 +/- 0.08 | 76.04 +/- 0.05 |
+| 90 | 77.10 +/- 0.06 | 77.23 +/- 0.04 |
+| 250 | 78.59 +/- 0.13 | 78.46 +/- 0.03 |
+
+
+
+##### Example plots
+
+The following images show a 250 epochs configuration on a DGX-1V.
+
+![ValidationLoss](./img/loss_plot.png)
+
+![ValidationTop1](./img/top1_plot.png)
+
+![ValidationTop5](./img/top5_plot.png)
+
+#### Training performance results
+
+##### Traininig performance: NVIDIA DGX1-16G (8x V100 16G)
+
+| **GPUs** | **Mixed Precision** | **FP32** | **Mixed Precision speedup** | **Mixed Precision Strong Scaling** | **FP32 Strong Scaling** |
+|:-:|:-:|:-:|:-:|:-:|:-:|
+| 1 | 893.09 img/s | 380.44 img/s | 2.35x | 1.00x | 1.00x |
+| 8 | 6888.75 img/s | 2945.37 img/s | 2.34x | 7.71x | 7.74x |
+
+##### Traininig performance: NVIDIA DGX1-32G (8x V100 32G)
+
+| **GPUs** | **Mixed Precision** | **FP32** | **Mixed Precision speedup** | **Mixed Precision Strong Scaling** | **FP32 Strong Scaling** |
+|:-:|:-:|:-:|:-:|:-:|:-:|
+| 1 | 849.63 img/s | 373.93 img/s | 2.27x | 1.00x | 1.00x |
+| 8 | 6614.15 img/s | 2911.22 img/s | 2.27x | 7.78x | 7.79x |
+
+##### Traininig performance: NVIDIA DGX2 (16x V100 32G)
+
+| **GPUs** | **Mixed Precision** | **FP32** | **Mixed Precision speedup** | **Mixed Precision Strong Scaling** | **FP32 Strong Scaling** |
+|:-:|:-:|:-:|:-:|:-:|:-:|
+| 1 | 894.41 img/s | 402.23 img/s | 2.22x | 1.00x | 1.00x |
+| 16 | 13443.82 img/s | 6263.41 img/s | 2.15x | 15.03x | 15.57x |
+
+#### Training Time for 90 Epochs
+
+##### Training time: NVIDIA DGX-1 (8x V100 16G)
+
+| **GPUs** | **Mixed Precision training time** | **FP32 training time** |
+|:-:|:-:|:-:|
+| 1 | ~ 41 h | ~ 95 h |
+| 8 | ~ 7 h | ~ 14 h |
+
+##### Training time: NVIDIA DGX-2 (16x V100 32G)
+
+| **GPUs** | **Mixed Precision training time** | **FP32 training time** |
+|:-:|:-:|:-:|
+| 1 | ~ 41 h | ~ 90 h |
+| 16 | ~ 5 h | ~ 8 h |
+
+
+
+#### Inference performance results
+
+##### Inference performance: NVIDIA DGX-1 (1x V100 16G)
+
+###### FP32 Inference Latency
+
+| **batch size** | **Throughput Avg** | **Latency Avg** | **Latency 90%** | **Latency 95%** | **Latency 99%** |
+|:-:|:-:|:-:|:-:|:-:|:-:|
+| 1 | 136.82 img/s | 7.12ms | 7.25ms | 8.36ms | 10.92ms |
+| 2 | 266.86 img/s | 7.27ms | 7.41ms | 7.85ms | 9.11ms |
+| 4 | 521.76 img/s | 7.44ms | 7.58ms | 8.14ms | 10.09ms |
+| 8 | 766.22 img/s | 10.18ms | 10.46ms | 10.97ms | 12.75ms |
+| 16 | 976.36 img/s | 15.79ms | 15.88ms | 15.95ms | 16.63ms |
+| 32 | 1092.27 img/s | 28.63ms | 28.71ms | 28.76ms | 29.30ms |
+| 64 | 1161.55 img/s | 53.69ms | 53.86ms | 53.90ms | 54.23ms |
+| 128 | 1209.12 img/s | 104.24ms | 104.68ms | 104.80ms | 105.00ms |
+| 256 | N/A | N/A | N/A | N/A | N/A |
+
+###### Mixed Precision Inference Latency
+
+| **batch size** | **Throughput Avg** | **Latency Avg** | **Latency 90%** | **Latency 95%** | **Latency 99%** |
+|:-:|:-:|:-:|:-:|:-:|:-:|
+| 1 | 114.97 img/s | 8.56ms | 9.32ms | 11.43ms | 12.79ms |
+| 2 | 238.70 img/s | 8.20ms | 8.75ms | 9.49ms | 12.31ms |
+| 4 | 448.69 img/s | 8.67ms | 9.20ms | 9.97ms | 10.60ms |
+| 8 | 875.00 img/s | 8.88ms | 9.31ms | 9.80ms | 10.82ms |
+| 16 | 1746.07 img/s | 8.89ms | 9.05ms | 9.56ms | 12.81ms |
+| 32 | 2004.28 img/s | 14.07ms | 14.14ms | 14.31ms | 14.92ms |
+| 64 | 2254.60 img/s | 25.93ms | 26.05ms | 26.07ms | 26.17ms |
+| 128 | 2360.14 img/s | 50.14ms | 50.28ms | 50.34ms | 50.68ms |
+| 256 | 2342.13 img/s | 96.74ms | 96.91ms | 96.99ms | 97.14ms |
+
+
+
+##### Inference performance: NVIDIA T4
+
+###### FP32 Inference Latency
+
+| **batch size** | **Throughput Avg** | **Latency Avg** | **Latency 90%** | **Latency 95%** | **Latency 99%** |
+|:-:|:-:|:-:|:-:|:-:|:-:|
+| 1 | 179.85 img/s | 5.51ms | 5.65ms | 7.34ms | 10.97ms |
+| 2 | 348.12 img/s | 5.67ms | 5.95ms | 6.33ms | 9.81ms |
+| 4 | 556.27 img/s | 7.03ms | 7.34ms | 8.13ms | 9.65ms |
+| 8 | 740.43 img/s | 10.32ms | 10.33ms | 10.60ms | 13.87ms |
+| 16 | 909.17 img/s | 17.19ms | 17.15ms | 18.13ms | 21.06ms |
+| 32 | 999.07 img/s | 31.07ms | 31.12ms | 31.17ms | 32.41ms |
+| 64 | 1090.47 img/s | 57.62ms | 57.84ms | 57.91ms | 58.05ms |
+| 128 | 1142.46 img/s | 110.94ms | 111.15ms | 111.23ms | 112.16ms |
+| 256 | N/A | N/A | N/A | N/A | N/A |
+
+###### Mixed Precision Inference Latency
+
+| **batch size** | **Throughput Avg** | **Latency Avg** | **Latency 90%** | **Latency 95%** | **Latency 99%** |
+|:-:|:-:|:-:|:-:|:-:|:-:|
+| 1 | 163.78 img/s | 6.05ms | 5.92ms | 7.98ms | 11.58ms |
+| 2 | 333.43 img/s | 5.91ms | 6.05ms | 6.63ms | 11.52ms |
+| 4 | 645.45 img/s | 6.04ms | 6.33ms | 7.01ms | 8.90ms |
+| 8 | 1164.15 img/s | 6.73ms | 7.31ms | 8.04ms | 12.41ms |
+| 16 | 1606.42 img/s | 9.53ms | 9.86ms | 10.52ms | 17.01ms |
+| 32 | 1857.29 img/s | 15.67ms | 15.61ms | 16.14ms | 18.66ms |
+| 64 | 2011.62 img/s | 28.64ms | 28.69ms | 28.82ms | 31.06ms |
+| 128 | 2083.90 img/s | 54.87ms | 54.96ms | 54.99ms | 55.27ms |
+| 256 | 2043.72 img/s | 106.51ms | 106.62ms | 106.68ms | 107.03ms |
+
+
+
+
+
+## Release notes
+
+### Changelog
+
+1. September 2018
+  * Initial release
+2. January 2019
+  * Added options Label Smoothing, fan-in initialization, skipping weight decay on batch norm gamma and bias.
+3. May 2019
+  * Cosine LR schedule
+  * MixUp regularization
+  * DALI support
+  * DGX2 configurations
+  * gradients accumulation
+4. July 2019
+  * DALI-CPU dataloader
+  * Updated README
+
+### Known issues
+
+There are no known issues with this model.
+
+
--- a/PyTorch/Classification/ConvNets/resnet50v1.5/img/loss_plot.png
+++ b/PyTorch/Classification/ConvNets/resnet50v1.5/img/loss_plot.png
--- a/PyTorch/Classification/ConvNets/resnet50v1.5/img/top1_plot.png
+++ b/PyTorch/Classification/ConvNets/resnet50v1.5/img/top1_plot.png
--- a/PyTorch/Classification/ConvNets/resnet50v1.5/img/top5_plot.png
+++ b/PyTorch/Classification/ConvNets/resnet50v1.5/img/top5_plot.png
--- a/PyTorch/Classification/ConvNets/resnet50v1.5/training/AMP/DGX1_RN50_AMP_250E.sh
+++ b/PyTorch/Classification/ConvNets/resnet50v1.5/training/AMP/DGX1_RN50_AMP_250E.sh
@ -0,0 +1 @@
+python ./multiproc.py --nproc_per_node 8 ./main.py /imagenet --data-backend dali-cpu --raport-file raport.json -j5 -p 100 --lr 2.048 --optimizer-batch-size 2048 --warmup 8 --arch resnet50 -c fanin --label-smoothing 0.1 --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace ${1:-./} -b 256 --amp --static-loss-scale 128 --epochs 250 --mixup 0.2
--- a/PyTorch/Classification/ConvNets/resnet50v1.5/training/AMP/DGX1_RN50_AMP_50E.sh
+++ b/PyTorch/Classification/ConvNets/resnet50v1.5/training/AMP/DGX1_RN50_AMP_50E.sh
@ -0,0 +1 @@
+python ./multiproc.py --nproc_per_node 8 ./main.py /imagenet --data-backend dali-cpu --raport-file raport.json -j5 -p 100 --lr 2.048 --optimizer-batch-size 2048 --warmup 8 --arch resnet50 -c fanin --label-smoothing 0.1 --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace ${1:-./} -b 256 --amp --static-loss-scale 128 --epochs 50
--- a/PyTorch/Classification/ConvNets/resnet50v1.5/training/AMP/DGX1_RN50_AMP_90E.sh
+++ b/PyTorch/Classification/ConvNets/resnet50v1.5/training/AMP/DGX1_RN50_AMP_90E.sh
@ -0,0 +1 @@
+python ./multiproc.py --nproc_per_node 8 ./main.py /imagenet --data-backend dali-cpu --raport-file raport.json -j5 -p 100 --lr 2.048 --optimizer-batch-size 2048 --warmup 8 --arch resnet50 -c fanin --label-smoothing 0.1 --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace ${1:-./} -b 256 --amp --static-loss-scale 128 --epochs 90
--- a/PyTorch/Classification/ConvNets/resnet50v1.5/training/AMP/DGX2_RN50_AMP_250E.sh
+++ b/PyTorch/Classification/ConvNets/resnet50v1.5/training/AMP/DGX2_RN50_AMP_250E.sh
@ -0,0 +1 @@
+python ./multiproc.py --nproc_per_node 16 ./main.py /imagenet --data-backend dali-gpu --raport-file raport.json -j5 -p 100 --lr 4.096 --optimizer-batch-size 4096 --warmup 16 --arch resnet50 -c fanin --label-smoothing 0.1 --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace ${1:-./} -b 256 --amp --static-loss-scale 128 --epochs 250 --mixup 0.2
--- a/PyTorch/Classification/ConvNets/resnet50v1.5/training/AMP/DGX2_RN50_AMP_50E.sh
+++ b/PyTorch/Classification/ConvNets/resnet50v1.5/training/AMP/DGX2_RN50_AMP_50E.sh
@ -0,0 +1 @@
+python ./multiproc.py --nproc_per_node 16 ./main.py /imagenet --data-backend dali-gpu --raport-file raport.json -j5 -p 100 --lr 4.096 --optimizer-batch-size 4096 --warmup 16 --arch resnet50 -c fanin --label-smoothing 0.1 --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace ${1:-./} -b 256 --amp --static-loss-scale 128 --epochs 50
--- a/PyTorch/Classification/ConvNets/resnet50v1.5/training/AMP/DGX2_RN50_AMP_90E.sh
+++ b/PyTorch/Classification/ConvNets/resnet50v1.5/training/AMP/DGX2_RN50_AMP_90E.sh
@ -0,0 +1 @@
+python ./multiproc.py --nproc_per_node 16 ./main.py /imagenet --data-backend dali-gpu --raport-file raport.json -j5 -p 100 --lr 4.096 --optimizer-batch-size 4096 --warmup 16 --arch resnet50 -c fanin --label-smoothing 0.1 --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace ${1:-./} -b 256 --amp --static-loss-scale 128 --epochs 90
--- a/PyTorch/Classification/ConvNets/resnet50v1.5/training/FP16/DGX1_RN50_FP16_250E.sh
+++ b/PyTorch/Classification/ConvNets/resnet50v1.5/training/FP16/DGX1_RN50_FP16_250E.sh
@ -0,0 +1 @@
+python ./multiproc.py --nproc_per_node 8 ./main.py /imagenet --data-backend dali-cpu --raport-file raport.json -j5 -p 100 --lr 2.048 --optimizer-batch-size 2048 --warmup 8 --arch resnet50 -c fanin --label-smoothing 0.1 --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace ${1:-./} -b 256 --fp16 --static-loss-scale 128 --epochs 250 --mixup 0.2
--- a/PyTorch/Classification/ConvNets/resnet50v1.5/training/FP16/DGX1_RN50_FP16_50E.sh
+++ b/PyTorch/Classification/ConvNets/resnet50v1.5/training/FP16/DGX1_RN50_FP16_50E.sh
@ -0,0 +1 @@
+python ./multiproc.py --nproc_per_node 8 ./main.py /imagenet --data-backend dali-cpu --raport-file raport.json -j5 -p 100 --lr 2.048 --optimizer-batch-size 2048 --warmup 8 --arch resnet50 -c fanin --label-smoothing 0.1 --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace ${1:-./} -b 256 --fp16 --static-loss-scale 128 --epochs 50
--- a/PyTorch/Classification/ConvNets/resnet50v1.5/training/FP16/DGX1_RN50_FP16_90E.sh
+++ b/PyTorch/Classification/ConvNets/resnet50v1.5/training/FP16/DGX1_RN50_FP16_90E.sh
@ -0,0 +1 @@
+python ./multiproc.py --nproc_per_node 8 ./main.py /imagenet --data-backend dali-cpu --raport-file raport.json -j5 -p 100 --lr 2.048 --optimizer-batch-size 2048 --warmup 8 --arch resnet50 -c fanin --label-smoothing 0.1 --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace ${1:-./} -b 256 --fp16 --static-loss-scale 128 --epochs 90
--- a/PyTorch/Classification/ConvNets/resnet50v1.5/training/FP16/DGX2_RN50_FP16_250E.sh
+++ b/PyTorch/Classification/ConvNets/resnet50v1.5/training/FP16/DGX2_RN50_FP16_250E.sh
@ -0,0 +1 @@
+python ./multiproc.py --nproc_per_node 16 ./main.py /imagenet --data-backend dali-gpu --raport-file raport.json -j5 -p 100 --lr 4.096 --optimizer-batch-size 4096 --warmup 16 --arch resnet50 -c fanin --label-smoothing 0.1 --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace ${1:-./} -b 256 --fp16 --static-loss-scale 128 --epochs 250 --mixup 0.2
--- a/PyTorch/Classification/ConvNets/resnet50v1.5/training/FP16/DGX2_RN50_FP16_50E.sh
+++ b/PyTorch/Classification/ConvNets/resnet50v1.5/training/FP16/DGX2_RN50_FP16_50E.sh
@ -0,0 +1 @@
+python ./multiproc.py --nproc_per_node 16 ./main.py /imagenet --data-backend dali-gpu --raport-file raport.json -j5 -p 100 --lr 4.096 --optimizer-batch-size 4096 --warmup 16 --arch resnet50 -c fanin --label-smoothing 0.1 --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace ${1:-./} -b 256 --fp16 --static-loss-scale 128 --epochs 50
--- a/PyTorch/Classification/ConvNets/resnet50v1.5/training/FP16/DGX2_RN50_FP16_90E.sh
+++ b/PyTorch/Classification/ConvNets/resnet50v1.5/training/FP16/DGX2_RN50_FP16_90E.sh
@ -0,0 +1 @@
+python ./multiproc.py --nproc_per_node 16 ./main.py /imagenet --data-backend dali-gpu --raport-file raport.json -j5 -p 100 --lr 4.096 --optimizer-batch-size 4096 --warmup 16 --arch resnet50 -c fanin --label-smoothing 0.1 --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace ${1:-./} -b 256 --fp16 --static-loss-scale 128 --epochs 90
--- a/PyTorch/Classification/ConvNets/resnet50v1.5/training/FP32/DGX1_RN50_FP32_250E.sh
+++ b/PyTorch/Classification/ConvNets/resnet50v1.5/training/FP32/DGX1_RN50_FP32_250E.sh
@ -0,0 +1 @@
+python ./multiproc.py --nproc_per_node 8 ./main.py /imagenet --data-backend dali-cpu --raport-file raport.json -j5 -p 100 --lr 2.048 --optimizer-batch-size 2048 --warmup 8 --arch resnet50 -c fanin --label-smoothing 0.1 --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace ${1:-./} -b 128 --epochs 250 --mixup 0.2
--- a/PyTorch/Classification/ConvNets/resnet50v1.5/training/FP32/DGX1_RN50_FP32_50E.sh
+++ b/PyTorch/Classification/ConvNets/resnet50v1.5/training/FP32/DGX1_RN50_FP32_50E.sh
@ -0,0 +1 @@
+python ./multiproc.py --nproc_per_node 8 ./main.py /imagenet --data-backend dali-cpu --raport-file raport.json -j5 -p 100 --lr 2.048 --optimizer-batch-size 2048 --warmup 8 --arch resnet50 -c fanin --label-smoothing 0.1 --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace ${1:-./} -b 128 --epochs 50
--- a/PyTorch/Classification/ConvNets/resnet50v1.5/training/FP32/DGX1_RN50_FP32_90E.sh
+++ b/PyTorch/Classification/ConvNets/resnet50v1.5/training/FP32/DGX1_RN50_FP32_90E.sh
@ -0,0 +1 @@
+python ./multiproc.py --nproc_per_node 8 ./main.py /imagenet --data-backend dali-cpu --raport-file raport.json -j5 -p 100 --lr 2.048 --optimizer-batch-size 2048 --warmup 8 --arch resnet50 -c fanin --label-smoothing 0.1 --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace ${1:-./} -b 128 --epochs 90
--- a/PyTorch/Classification/ConvNets/resnet50v1.5/training/FP32/DGX2_RN50_FP32_250E.sh
+++ b/PyTorch/Classification/ConvNets/resnet50v1.5/training/FP32/DGX2_RN50_FP32_250E.sh
@ -0,0 +1 @@
+python ./multiproc.py --nproc_per_node 16 ./main.py /imagenet --data-backend dali-gpu --raport-file raport.json -j5 -p 100 --lr 4.096 --optimizer-batch-size 4096 --warmup 16 --arch resnet50 -c fanin --label-smoothing 0.1 --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace ${1:-./} -b 128 --epochs 250 --mixup 0.2
--- a/PyTorch/Classification/ConvNets/resnet50v1.5/training/FP32/DGX2_RN50_FP32_50E.sh
+++ b/PyTorch/Classification/ConvNets/resnet50v1.5/training/FP32/DGX2_RN50_FP32_50E.sh
@ -0,0 +1 @@
+python ./multiproc.py --nproc_per_node 16 ./main.py /imagenet --data-backend dali-gpu --raport-file raport.json -j5 -p 100 --lr 4.096 --optimizer-batch-size 4096 --warmup 16 --arch resnet50 -c fanin --label-smoothing 0.1 --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace ${1:-./} -b 128 --epochs 50
--- a/PyTorch/Classification/ConvNets/resnet50v1.5/training/FP32/DGX2_RN50_FP32_90E.sh
+++ b/PyTorch/Classification/ConvNets/resnet50v1.5/training/FP32/DGX2_RN50_FP32_90E.sh
@ -0,0 +1 @@
+python ./multiproc.py --nproc_per_node 16 ./main.py /imagenet --data-backend dali-gpu --raport-file raport.json -j5 -p 100 --lr 4.096 --optimizer-batch-size 4096 --warmup 16 --arch resnet50 -c fanin --label-smoothing 0.1 --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace ${1:-./} -b 128 --epochs 90
--- a/PyTorch/Classification/ConvNets/resnext101-32x4d/README.md
+++ b/PyTorch/Classification/ConvNets/resnext101-32x4d/README.md
@ -0,0 +1,650 @@
+# ResNeXt101-32x4d For PyTorch
+
+This repository provides a script and recipe to train the ResNeXt101-32x4d model to
+achieve state-of-the-art accuracy, and is tested and maintained by NVIDIA.
+
+## Table Of Contents
+
+* [Model overview](#model-overview)
+  * [Model architecture](#model-architecture)
+  * [Default configuration](#default-configuration)
+    * [Optimizer](#optimizer)
+    * [Data augmentation](#data-augmentation)
+  * [DALI](#dali)
+  * [Feature support matrix](#feature-support-matrix)
+    * [Features](#features)
+  * [Mixed precision training](#mixed-precision-training)
+    * [Enabling mixed precision](#enabling-mixed-precision)
+* [Setup](#setup)
+  * [Requirements](#requirements)
+* [Quick Start Guide](#quick-start-guide)
+* [Advanced](#advanced)
+  * [Scripts and sample code](#scripts-and-sample-code)
+    * [Parameters](#parameters)
+    * [Command-line options](#command-line-options)
+    * [Getting the data](#getting-the-data)
+        * [Dataset guidelines](#dataset-guidelines)
+        * [Multi-dataset](#multi-dataset)
+    * [Training process](#training-process)
+    * [Inference process](#inference-process)
+
+* [Performance](#performance)
+  * [Benchmarking](#benchmarking)
+    * [Training performance benchmark](#training-performance-benchmark)
+    * [Inference performance benchmark](#inference-performance-benchmark)
+  * [Results](#results)
+    * [Training accuracy results](#training-accuracy-results)
+      * [Training accuracy: NVIDIA DGX-1 (8x V100 16G)](#training-accuracy-nvidia-dgx-1-(8x-v100-16G))
+      * [Example plots](*example-plots)
+    * [Training performance results](#training-performance-results)
+      * [Training performance: NVIDIA DGX-1 (8x V100 16G)](#training-performance-nvidia-dgx-1-(8x-v100-16G))
+    * [Training time for 90 epochs](#training-time-for-90-epochs)
+      * [Training time: NVIDIA DGX-1 (8x V100 16G)](#training-time-nvidia-dgx-1-(8x-v100-16G))
+  * [Inference performance results](#inference-performance-results)
+      * [Inference performance: NVIDIA DGX-1 (1x V100 16G)](#inference-performance-nvidia-dgx-1-(1x-v100-16G))
+      * [Inference performance: NVIDIA T4](#inference-performance-nvidia-t4)
+* [Release notes](#release-notes)
+  * [Changelog](#changelog)
+  * [Known issues](#known-issues)
+
+## Model overview
+
+The ResNeXt101-32x4d is a model introduced in the [Aggregated Residual Transformations for Deep Neural Networks](https://arxiv.org/pdf/1611.05431.pdf) paper.
+
+It is based on regular ResNet model, substituting 3x3 convolutions inside the bottleneck block for 3x3 grouped convolutions.
+
+### Model architecture
+
+![ResNextArch](./img/ResNeXtArch.png)
+
+_ Image source: [Aggregated Residual Transformations for Deep Neural Networks](https://arxiv.org/pdf/1611.05431.pdf) _
+
+Image shows difference between ResNet bottleneck block and ResNeXt bottleneck block.
+
+ResNeXt101-32x4d model's cardinality equals to 32 and bottleneck width equals to 4.
+
+### Default configuration
+
+The following sections highlight the default configurations for the ResNeXt101-32x4d model.
+
+#### Optimizer
+
+This model uses SGD with momentum optimizer with the following hyperparameters:
+
+* Momentum (0.875)
+
+* Learning rate (LR) = 0.256 for 256 batch size, for other batch sizes we lineary
+scale the learning rate.
+
+* Learning rate schedule - we use cosine LR schedule
+
+* For bigger batch sizes (512 and up) we use linear warmup of the learning rate
+during the first couple of epochs
+according to [Training ImageNet in 1 hour](https://arxiv.org/abs/1706.02677).
+Warmup length depends on the total training length.
+
+* Weight decay (WD)= 6.103515625e-05 (1/16384).
+
+* We do not apply WD on Batch Norm trainable parameters (gamma/bias)
+
+* Label smoothing = 0.1
+
+* We train for:
+
+    * 90 Epochs -> 90 epochs is a standard for ImageNet networks
+
+    * 250 Epochs -> best possible accuracy.
+
+* For 250 epoch training we also use [MixUp regularization](https://arxiv.org/pdf/1710.09412.pdf).
+
+
+#### Data augmentation
+
+This model uses the following data augmentation:
+
+* For training:
+  * Normalization
+  * Random resized crop to 224x224
+    * Scale from 8% to 100%
+    * Aspect ratio from 3/4 to 4/3
+  * Random horizontal flip
+
+* For inference:
+  * Normalization
+  * Scale to 256x256
+  * Center crop to 224x224
+
+### Feature support matrix
+
+The following features are supported by this model:
+
+| Feature               | ResNeXt101-32x4d
+|-----------------------|--------------------------
+|[DALI](https://docs.nvidia.com/deeplearning/sdk/dali-release-notes/index.html)   |   Yes
+|[APEX AMP](https://nvidia.github.io/apex/amp.html) | Yes |
+
+#### Features
+
+- NVIDIA DALI - DALI is a library accelerating data preparation pipeline. To accelerate your input pipeline, you only need to define your data loader
+with the DALI library. For more information about DALI, refer to the [DALI product documentation](https://docs.nvidia.com/deeplearning/sdk/index.html#data-loading).
+
+- [APEX](https://github.com/NVIDIA/apex) is a PyTorch extension that contains utility libraries, such as [Automatic Mixed Precision (AMP)](https://nvidia.github.io/apex/amp.html), which require minimal network code changes to leverage Tensor Cores performance. Refer to the [Enabling mixed precision](#enabling-mixed-precision) section for more details.
+
+### DALI
+
+We use [NVIDIA DALI](https://github.com/NVIDIA/DALI),
+which speeds up data loading when CPU becomes a bottleneck.
+DALI can use CPU or GPU, and outperforms the PyTorch native dataloader.
+
+Run training with `--data-backends dali-gpu` or `--data-backends dali-cpu` to enable DALI.
+For ResNeXt101-32x4d, for DGX1 and DGX2 we recommend `--data-backends dali-cpu`.
+
+### Mixed precision training
+
+Mixed precision is the combined use of different numerical precisions in a computational method. [Mixed precision](https://arxiv.org/abs/1710.03740) training offers significant computational speedup by performing operations in half-precision format, while storing minimal information in single-precision to retain as much information as possible in critical parts of the network. Since the introduction of [Tensor Cores](https://developer.nvidia.com/tensor-cores) in the Volta and Turing architecture, significant training speedups are experienced by switching to mixed precision -- up to 3x overall speedup on the most arithmetically intense model architectures. Using mixed precision training requires two steps:
+1.  Porting the model to use the FP16 data type where appropriate.
+2.  Adding loss scaling to preserve small gradient values.
+
+The ability to train deep learning networks with lower precision was introduced in the Pascal architecture and first supported in [CUDA 8](https://devblogs.nvidia.com/parallelforall/tag/fp16/) in the NVIDIA Deep Learning SDK.
+
+For information about:
+-   How to train using mixed precision, see the [Mixed Precision Training](https://arxiv.org/abs/1710.03740) paper and [Training With Mixed Precision](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html) documentation.
+-   Techniques used for mixed precision training, see the [Mixed-Precision Training of Deep Neural Networks](https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/) blog.
+-   How to access and enable AMP for TensorFlow, see [Using TF-AMP](https://docs.nvidia.com/deeplearning/dgx/tensorflow-user-guide/index.html#tfamp) from the TensorFlow User Guide.
+-   APEX tools for mixed precision training, see the [NVIDIA Apex: Tools for Easy Mixed-Precision Training in PyTorch](https://devblogs.nvidia.com/apex-pytorch-easy-mixed-precision-training/).
+
+#### Enabling mixed precision
+
+Mixed precision is enabled in PyTorch by using the Automatic Mixed Precision (AMP), a library from [APEX](https://github.com/NVIDIA/apex) that casts variables to half-precision upon retrieval,
+while storing variables in single-precision format. Furthermore, to preserve small gradient magnitudes in backpropagation, a [loss scaling](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html#lossscaling) step must be included when applying gradients.
+In PyTorch, loss scaling can be easily applied by using scale_loss() method provided by AMP. The scaling value to be used can be [dynamic](https://nvidia.github.io/apex/fp16_utils.html#apex.fp16_utils.DynamicLossScaler) or fixed.
+
+For an in-depth walk through on AMP, check out sample usage [here](https://github.com/NVIDIA/apex/tree/master/apex/amp#usage-and-getting-started). [APEX](https://github.com/NVIDIA/apex) is a PyTorch extension that contains utility libraries, such as AMP, which require minimal network code changes to leverage tensor cores performance.
+
+To enable mixed precision, you can:
+- Import AMP from APEX, for example:
+
+  ```
+  from apex import amp
+  ```
+- Initialize an AMP handle, for example:
+
+  ```
+  amp_handle = amp.init(enabled=True, verbose=True)
+  ```
+- Wrap your optimizer with the AMP handle, for example:
+
+  ```
+  optimizer = amp_handle.wrap_optimizer(optimizer)
+  ```
+- Scale loss before backpropagation (assuming loss is stored in a variable called losses)
+  - Default backpropagate for FP32:
+
+    ```
+    losses.backward()
+    ```
+  - Scale loss and backpropagate with AMP:
+
+    ```
+    with optimizer.scale_loss(losses) as scaled_losses:
+       scaled_losses.backward()
+    ```
+
+## Setup
+
+The following section lists the requirements that you need to meet in order to start training the ResNeXt101-32x4d model.
+
+### Requirements
+
+This repository contains Dockerfile which extends the PyTorch NGC container and encapsulates some dependencies. Aside from these dependencies, ensure you have the following components:
+
+* [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker)
+* [PyTorch 19.10-py3 NGC container](https://ngc.nvidia.com/registry/nvidia-pytorch) or newer
+* [NVIDIA Volta](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/) or [Turing](https://www.nvidia.com/en-us/geforce/turing/) based GPU
+
+For more information about how to get started with NGC containers, see the
+following sections from the NVIDIA GPU Cloud Documentation and the Deep Learning
+DGX Documentation:
+* [Getting Started Using NVIDIA GPU Cloud](https://docs.nvidia.com/ngc/ngc-getting-started-guide/index.html)
+* [Accessing And Pulling From The NGC Container Registry](https://docs.nvidia.com/deeplearning/dgx/user-guide/index.html#accessing_registry)
+* [Running PyTorch](https://docs.nvidia.com/deeplearning/dgx/pytorch-release-notes/running.html#running)
+
+For those unable to use the PyTorch NGC container, to set up the required environment or create your own container, see the versioned [NVIDIA Container Support Matrix](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html).
+
+## Quick Start Guide
+
+### 1. Clone the repository.
+```
+git clone https://github.com/NVIDIA/DeepLearningExamples
+cd DeepLearningExamples/PyTorch/Classification/
+```
+
+### 2. Download and preprocess the dataset.
+
+The ResNeXt101-32x4d script operates on ImageNet 1k, a widely popular image classification dataset from the ILSVRC challenge.
+
+PyTorch can work directly on JPEGs, therefore, preprocessing/augmentation is not needed.
+
+1. [Download the images](http://image-net.org/download-images).
+
+2. Extract the training data:
+  ```bash
+  mkdir train && mv ILSVRC2012_img_train.tar train/ && cd train
+  tar -xvf ILSVRC2012_img_train.tar && rm -f ILSVRC2012_img_train.tar
+  find . -name "*.tar" | while read NAME ; do mkdir -p "${NAME%.tar}"; tar -xvf "${NAME}" -C "${NAME%.tar}"; rm -f "${NAME}"; done
+  cd ..
+  ```
+
+3. Extract the validation data and move the images to subfolders:
+  ```bash
+  mkdir val && mv ILSVRC2012_img_val.tar val/ && cd val && tar -xvf ILSVRC2012_img_val.tar
+  wget -qO- https://raw.githubusercontent.com/soumith/imagenetloader.torch/master/valprep.sh | bash
+  ```
+
+The directory in which the `train/` and `val/` directories are placed, is referred to as `<path to imagenet>` in this document.
+
+### 3. Build the RNXT101-32x4d PyTorch NGC container.
+
+```
+docker build . -t nvidia_rnxt101-32x4d
+```
+
+### 4. Start an interactive session in the NGC container to run training/inference.
+```
+nvidia-docker run --rm -it -v <path to imagenet>:/imagenet --ipc=host nvidia_rnxt101-32x4d
+```
+
+### 5. Start training
+
+To run training for a standard configuration (DGX1V, AMP/FP32, 90/250 Epochs),
+run one of the scripts in the `./resnext101-32x4d/training` directory
+called `./resnext101-32x4d/training/{AMP,FP32}/{DGX1}_RNXT101-32x4d_{AMP, FP32}_{90,250}E.sh`.
+
+Ensure ImageNet is mounted in the `/imagenet` directory.
+
+Example:
+    `bash ./resnext101-32x4d/training/DGX1_RNXT101-32x4d_FP16_250E.sh <path were to store checkpoints and logs>`
+
+### 6. Start inference
+
+To run inference on ImageNet on a checkpointed model, run:
+
+`python ./main.py --arch resnext101-32x4d --evaluate --epochs 1 --resume <path to checkpoint> -b <batch size> <path to imagenet>`
+
+To run inference on JPEG image, you have to first extract the model weights from checkpoint:
+
+`python checkpoint2model.py --checkpoint-path <path to checkpoint> --weight-path <path where weights will be stored>`
+
+Then run classification script:
+
+`python classify.py --arch resnext101-32x4d -c fanin --weights <path to weights from previous step> --precision AMP|FP16|FP32 --image <path to JPEG image>`
+
+
+## Advanced
+
+The following sections provide greater details of the dataset, running training and inference, and the training results.
+
+### Scripts and sample code
+
+To run a non standard configuration use:
+
+* For 1 GPU
+    * FP32
+        `python ./main.py --arch resnext101-32x4d -c fanin --label-smoothing 0.1 <path to imagenet>`
+        `python ./main.py --arch resnext101-32x4d -c fanin --label-smoothing 0.1 --amp --static-loss-scale 256 <path to imagenet>`
+
+* For multiple GPUs
+    * FP32
+        `python ./multiproc.py --nproc_per_node 8 ./main.py --arch resnext101-32x4d -c fanin --label-smoothing 0.1 <path to imagenet>`
+    * AMP
+        `python ./multiproc.py --nproc_per_node 8 ./main.py --arch resnext101-32x4d -c fanin --label-smoothing 0.1 --amp --static-loss-scale 256 <path to imagenet>`
+
+Use `python ./main.py -h` to obtain the list of available options in the `main.py` script.
+
+
+### Commmand-line options:
+
+To see the full list of available options and their descriptions, use the `-h` or `--help` command-line option, for example:
+
+`python main.py -h`
+
+
+```
+usage: main.py [-h] [--data-backend BACKEND] [--arch ARCH]
+               [--model-config CONF] [-j N] [--epochs N] [-b N]
+               [--optimizer-batch-size N] [--lr LR] [--lr-schedule SCHEDULE]
+               [--warmup E] [--label-smoothing S] [--mixup ALPHA]
+               [--momentum M] [--weight-decay W] [--bn-weight-decay]
+               [--nesterov] [--print-freq N] [--resume PATH]
+               [--pretrained-weights PATH] [--fp16]
+               [--static-loss-scale STATIC_LOSS_SCALE] [--dynamic-loss-scale]
+               [--prof N] [--amp] [--local_rank LOCAL_RANK] [--seed SEED]
+               [--gather-checkpoints] [--raport-file RAPORT_FILE] [--evaluate]
+               [--training-only] [--no-checkpoints] [--workspace DIR]
+               DIR
+
+PyTorch ImageNet Training
+
+positional arguments:
+  DIR                   path to dataset
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --data-backend BACKEND
+                        data backend: pytorch | syntetic | dali-gpu | dali-cpu
+                        (default: dali-cpu)
+  --arch ARCH, -a ARCH  model architecture: resnet18 | resnet34 | resnet50 |
+                        resnet101 | resnet152 | resnext101-32x4d | se-
+                        resnext101-32x4d (default: resnet50)
+  --model-config CONF, -c CONF
+                        model configs: classic | fanin | grp-fanin | grp-
+                        fanout(default: classic)
+  -j N, --workers N     number of data loading workers (default: 5)
+  --epochs N            number of total epochs to run
+  -b N, --batch-size N  mini-batch size (default: 256) per gpu
+  --optimizer-batch-size N
+                        size of a total batch size, for simulating bigger
+                        batches using gradient accumulation
+  --lr LR, --learning-rate LR
+                        initial learning rate
+  --lr-schedule SCHEDULE
+                        Type of LR schedule: step, linear, cosine
+  --warmup E            number of warmup epochs
+  --label-smoothing S   label smoothing
+  --mixup ALPHA         mixup alpha
+  --momentum M          momentum
+  --weight-decay W, --wd W
+                        weight decay (default: 1e-4)
+  --bn-weight-decay     use weight_decay on batch normalization learnable
+                        parameters, (default: false)
+  --nesterov            use nesterov momentum, (default: false)
+  --print-freq N, -p N  print frequency (default: 10)
+  --resume PATH         path to latest checkpoint (default: none)
+  --pretrained-weights PATH
+                        load weights from here
+  --fp16                Run model fp16 mode.
+  --static-loss-scale STATIC_LOSS_SCALE
+                        Static loss scale, positive power of 2 values can
+                        improve fp16 convergence.
+  --dynamic-loss-scale  Use dynamic loss scaling. If supplied, this argument
+                        supersedes --static-loss-scale.
+  --prof N              Run only N iterations
+  --amp                 Run model AMP (automatic mixed precision) mode.
+  --local_rank LOCAL_RANK
+                        Local rank of python process. Set up by distributed
+                        launcher
+  --seed SEED           random seed used for numpy and pytorch
+  --gather-checkpoints  Gather checkpoints throughout the training, without
+                        this flag only best and last checkpoints will be
+                        stored
+  --raport-file RAPORT_FILE
+                        file in which to store JSON experiment raport
+  --evaluate            evaluate checkpoint/model
+  --training-only       do not evaluate
+  --no-checkpoints      do not store any checkpoints, useful for benchmarking
+  --workspace DIR       path to directory where checkpoints will be stored
+```
+
+
+### Dataset guidelines
+
+To use your own dataset, divide it in directories as in the following scheme:
+
+ - Training images - `train/<class id>/<image>`
+ - Validation images - `val/<class id>/<image>`
+
+If your dataset's has number of classes different than 1000, you need to add a custom config
+in the `image_classification/resnet.py` file.
+
+```python
+resnet_versions = {
+    ...
+    'resnext101-32x4d-custom' : {
+        'net' : ResNet,
+        'block' : Bottleneck,
+        'cardinality' : 32,
+        'layers' : [3, 4, 23, 3],
+        'widths' : [128, 256, 512, 1024],
+        'expansion' : 2,
+        'num_classes' : <custom number of classes>,
+    }
+}
+```
+
+After adding the config, run the training script with `--arch resnext101-32x4d-custom` flag.
+
+### Training process
+
+All the results of the training will be stored in the directory specified with `--workspace` argument.
+Script will store:
+ - most recent checkpoint - `checkpoint.pth.tar` (unless `--no-checkpoints` flag is used).
+ - checkpoint with best validation accuracy - `model_best.pth.tar` (unless `--no-checkpoints` flag is used).
+ - JSON log - in the file specified with `--raport-file` flag.
+
+Metrics gathered through training:
+
+ - `train.loss` - training loss
+ - `train.total_ips` - training speed measured in images/second
+ - `train.compute_ips` - training speed measured in images/second, not counting data loading
+ - `train.data_time` - time spent on waiting on data
+ - `train.compute_time` - time spent in forward/backward pass
+
+### Inference process
+
+Validation is done every epoch, and can be also run separately on a checkpointed model.
+
+`python ./main.py --arch resnext101-32x4d --evaluate --epochs 1 --resume <path to checkpoint> -b <batch size> <path to imagenet>`
+
+Metrics gathered through training:
+
+ - `val.loss` - validation loss
+ - `val.top1` - validation top1 accuracy
+ - `val.top5` - validation top5 accuracy
+ - `val.total_ips` - inference speed measured in images/second
+ - `val.compute_ips` - inference speed measured in images/second, not counting data loading
+ - `val.data_time` - time spent on waiting on data
+ - `val.compute_time` - time spent on inference
+
+
+To run inference on JPEG image, you have to first extract the model weights from checkpoint:
+
+`python checkpoint2model.py --checkpoint-path <path to checkpoint> --weight-path <path where weights will be stored>`
+
+Then run classification script:
+
+`python classify.py --arch resnext101-32x4d -c fanin --weights <path to weights from previous step> --precision AMP|FP16|FP32 --image <path to JPEG image>`
+
+Example output:
+
+
+
+## Performance
+
+### Benchmarking
+
+The following section shows how to run benchmarks measuring the model performance in training and inference modes.
+
+#### Training performance benchmark
+
+To benchmark training, run:
+
+* For 1 GPU
+    * FP32
+`python ./main.py --arch resnext101-32x4d --training-only -p 1 --raport-file benchmark.json --epochs 1 --prof 100 <path to imagenet>`
+    * FP16
+`python ./main.py --arch resnext101-32x4d --training-only -p 1 --raport-file benchmark.json --epochs 1 --prof 100 --fp16 --static-loss-scale 256 <path to imagenet>`
+    * AMP
+`python ./main.py --arch resnext101-32x4d --training-only -p 1 --raport-file benchmark.json --epochs 1 --prof 100 --amp --static-loss-scale 256 <path to imagenet>`
+* For multiple GPUs
+    * FP32
+`python ./multiproc.py --nproc_per_node 8 ./main.py --arch resnext101-32x4d --training-only -p 1 --raport-file benchmark.json --epochs 1 --prof 100 <path to imagenet>`
+    * FP16
+`python ./multiproc.py --nproc_per_node 8 ./main.py --arch resnext101-32x4d --training-only -p 1 --raport-file benchmark.json --fp16 --static-loss-scale 256 --epochs 1 --prof 100 <path to imagenet>`
+    * AMP
+`python ./multiproc.py --nproc_per_node 8 ./main.py --arch resnext101-32x4d --training-only -p 1 --raport-file benchmark.json --amp --static-loss-scale 256 --epochs 1 --prof 100 <path to imagenet>`
+
+Each of these scripts will run 100 iterations and save results in the `benchmark.json` file.
+
+#### Inference performance benchmark
+
+To benchmark inference, run:
+
+* FP32
+
+`python ./main.py --arch resnext101-32x4d -p 1 --raport-file benchmark.json --epochs 1 --prof 100 --evaluate <path to imagenet>`
+
+* FP16
+
+`python ./main.py --arch resnext101-32x4d -p 1 --raport-file benchmark.json --epochs 1 --prof 100 --evaluate --fp16 <path to imagenet>`
+
+* AMP
+
+`python ./main.py --arch resnext101-32x4d -p 1 --raport-file benchmark.json --epochs 1 --prof 100 --evaluate --amp <path to imagenet>`
+
+Each of these scripts will run 100 iterations and save results in the `benchmark.json` file.
+
+
+### Results
+
+Our results were obtained by running the applicable training script     in the pytorch-19.10 NGC container.
+
+To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
+
+#### Training accuracy results
+
+##### Training accuracy: NVIDIA DGX-1 (8x V100 16G)
+
+| **epochs** | **Mixed Precision Top1** | **FP32 Top1** |
+|:-:|:-:|:-:|
+| 90 | 79.23 +/- 0.09 | 79.23 +/- 0.09 |
+| 250 | 79.92 +/- 0.13 | 80.06 +/- 0.06 |
+
+
+
+##### Example plots
+
+The following images show a 250 epochs configuration on a DGX-1V.
+
+![ValidationLoss](./img/loss_plot.png)
+
+![ValidationTop1](./img/top1_plot.png)
+
+![ValidationTop5](./img/top5_plot.png)
+
+#### Training performance results
+
+##### Traininig performance: NVIDIA DGX1-16G (8x V100 16G)
+
+| **GPUs** | **Mixed Precision** | **FP32** | **Mixed Precision speedup** | **Mixed Precision Strong Scaling** | **FP32 Strong Scaling** |
+|:-:|:-:|:-:|:-:|:-:|:-:|
+| 1 | 313.43 img/s | 146.66 img/s | 2.14x | 1.00x | 1.00x |
+| 8 | 2384.85 img/s | 1116.58 img/s | 2.14x | 7.61x | 7.61x |
+
+##### Traininig performance: NVIDIA DGX1-32G (8x V100 32G)
+
+| **GPUs** | **Mixed Precision** | **FP32** | **Mixed Precision speedup** | **Mixed Precision Strong Scaling** | **FP32 Strong Scaling** |
+|:-:|:-:|:-:|:-:|:-:|:-:|
+| 1 | 297.83 img/s | 143.27 img/s | 2.08x | 1.00x | 1.00x |
+| 8 | 2270.85 img/s | 1104.62 img/s | 2.06x | 7.62x | 7.71x |
+
+##### Traininig performance: NVIDIA DGX2 (16x V100 32G)
+
+| **GPUs** | **Mixed Precision** | **FP32** | **Mixed Precision speedup** | **Mixed Precision Strong Scaling** | **FP32 Strong Scaling** |
+|:-:|:-:|:-:|:-:|:-:|:-:|
+| 1 | 308.42 img/s | 151.67 img/s | 2.03x | 1.00x | 1.00x |
+| 16 | 4473.37 img/s | 2261.97 img/s | 1.98x | 14.50x | 14.91x |
+
+#### Training Time for 90 Epochs
+
+##### Training time: NVIDIA DGX-1 (8x V100 16G)
+
+| **GPUs** | **Mixed Precision training time** | **FP32 training time** |
+|:-:|:-:|:-:|
+| 1 | ~ 114 h | ~ 242 h |
+| 8 | ~ 17 h | ~ 34 h |
+
+##### Training time: NVIDIA DGX-2 (16x V100 32G)
+
+| **GPUs** | **Mixed Precision training time** | **FP32 training time** |
+|:-:|:-:|:-:|
+| 1 | ~ 116 h | ~ 234 h |
+| 16 | ~ 10 h | ~ 18 h |
+
+
+
+#### Inference performance results
+
+##### Inference performance: NVIDIA DGX-1 (1x V100 16G)
+
+###### FP32 Inference Latency
+
+| **batch size** | **Throughput Avg** | **Latency Avg** | **Latency 90%** | **Latency 95%** | **Latency 99%** |
+|:-:|:-:|:-:|:-:|:-:|:-:|
+| 1 | 47.34 img/s | 21.02ms | 23.41ms | 24.55ms | 26.00ms |
+| 2 | 89.68 img/s | 22.14ms | 22.90ms | 24.86ms | 26.59ms |
+| 4 | 175.92 img/s | 22.57ms | 24.96ms | 25.53ms | 26.03ms |
+| 8 | 325.69 img/s | 24.35ms | 25.17ms | 25.80ms | 28.52ms |
+| 16 | 397.04 img/s | 40.04ms | 40.01ms | 40.08ms | 40.32ms |
+| 32 | 431.77 img/s | 73.71ms | 74.05ms | 74.09ms | 74.26ms |
+| 64 | 485.70 img/s | 131.04ms | 131.38ms | 131.53ms | 131.81ms |
+| 128 | N/A | N/A | N/A | N/A | N/A |
+
+###### Mixed Precision Inference Latency
+
+| **batch size** | **Throughput Avg** | **Latency Avg** | **Latency 90%** | **Latency 95%** | **Latency 99%** |
+|:-:|:-:|:-:|:-:|:-:|:-:|
+| 1 | 43.11 img/s | 23.05ms | 25.19ms | 25.41ms | 26.63ms |
+| 2 | 83.29 img/s | 23.82ms | 25.11ms | 26.25ms | 27.29ms |
+| 4 | 173.67 img/s | 22.82ms | 24.38ms | 25.26ms | 25.92ms |
+| 8 | 330.18 img/s | 24.05ms | 26.45ms | 27.37ms | 27.74ms |
+| 16 | 634.82 img/s | 25.00ms | 26.93ms | 28.12ms | 28.73ms |
+| 32 | 884.91 img/s | 35.71ms | 35.96ms | 36.01ms | 36.13ms |
+| 64 | 998.40 img/s | 63.43ms | 63.63ms | 63.75ms | 63.96ms |
+| 128 | 1079.10 img/s | 117.74ms | 118.02ms | 118.11ms | 118.35ms |
+
+
+
+##### Inference performance: NVIDIA T4
+
+###### FP32 Inference Latency
+
+| **batch size** | **Throughput Avg** | **Latency Avg** | **Latency 90%** | **Latency 95%** | **Latency 99%** |
+|:-:|:-:|:-:|:-:|:-:|:-:|
+| 1 | 55.64 img/s | 17.88ms | 19.21ms | 20.35ms | 22.29ms |
+| 2 | 109.22 img/s | 18.24ms | 19.00ms | 20.43ms | 22.51ms |
+| 4 | 217.27 img/s | 18.26ms | 18.88ms | 19.51ms | 21.74ms |
+| 8 | 294.55 img/s | 26.74ms | 27.35ms | 27.62ms | 28.93ms |
+| 16 | 351.30 img/s | 45.34ms | 45.72ms | 46.10ms | 47.43ms |
+| 32 | 401.97 img/s | 79.10ms | 79.37ms | 79.44ms | 81.83ms |
+| 64 | 449.30 img/s | 140.30ms | 140.73ms | 141.26ms | 143.57ms |
+| 128 | N/A | N/A | N/A | N/A | N/A |
+
+###### Mixed Precision Inference Latency
+
+| **batch size** | **Throughput Avg** | **Latency Avg** | **Latency 90%** | **Latency 95%** | **Latency 99%** |
+|:-:|:-:|:-:|:-:|:-:|:-:|
+| 1 | 51.14 img/s | 19.48ms | 20.16ms | 21.40ms | 26.21ms |
+| 2 | 102.29 img/s | 19.44ms | 19.77ms | 20.42ms | 24.51ms |
+| 4 | 209.44 img/s | 18.93ms | 19.52ms | 20.23ms | 21.95ms |
+| 8 | 408.69 img/s | 19.47ms | 21.12ms | 23.15ms | 25.77ms |
+| 16 | 641.78 img/s | 24.54ms | 25.19ms | 25.64ms | 27.31ms |
+| 32 | 800.26 img/s | 39.28ms | 39.43ms | 39.54ms | 41.96ms |
+| 64 | 883.66 img/s | 71.76ms | 71.87ms | 71.94ms | 72.78ms |
+| 128 | 948.27 img/s | 134.19ms | 134.40ms | 134.58ms | 134.81ms |
+
+
+
+
+
+## Release notes
+
+### Changelog
+
+1. October 2019
+  * Initial release
+
+### Known issues
+
+There are no known issues with this model.
+
+
--- a/PyTorch/Classification/ConvNets/resnext101-32x4d/img/ResNeXtArch.png
+++ b/PyTorch/Classification/ConvNets/resnext101-32x4d/img/ResNeXtArch.png
--- a/PyTorch/Classification/ConvNets/resnext101-32x4d/img/loss_plot.png
+++ b/PyTorch/Classification/ConvNets/resnext101-32x4d/img/loss_plot.png
--- a/PyTorch/Classification/ConvNets/resnext101-32x4d/img/top1_plot.png
+++ b/PyTorch/Classification/ConvNets/resnext101-32x4d/img/top1_plot.png
--- a/PyTorch/Classification/ConvNets/resnext101-32x4d/img/top5_plot.png
+++ b/PyTorch/Classification/ConvNets/resnext101-32x4d/img/top5_plot.png
--- a/PyTorch/Classification/ConvNets/resnext101-32x4d/training/AMP/DGX1_RNXT101-32x4d_AMP_250E.sh
+++ b/PyTorch/Classification/ConvNets/resnext101-32x4d/training/AMP/DGX1_RNXT101-32x4d_AMP_250E.sh
@ -0,0 +1 @@
+python ./multiproc.py --nproc_per_node 8 ./main.py /imagenet --raport-file raport.json -j5 -p 100 --data-backend dali-cpu --arch resnext101-32x4d -c fanin --label-smoothing 0.1 --workspace $1 -b 128 --amp --static-loss-scale 128 --optimizer-batch-size 1024 --lr 1.024 --mom 0.875 --lr-schedule cosine --epochs 250 --warmup 8 --wd 6.103515625e-05 --mixup 0.2
--- a/PyTorch/Classification/ConvNets/resnext101-32x4d/training/AMP/DGX1_RNXT101-32x4d_AMP_90E.sh
+++ b/PyTorch/Classification/ConvNets/resnext101-32x4d/training/AMP/DGX1_RNXT101-32x4d_AMP_90E.sh
@ -0,0 +1 @@
+python ./multiproc.py --nproc_per_node 8 ./main.py /imagenet --raport-file raport.json -j5 -p 100 --data-backend dali-cpu --arch resnext101-32x4d -c fanin --label-smoothing 0.1 --workspace $1 -b 128 --amp --static-loss-scale 128 --optimizer-batch-size 1024 --lr 1.024 --mom 0.875 --lr-schedule cosine --epochs  90 --warmup 8 --wd 6.103515625e-05
--- a/PyTorch/Classification/ConvNets/resnext101-32x4d/training/FP32/DGX1_RNXT101-32x4d_FP32_250E.sh
+++ b/PyTorch/Classification/ConvNets/resnext101-32x4d/training/FP32/DGX1_RNXT101-32x4d_FP32_250E.sh
@ -0,0 +1 @@
+python ./multiproc.py --nproc_per_node 8 ./main.py /imagenet --raport-file raport.json -j5 -p 100 --data-backend dali-cpu --arch resnext101-32x4d -c fanin --label-smoothing 0.1 --workspace $1 -b 64 --optimizer-batch-size 1024 --lr 1.024 --mom 0.875 --lr-schedule cosine --epochs 250 --warmup 8 --wd 6.103515625e-05 --mixup 0.2
--- a/PyTorch/Classification/ConvNets/resnext101-32x4d/training/FP32/DGX1_RNXT101-32x4d_FP32_90E.sh
+++ b/PyTorch/Classification/ConvNets/resnext101-32x4d/training/FP32/DGX1_RNXT101-32x4d_FP32_90E.sh
@ -0,0 +1 @@
+python ./multiproc.py --nproc_per_node 8 ./main.py /imagenet --raport-file raport.json -j5 -p 100 --data-backend dali-cpu --arch resnext101-32x4d -c fanin --label-smoothing 0.1 --workspace $1 -b 64 --optimizer-batch-size 1024 --lr 1.024 --mom 0.875 --lr-schedule cosine --epochs  90 --warmup 8 --wd 6.103515625e-05
--- a/PyTorch/Classification/ConvNets/se-resnext101-32x4d/README.md
+++ b/PyTorch/Classification/ConvNets/se-resnext101-32x4d/README.md
@ -0,0 +1,651 @@
+# SE-ResNeXt101-32x4d For PyTorch
+
+This repository provides a script and recipe to train the SE-ResNeXt101-32x4d model to
+achieve state-of-the-art accuracy, and is tested and maintained by NVIDIA.
+
+## Table Of Contents
+
+* [Model overview](#model-overview)
+  * [Model architecture](#model-architecture)
+  * [Default configuration](#default-configuration)
+    * [Optimizer](#optimizer)
+    * [Data augmentation](#data-augmentation)
+  * [DALI](#dali)
+  * [Feature support matrix](#feature-support-matrix)
+    * [Features](#features)
+  * [Mixed precision training](#mixed-precision-training)
+    * [Enabling mixed precision](#enabling-mixed-precision)
+* [Setup](#setup)
+  * [Requirements](#requirements)
+* [Quick Start Guide](#quick-start-guide)
+* [Advanced](#advanced)
+  * [Scripts and sample code](#scripts-and-sample-code)
+    * [Parameters](#parameters)
+    * [Command-line options](#command-line-options)
+    * [Getting the data](#getting-the-data)
+        * [Dataset guidelines](#dataset-guidelines)
+        * [Multi-dataset](#multi-dataset)
+    * [Training process](#training-process)
+    * [Inference process](#inference-process)
+
+* [Performance](#performance)
+  * [Benchmarking](#benchmarking)
+    * [Training performance benchmark](#training-performance-benchmark)
+    * [Inference performance benchmark](#inference-performance-benchmark)
+  * [Results](#results)
+    * [Training accuracy results](#training-accuracy-results)
+      * [Training accuracy: NVIDIA DGX-1 (8x V100 16G)](#training-accuracy-nvidia-dgx-1-(8x-v100-16G))
+      * [Example plots](*example-plots)
+    * [Training performance results](#training-performance-results)
+      * [Training performance: NVIDIA DGX-1 (8x V100 16G)](#training-performance-nvidia-dgx-1-(8x-v100-16G))
+    * [Training time for 90 epochs](#training-time-for-90-epochs)
+      * [Training time: NVIDIA DGX-1 (8x V100 16G)](#training-time-nvidia-dgx-1-(8x-v100-16G))
+  * [Inference performance results](#inference-performance-results)
+      * [Inference performance: NVIDIA DGX-1 (1x V100 16G)](#inference-performance-nvidia-dgx-1-(1x-v100-16G))
+      * [Inference performance: NVIDIA T4](#inference-performance-nvidia-t4)
+* [Release notes](#release-notes)
+  * [Changelog](#changelog)
+  * [Known issues](#known-issues)
+
+
+## Model overview
+
+The SE-ResNeXt101-32x4d is a [ResNeXt101-32x4d](https://arxiv.org/pdf/1611.05431.pdf)
+model with added Squeeze-and-Excitation module introduced
+in [Squeeze-and-Excitation Networks](https://arxiv.org/pdf/1709.01507.pdf) paper.
+
+Squeeze and Excitation module architecture for ResNet-type models:
+
+### Model architecture
+
+![SEArch](./img/SEArch.png)
+
+_ Image source: [Squeeze-and-Excitation Networks](https://arxiv.org/pdf/1709.01507.pdf) _
+
+Image shows the architecture of SE block and where is it placed in ResNet bottleneck block.
+
+### Default configuration
+
+The following sections highlight the default configurations for the SE-ResNeXt101-32x4d model.
+
+#### Optimizer
+
+This model uses SGD with momentum optimizer with the following hyperparameters:
+
+* Momentum (0.875)
+
+* Learning rate (LR) = 0.256 for 256 batch size, for other batch sizes we lineary
+scale the learning rate.
+
+* Learning rate schedule - we use cosine LR schedule
+
+* For bigger batch sizes (512 and up) we use linear warmup of the learning rate
+during the first couple of epochs
+according to [Training ImageNet in 1 hour](https://arxiv.org/abs/1706.02677).
+Warmup length depends on the total training length.
+
+* Weight decay (WD)= 6.103515625e-05 (1/16384).
+
+* We do not apply WD on Batch Norm trainable parameters (gamma/bias)
+
+* Label smoothing = 0.1
+
+* We train for:
+
+    * 90 Epochs -> 90 epochs is a standard for ImageNet networks
+
+    * 250 Epochs -> best possible accuracy.
+
+* For 250 epoch training we also use [MixUp regularization](https://arxiv.org/pdf/1710.09412.pdf).
+
+
+#### Data augmentation
+
+This model uses the following data augmentation:
+
+* For training:
+  * Normalization
+  * Random resized crop to 224x224
+    * Scale from 8% to 100%
+    * Aspect ratio from 3/4 to 4/3
+  * Random horizontal flip
+
+* For inference:
+  * Normalization
+  * Scale to 256x256
+  * Center crop to 224x224
+
+### Feature support matrix
+
+The following features are supported by this model:
+
+| Feature               | ResNeXt101-32x4d
+|-----------------------|--------------------------
+|[DALI](https://docs.nvidia.com/deeplearning/sdk/dali-release-notes/index.html)   |   Yes
+|[APEX AMP](https://nvidia.github.io/apex/amp.html) | Yes |
+
+#### Features
+
+- NVIDIA DALI - DALI is a library accelerating data preparation pipeline. To accelerate your input pipeline, you only need to define your data loader
+with the DALI library. For more information about DALI, refer to the [DALI product documentation](https://docs.nvidia.com/deeplearning/sdk/index.html#data-loading).
+
+- [APEX](https://github.com/NVIDIA/apex) is a PyTorch extension that contains utility libraries, such as [Automatic Mixed Precision (AMP)](https://nvidia.github.io/apex/amp.html), which require minimal network code changes to leverage Tensor Cores performance. Refer to the [Enabling mixed precision](#enabling-mixed-precision) section for more details.
+
+### DALI
+
+We use [NVIDIA DALI](https://github.com/NVIDIA/DALI),
+which speeds up data loading when CPU becomes a bottleneck.
+DALI can use CPU or GPU, and outperforms the PyTorch native dataloader.
+
+Run training with `--data-backends dali-gpu` or `--data-backends dali-cpu` to enable DALI.
+For ResNeXt101-32x4d, for DGX1 and DGX2 we recommend `--data-backends dali-cpu`.
+
+### Mixed precision training
+
+Mixed precision is the combined use of different numerical precisions in a computational method. [Mixed precision](https://arxiv.org/abs/1710.03740) training offers significant computational speedup by performing operations in half-precision format, while storing minimal information in single-precision to retain as much information as possible in critical parts of the network. Since the introduction of [Tensor Cores](https://developer.nvidia.com/tensor-cores) in the Volta and Turing architecture, significant training speedups are experienced by switching to mixed precision -- up to 3x overall speedup on the most arithmetically intense model architectures. Using mixed precision training requires two steps:
+1.  Porting the model to use the FP16 data type where appropriate.
+2.  Adding loss scaling to preserve small gradient values.
+
+The ability to train deep learning networks with lower precision was introduced in the Pascal architecture and first supported in [CUDA 8](https://devblogs.nvidia.com/parallelforall/tag/fp16/) in the NVIDIA Deep Learning SDK.
+
+For information about:
+-   How to train using mixed precision, see the [Mixed Precision Training](https://arxiv.org/abs/1710.03740) paper and [Training With Mixed Precision](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html) documentation.
+-   Techniques used for mixed precision training, see the [Mixed-Precision Training of Deep Neural Networks](https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/) blog.
+-   How to access and enable AMP for TensorFlow, see [Using TF-AMP](https://docs.nvidia.com/deeplearning/dgx/tensorflow-user-guide/index.html#tfamp) from the TensorFlow User Guide.
+-   APEX tools for mixed precision training, see the [NVIDIA Apex: Tools for Easy Mixed-Precision Training in PyTorch](https://devblogs.nvidia.com/apex-pytorch-easy-mixed-precision-training/).
+
+#### Enabling mixed precision
+
+Mixed precision is enabled in PyTorch by using the Automatic Mixed Precision (AMP), a library from [APEX](https://github.com/NVIDIA/apex) that casts variables to half-precision upon retrieval,
+while storing variables in single-precision format. Furthermore, to preserve small gradient magnitudes in backpropagation, a [loss scaling](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html#lossscaling) step must be included when applying gradients.
+In PyTorch, loss scaling can be easily applied by using scale_loss() method provided by AMP. The scaling value to be used can be [dynamic](https://nvidia.github.io/apex/fp16_utils.html#apex.fp16_utils.DynamicLossScaler) or fixed.
+
+For an in-depth walk through on AMP, check out sample usage [here](https://github.com/NVIDIA/apex/tree/master/apex/amp#usage-and-getting-started). [APEX](https://github.com/NVIDIA/apex) is a PyTorch extension that contains utility libraries, such as AMP, which require minimal network code changes to leverage tensor cores performance.
+
+To enable mixed precision, you can:
+- Import AMP from APEX, for example:
+
+  ```
+  from apex import amp
+  ```
+- Initialize an AMP handle, for example:
+
+  ```
+  amp_handle = amp.init(enabled=True, verbose=True)
+  ```
+- Wrap your optimizer with the AMP handle, for example:
+
+  ```
+  optimizer = amp_handle.wrap_optimizer(optimizer)
+  ```
+- Scale loss before backpropagation (assuming loss is stored in a variable called losses)
+  - Default backpropagate for FP32:
+
+    ```
+    losses.backward()
+    ```
+  - Scale loss and backpropagate with AMP:
+
+    ```
+    with optimizer.scale_loss(losses) as scaled_losses:
+       scaled_losses.backward()
+    ```
+
+## Setup
+
+The following section lists the requirements that you need to meet in order to start training the SE-ResNeXt101-32x4d model.
+
+### Requirements
+
+This repository contains Dockerfile which extends the PyTorch NGC container and encapsulates some dependencies. Aside from these dependencies, ensure you have the following components:
+
+* [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker)
+* [PyTorch 19.10-py3 NGC container](https://ngc.nvidia.com/registry/nvidia-pytorch) or newer
+* [NVIDIA Volta](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/) or [Turing](https://www.nvidia.com/en-us/geforce/turing/) based GPU
+
+For more information about how to get started with NGC containers, see the
+following sections from the NVIDIA GPU Cloud Documentation and the Deep Learning
+DGX Documentation:
+* [Getting Started Using NVIDIA GPU Cloud](https://docs.nvidia.com/ngc/ngc-getting-started-guide/index.html)
+* [Accessing And Pulling From The NGC Container Registry](https://docs.nvidia.com/deeplearning/dgx/user-guide/index.html#accessing_registry)
+* [Running PyTorch](https://docs.nvidia.com/deeplearning/dgx/pytorch-release-notes/running.html#running)
+
+For those unable to use the PyTorch NGC container, to set up the required environment or create your own container, see the versioned [NVIDIA Container Support Matrix](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html).
+
+## Quick Start Guide
+
+### 1. Clone the repository.
+```
+git clone https://github.com/NVIDIA/DeepLearningExamples
+cd DeepLearningExamples/PyTorch/Classification/
+```
+
+### 2. Download and preprocess the dataset.
+
+The ResNeXt101-32x4d script operates on ImageNet 1k, a widely popular image classification dataset from the ILSVRC challenge.
+
+PyTorch can work directly on JPEGs, therefore, preprocessing/augmentation is not needed.
+
+1. [Download the images](http://image-net.org/download-images).
+
+2. Extract the training data:
+  ```bash
+  mkdir train && mv ILSVRC2012_img_train.tar train/ && cd train
+  tar -xvf ILSVRC2012_img_train.tar && rm -f ILSVRC2012_img_train.tar
+  find . -name "*.tar" | while read NAME ; do mkdir -p "${NAME%.tar}"; tar -xvf "${NAME}" -C "${NAME%.tar}"; rm -f "${NAME}"; done
+  cd ..
+  ```
+
+3. Extract the validation data and move the images to subfolders:
+  ```bash
+  mkdir val && mv ILSVRC2012_img_val.tar val/ && cd val && tar -xvf ILSVRC2012_img_val.tar
+  wget -qO- https://raw.githubusercontent.com/soumith/imagenetloader.torch/master/valprep.sh | bash
+  ```
+
+The directory in which the `train/` and `val/` directories are placed, is referred to as `<path to imagenet>` in this document.
+
+### 3. Build the SE-RNXT101-32x4d PyTorch NGC container.
+
+```
+docker build . -t nvidia_se-rnxt101-32x4d
+```
+
+### 4. Start an interactive session in the NGC container to run training/inference.
+```
+nvidia-docker run --rm -it -v <path to imagenet>:/imagenet --ipc=host nvidia_se-rnxt101-32x4d
+```
+
+### 5. Start training
+
+To run training for a standard configuration (DGX1V/DGX2V, AMP/FP32, 90/250 Epochs),
+run one of the scripts in the `./se-resnext101-32x4d/training` directory
+called `./se-resnext101-32x4d/training/{DGX1, DGX2}_SE-RNXT101-32x4d_{AMP, FP32}_{90,250}E.sh`.
+
+Ensure ImageNet is mounted in the `/imagenet` directory.
+
+Example:
+    `bash ./se-resnext101-32x4d/training/DGX1_SE-RNXT101-32x4d_FP16_250E.sh`
+
+### 6. Start inference
+
+To run inference on ImageNet on a checkpointed model, run:
+
+`python ./main.py --arch se-resnext101-32x4d --evaluate --epochs 1 --resume <path to checkpoint> -b <batch size> <path to imagenet>`
+
+To run inference on JPEG image, you have to first extract the model weights from checkpoint:
+
+`python checkpoint2model.py --checkpoint-path <path to checkpoint> --weight-path <path where weights will be stored>`
+
+Then run classification script:
+
+`python classify.py --arch se-resnext101-32x4d -c fanin --weights <path to weights from previous step> --precision AMP|FP16|FP32 --image <path to JPEG image>`
+
+
+## Advanced
+
+The following sections provide greater details of the dataset, running training and inference, and the training results.
+
+### Scripts and sample code
+
+To run a non standard configuration use:
+
+* For 1 GPU
+    * FP32
+        `python ./main.py --arch se-resnext101-32x4d -c fanin --label-smoothing 0.1 <path to imagenet>`
+        `python ./main.py --arch se-resnext101-32x4d -c fanin --label-smoothing 0.1 --amp --static-loss-scale 256 <path to imagenet>`
+
+* For multiple GPUs
+    * FP32
+        `python ./multiproc.py --nproc_per_node 8 ./main.py --arch se-resnext101-32x4d -c fanin --label-smoothing 0.1 <path to imagenet>`
+    * AMP
+        `python ./multiproc.py --nproc_per_node 8 ./main.py --arch se-resnext101-32x4d -c fanin --label-smoothing 0.1 --amp --static-loss-scale 256 <path to imagenet>`
+
+Use `python ./main.py -h` to obtain the list of available options in the `main.py` script.
+
+
+### Commmand-line options:
+
+To see the full list of available options and their descriptions, use the `-h` or `--help` command-line option, for example:
+
+`python main.py -h`
+
+
+```
+usage: main.py [-h] [--data-backend BACKEND] [--arch ARCH]
+               [--model-config CONF] [-j N] [--epochs N] [-b N]
+               [--optimizer-batch-size N] [--lr LR] [--lr-schedule SCHEDULE]
+               [--warmup E] [--label-smoothing S] [--mixup ALPHA]
+               [--momentum M] [--weight-decay W] [--bn-weight-decay]
+               [--nesterov] [--print-freq N] [--resume PATH]
+               [--pretrained-weights PATH] [--fp16]
+               [--static-loss-scale STATIC_LOSS_SCALE] [--dynamic-loss-scale]
+               [--prof N] [--amp] [--local_rank LOCAL_RANK] [--seed SEED]
+               [--gather-checkpoints] [--raport-file RAPORT_FILE] [--evaluate]
+               [--training-only] [--no-checkpoints] [--workspace DIR]
+               DIR
+
+PyTorch ImageNet Training
+
+positional arguments:
+  DIR                   path to dataset
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --data-backend BACKEND
+                        data backend: pytorch | syntetic | dali-gpu | dali-cpu
+                        (default: dali-cpu)
+  --arch ARCH, -a ARCH  model architecture: resnet18 | resnet34 | resnet50 |
+                        resnet101 | resnet152 | resnext101-32x4d | se-
+                        resnext101-32x4d (default: resnet50)
+  --model-config CONF, -c CONF
+                        model configs: classic | fanin | grp-fanin | grp-
+                        fanout(default: classic)
+  -j N, --workers N     number of data loading workers (default: 5)
+  --epochs N            number of total epochs to run
+  -b N, --batch-size N  mini-batch size (default: 256) per gpu
+  --optimizer-batch-size N
+                        size of a total batch size, for simulating bigger
+                        batches using gradient accumulation
+  --lr LR, --learning-rate LR
+                        initial learning rate
+  --lr-schedule SCHEDULE
+                        Type of LR schedule: step, linear, cosine
+  --warmup E            number of warmup epochs
+  --label-smoothing S   label smoothing
+  --mixup ALPHA         mixup alpha
+  --momentum M          momentum
+  --weight-decay W, --wd W
+                        weight decay (default: 1e-4)
+  --bn-weight-decay     use weight_decay on batch normalization learnable
+                        parameters, (default: false)
+  --nesterov            use nesterov momentum, (default: false)
+  --print-freq N, -p N  print frequency (default: 10)
+  --resume PATH         path to latest checkpoint (default: none)
+  --pretrained-weights PATH
+                        load weights from here
+  --fp16                Run model fp16 mode.
+  --static-loss-scale STATIC_LOSS_SCALE
+                        Static loss scale, positive power of 2 values can
+                        improve fp16 convergence.
+  --dynamic-loss-scale  Use dynamic loss scaling. If supplied, this argument
+                        supersedes --static-loss-scale.
+  --prof N              Run only N iterations
+  --amp                 Run model AMP (automatic mixed precision) mode.
+  --local_rank LOCAL_RANK
+                        Local rank of python process. Set up by distributed
+                        launcher
+  --seed SEED           random seed used for numpy and pytorch
+  --gather-checkpoints  Gather checkpoints throughout the training, without
+                        this flag only best and last checkpoints will be
+                        stored
+  --raport-file RAPORT_FILE
+                        file in which to store JSON experiment raport
+  --evaluate            evaluate checkpoint/model
+  --training-only       do not evaluate
+  --no-checkpoints      do not store any checkpoints, useful for benchmarking
+  --workspace DIR       path to directory where checkpoints will be stored
+```
+
+
+### Dataset guidelines
+
+To use your own dataset, divide it in directories as in the following scheme:
+
+ - Training images - `train/<class id>/<image>`
+ - Validation images - `val/<class id>/<image>`
+
+If your dataset's has number of classes different than 1000, you need to add a custom config
+in the `image_classification/resnet.py` file.
+
+```python
+resnet_versions = {
+    ...
+    'se-resnext101-32x4d-custom' : {
+        'net' : ResNet,
+        'block' : SEBottleneck,
+        'cardinality' : 32,
+        'layers' : [3, 4, 23, 3],
+        'widths' : [128, 256, 512, 1024],
+        'expansion' : 2,
+        'num_classes' : <custom number of classes>,
+    }
+}
+```
+
+After adding the config, run the training script with `--arch resnext101-32x4d-custom` flag.
+
+### Training process
+
+All the results of the training will be stored in the directory specified with `--workspace` argument.
+Script will store:
+ - most recent checkpoint - `checkpoint.pth.tar` (unless `--no-checkpoints` flag is used).
+ - checkpoint with best validation accuracy - `model_best.pth.tar` (unless `--no-checkpoints` flag is used).
+ - JSON log - in the file specified with `--raport-file` flag.
+
+Metrics gathered through training:
+
+ - `train.loss` - training loss
+ - `train.total_ips` - training speed measured in images/second
+ - `train.compute_ips` - training speed measured in images/second, not counting data loading
+ - `train.data_time` - time spent on waiting on data
+ - `train.compute_time` - time spent in forward/backward pass
+
+### Inference process
+
+Validation is done every epoch, and can be also run separately on a checkpointed model.
+
+`python ./main.py --arch se-resnext101-32x4d --evaluate --epochs 1 --resume <path to checkpoint> -b <batch size> <path to imagenet>`
+
+Metrics gathered through training:
+
+ - `val.loss` - validation loss
+ - `val.top1` - validation top1 accuracy
+ - `val.top5` - validation top5 accuracy
+ - `val.total_ips` - inference speed measured in images/second
+ - `val.compute_ips` - inference speed measured in images/second, not counting data loading
+ - `val.data_time` - time spent on waiting on data
+ - `val.compute_time` - time spent on inference
+
+
+To run inference on JPEG image, you have to first extract the model weights from checkpoint:
+
+`python checkpoint2model.py --checkpoint-path <path to checkpoint> --weight-path <path where weights will be stored>`
+
+Then run classification script:
+
+`python classify.py --arch se-resnext101-32x4d -c fanin --weights <path to weights from previous step> --precision AMP|FP16|FP32 --image <path to JPEG image>`
+
+Example output:
+
+
+
+## Performance
+
+### Benchmarking
+
+The following section shows how to run benchmarks measuring the model performance in training and inference modes.
+
+#### Training performance benchmark
+
+To benchmark training, run:
+
+* For 1 GPU
+    * FP32
+`python ./main.py --arch se-resnext101-32x4d --training-only -p 1 --raport-file benchmark.json --epochs 1 --prof 100 <path to imagenet>`
+    * FP16
+`python ./main.py --arch se-resnext101-32x4d --training-only -p 1 --raport-file benchmark.json --epochs 1 --prof 100 --fp16 --static-loss-scale 256 <path to imagenet>`
+    * AMP
+`python ./main.py --arch se-resnext101-32x4d --training-only -p 1 --raport-file benchmark.json --epochs 1 --prof 100 --amp --static-loss-scale 256 <path to imagenet>`
+* For multiple GPUs
+    * FP32
+`python ./multiproc.py --nproc_per_node 8 ./main.py --arch se-resnext101-32x4d --training-only -p 1 --raport-file benchmark.json --epochs 1 --prof 100 <path to imagenet>`
+    * FP16
+`python ./multiproc.py --nproc_per_node 8 ./main.py --arch se-resnext101-32x4d --training-only -p 1 --raport-file benchmark.json --fp16 --static-loss-scale 256 --epochs 1 --prof 100 <path to imagenet>`
+    * AMP
+`python ./multiproc.py --nproc_per_node 8 ./main.py --arch se-resnext101-32x4d --training-only -p 1 --raport-file benchmark.json --amp --static-loss-scale 256 --epochs 1 --prof 100 <path to imagenet>`
+
+Each of these scripts will run 100 iterations and save results in the `benchmark.json` file.
+
+#### Inference performance benchmark
+
+To benchmark inference, run:
+
+* FP32
+
+`python ./main.py --arch se-resnext101-32x4d -p 1 --raport-file benchmark.json --epochs 1 --prof 100 --evaluate <path to imagenet>`
+
+* FP16
+
+`python ./main.py --arch se-resnext101-32x4d -p 1 --raport-file benchmark.json --epochs 1 --prof 100 --evaluate --fp16 <path to imagenet>`
+
+* AMP
+
+`python ./main.py --arch se-resnext101-32x4d -p 1 --raport-file benchmark.json --epochs 1 --prof 100 --evaluate --amp <path to imagenet>`
+
+Each of these scripts will run 100 iterations and save results in the `benchmark.json` file.
+
+
+### Results
+
+Our results were obtained by running the applicable training script     in the pytorch-19.10 NGC container.
+
+To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
+
+#### Training accuracy results
+
+##### Training accuracy: NVIDIA DGX-1 (8x V100 16G)
+
+| **epochs** | **Mixed Precision Top1** | **FP32 Top1** |
+|:-:|:-:|:-:|
+| 90 | 80.03 +/- 0.10 | 79.86 +/- 0.13 |
+| 250 | 80.96 +/- 0.04 | 80.97 +/- 0.09 |
+
+
+
+##### Example plots
+
+The following images show a 250 epochs configuration on a DGX-1V.
+
+![ValidationLoss](./img/loss_plot.png)
+
+![ValidationTop1](./img/top1_plot.png)
+
+![ValidationTop5](./img/top5_plot.png)
+
+#### Training performance results
+
+##### Traininig performance: NVIDIA DGX1-16G (8x V100 16G)
+
+| **GPUs** | **Mixed Precision** | **FP32** | **Mixed Precision speedup** | **Mixed Precision Strong Scaling** | **FP32 Strong Scaling** |
+|:-:|:-:|:-:|:-:|:-:|:-:|
+| 1 | 266.65 img/s | 128.23 img/s | 2.08x | 1.00x | 1.00x |
+| 8 | 2031.17 img/s | 977.45 img/s | 2.08x | 7.62x | 7.62x |
+
+##### Traininig performance: NVIDIA DGX1-32G (8x V100 32G)
+
+| **GPUs** | **Mixed Precision** | **FP32** | **Mixed Precision speedup** | **Mixed Precision Strong Scaling** | **FP32 Strong Scaling** |
+|:-:|:-:|:-:|:-:|:-:|:-:|
+| 1 | 255.22 img/s | 125.13 img/s | 2.04x | 1.00x | 1.00x |
+| 8 | 1959.35 img/s | 963.21 img/s | 2.03x | 7.68x | 7.70x |
+
+##### Traininig performance: NVIDIA DGX2 (16x V100 32G)
+
+| **GPUs** | **Mixed Precision** | **FP32** | **Mixed Precision speedup** | **Mixed Precision Strong Scaling** | **FP32 Strong Scaling** |
+|:-:|:-:|:-:|:-:|:-:|:-:|
+| 1 | 261.58 img/s | 130.85 img/s | 2.00x | 1.00x | 1.00x |
+| 16 | 3776.03 img/s | 1953.13 img/s | 1.93x | 14.44x | 14.93x |
+
+#### Training Time for 90 Epochs
+
+##### Training time: NVIDIA DGX-1 (8x V100 16G)
+
+| **GPUs** | **Mixed Precision training time** | **FP32 training time** |
+|:-:|:-:|:-:|
+| 1 | ~ 134 h | ~ 277 h |
+| 8 | ~ 19 h | ~ 38 h |
+
+##### Training time: NVIDIA DGX-2 (16x V100 32G)
+
+| **GPUs** | **Mixed Precision training time** | **FP32 training time** |
+|:-:|:-:|:-:|
+| 1 | ~ 137 h | ~ 271 h |
+| 16 | ~ 11 h | ~ 20 h |
+
+
+
+#### Inference performance results
+
+##### Inference performance: NVIDIA DGX-1 (1x V100 16G)
+
+###### FP32 Inference Latency
+
+| **batch size** | **Throughput Avg** | **Latency Avg** | **Latency 90%** | **Latency 95%** | **Latency 99%** |
+|:-:|:-:|:-:|:-:|:-:|:-:|
+| 1 | 33.58 img/s | 29.72ms | 30.92ms | 31.77ms | 34.65ms |
+| 2 | 66.47 img/s | 29.94ms | 31.30ms | 32.74ms | 34.79ms |
+| 4 | 135.31 img/s | 29.36ms | 29.78ms | 32.61ms | 33.90ms |
+| 8 | 261.52 img/s | 30.42ms | 32.73ms | 33.99ms | 35.61ms |
+| 16 | 356.05 img/s | 44.61ms | 44.93ms | 45.17ms | 46.90ms |
+| 32 | 391.83 img/s | 80.91ms | 81.28ms | 81.64ms | 82.69ms |
+| 64 | 443.91 img/s | 142.70ms | 142.99ms | 143.46ms | 145.01ms |
+| 128 | N/A | N/A | N/A | N/A | N/A |
+
+###### Mixed Precision Inference Latency
+
+| **batch size** | **Throughput Avg** | **Latency Avg** | **Latency 90%** | **Latency 95%** | **Latency 99%** |
+|:-:|:-:|:-:|:-:|:-:|:-:|
+| 1 | 35.08 img/s | 28.40ms | 29.75ms | 31.77ms | 35.85ms |
+| 2 | 68.85 img/s | 28.92ms | 30.24ms | 31.46ms | 37.07ms |
+| 4 | 131.78 img/s | 30.17ms | 31.39ms | 32.66ms | 37.17ms |
+| 8 | 260.21 img/s | 30.52ms | 31.20ms | 32.92ms | 34.46ms |
+| 16 | 506.62 img/s | 31.36ms | 32.48ms | 34.13ms | 36.49ms |
+| 32 | 778.92 img/s | 40.69ms | 40.90ms | 41.07ms | 43.67ms |
+| 64 | 880.49 img/s | 72.10ms | 72.29ms | 72.34ms | 76.46ms |
+| 128 | 977.86 img/s | 130.19ms | 130.34ms | 130.41ms | 131.12ms |
+
+
+
+##### Inference performance: NVIDIA T4
+
+###### FP32 Inference Latency
+
+| **batch size** | **Throughput Avg** | **Latency Avg** | **Latency 90%** | **Latency 95%** | **Latency 99%** |
+|:-:|:-:|:-:|:-:|:-:|:-:|
+| 1 | 40.47 img/s | 24.72ms | 26.94ms | 29.33ms | 33.03ms |
+| 2 | 84.16 img/s | 23.66ms | 24.53ms | 25.96ms | 29.42ms |
+| 4 | 165.10 img/s | 24.08ms | 24.59ms | 25.75ms | 27.57ms |
+| 8 | 266.04 img/s | 29.90ms | 30.51ms | 30.84ms | 33.07ms |
+| 16 | 325.89 img/s | 48.57ms | 48.91ms | 49.02ms | 51.01ms |
+| 32 | 365.99 img/s | 86.94ms | 87.15ms | 87.41ms | 90.74ms |
+| 64 | 410.43 img/s | 155.30ms | 156.07ms | 156.36ms | 164.74ms |
+| 128 | N/A | N/A | N/A | N/A | N/A |
+
+###### Mixed Precision Inference Latency
+
+| **batch size** | **Throughput Avg** | **Latency Avg** | **Latency 90%** | **Latency 95%** | **Latency 99%** |
+|:-:|:-:|:-:|:-:|:-:|:-:|
+| 1 | 38.80 img/s | 25.74ms | 26.10ms | 29.28ms | 31.72ms |
+| 2 | 78.79 img/s | 25.29ms | 25.83ms | 27.18ms | 33.07ms |
+| 4 | 160.22 img/s | 24.81ms | 25.58ms | 26.25ms | 27.93ms |
+| 8 | 298.01 img/s | 26.69ms | 27.59ms | 29.13ms | 32.69ms |
+| 16 | 567.48 img/s | 28.05ms | 28.36ms | 31.28ms | 34.44ms |
+| 32 | 709.56 img/s | 44.58ms | 44.69ms | 44.98ms | 47.99ms |
+| 64 | 799.72 img/s | 79.32ms | 79.40ms | 79.49ms | 84.34ms |
+| 128 | 856.19 img/s | 147.92ms | 149.02ms | 149.13ms | 151.90ms |
+
+
+
+
+
+## Release notes
+
+### Changelog
+
+1. October 2019
+  * Initial release
+
+### Known issues
+
+There are no known issues with this model.
+
+
--- a/PyTorch/Classification/ConvNets/se-resnext101-32x4d/img/SEArch.png
+++ b/PyTorch/Classification/ConvNets/se-resnext101-32x4d/img/SEArch.png
--- a/PyTorch/Classification/ConvNets/se-resnext101-32x4d/img/loss_plot.png
+++ b/PyTorch/Classification/ConvNets/se-resnext101-32x4d/img/loss_plot.png
--- a/PyTorch/Classification/ConvNets/se-resnext101-32x4d/img/top1_plot.png
+++ b/PyTorch/Classification/ConvNets/se-resnext101-32x4d/img/top1_plot.png
--- a/PyTorch/Classification/ConvNets/se-resnext101-32x4d/img/top5_plot.png
+++ b/PyTorch/Classification/ConvNets/se-resnext101-32x4d/img/top5_plot.png
--- a/PyTorch/Classification/ConvNets/se-resnext101-32x4d/training/AMP/DGX1_SE-RNXT101-32x4d_AMP_250E.sh
+++ b/PyTorch/Classification/ConvNets/se-resnext101-32x4d/training/AMP/DGX1_SE-RNXT101-32x4d_AMP_250E.sh
@ -0,0 +1 @@
+python ./multiproc.py --nproc_per_node 8 ./main.py /imagenet --raport-file raport.json -j5 -p 100 --data-backend pytorch --arch se-resnext101-32x4d -c fanin --label-smoothing 0.1 --workspace $1 -b 128 --amp --static-loss-scale 128 --optimizer-batch-size 1024 --lr 1.024 --mom 0.875 --lr-schedule cosine --epochs 250 --warmup 8 --wd 6.103515625e-05 --mixup 0.2
--- a/PyTorch/Classification/ConvNets/se-resnext101-32x4d/training/AMP/DGX1_SE-RNXT101-32x4d_AMP_90E.sh
+++ b/PyTorch/Classification/ConvNets/se-resnext101-32x4d/training/AMP/DGX1_SE-RNXT101-32x4d_AMP_90E.sh
@ -0,0 +1 @@
+python ./multiproc.py --nproc_per_node 8 ./main.py /imagenet --raport-file raport.json -j5 -p 100 --data-backend pytorch --arch se-resnext101-32x4d -c fanin --label-smoothing 0.1 --workspace $1 -b 128 --amp --static-loss-scale 128 --optimizer-batch-size 1024 --lr 1.024 --mom 0.875 --lr-schedule cosine --epochs  90 --warmup 8 --wd 6.103515625e-05
--- a/PyTorch/Classification/ConvNets/se-resnext101-32x4d/training/FP32/DGX1_SE-RNXT101-32x4d_FP32_250E.sh
+++ b/PyTorch/Classification/ConvNets/se-resnext101-32x4d/training/FP32/DGX1_SE-RNXT101-32x4d_FP32_250E.sh
@ -0,0 +1 @@
+python ./multiproc.py --nproc_per_node 8 ./main.py /imagenet --raport-file raport.json -j5 -p 100 --data-backend pytorch --arch se-resnext101-32x4d -c fanin --label-smoothing 0.1 --workspace $1 -b 64 --optimizer-batch-size 1024 --lr 1.024 --mom 0.875 --lr-schedule cosine --epochs 250 --warmup 8 --wd 6.103515625e-05 --mixup 0.2
--- a/PyTorch/Classification/ConvNets/se-resnext101-32x4d/training/FP32/DGX1_SE-RNXT101-32x4d_FP32_90E.sh
+++ b/PyTorch/Classification/ConvNets/se-resnext101-32x4d/training/FP32/DGX1_SE-RNXT101-32x4d_FP32_90E.sh
@ -0,0 +1 @@
+python ./multiproc.py --nproc_per_node 8 ./main.py /imagenet --raport-file raport.json -j5 -p 100 --data-backend pytorch --arch se-resnext101-32x4d -c fanin --label-smoothing 0.1 --workspace $1 -b 64 --optimizer-batch-size 1024 --lr 1.024 --mom 0.875 --lr-schedule cosine --epochs  90 --warmup 8 --wd 6.103515625e-05
--- a/PyTorch/Classification/RN50v1.5/Dockerfile
+++ b/PyTorch/Classification/RN50v1.5/Dockerfile
@ -1,8 +0,0 @@
-FROM nvcr.io/nvidia/pytorch:19.05-py3
-
-RUN git clone https://github.com/NVIDIA/apex \
-        && cd apex \
-        && pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" .
-
-ADD . /workspace/rn50
-WORKDIR /workspace/rn50
--- a/PyTorch/Classification/RN50v1.5/README.md
+++ b/PyTorch/Classification/RN50v1.5/README.md
@ -1,311 +0,0 @@
-# ResNet50 v1.5
-
-## The model
-The ResNet50 v1.5 model is a modified version of the [original ResNet50 v1 model](https://arxiv.org/abs/1512.03385).
-
-The difference between v1 and v1.5 is that, in the bottleneck blocks which requires
-downsampling, v1 has stride = 2 in the first 1x1 convolution, whereas v1.5 has stride = 2 in the 3x3 convolution.
-
-This difference makes ResNet50 v1.5 slightly more accurate (~0.5% top1) than v1, but comes with a smallperformance drawback (~5% imgs/sec).
-
-The model is initialized as described in [Delving deep into rectifiers: Surpassing human-level performance on ImageNet classification](https://arxiv.org/pdf/1502.01852.pdf)
-
-## Training procedure
-
-### Optimizer
-
-This model trains for 90 epochs, with standard ResNet v1.5 setup:
-
-* SGD with momentum (0.875)
-
-* Learning rate = 0.256 for 256 batch size, for other batch sizes we lineary
-scale the learning rate.
-
-* Learning rate schedule - we use cosine LR schedule
-
-* For bigger batch sizes (512 and up) we use linear warmup of the learning rate
-during first couple of epochs
-according to [Training ImageNet in 1 hour](https://arxiv.org/abs/1706.02677).
-Warmup length depends on total training length.
-
-* Weight decay: 3.0517578125e-05 (1/32768).
-
-* We do not apply WD on Batch Norm trainable parameters (gamma/bias)
-
-* Label Smoothing: 0.1
-
-* We train for:
-
-    * 50 Epochs -> configuration that reaches 75.9% top1 accuracy
-
-    * 90 Epochs -> 90 epochs is a standard for ResNet50
-
-    * 250 Epochs -> best possible accuracy.
-
-* For 250 epoch training we also use [MixUp regularization](https://arxiv.org/pdf/1710.09412.pdf).
-
-
-### Data Augmentation
-
-This model uses the following data augmentation:
-
-* For training:
-  * Normalization
-  * Random resized crop to 224x224
-    * Scale from 8% to 100%
-    * Aspect ratio from 3/4 to 4/3
-  * Random horizontal flip
-
-* For inference:
-  * Normalization
-  * Scale to 256x256
-  * Center crop to 224x224
-
-### Other training recipes
-
-This script does not targeting any specific benchmark.
-There are changes that others have made which can speed up convergence and/or increase accuracy.
-
-One of the more popular training recipes is provided by [fast.ai](https://github.com/fastai/imagenet-fast).
-
-The fast.ai recipe introduces many changes to the training procedure, one of which is progressive resizing of the training images.
-
-The first part of training uses 128px images, the middle part uses 224px images, and the last part uses 288px images.
-The final validation is performed on 288px images.
-
-Training script in this repository performs validation on 224px images, just like the original paper described.
-
-These two approaches can't be directly compared, since the fast.ai recipe requires validation on 288px images,
-and this recipe keeps the original assumption that validation is done on 224px images.
-
-Using 288px images means that a lot more FLOPs are needed during inference to reach the same accuracy.
-
-
-# Setup
-## Requirements
-
-Ensure you meet the following requirements:
-
-* [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker)
-* [PyTorch 19.05-py3 NGC container](https://ngc.nvidia.com/registry/nvidia-pytorch) or newer
-* (optional) NVIDIA Volta GPU (see section below) - for best training performance using mixed precision
-
-For more information about how to get started with NGC containers, see the
-following sections from the NVIDIA GPU Cloud Documentation and the Deep Learning
-DGX Documentation:
-* [Getting Started Using NVIDIA GPU Cloud](https://docs.nvidia.com/ngc/ngc-getting-started-guide/index.html)
-* [Accessing And Pulling From The NGC Container Registry](https://docs.nvidia.com/deeplearning/dgx/user-guide/index.html#accessing_registry)
-* [Running PyTorch](https://docs.nvidia.com/deeplearning/dgx/pytorch-release-notes/running.html#running)
-
-## Training using mixed precision with Tensor Cores
-
-### Hardware requirements
-Training with mixed precision on NVIDIA Tensor Cores, requires an
-[NVIDIA Volta](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/)-based GPU.
-
-### Software changes
-
-For information about how to train using mixed precision, see the
-[Mixed Precision Training paper](https://arxiv.org/abs/1710.03740)
-and
-[Training With Mixed Precision documentation](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html).
-
-For PyTorch, easily adding mixed-precision support is available from NVIDIA’s
-[APEX](https://github.com/NVIDIA/apex), a PyTorch extension, that contains
-utility libraries, such as AMP, which require minimal network code changes to
-leverage Tensor Core performance.
-
-### DALI
-
-For DGX2 configurations we use [NVIDIA DALI](https://github.com/NVIDIA/DALI),
-which speeds up data loading when CPU becomes a bottleneck.
-
-Run training with `--data-backends dali-gpu` to enable DALI.
-
-# Quick start guide
-
-## Geting the data
-
-The ResNet50 v1.5 script operates on ImageNet 1k, a widely popular image classification dataset from ILSVRC challenge.
-
-PyTorch can work directly on JPEGs, therefore, preprocessing/augmentation is not needed.
-
-1. Download the images from http://image-net.org/download-images
-
-2. Extract the training data:
-  ```bash
-  mkdir train && mv ILSVRC2012_img_train.tar train/ && cd train
-  tar -xvf ILSVRC2012_img_train.tar && rm -f ILSVRC2012_img_train.tar
-  find . -name "*.tar" | while read NAME ; do mkdir -p "${NAME%.tar}"; tar -xvf "${NAME}" -C "${NAME%.tar}"; rm -f "${NAME}"; done
-  cd ..
-  ```
-
-3. Extract the validation data and move the images to subfolders:
-  ```bash
-  mkdir val && mv ILSVRC2012_img_val.tar val/ && cd val && tar -xvf ILSVRC2012_img_val.tar
-  wget -qO- https://raw.githubusercontent.com/soumith/imagenetloader.torch/master/valprep.sh | bash
-  ```
-
-The directory in which the `train/` and `val/` directories are placed, is referred to as `<path to imagenet>` in this document.
-
-## Running training
-
-To run training for a standard configuration (DGX1V/DGX2V, FP16/FP32, 50/90/250 Epochs),
-run one of the scripts in the `./resnet50v1.5/training` directory
-called `./resnet50v1.5/training/{DGX1, DGX2}_RN50_{FP16, FP32}_{50,90,250}E.sh`.
-
-Ensure imagenet is mounted in the `/data/imagenet` directory.
-
-To run a non standard configuration use:
-
-* For 1 GPU
-    * FP32
-        `python ./main.py --arch resnet50 -c fanin --label-smoothing 0.1 <path to imagenet>`
-    * FP16
-        `python ./main.py --arch resnet50 -c fanin --label-smoothing 0.1 --fp16 --static-loss-scale 256 <path to imagenet>`
-
-* For multiple GPUs
-    * FP32
-        `python ./multiproc.py --nproc_per_node 8 ./main.py --arch resnet50 -c fanin --label-smoothing 0.1 <path to imagenet>`
-    * FP16
-        `python ./multiproc.py --nproc_per_node 8 ./main.py --arch resnet50 -c fanin --label-smoothing 0.1 --fp16 --static-loss-scale 256 <path to imagenet>`
-
-Use `python ./main.py -h` to obtain the list of available options in the `main.py` script.
-
-## Running inference
-
-To run inference on a checkpointed model run:
-
-`python ./main.py --arch resnet50 --evaluate --resume <path to checkpoint> -b <batch size> <path to imagenet>`
-
-## Benchmarking
-
-### Training performance
-
-To benchmark training, run:
-
-* For 1 GPU
-    * FP32
-`python ./main.py --arch resnet50 --training-only -p 1 --raport-file benchmark.json --epochs 1 --prof 100 <path to imagenet>`
-    * FP16
-`python ./main.py --arch resnet50 --training-only -p 1 --raport-file benchmark.json --epochs 1 --prof 100 --fp16 --static-loss-scale 256 <path to imagenet>`
-* For multiple GPUs
-    * FP32
-`python ./multiproc.py --nproc_per_node 8 ./main.py --arch resnet50 --training-only -p 1 --raport-file benchmark.json --epochs 1 --prof 100 <path to imagenet>`
-    * FP16
-`python ./multiproc.py --nproc_per_node 8 ./main.py --arch resnet50 --training-only -p 1 --raport-file benchmark.json --fp16 --static-loss-scale 256 --epochs 1 --prof 100 <path to imagenet>`
-
-Each of this scripts will run 100 iterations and save results in benchmark.json file
-
-### Inference performance
-
-To benchmark inference, run:
-
-* FP32
-
-`python ./main.py --arch resnet50 -p 1 --raport-file benchmark.json --epochs 1 --prof 100 --evaluate <path to imagenet>`
-
-* FP16
-
-`python ./main.py --arch resnet50 -p 1 --raport-file benchmark.json --epochs 1 --prof 100 --evaluate --fp16 <path to imagenet>`
-
-Each of this scripts will run 100 iterations and save results in benchmark.json file
-
-## Training Accuracy Results
-
-### NVIDIA DGX1V (8x V100 16G)
-
-#### Accuracy
-
-| **# of epochs** | **mixed precision top1** | **FP32 top1**   |
-|:-----------------:|:------------------------:|:---------------:|
-| 50                | 76.25 +/- 0.04           | 76.26 +/- 0.07  |
-| 90                | 77.23 +/- 0.04           | 77.08 +/- 0.07  |
-| 250               | 78.42 +/- 0.04           | 78.30 +/- 0.16  |
-
-#### Training time for 90 epochs
-
-| **number of GPUs** | **mixed precision training time** | **FP32 training time** |
-|:------------------:|:---------------------------------:|:----------------------:|
-| 1                  | ~46h                              | ~90h                   |
-| 4                  | ~14h                              | ~26h                   |
-| 8                  | ~8h                               | ~14h                   |
-
-### NVIDIA DGX2V (16x V100 32G)
-
-#### Accuracy
-
-| **# of epochs** | **mixed precision top1** | **FP32 top1**   |
-|:-----------------:|:------------------------:|:---------------:|
-| 50                | 75.80 +/- 0.08           | 76.04 +/- 0.05  |
-| 90                | 77.10 +/- 0.06           | 77.23 +/- 0.04  |
-| 250               | 78.59 +/- 0.13           | 78.46 +/- 0.03  |
-
-#### Training time for 90 epochs
-
-| **number of GPUs** | **mixed precision training time** | **FP32 training time** |
-|:------------------:|:---------------------------------:|:----------------------:|
-| 2                  | ~24h                              | ~45h                   |
-| 8                  | ~8h                               | ~13h                   |
-| 16                  | ~4h                               | ~7h                    |
-
-
-### Example plots (250 Epochs configuration on DGX2)
-
-![TrainingLoss](./img/DGX2_250_loss.png)
-
-![ValidationTop1](./img/DGX2_250_top1.png)
-
-![ValidationTop5](./img/DGX2_250_top5.png)
-
-
-## Training Performance Results
-
-### NVIDIA DGX1V (8x V100 16G)
-
-| **number of GPUs** | **mixed precision img/s** | **FP32 img/s** | **mixed precision speedup** | **mixed precision weak scaling** | **FP32 weak scaling** |
-|:------------------:|:-------------------------:|:--------------:|:---------------------------:|:--------------------------------:|:---------------------:|
-| 1                  | 747.3                     | 363.1          | 2.06                        | 1.00                             | 1.00                  |
-| 4                  | 2886.9                    | 1375.5         | 2.1                         | 3.86                             | 3.79                  |
-| 8                  | 5815.8                    | 2857.9         | 2.03                        | 7.78                             | 7.87                  |
-
-### NVIDIA DGX2V (16x V100 32G)
-
-| **number of GPUs** | **mixed precision img/s** | **FP32 img/s** | **mixed precision speedup** |
-|:------------------:|:-------------------------:|:--------------:|:---------------------------:|
-| 16                 | 12086.1                   | 5578.2         | 2.16                        |
-
-
-## Inference Performance Results
-
-### NVIDIA VOLTA V100 16G on DGX1V
-
-| **batch size** | **mixed precision img/s** | **FP32 img/s** |
-|:--------------:|:-------------------------:|:--------------:|
-|       1 |   131.8 |   134.9 |
-|       2 |   248.7 |   260.6 |
-|       4 |   486.4 |   425.5 |
-|       8 |   908.5 |   783.6 |
-|      16 |  1370.6 |   998.9 |
-|      32 |  2287.5 |  1092.3 |
-|      64 |  2476.2 |  1166.6 |
-|     128 |  2615.6 |  1215.6 |
-|     256 |  2696.7 |  N/A    |
-
-# Changelog
-
-1. September 2018
-  * Initial release
-2. January 2019
-  * Added options Label Smoothing, fan-in initialization, skipping weight decay on batch norm gamma and bias.
-3. May 2019
-  * Cosine LR schedule
-  * MixUp regularization
-  * DALI support
-  * DGX2 configurations
-  * gradients accumulation
-
-
-# Known issues
-
-There are no known issues with this model.
--- a/PyTorch/Classification/RN50v1.5/examples/RN50_FP16_1GPU.sh
+++ b/PyTorch/Classification/RN50v1.5/examples/RN50_FP16_1GPU.sh
@ -1,4 +0,0 @@
-# This script launches ResNet50 training in FP16 on 1 GPUs using 256 batch size (256 per GPU)
-# Usage ./RN50_FP16_1GPU.sh <path to this repository> <additional flags>
-
-python $1/main.py -j5 -p 500 --arch resnet50 -c fanin --label-smoothing 0.1 -b 256 --lr 0.1 --epochs 90 --fp16 --static-loss-scale 256 $2 /data/imagenet
--- a/PyTorch/Classification/RN50v1.5/examples/RN50_FP16_4GPU.sh
+++ b/PyTorch/Classification/RN50v1.5/examples/RN50_FP16_4GPU.sh
@ -1,4 +0,0 @@
-# This script launches ResNet50 training in FP16 on 4 GPUs using 1024 batch size (256 per GPU)
-# Usage ./RN50_FP16_4GPU.sh <path to this repository> <additional flags>
-
-python $1/multiproc.py --nproc_per_node 4 $1/main.py -j5 -p 500 --arch resnet50 -c fanin --label-smoothing 0.1 -b 256 --lr 0.4 --warmup 5 --epochs 90 --fp16 --static-loss-scale 256 $2 /data/imagenet
--- a/PyTorch/Classification/RN50v1.5/examples/RN50_FP16_8GPU.sh
+++ b/PyTorch/Classification/RN50v1.5/examples/RN50_FP16_8GPU.sh
@ -1,4 +0,0 @@
-# This script launches ResNet50 training in FP16 on 8 GPUs using 2048 batch size (256 per GPU)
-# Usage ./RN50_FP16_8GPU.sh <path to this repository> <additional flags>
-
-python $1/multiproc.py --nproc_per_node 8 $1/main.py -j5 -p 500 --arch resnet50 -c fanin --label-smoothing 0.1 -b 256 --lr 0.8 --warmup 5 --epochs 90 --fp16 --static-loss-scale 256 $2 /data/imagenet
--- a/PyTorch/Classification/RN50v1.5/examples/RN50_FP16_EVAL.sh
+++ b/PyTorch/Classification/RN50v1.5/examples/RN50_FP16_EVAL.sh
@ -1,4 +0,0 @@
-# This script evaluates ResNet50 model in FP16 using 64 batch size on 1 GPU
-# Usage: ./RN50_FP16_EVAL.sh <path to this repository> <path to checkpoint>
-
-python $1/main.py -j5 p 100 --arch resnet50 -b 256 --resume $2 --evaluate --fp16 /data/imagenet
--- a/PyTorch/Classification/RN50v1.5/examples/RN50_FP16_INFERENCE_BENCHMARK.sh
+++ b/PyTorch/Classification/RN50v1.5/examples/RN50_FP16_INFERENCE_BENCHMARK.sh
@ -1,3 +0,0 @@
-# This script launches ResNet50 inference benchmark in FP16 on 1 GPU with 256 batch size
-
-python ./main.py -j5 --arch resnet50 -b 256 --fp16 --benchmark-inference /data/imagenet
--- a/PyTorch/Classification/RN50v1.5/examples/RN50_FP32_1GPU.sh
+++ b/PyTorch/Classification/RN50v1.5/examples/RN50_FP32_1GPU.sh
@ -1,4 +0,0 @@
-# This script launches ResNet50 training in FP32 on 1 GPUs using 128 batch size (128 per GPU)
-# Usage ./RN50_FP32_1GPU.sh <path to this repository> <additional flags>
-
-python $1/main.py -j5 -p 500 --arch resnet50 -c fanin --label-smoothing 0.1 -b 128 --lr 0.05 --epochs 90 $2 /data/imagenet
--- a/Show more
+++ b/Show more
				`@ -0,0 +1 @@`
				`git+git://github.com/NVIDIA/dllogger.git@26a0f8f1958de2c0c460925ff6102a4d2486d6cc#egg=dllogger`
				`@ -0,0 +1 @@`
				`python ./multiproc.py --nproc_per_node 8 ./main.py /imagenet --data-backend dali-cpu --raport-file raport.json -j5 -p 100 --lr 2.048 --optimizer-batch-size 2048 --warmup 8 --arch resnet50 -c fanin --label-smoothing 0.1 --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace ${1:-./} -b 256 --amp --static-loss-scale 128 --epochs 250 --mixup 0.2`