[QuartzNet/PyT] Release QuartzNet model

2021-09-14 06:03:36 -07:00 · 2021-09-14 06:03:36 -07:00 · 649776f79a
parent 88eb3cff2f
commit 649776f79a
55 changed files with 6160 additions and 0 deletions
--- a/PyTorch/SpeechRecognition/QuartzNet/.gitignore
+++ b/PyTorch/SpeechRecognition/QuartzNet/.gitignore
@ -0,0 +1,9 @@
+__pycache__
+*.pt
+results/
+datasets/
+checkpoints/
+
+*.swp
+*.swo
+*.swn
--- a/PyTorch/SpeechRecognition/QuartzNet/Dockerfile
+++ b/PyTorch/SpeechRecognition/QuartzNet/Dockerfile
@ -0,0 +1,30 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:21.07-py3
+FROM ${FROM_IMAGE_NAME}
+
+RUN apt update && apt install -y libsndfile1 && apt install -y sox && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /workspace/quartznet
+
+# Install requirements (do this first for better caching)
+COPY requirements.txt .
+RUN conda install -y pyyaml==5.4.1
+RUN pip install --disable-pip-version-check -U -r requirements.txt
+
+RUN pip install --extra-index-url https://developer.download.nvidia.com/compute/redist nvidia-dali-cuda110==1.2.0
+
+# Copy rest of files
+COPY . .
--- a/PyTorch/SpeechRecognition/QuartzNet/LICENSE
+++ b/PyTorch/SpeechRecognition/QuartzNet/LICENSE
@ -0,0 +1,203 @@
+   Except where otherwise noted, the following license applies to all files in this repo. 
+        
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright 2019 NVIDIA Corporation
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
--- a/PyTorch/SpeechRecognition/QuartzNet/NOTICE
+++ b/PyTorch/SpeechRecognition/QuartzNet/NOTICE
@ -0,0 +1,5 @@
+QuartzNet in PyTorch
+
+This repository includes source code (in "common/") from:
+* https://github.com/keithito/tacotron and https://github.com/ryanleary/patter licensed under MIT license.
+
--- a/PyTorch/SpeechRecognition/QuartzNet/README.md
+++ b/PyTorch/SpeechRecognition/QuartzNet/README.md
@ -0,0 +1,674 @@
+# QuartzNet For PyTorch
+
+This repository provides a script and recipe to train the QuartzNet model to achieve state-of-the-art accuracy. The content of this repository is tested and maintained by NVIDIA.
+
+## Table Of Contents
+
+- [Model overview](#model-overview)
+    * [Model architecture](#model-architecture)
+    * [Default configuration](#default-configuration)
+    * [Feature support matrix](#feature-support-matrix)
+        * [Features](#features)
+    * [Mixed precision training](#mixed-precision-training)
+        * [Enabling mixed precision](#enabling-mixed-precision)
+        * [Enabling TF32](#enabling-tf32)
+    * [Glossary](#glossary)
+- [Setup](#setup)
+    * [Requirements](#requirements)
+- [Quick Start Guide](#quick-start-guide)
+- [Advanced](#advanced)
+    * [Scripts and sample code](#scripts-and-sample-code)
+    * [Parameters](#parameters)
+    * [Command-line options](#command-line-options)
+    * [Getting the data](#getting-the-data)
+        * [Dataset guidelines](#dataset-guidelines)
+        * [Multi-dataset](#multi-dataset)
+    * [Training process](#training-process)
+    * [Inference process](#inference-process)
+- [Performance](#performance)
+    * [Benchmarking](#benchmarking)
+        * [Training performance benchmark](#training-performance-benchmark)
+        * [Inference performance benchmark](#inference-performance-benchmark)
+    * [Results](#results)
+        * [Training accuracy results](#training-accuracy-results)
+            * [Training accuracy: NVIDIA DGX A100 (8x A100 80GB)](#training-accuracy-nvidia-dgx-a100-8x-a100-80gb)
+            * [Training stability test](#training-stability-test)
+        * [Training performance results](#training-performance-results)
+            * [Training performance: NVIDIA DGX A100 (8x A100 80GB)](#training-performance-nvidia-dgx-a100-8x-a100-80gb)
+            * [Training performance: NVIDIA DGX-2 (16x V100 32GB)](#training-performance-nvidia-dgx-2-16x-v100-32gb)
+        * [Inference performance results](#inference-performance-results)
+            * [Inference performance: NVIDIA DGX A100 (1x A100 80GB)](#inference-performance-nvidia-dgx-a100-1x-a100-80gb)
+            * [Inference performance: NVIDIA DGX-2 (1x V100 32GB)](#inference-performance-nvidia-dgx-2-1x-v100-32gb)
+- [Release notes](#release-notes)
+    * [Changelog](#changelog)
+    * [Known issues](#known-issues)
+
+## Model overview
+
+This repository provides an implementation of the QuartzNet model in PyTorch from the paper [QuartzNet: Deep Automatic Speech Recognition with 1D Time-Channel Separable Convolutions](https://arxiv.org/pdf/1910.10261).
+The QuartzNet model is an end-to-end neural acoustic model for automatic speech recognition (ASR), that provides high accuracy at a low memory footprint. The QuartzNet architecture of convolutional layers was designed to facilitate fast GPU inference, by allowing whole sub-blocks to be fused into a single GPU kernel. This is important for meeting strict real-time requirements of ASR systems in deployment.
+
+
+This repository is a PyTorch implementation of QuartzNet and provides scripts to train the QuartzNet 10x5 model from scratch on the [LibriSpeech](http://www.openslr.org/12) dataset to achieve the greedy decoding results improved upon the original paper.
+The repository is self-contained and includes data preparation scripts, training, and inference scripts.
+Both training and inference scripts offer the option to use Automatic Mixed Precision (AMP) to benefit from Tensor Cores for better performance.
+
+In addition to providing the hyperparameters for training a model checkpoint, we publish a thorough inference analysis across different NVIDIA GPU platforms, for example, DGX-2, NVIDIA A100 GPU, and T4.
+
+This model is trained with mixed precision using Tensor Cores on Volta, Turing, and the NVIDIA Ampere GPU architectures. Therefore, researchers can get results [1.4]x faster than training without Tensor Cores, while experiencing the benefits of mixed precision training. This model is tested against each NGC monthly container release to ensure consistent accuracy and performance over time.
+
+### Model architecture
+
+QuartzNet is an end-to-end neural acoustic model that is based on efficient, time-channel separable convolutions (Figure 1).
+In the audio processing stage, each frame is transformed into mel-scale spectrogram features, which the acoustic model takes as input and outputs a probability distribution over the vocabulary for each frame.
+
+<p align="center">
+ <img src="./img/model.png" alt="QuartzNet model architecture" width="50%" />
+</p>
+<p align="center">
+ <em>Figure 1. Architecture of QuartzNet (<a href=”https://arxiv.org/abs/1910.10261”>source</a>)
+ </em>
+</p>
+
+### Default configuration
+
+The following features were implemented in this model:
+* GPU-supported feature extraction with data augmentation options [SpecAugment](https://arxiv.org/abs/1904.08779) and [Cutout](https://arxiv.org/pdf/1708.04552.pdf) using the DALI library
+* offline and online [Speed Perturbation](https://www.danielpovey.com/files/2015_interspeech_augmentation.pdf) using the DALI library
+* data-parallel multi-GPU training and evaluation
+* AMP with dynamic loss scaling for Tensor Core training
+* FP16 inference
+
+### Feature support matrix
+
+| **Feature**   | **QuartzNet**    |
+|---------------|---------------|
+|[Apex AMP](https://nvidia.github.io/apex/amp.html) | Yes |
+|[DALI](https://docs.nvidia.com/deeplearning/dali/release-notes/index.html)   | Yes |
+
+#### Features
+
+**DALI**
+NVIDIA Data Loading Library (DALI) is a collection of highly optimized building blocks, and an execution engine, to accelerate the pre-processing of the input data for deep learning applications. DALI provides both the performance and the flexibility for accelerating different data pipelines as a single library. This single library can then be easily integrated into different deep learning training and inference applications. For details, see example sources in this repository or see the [DALI documentation](https://docs.nvidia.com/deeplearning/dali/index.html).
+
+**Automatic Mixed Precision (AMP)**
+Computation graphs can be modified by PyTorch on runtime to support mixed precision training. A detailed explanation of mixed precision can be found in the next section.
+
+### Mixed precision training
+
+Mixed precision is the combined use of different numerical precisions in a computational method. [Mixed precision](https://arxiv.org/abs/1710.03740) training offers significant computational speedup by performing operations in half-precision format while storing minimal information in single-precision to retain as much information as possible in critical parts of the network. Since the introduction of [Tensor Cores](https://developer.nvidia.com/tensor-cores) in Volta, and following with both the Turing and Ampere architectures, significant training speedups are experienced by switching to mixed precision -- up to 3x overall speedup on the most arithmetically intense model architectures. Using [mixed precision training](https://docs.nvidia.com/deeplearning/performance/mixed-precision-training/index.html) previously required two steps:
+1. Porting the model to use the FP16 data type where appropriate.
+2. Adding loss scaling to preserve small gradient values.
+
+For information about:
+-  How to train using mixed precision, see the [Mixed Precision Training](https://arxiv.org/abs/1710.03740) paper and [Training With Mixed Precision](https://docs.nvidia.com/deeplearning/performance/mixed-precision-training/index.html) documentation.
+-  Techniques used for mixed precision training, see the [Mixed-Precision Training of Deep Neural Networks](https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/) blog.
+-  APEX tools for mixed precision training, see the [NVIDIA Apex: Tools for Easy Mixed-Precision Training in PyTorch](https://devblogs.nvidia.com/apex-pytorch-easy-mixed-precision-training/).
+
+#### Enabling mixed precision
+
+For training, mixed precision can be enabled by setting the flag: `train.py --amp`. When using bash helper scripts, mixed precision can be enabled with the environment variable `AMP=true`, for example, `AMP=true bash scripts/train.sh`, `AMP=true bash scripts/inference.sh`, etc.
+
+#### Enabling TF32
+
+TensorFloat-32 (TF32) is the new math mode in [NVIDIA A100](https://www.nvidia.com/en-us/data-center/a100/) GPUs for handling the matrix math also called tensor operations. TF32 running on Tensor Cores in A100 GPUs can provide up to 10x speedups compared to single-precision floating-point math (FP32) on Volta GPUs.
+
+TF32 Tensor Cores can speed up networks using FP32, typically with no loss of accuracy. It is more robust than FP16 for models which require high dynamic range for weights or activations.
+
+For more information, refer to the [TensorFloat-32 in the A100 GPU Accelerates AI Training, HPC up to 20x](https://blogs.nvidia.com/blog/2020/05/14/tensorfloat-32-precision-format/) blog post.
+
+TF32 is supported in the NVIDIA Ampere GPU architecture and is enabled by default.
+
+### Glossary
+
+**Time-channel separable (TCS) convolution**
+A module composed mainly of two convolutional layers: a 1D depthwise convolutional layer,
+and a pointwise convolutional layer (Figure 2). The former operates across K time frames, and the latter across all channels. By decoupling time and channel axes, the separable module uses less parameters and calculates the result faster, than it would otherwise would.
+
+<p align="center">
+ <img src="./img/tcs_conv.png" alt="Time-channel separable (TCS) convolutional module" width="50%" />
+</p>
+<p align="center">
+ <em>Figure 2. Time-channel separable (TCS) convolutional module: (a) basic design, (b) TCS with a group shuffle layer, added to increase cross-group interchange</em>
+</p>
+
+**Automatic Speech Recognition (ASR)**
+Uses both an acoustic model and a language model to output the transcript of an input audio signal.
+
+**Acoustic model**
+Assigns a probability distribution over a vocabulary of characters given an audio frame. Typically, a large part of the entire ASR model.
+
+**Language model**
+Assigns a probability distribution over a sequence of words. Given a sequence of words, it assigns a probability to the whole sequence.
+
+**Pre-training**
+Training a model on vast amounts of data on the same (or different) task to build general understandings.
+
+## Setup
+
+The following section lists the requirements that you need to meet in order to start training the QuartzNet model.
+
+### Requirements
+
+This repository contains Dockerfile which extends the PyTorch 21.07-py3 NGC container and encapsulates some dependencies. Aside from these dependencies, ensure you have the following components:
+-  [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker)
+-  [PyTorch 21.07-py3 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch)
+-  Supported GPUs:
+   - [NVIDIA Volta architecture](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/)
+   - [NVIDIA Turing architecture](https://www.nvidia.com/en-us/design-visualization/technologies/turing-architecture/)
+   - [NVIDIA Ampere architecture](https://www.nvidia.com/en-us/data-center/nvidia-ampere-gpu-architecture/)
+
+For more information about how to get started with NGC containers, see the following sections from the NVIDIA GPU Cloud Documentation and the Deep Learning Documentation:
+-  [Getting Started Using NVIDIA GPU Cloud](https://docs.nvidia.com/ngc/ngc-getting-started-guide/index.html)
+-  [Accessing And Pulling From The NGC Container Registry](https://docs.nvidia.com/deeplearning/frameworks/user-guide/index.html#accessing_registry)
+-  [Running PyTorch](https://docs.nvidia.com/deeplearning/dgx/pytorch-release-notes/running.html#running)
+
+Further required Python packages are listed in `requirements.txt`, which are automatically installed with the built Docker container. To manually install them, run:
+```bash
+pip install -r requirements.txt
+```
+
+For those unable to use the PyTorch 21.07-py3 NGC container, to set up the required environment or create your own container, see the versioned [NVIDIA Container Support Matrix](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html).
+
+## Quick Start Guide
+
+To train your model using mixed or TF32 precision with Tensor Cores or using FP32, perform the following steps using the default parameters of the QuartzNet model on the LibriSpeech dataset. For the specifics concerning training and inference, see the [Advanced](#advanced) section.
+
+1. Clone the repository.
+   ```bash
+   git clone https://github.com/NVIDIA/DeepLearningExamples
+   cd DeepLearningExamples/PyTorch/SpeechRecognition/QuartzNet
+   ```
+
+2. Build the QuartzNet PyTorch NGC container.
+   ```bash
+   bash scripts/docker/build.sh
+   ```
+
+3. Start an interactive session in the NGC container to prepare the dataset, or run training/inference.
+   Specify a local mountpoint for the dataset with the `DATA_DIR` variable:
+   ```bash
+   DATA_DIR=<path_on_the_host> bash scripts/docker/launch.sh
+   ```
+
+4. Download and preprocess the dataset.
+   No GPU is required for data download and preprocessing.
+   It can take several hours to complete, and requires over 250GB of free disk space.
+
+   This repository provides scripts to download and extract LibriSpeech [http://www.openslr.org/12](http://www.openslr.org/12). The dataset contains 1000 hours of 16kHz read English speech derived from public domain audiobooks from the LibriVox project and has been carefully segmented and aligned. For more information, see the [LIBRISPEECH: AN ASR CORPUS BASED ON PUBLIC DOMAIN AUDIO BOOKS](http://www.danielpovey.com/files/2015_icassp_librispeech.pdf) paper.
+
+   Inside the container, download and extract the datasets into the required format for later training and inference:
+   ```bash
+   bash scripts/download_librispeech.sh
+   ```
+   After the data download is complete, the following folders should exist:
+   ```bash
+   datasets/LibriSpeech/
+   ├── dev-clean
+   ├── dev-other
+   ├── test-clean
+   ├── test-other
+   ├── train-clean-100
+   ├── train-clean-360
+   └── train-other-500
+   ```
+
+   Since `/datasets/` is mounted to `DATA_DIR` on the host,  after the dataset is downloaded it will be accessible from outside of the container at `$DATA_DIR/LibriSpeech`.
+
+   Next, convert the data into WAV files:
+   ```bash
+   bash scripts/preprocess_librispeech.sh
+   ```
+
+   After the data is converted, the following additional files and folders should exist:
+   ```bash
+   datasets/LibriSpeech/
+   ├── dev-clean-wav
+   ├── dev-other-wav
+   ├── librispeech-train-clean-100-wav.json
+   ├── librispeech-train-clean-360-wav.json
+   ├── librispeech-train-other-500-wav.json
+   ├── librispeech-dev-clean-wav.json
+   ├── librispeech-dev-other-wav.json
+   ├── librispeech-test-clean-wav.json
+   ├── librispeech-test-other-wav.json
+   ├── test-clean-wav
+   ├── test-other-wav
+   ├── train-clean-100-wav
+   ├── train-clean-360-wav
+   └── train-other-500-wav
+   ```
+
+5. Start training.
+   Inside the container, use the following script to start training.
+   Make sure the downloaded and preprocessed dataset is located at `$DATA_DIR/LibriSpeech` on the host, which is mounted as `/datasets/LibriSpeech` inside the container.
+
+   ```bash
+   [OPTION1=value1 OPTION2=value2 ...] bash scripts/train.sh
+   ```
+   By default, automatic precision is disabled, batch size is 144 over two gradient accumulation steps, and the recipe is run on a total of 8 GPUs. The hyperparameters are tuned for a GPU with at least 32GB of memory and will require adjustment for different configurations (for example, by lowering the batch size and using more gradient accumulation steps).
+
+   Options are being passed as environment variables. More details on the available options can be found in the [Parameters](#parameters) and [Training process](#training-process) sections.
+
+6. Start validation/evaluation.
+   Inside the container, use the following script to run evaluation.
+   Make sure the downloaded and preprocessed dataset is located at `$DATA_DIR/LibriSpeech` on the host, which is mounted as `/datasets/LibriSpeech` inside the container.
+   ```bash
+   [OPTION1=value1 OPTION2=value2 ...] bash scripts/evaluation.sh [OPTIONS]
+   ```
+
+   By default, this will use full precision, a batch size of 64, and run on a single GPU.
+
+   Options are being passed as environment variables. More details on the available options can be found in the [Parameters](#parameters) and [Evaluation process](#evaluation-process) sections.
+
+7. Start inference/predictions.
+   Inside the container, use the following script to run inference.
+   Make sure the downloaded and preprocessed dataset is located at `$DATA_DIR/LibriSpeech` on the host, which is mounted as `/datasets/LibriSpeech` inside the container.
+   A pretrained model checkpoint can be downloaded from [NGC model repository](https://ngc.nvidia.com/catalog/models).
+
+   ```bash
+   [OPTION1=value1 OPTION2=value2 ...] bash scripts/inference.sh
+   ```
+
+   By default, this will use single precision, a batch size of 64, and run on a single GPU.
+
+   Options are being passed as environment variables. More details on the available options can be found in the [Parameters](#parameters) and [Inference process](#inference-process) sections.
+
+Now that you have your model trained and evaluated, you can choose to compare your training results with our [Training accuracy results](#training-accuracy-results). You can also choose to benchmark your performance to [Training performance benchmark](#training-performance-results), or [Inference performance benchmark](#inference-performance-results). Following the steps in these sections will ensure that you achieve the same accuracy and performance results as stated in the [Results](#results) section.
+
+## Advanced
+
+The following sections provide greater details of the dataset, running training and inference, and the training results.
+
+### Scripts and sample code
+
+In the `root` directory, the most important files are:
+```
+quartznet
+├── common        # data pre-processing, logging, etc.
+├── configs       # model configurations
+├── Dockerfile    # container with the basic set of dependencies to run QuartzNet
+├── inference.py  # entry point for inference
+├── quartznet     # model-specific code
+├── scripts       # one-click scripts required for running various supported functionalities
+│   ├── docker                     # contains the scripts for building and launching the container
+│   ├── download_librispeech.sh    # downloads LibriSpeech dataset
+│   ├── evaluation.sh              # runs evaluation using the `inference.py` script
+│   ├── inference_benchmark.sh     # runs the inference benchmark using the `inference_benchmark.py` script
+│   ├── inference.sh               # runs inference using the `inference.py` script
+│   ├── preprocess_librispeech.sh  # preprocess LibriSpeech raw data files for training and inference
+│   ├── train_benchmark.sh         # runs the training performance benchmark using the `train.py` script
+│   └── train.sh                   # runs training using the `train.py` script
+├── train.py      # entry point for training
+└── utils         # data downloading and common routines
+```
+
+### Parameters
+
+Parameters should be set as environment variables.
+
+The complete list of available parameters for `scripts/train.sh` script contains:
+```bash
+DATA_DIR: directory of dataset. (default: '/datasets/LibriSpeech')
+MODEL_CONFIG: relative path to model configuration. (default: 'configs/quartznet10x5dr_speedp_online_speca.yaml')
+OUTPUT_DIR: directory for results, logs, and created checkpoints. (default: '/results')
+CHECKPOINT: a specific model checkpoint to continue training from. To resume training from the last checkpoint, see the RESUME option.
+RESUME: resume training from the last checkpoint found in OUTPUT_DIR, or from scratch if there are no checkpoints (default: true)
+CUDNN_BENCHMARK: boolean that indicates whether to enable cudnn benchmark mode for using more optimized kernels. (default: true)
+NUM_GPUS: number of GPUs to use. (default: 8)
+AMP: if set to `true`, enables automatic mixed precision (default: false)
+GPU_BATCH_SIZE: batch size for every forward/backward pass. The effective batch size might be higher, if gradient accumulation is enabled (default: 72)
+GRAD_ACCUMULATION: number of forward/backward passes until the optimizer updates weights. (default: 2)
+LEARNING_RATE: initial learning rate. (default: 0.01)
+MIN_LEARNING_RATE: minimum learning rate, despite LR scheduling (default: 1e-5)
+LR_POLICY: how to decay LR (default: exponential)
+LR_EXP_GAMMA: decay factor for the exponential LR schedule (default: 0.981)
+EMA: decay factor for exponential averages of checkpoints (default: 0.999)
+SEED: seed for random number generator and used for ensuring reproducibility. (default: 0)
+EPOCHS: number of training epochs. (default: 440)
+WARMUP_EPOCHS: number of initial epoch of linearly increasing LR. (default: 2)
+HOLD_EPOCHS:  number of epochs to hold maximum LR after warmup. (default: 140)
+SAVE_FREQUENCY: number of epochs between saving the model to disk. (default: 10)
+EPOCHS_THIS_JOB: run training for this number of epochs. Does not affect LR schedule like the EPOCHS parameter. (default: 0)
+DALI_DEVICE: device to run the DALI pipeline on for calculation of filterbanks. Valid choices: cpu, gpu, none. (default: gpu)
+PAD_TO_MAX_DURATION: pad all sequences with zeros to maximum length. (default: false)
+EVAL_FREQUENCY: number of steps between evaluations on the validation set. (default: 544)
+PREDICTION_FREQUENCY: the number of steps between writing a sample prediction to stdout. (default: 544)
+TRAIN_MANIFESTS: lists of .json training set files
+VAL_MANIFESTS: lists of .json validation set files
+
+```
+
+The complete list of available parameters for `scripts/inference.sh` script contains:
+```bash
+DATA_DIR: directory of dataset. (default: '/datasets/LibriSpeech')
+MODEL_CONFIG: model configuration. (default: 'configs/quartznet10x5dr_speedp-online_speca.yaml')
+OUTPUT_DIR: directory for results and logs. (default: '/results')
+CHECKPOINT: model checkpoint path. (required)
+DATASET: name of the LibriSpeech subset to use. (default: 'dev-clean')
+LOG_FILE: path to the DLLogger .json logfile. (default: '')
+CUDNN_BENCHMARK: enable cudnn benchmark mode for using more optimized kernels. (default: false)
+MAX_DURATION: filter out recordings shorter then MAX_DURATION seconds. (default: "")
+PAD_TO_MAX_DURATION: pad all sequences with zeros to maximum length. (default: false)
+NUM_GPUS: number of GPUs to use. Note that with > 1 GPUs WER results might be inaccurate due to the batching policy. (default: 1)
+NUM_STEPS: number of batches to evaluate, loop the dataset if necessary. (default: 0)
+NUM_WARMUP_STEPS: number of initial steps before measuring performance. (default: 0)
+AMP: enable FP16 inference with AMP. (default: false)
+BATCH_SIZE: data batch size. (default: 64)
+EMA: Attempt to load exponentially averaged weights from a checkpoint. (default: true)
+SEED: seed for random number generator and used for ensuring reproducibility. (default: 0)
+DALI_DEVICE: device to run the DALI pipeline on for calculation of filterbanks. Valid choices: cpu, gpu, none. (default: gpu)
+CPU: run inference on CPU. (default: false)
+LOGITS_FILE: dump logit matrices to a file. (default: "")
+PREDICTION_FILE: save predictions to a file. (default: "${OUTPUT_DIR}/${DATASET}.predictions")
+```
+
+The complete list of available parameters for `scripts/evaluation.sh` is the same as `scripts/inference.sh`. Only the defaults have changed.
+```bash
+PREDICTION_FILE: (default: "")
+DATASET: (default: "test-other")
+```
+
+The `scripts/inference_benchmark.sh` script pads all input to a fixed duration and computes the mean, 90%, 95%, 99% percentile of latency for the specified number of inference steps. Latency is measured in milliseconds per batch. The `scripts/inference_benchmark.sh` measures latency for a single GPU and loops over a number of batch sizes and durations. It extends  `scripts/inference.sh` and changes the defaults with:
+```bash
+BATCH_SIZE_SEQ: batch sizes to measure with. (default: "1 2 4 8 16")
+MAX_DURATION_SEQ: input durations (in seconds) to measure with (default: "2 7 16.7")
+CUDNN_BENCHMARK: (default: true)
+PAD_TO_MAX_DURATION: (default: true)
+NUM_WARMUP_STEPS: (default: 10)
+NUM_STEPS: (default: 500)
+DALI_DEVICE: (default: "cpu")
+```
+
+The `scripts/train_benchmark.sh` script pads all input to the same length according to the input argument `MAX_DURATION` and measures average training latency and throughput performance. Latency is measured in seconds per batch, throughput in sequences per second.
+Training performance is measured with online speed perturbation and NVIDIA cuDNN benchmark mode enabled.
+The script `scripts/train_benchmark.sh` loops over a number of batch sizes and GPU counts.
+It extends `scripts/train.sh`, the complete list of available parameters for `scripts/train_benchmark.sh` script contains:
+```bash
+ACC_BATCH_SIZE: accumulated (effective) batch size to measure with. (default: "144")
+GRAD_ACC_SEQ: the sequence of gradient accumulation settings to measure with. (default: "4 2")
+NUM_GPUS_SEQ: number of GPUs to run the training on. (default: "1 4 8")
+MODEL_CONFIG: (default: "configs/quartznet10x5dr_speedp-online_train-benchmark.yaml")
+TRAIN_MANIFESTS: (default: "$DATA_DIR/librispeech-train-clean-100-wav.json")
+RESUME: (default: false)
+EPOCHS_THIS_JOB: (default: 2)
+EPOCHS: (default: 100000)
+SAVE_FREQUENCY: (default: 100000)
+EVAL_FREQUENCY: (default: 100000)
+GRAD_ACCUMULATION_STEPS: (default: 1)
+PAD_TO_MAX_DURATION: (default: true)
+EMA: (default: 0)
+```
+
+### Command-line options
+
+To see the full list of available options and their descriptions, use the `-h` or `--help` command-line option, for example:
+```bash
+python train.py --help
+python inference.py --help
+```
+
+### Getting the data
+
+QuartzNet is trained on the LibriSpeech dataset. We use the concatenation of `train-clean-100`, `train-clean-360`, and `train-other-500` for training and `dev-clean` for validation.
+
+This repository contains the `scripts/download_librispeech.sh` and `scripts/preprocess_librispeech.sh` scripts that automatically downloads and preprocesses the training, test, and development datasets. By default, data is downloaded to the `/datasets/LibriSpeech` directory. A minimum of 250GB free space is required for download and preprocessing; the final preprocessed dataset is approximately 100GB.
+
+#### Dataset guidelines
+
+The `scripts/preprocess_librispeech.sh` script converts the input audio files to WAV format with a sample rate of 16kHz. The target transcripts are stripped from whitespace characters, then lower-cased. No offline augmentations are stored on the disk - these are computed online with the DALI library without any impact on training time.
+
+After preprocessing, the script creates JSON metadata files with output file paths, sample rate, target transcript and other metadata. These JSON files are used by the training script to identify training and validation datasets.
+
+The QuartzNet model was tuned on audio signals with a sample rate of 16kHz. If you wish to use a different sampling rate, then some hyperparameters might need to be changed - specifically, the window size and step size.
+
+#### Multi-dataset
+
+Training scripts in this repository treat the training subsets of LibriSpeech (`train-clean-100`, `train-clean-360`, `train-other-500`) as three independent training datasets.
+In order to add more datasets, follow the format of LibriSpeech, adjust the provided pre-processing scripts to generate metadata JSON files, and point them with the `TRAIN_MANIFESTS` variable to the `scripts/train.sh` script.
+
+### Training process
+
+Training is performed using the `train.py` script along with parameters defined in  `scripts/train.sh`.
+The `scripts/train.sh` script runs a job on a single node that trains the QuartzNet model from scratch using LibriSpeech as training data. To make training more efficient, we discard audio samples longer than 16.7 seconds from the training dataset, the total number of these samples is less than 1%. Such filtering does not degrade accuracy, but it allows us to decrease the number of time steps in a batch, which requires less GPU memory and increases training speed.
+Apart from the default arguments as listed in the [Parameters](#parameters) section, by default the training script:
+
+* Runs on 8 GPUs with at least 32GB of memory and training/evaluation batch size 48, split over three gradient accumulation steps
+* Uses TF32 precision (A100 GPU) or FP32 (other GPUs)
+* Trains on the concatenation of all 3 LibriSpeech training datasets and evaluates on the LibriSpeech dev-clean dataset
+* Maintains an exponential moving average of parameters for evaluation
+* Has cuDNN benchmark enabled
+* Runs for 260 epochs
+* Uses an initial learning rate of 0.02 and an exponential learning rate decay
+* Saves a checkpoint every 10 epochs
+* Automatically removes old checkpoints and preserves milestone checkpoints
+* Runs evaluation on the development dataset every epoch and at the end of training
+* Maintains a separate checkpoint with the lowest WER on development set
+* Prints out training progress every iteration to `stdout`
+* Creates a DLLogger log file and a TensorBoard log
+* Calculates speed perturbation online during training
+* Uses `SpecAugment` in data pre-processing
+* Filters out audio samples longer than 16.7 seconds
+* Pads each batch so its length is divisible by 16
+* Uses time-channel separable convolutions as described in the paper
+* Uses weight decay of 0.001
+* Uses [Novograd](https://arxiv.org/pdf/1905.11286.pdf) as optimizer with betas=(0.95, 0)
+
+Enabling AMP permits batch size 144 with one gradient accumulation step. Since each batch has to be padded to the longest sequence, all GPUs have to wait for the slowest one, and two accumulation steps are slightly faster.
+
+The current training setup improves upon the greedy WER [Results](#results) of the QuartzNet paper.
+
+### Inference process
+
+Inference is performed using the `inference.py` script along with parameters defined in `scripts/inference.sh`.
+The `scripts/inference.sh` script runs the job on a single GPU, taking a pre-trained QuartzNet model checkpoint and running it on the specified dataset.
+Apart from the default arguments as listed in the [Parameters](#parameters) section, by default, the inference script:
+
+* Evaluates on the LibriSpeech dev-clean dataset and prints out the final word error rate
+* Uses a batch size of 64
+* Creates a log file with progress and results which will be stored in the `results` folder
+* Pads each batch so its length would be divisible by 16
+* Does not use data augmentation
+* Does greedy decoding and optionally saves the transcriptions in the results folder
+* Has the option to save the model output tensors for more complex decoding, for example, beam search
+* Has cuDNN benchmark disabled
+
+To view all available options for inference, run `python inference.py --help`.
+
+## Performance
+
+### Benchmarking
+
+The following section shows how to run benchmarks measuring the model performance in training and inference modes.
+
+#### Training performance benchmark
+
+To benchmark the training performance with a number of specific configurations, run:
+```bash
+GRAD_ACC_SEQ=<SEQUENCE> NUM_GPUS_SEQ=<NUMS_OF_GPUS> bash scripts/train_benchmark.sh
+```
+for example:
+```bash
+GRAD_ACC_SEQ="12 24" NUM_GPUS_SEQ="4 8" bash scripts/train_benchmark.sh
+```
+
+This invocation will measure performance in four setups (two different batch sizes for every single forward/backward pass times two hardware setups).
+
+By default, this script makes forward/backward pre-allocation passes with all possible audio lengths
+enabling immediate stabilization of training step times in the cuDNN benchmark mode,
+and trains for two epochs on the `train-clean-100` subset of LibriSpeech.
+
+#### Inference performance benchmark
+
+To benchmark the inference performance on a specific batch size and audio length, run:
+
+```bash
+BATCH_SIZE_SEQ=<BATCH_SIZES> MAX_DURATION_SEQ=<DURATIONS> bash scripts/inference_benchmark.sh
+```
+
+for example:
+```bash
+BATCH_SIZE_SEQ="24 48" MAX_DURATION_SEQ="2 7 16.7" bash scripts/inference_benchmark.sh
+```
+
+The script runs on a single GPU and evaluates on the dataset of fixed-length utterances shorter than `MAX_DURATION` and padded to that duration.
+
+### Results
+
+The following sections provide details on how we achieved our performance and accuracy in training and inference.
+
+#### Training accuracy results
+
+##### Training accuracy: NVIDIA DGX A100 (8x A100 80GB)
+
+Our results were obtained by running the `scripts/train.sh` training script in the PyTorch 21.07-py3 NGC container on NVIDIA DGX A100 (8x A100 80GB) GPUs.
+
+| Number of GPUs | Batch size per GPU | Precision | dev-clean WER | dev-other WER | test-clean WER | test-other WER | Time to train |
+|-----|-----|-------|-------|-------|------|-------|------|
+|   8 | 144 | mixed |  3.47 | 10.84 | 3.69 | 10.69 | 34 h |
+
+The table reports word error rate (WER) of the acoustic model with greedy decoding on all LibriSpeech dev and test datasets for mixed precision training.
+
+##### Training stability test
+
+The following table compares greedy decoding word error rates across 8 different training runs with different seeds for mixed precision training.
+
+| DGX A100 80GB, FP16, 8x GPU |   Seed #1 |   Seed #2 |   Seed #3 |   Seed #4 |   Seed #5 |   Seed #6 |   Seed #7 |   Seed #8 |   Mean |   Std |
+|-----------:|----------:|----------:|----------:|----------:|----------:|----------:|----------:|----------:|-------:|------:|
+| dev-clean  |      3.57 |      3.48 |      3.54 |      3.48 |      3.47 |      3.69 |      3.51 |      3.59 |   3.54 |  0.07 |
+| dev-other  |     10.68 |     10.78 |     10.47 |     10.72 |     10.84 |     11.03 |     10.67 |     10.86 |  10.76 |  0.15 |
+| test-clean |      3.70 |      3.82 |      3.79 |      3.84 |      3.69 |      4.03 |      3.82 |      3.80 |   3.81 |  0.10 |
+| test-other |     10.75 |     10.62 |     10.54 |     10.90 |     10.69 |     11.14 |     10.41 |     10.82 |  10.73 |  0.21 |
+
+#### Training performance results
+
+##### Training performance: NVIDIA DGX A100 (8x A100 80GB)
+
+Our results were obtained by running:
+```bash
+AMP=true NUM_GPUS_SEQ="1" GRAD_ACC_SEQ="16 24" bash scripts/train_benchmark.sh
+AMP=true NUM_GPUS_SEQ="4" GRAD_ACC_SEQ="4 6" bash scripts/train_benchmark.sh
+AMP=true NUM_GPUS_SEQ="8" GRAD_ACC_SEQ="2 3" bash scripts/train_benchmark.sh
+AMP=false NUM_GPUS_SEQ="1" GRAD_ACC_SEQ="16 24" bash scripts/train_benchmark.sh
+AMP=false NUM_GPUS_SEQ="4" GRAD_ACC_SEQ="4 6" bash scripts/train_benchmark.sh
+AMP=false NUM_GPUS_SEQ="8" GRAD_ACC_SEQ="2 3" bash scripts/train_benchmark.sh
+```
+in the PyTorch 21.07-py3 NGC container on NVIDIA DGX A100 with (8x A100 80GB) GPUs. Performance numbers (in items/images per second) were averaged over an entire training epoch.
+
+| Batch size / GPU | Grad accumulation | GPUs | Throughput - FP32 | Throughput - mixed precision | Throughput speedup (FP32 to mixed precision) | Weak scaling - FP32 | Weak scaling - mixed precision |
+|-----:|-----:|-------:|----------:|-------:|--------:|-----:|------:|
+|   48 |   24 |      1 |     89.69 |  78.89 |    1.14 | 1.00 |  1.00 |
+|   72 |   16 |      1 |     88.70 |  79.01 |    1.12 | 1.00 |  1.00 |
+|   48 |    6 |      4 |    343.06 | 303.16 |    1.13 | 3.84 |  3.82 |
+|   72 |    4 |      4 |    341.95 | 304.47 |    1.12 | 3.85 |  3.86 |
+|   48 |    3 |      8 |    644.27 | 576.37 |    1.12 | 7.31 |  7.18 |
+|   72 |    2 |      8 |    651.60 | 583.31 |    1.12 | 7.38 |  7.35 |
+
+To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
+
+##### Training performance: NVIDIA DGX-2 (16x V100 32GB)
+
+Our results were obtained by running:
+```bash
+AMP=true NUM_GPUS_SEQ="1" GRAD_ACC_SEQ="24 48" bash scripts/train_benchmark.sh
+AMP=true NUM_GPUS_SEQ="4" GRAD_ACC_SEQ="6 12" bash scripts/train_benchmark.sh
+AMP=true NUM_GPUS_SEQ="8" GRAD_ACC_SEQ="3 6" bash scripts/train_benchmark.sh
+AMP=true NUM_GPUS_SEQ="16" GRAD_ACC_SEQ="3" bash scripts/train_benchmark.sh
+AMP=false NUM_GPUS_SEQ="1" GRAD_ACC_SEQ="48" bash scripts/train_benchmark.sh
+AMP=false NUM_GPUS_SEQ="4" GRAD_ACC_SEQ="12" bash scripts/train_benchmark.sh
+AMP=false NUM_GPUS_SEQ="8" GRAD_ACC_SEQ="6" bash scripts/train_benchmark.sh
+AMP=false NUM_GPUS_SEQ="16" GRAD_ACC_SEQ="3" bash scripts/train_benchmark.sh
+```
+in the PyTorch 21.07-py3 NGC container on NVIDIA DGX-2 with (16x V100 32GB) GPUs. Performance numbers (in items/images per second) were averaged over an entire training epoch.
+
+| Batch size / GPU | Grad accumulation | GPUs | Throughput - FP32 | Throughput - mixed precision | Throughput speedup (FP32 to mixed precision) | Weak scaling - FP32 | Weak scaling - mixed precision |
+|-----:|-----:|-------:|----------:|-------:|--------:|------:|------:|
+|   24 |   48 |      1 |     67.95 |  44.65 |    1.52 |  1.00 |  1.00 |
+|   48 |   24 |      1 |     67.49 |      - |       - |  1.00 |  1.00 |
+|   24 |   12 |      4 |    258.56 | 170.18 |    1.52 |  3.81 |  3.81 |
+|   48 |    6 |      4 |    254.58 |      - |       - |     - |  3.77 |
+|   24 |    6 |      8 |    495.52 | 330.53 |    1.50 |  7.40 |  7.29 |
+|   48 |    3 |      8 |    477.87 |      - |       - |     - |  7.08 |
+|   24 |    3 |     16 |    872.99 | 616.51 |    1.42 | 13.81 | 12.85 |
+
+
+To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
+
+#### Inference performance results
+
+##### Inference performance: NVIDIA DGX A100 (1x A100 80GB)
+
+Our results were obtained by running:
+```bash
+bash AMP=false scripts/inference_benchmark.sh
+bash AMP=true scripts/inference_benchmark.sh
+```
+
+in the PyTorch 21.07-py3 NGC container on NVIDIA DGX A100 (1x A100 80GB) GPU.
+Performance numbers (latency in milliseconds per batch) were averaged over 500 iterations.
+
+|  |  | FP16 Latency (ms) Percentiles |  |  |  | TF32 Latency (ms) Percentiles |  |  |  | FP16/TF32 speed up |
+|-----:|---------------:|------:|------:|------:|------:|------:|------:|------:|------:|------:|
+|   BS |   Duration (s) |   90% |   95% |   99% |   Avg |   90% |   95% |   99% |   Avg |   Avg |
+|    1 |            2.0 | 35.51 | 36.36 | 55.57 | 35.71 | 33.23 | 33.86 | 40.05 | 33.23 |  0.93 |
+|    2 |            2.0 | 38.05 | 38.91 | 52.67 | 38.21 | 34.17 | 35.17 | 39.32 | 33.73 |  0.88 |
+|    4 |            2.0 | 38.43 | 38.98 | 45.44 | 37.78 | 35.02 | 36.00 | 44.10 | 34.75 |  0.92 |
+|    8 |            2.0 | 38.63 | 39.37 | 45.43 | 37.94 | 35.49 | 36.70 | 45.94 | 34.53 |  0.91 |
+|   16 |            2.0 | 42.33 | 44.58 | 61.02 | 40.28 | 35.66 | 36.93 | 45.38 | 34.78 |  0.86 |
+|    1 |            7.0 | 37.72 | 38.54 | 42.56 | 37.28 | 33.23 | 34.16 | 40.54 | 33.13 |  0.89 |
+|    2 |            7.0 | 39.44 | 41.35 | 53.62 | 38.56 | 35.15 | 35.81 | 41.83 | 34.82 |  0.90 |
+|    4 |            7.0 | 38.39 | 39.48 | 45.01 | 37.98 | 37.54 | 38.51 | 42.67 | 36.12 |  0.95 |
+|    8 |            7.0 | 40.82 | 41.76 | 54.20 | 39.43 | 37.67 | 39.97 | 45.24 | 36.12 |  0.92 |
+|   16 |            7.0 | 42.80 | 44.80 | 56.92 | 41.52 | 40.66 | 41.96 | 53.24 | 39.24 |  0.95 |
+|    1 |           16.7 | 38.22 | 38.98 | 44.15 | 37.80 | 33.89 | 34.98 | 42.66 | 33.23 |  0.88 |
+|    2 |           16.7 | 39.84 | 41.09 | 52.50 | 39.34 | 35.86 | 37.16 | 42.04 | 34.39 |  0.87 |
+|    4 |           16.7 | 41.02 | 42.64 | 54.96 | 39.50 | 35.98 | 37.02 | 39.30 | 34.87 |  0.88 |
+|    8 |           16.7 | 40.93 | 42.06 | 56.26 | 39.36 | 40.93 | 42.06 | 45.50 | 39.34 |  1.00 |
+|   16 |           16.7 | 57.21 | 58.65 | 71.33 | 57.78 | 62.74 | 63.82 | 71.13 | 61.49 |  1.06 |
+
+To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
+
+##### Inference performance: NVIDIA DGX-2 (1x V100 32GB)
+
+Our results were obtained by running:
+```bash
+bash AMP=false scripts/inference_benchmark.sh
+bash AMP=true scripts/inference_benchmark.sh
+```
+
+in the PyTorch 21.07-py3 NGC container on NVIDIA DGX-2 with (1x V100 32GB) GPU.
+Performance numbers (latency in milliseconds per batch) were averaged over 500 iterations.
+
+|  |  | FP16 Latency (ms) Percentiles |  |  |  | FP32 Latency (ms) Percentiles |  |  |  | FP16/FP32 speed up |
+|-----:|---------------:|------:|------:|------:|------:|-------:|-------:|-------:|-------:|------:|
+|   BS |   Duration (s) |   90% |   95% |   99% |   Avg |    90% |    95% |    99% |    Avg |   Avg |
+|    1 |            2.0 | 36.89 | 38.16 | 41.80 | 35.85 |  33.44 |  33.78 |  38.09 |  33.01 |  0.92 |
+|    2 |            2.0 | 40.47 | 41.33 | 45.70 | 40.02 |  32.62 |  33.27 |  36.38 |  32.09 |  0.80 |
+|    4 |            2.0 | 41.50 | 42.85 | 49.65 | 41.12 |  34.56 |  34.83 |  37.10 |  34.04 |  0.83 |
+|    8 |            2.0 | 49.87 | 50.48 | 51.99 | 49.19 |  34.90 |  35.17 |  36.57 |  34.27 |  0.70 |
+|   16 |            2.0 | 46.39 | 46.77 | 47.87 | 40.04 |  45.37 |  45.89 |  47.52 |  44.46 |  1.11 |
+|    1 |            7.0 | 48.83 | 49.16 | 52.22 | 48.26 |  33.87 |  34.50 |  36.45 |  33.24 |  0.69 |
+|    2 |            7.0 | 41.48 | 41.82 | 45.07 | 41.03 |  42.32 |  42.66 |  43.86 |  41.79 |  1.02 |
+|    4 |            7.0 | 42.48 | 43.25 | 47.29 | 41.56 |  37.20 |  38.18 |  39.74 |  36.46 |  0.88 |
+|    8 |            7.0 | 39.78 | 40.49 | 44.73 | 38.89 |  46.84 |  47.17 |  48.07 |  44.78 |  1.15 |
+|   16 |            7.0 | 49.85 | 50.56 | 53.04 | 44.95 |  60.21 |  60.68 |  64.92 |  57.94 |  1.29 |
+|    1 |           16.7 | 40.80 | 41.16 | 42.96 | 40.52 |  42.04 |  42.53 |  44.59 |  37.08 |  0.92 |
+|    2 |           16.7 | 41.37 | 41.69 | 43.74 | 40.85 |  35.61 |  36.49 |  40.32 |  34.68 |  0.85 |
+|    4 |           16.7 | 50.22 | 51.07 | 54.13 | 49.51 |  40.95 |  41.38 |  44.09 |  40.39 |  0.82 |
+|    8 |           16.7 | 44.93 | 45.38 | 49.24 | 44.16 |  62.54 |  62.92 |  65.95 |  61.86 |  1.40 |
+|   16 |           16.7 | 70.74 | 71.56 | 75.16 | 69.87 | 102.52 | 103.57 | 108.20 | 101.57 |  1.45 |
+
+To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
+
+## Release notes
+
+We're constantly refining and improving our performance on AI and HPC workloads even on the same hardware with frequent updates to our software stack. For our latest performance data, refer to these pages for [AI](#https://developer.nvidia.com/deep-learning-performance-training-inference) and [HPC](#https://developer.nvidia.com/hpc-application-performance) benchmarks.
+
+### Changelog
+
+September 2021
+- Initial release
+
+### Known issues
+
+There are no known issues in this release.
--- a/PyTorch/SpeechRecognition/QuartzNet/common/init.py
+++ b/PyTorch/SpeechRecognition/QuartzNet/common/init.py
--- a/PyTorch/SpeechRecognition/QuartzNet/common/audio.py
+++ b/PyTorch/SpeechRecognition/QuartzNet/common/audio.py
@ -0,0 +1,247 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+import soundfile as sf
+
+import librosa
+import torch
+import numpy as np
+
+import sox
+
+
+def audio_from_file(file_path, offset=0, duration=0, trim=False, target_sr=16000):
+    audio = AudioSegment(file_path, target_sr=target_sr, int_values=False,
+                         offset=offset, duration=duration, trim=trim)
+
+    samples = torch.tensor(audio.samples, dtype=torch.float).cuda()
+    num_samples = torch.tensor(samples.shape[0]).int().cuda()
+    return (samples.unsqueeze(0), num_samples.unsqueeze(0))
+
+
+class AudioSegment(object):
+    """Monaural audio segment abstraction.
+
+    :param samples: Audio samples [num_samples x num_channels].
+    :type samples: ndarray.float32
+    :param sample_rate: Audio sample rate.
+    :type sample_rate: int
+    :raises TypeError: If the sample data type is not float or int.
+    """
+
+    def __init__(self, filename, target_sr=None, int_values=False, offset=0,
+                 duration=0, trim=False, trim_db=60):
+        """Create audio segment from samples.
+
+        Samples are converted to float32 internally, with int scaled to [-1, 1].
+        Load a file supported by librosa and return as an AudioSegment.
+        :param filename: path of file to load
+        :param target_sr: the desired sample rate
+        :param int_values: if true, load samples as 32-bit integers
+        :param offset: offset in seconds when loading audio
+        :param duration: duration in seconds when loading audio
+        :return: numpy array of samples
+        """
+        with sf.SoundFile(filename, 'r') as f:
+            dtype = 'int32' if int_values else 'float32'
+            sample_rate = f.samplerate
+            if offset > 0:
+                f.seek(int(offset * sample_rate))
+            if duration > 0:
+                samples = f.read(int(duration * sample_rate), dtype=dtype)
+            else:
+                samples = f.read(dtype=dtype)
+        samples = samples.transpose()
+
+        samples = self._convert_samples_to_float32(samples)
+        if target_sr is not None and target_sr != sample_rate:
+            samples = librosa.core.resample(samples, sample_rate, target_sr)
+            sample_rate = target_sr
+        if trim:
+            samples, _ = librosa.effects.trim(samples, trim_db)
+        self._samples = samples
+        self._sample_rate = sample_rate
+        if self._samples.ndim >= 2:
+            self._samples = np.mean(self._samples, 1)
+
+    def __eq__(self, other):
+        """Return whether two objects are equal."""
+        if type(other) is not type(self):
+            return False
+        if self._sample_rate != other._sample_rate:
+            return False
+        if self._samples.shape != other._samples.shape:
+            return False
+        if np.any(self.samples != other._samples):
+            return False
+        return True
+
+    def __ne__(self, other):
+        """Return whether two objects are unequal."""
+        return not self.__eq__(other)
+
+    def __str__(self):
+        """Return human-readable representation of segment."""
+        return ("%s: num_samples=%d, sample_rate=%d, duration=%.2fsec, "
+                        "rms=%.2fdB" % (type(self), self.num_samples, self.sample_rate,
+                                                        self.duration, self.rms_db))
+
+    @staticmethod
+    def _convert_samples_to_float32(samples):
+        """Convert sample type to float32.
+
+        Audio sample type is usually integer or float-point.
+        Integers will be scaled to [-1, 1] in float32.
+        """
+        float32_samples = samples.astype('float32')
+        if samples.dtype in np.sctypes['int']:
+            bits = np.iinfo(samples.dtype).bits
+            float32_samples *= (1. / 2 ** (bits - 1))
+        elif samples.dtype in np.sctypes['float']:
+            pass
+        else:
+            raise TypeError("Unsupported sample type: %s." % samples.dtype)
+        return float32_samples
+
+    @property
+    def samples(self):
+        return self._samples.copy()
+
+    @property
+    def sample_rate(self):
+        return self._sample_rate
+
+    @property
+    def num_samples(self):
+        return self._samples.shape[0]
+
+    @property
+    def duration(self):
+        return self._samples.shape[0] / float(self._sample_rate)
+
+    @property
+    def rms_db(self):
+        mean_square = np.mean(self._samples ** 2)
+        return 10 * np.log10(mean_square)
+
+    def gain_db(self, gain):
+        self._samples *= 10. ** (gain / 20.)
+
+    def pad(self, pad_size, symmetric=False):
+        """Add zero padding to the sample.
+
+        The pad size is given in number of samples. If symmetric=True,
+        `pad_size` will be added to both sides. If false, `pad_size` zeros
+        will be added only to the end.
+        """
+        self._samples = np.pad(self._samples,
+                               (pad_size if symmetric else 0, pad_size),
+                               mode='constant')
+
+    def subsegment(self, start_time=None, end_time=None):
+        """Cut the AudioSegment between given boundaries.
+
+        Note that this is an in-place transformation.
+        :param start_time: Beginning of subsegment in seconds.
+        :type start_time: float
+        :param end_time: End of subsegment in seconds.
+        :type end_time: float
+        :raise ValueError: If start_time or end_time is incorrectly set, e.g. out
+                                             of bounds in time.
+        """
+        start_time = 0.0 if start_time is None else start_time
+        end_time = self.duration if end_time is None else end_time
+        if start_time < 0.0:
+            start_time = self.duration + start_time
+        if end_time < 0.0:
+            end_time = self.duration + end_time
+        if start_time < 0.0:
+            raise ValueError("The slice start position (%f s) is out of "
+                             "bounds." % start_time)
+        if end_time < 0.0:
+            raise ValueError("The slice end position (%f s) is out of bounds." %
+                             end_time)
+        if start_time > end_time:
+            raise ValueError("The slice start position (%f s) is later than "
+                             "the end position (%f s)." % (start_time, end_time))
+        if end_time > self.duration:
+            raise ValueError("The slice end position (%f s) is out of bounds "
+                             "(> %f s)" % (end_time, self.duration))
+        start_sample = int(round(start_time * self._sample_rate))
+        end_sample = int(round(end_time * self._sample_rate))
+        self._samples = self._samples[start_sample:end_sample]
+
+
+class Perturbation:
+    def __init__(self, p=0.1, rng=None):
+        self.p = p
+        self._rng = random.Random() if rng is None else rng
+
+    def maybe_apply(self, segment, sample_rate=None):
+        if self._rng.random() < self.p:
+            self(segment, sample_rate)
+
+
+class SpeedPerturbation(Perturbation):
+    def __init__(self, min_rate=0.85, max_rate=1.15, discrete=False, p=0.1, rng=None):
+        super(SpeedPerturbation, self).__init__(p, rng)
+        assert 0 < min_rate < max_rate
+        self.min_rate = min_rate
+        self.max_rate = max_rate
+        self.discrete = discrete
+
+    def __call__(self, data, sample_rate):
+        if self.discrete:
+            rate = np.random.choice([self.min_rate, None, self.max_rate])
+        else:
+            rate = self._rng.uniform(self.min_rate, self.max_rate)
+
+        if rate is not None:
+            data._samples = sox.Transformer().speed(factor=rate).build_array(
+                input_array=data._samples, sample_rate_in=sample_rate)
+
+
+class GainPerturbation(Perturbation):
+    def __init__(self, min_gain_dbfs=-10, max_gain_dbfs=10, p=0.1, rng=None):
+        super(GainPerturbation, self).__init__(p, rng)
+        self._rng = random.Random() if rng is None else rng
+        self._min_gain_dbfs = min_gain_dbfs
+        self._max_gain_dbfs = max_gain_dbfs
+
+    def __call__(self, data, sample_rate=None):
+        del sample_rate
+        gain = self._rng.uniform(self._min_gain_dbfs, self._max_gain_dbfs)
+        data._samples = data._samples * (10. ** (gain / 20.))
+
+
+class ShiftPerturbation(Perturbation):
+    def __init__(self, min_shift_ms=-5.0, max_shift_ms=5.0, p=0.1, rng=None):
+        super(ShiftPerturbation, self).__init__(p, rng)
+        self._min_shift_ms = min_shift_ms
+        self._max_shift_ms = max_shift_ms
+
+    def __call__(self, data, sample_rate):
+        shift_ms = self._rng.uniform(self._min_shift_ms, self._max_shift_ms)
+        if abs(shift_ms) / 1000 > data.duration:
+            # TODO: do something smarter than just ignore this condition
+            return
+        shift_samples = int(shift_ms * data.sample_rate // 1000)
+        # print("DEBUG: shift:", shift_samples)
+        if shift_samples < 0:
+            data._samples[-shift_samples:] = data._samples[:shift_samples]
+            data._samples[:-shift_samples] = 0
+        elif shift_samples > 0:
+            data._samples[:-shift_samples] = data._samples[shift_samples:]
+            data._samples[-shift_samples:] = 0
--- a/PyTorch/SpeechRecognition/QuartzNet/common/dali/init.py
+++ b/PyTorch/SpeechRecognition/QuartzNet/common/dali/init.py
--- a/PyTorch/SpeechRecognition/QuartzNet/common/dali/data_loader.py
+++ b/PyTorch/SpeechRecognition/QuartzNet/common/dali/data_loader.py
@ -0,0 +1,182 @@
+# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import math
+import os
+
+import torch
+import torch.distributed as dist
+
+from .iterator import DaliIterator, SyntheticDataIterator
+from .pipeline import make_dali_asr_pipeline
+from common.helpers import print_once
+
+
+def _parse_json(json_path: str, start_label=0, predicate=lambda json: True):
+    """
+    Parses json file to the format required by DALI.
+
+    Args:
+        json_path: path to json file
+        start_label: the label, starting from which DALI will assign
+            consecutive int numbers to every transcript
+        predicate: function, that accepts a sample descriptor
+            (i.e. json dictionary) as an argument. If the predicate for a given
+            sample returns True, it will be included in the dataset.
+
+    Returns:
+        output_files: dict that maps file name to label assigned by DALI
+        transcripts: dict that maps label assigned by DALI to the transcript
+    """
+    global cnt
+    with open(json_path) as f:
+        librispeech_json = json.load(f)
+    output_files = {}
+    transcripts = {}
+    curr_label = start_label
+    for original_sample in librispeech_json:
+        if not predicate(original_sample):
+            continue
+        transcripts[curr_label] = original_sample['transcript']
+        output_files[original_sample['files'][-1]['fname']] = curr_label
+        curr_label += 1
+    return output_files, transcripts
+
+
+def _dict_to_file(dict: dict, filename: str):
+    with open(filename, "w") as f:
+        for key, value in dict.items():
+            f.write("{} {}\n".format(key, value))
+
+
+class DaliDataLoader:
+    """
+    DataLoader is the main entry point to the data preprocessing pipeline.
+    To use, create an object and then just iterate over `data_iterator`.
+    DataLoader will do the rest for you.
+    Example:
+        data_layer = DataLoader(DaliTrainPipeline, path, json, bs, ngpu)
+        data_it = data_layer.data_iterator
+        for data in data_it:
+            print(data)  # Here's your preprocessed data
+
+    Args:
+        device_type: Which device to use for preprocessing. Choose: "cpu", "gpu"
+        pipeline_type: Choose: "train", "val", "synth"
+    """
+    def __init__(self, gpu_id, dataset_path: str, config_data: dict,
+                 config_features: dict, json_names: list, symbols: list,
+                 batch_size: int, pipeline_type: str,
+                 grad_accumulation_steps: int = 1,
+                 synth_iters_per_epoch: int = 544, device_type: str = "gpu"):
+
+        self.batch_size = batch_size
+        self.grad_accumulation_steps = grad_accumulation_steps
+        self.drop_last = (pipeline_type == 'train')
+        self.device_type = device_type
+        pipeline_type = self._parse_pipeline_type(pipeline_type)
+        if pipeline_type == "synth":
+            self._dali_data_iterator = self._init_synth_iterator(
+                self.batch_size,
+                config_features['nfilt'],
+                iters_per_epoch=synth_iters_per_epoch,
+                ngpus=torch.distributed.get_world_size())
+        else:
+            self._dali_data_iterator = self._init_iterator(
+                gpu_id=gpu_id,
+                dataset_path=dataset_path,
+                config_data=config_data,
+                config_features=config_features,
+                json_names=json_names,
+                symbols=symbols,
+                train_pipeline=pipeline_type == "train")
+
+    def _init_iterator(self, gpu_id, dataset_path, config_data,
+                       config_features, json_names: list, symbols: list,
+                       train_pipeline: bool):
+        """Returns an iterator over data preprocessed with Dali."""
+
+        def hash_list_of_strings(li):
+            return str(abs(hash(''.join(li))))
+
+        output_files, transcripts = {}, {}
+        max_duration = config_data['max_duration']
+        for jname in json_names:
+            of, tr = _parse_json(
+                jname if jname[0] == '/' else os.path.join(dataset_path, jname),
+                len(output_files),
+                predicate=lambda json: json['original_duration'] <= max_duration)
+            output_files.update(of)
+            transcripts.update(tr)
+        file_list_path = os.path.join(
+            "/tmp", "asr_dali.file_list." + hash_list_of_strings(json_names))
+        _dict_to_file(output_files, file_list_path)
+        self.dataset_size = len(output_files)
+        print_once('Dataset read by DALI. '
+                   f'Number of samples: {self.dataset_size}')
+
+        pipeline = make_dali_asr_pipeline(
+            config_data=config_data,
+            config_features=config_features,
+            device_id=gpu_id,
+            file_root=dataset_path,
+            file_list=file_list_path,
+            device_type=self.device_type,
+            batch_size=self.batch_size,
+            train_pipeline=train_pipeline)
+
+        return DaliIterator([pipeline], transcripts=transcripts,
+                            symbols=symbols, batch_size=self.batch_size,
+                            reader_name="file_reader",
+                            train_iterator=train_pipeline)
+
+    def _init_synth_iterator(self, batch_size, nfeatures, iters_per_epoch,
+                             ngpus):
+        self.dataset_size = ngpus * iters_per_epoch * batch_size
+        return SyntheticDataIterator(batch_size, nfeatures, regenerate=True)
+
+    @staticmethod
+    def _parse_pipeline_type(pipeline_type):
+        pipe = pipeline_type.lower()
+        assert pipe in ("train", "val", "synth"), \
+            'Invalid pipeline type (choices: "train", "val", "synth").'
+        return pipe
+
+    def _shard_size(self):
+        """
+        Total number of samples handled by a single GPU in a single epoch.
+        """
+        world_size = dist.get_world_size() if dist.is_initialized() else 1
+        if self.drop_last:
+            divisor = world_size * self.batch_size * self.grad_accumulation_steps
+            return self.dataset_size // divisor * divisor // world_size
+        else:
+            return int(math.ceil(self.dataset_size / world_size))
+
+    def __len__(self):
+        """
+        Number of batches handled by each GPU.
+        """
+        if self.drop_last:
+            assert self._shard_size() % self.batch_size == 0, \
+                f'{self._shard_size()} {self.batch_size}'
+
+        return int(math.ceil(self._shard_size() / self.batch_size))
+
+    def data_iterator(self):
+        return self._dali_data_iterator
+
+    def __iter__(self):
+        return self._dali_data_iterator
--- a/PyTorch/SpeechRecognition/QuartzNet/common/dali/iterator.py
+++ b/PyTorch/SpeechRecognition/QuartzNet/common/dali/iterator.py
@ -0,0 +1,183 @@
+# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import torch
+from nvidia.dali.plugin.base_iterator import LastBatchPolicy
+from nvidia.dali.plugin.pytorch import DALIGenericIterator
+
+from common.helpers import print_once
+from common.text import _clean_text, punctuation_map
+
+
+def normalize_string(s, symbols, punct_map):
+    """
+    Normalizes string.
+    Example:
+        'call me at 8:00 pm!' -> 'call me at eight zero pm'
+    """
+    labels = set(symbols)
+    try:
+        text = _clean_text(s, ["english_cleaners"], punct_map).strip()
+        return ''.join([tok for tok in text if all(t in labels for t in tok)])
+    except Exception as e:
+        print_once(f"WARNING: Normalizing failed: {s} {e}")
+
+
+class DaliIterator(object):
+    """Returns batches of data.
+
+    Batches are in the form:
+        (preprocessed_signal, preprocessed_signal_length, transcript,
+         transcript_length)
+
+    This iterator is not meant to be the entry point to a Dali pipeline.
+    Use DataLoader instead.
+    """
+
+    def __init__(self, dali_pipelines, transcripts, symbols, batch_size,
+                 reader_name, train_iterator: bool):
+        self.transcripts = transcripts
+        self.symbols = symbols
+        self.batch_size = batch_size
+
+        # in train pipeline shard_size is set to divisable by batch_size,
+        # so PARTIAL policy is safe
+        self.dali_it = DALIGenericIterator(
+            dali_pipelines,
+            ["audio", "label", "audio_shape"],
+            reader_name=reader_name,
+            dynamic_shape=True,
+            auto_reset=True,
+            last_batch_policy=LastBatchPolicy.DROP)
+
+    @staticmethod
+    def _str2list(s: str):
+        """
+        Returns list of floats, that represents given string.
+        '0.' denotes separator
+        '1.' denotes 'a'
+        '27.' denotes "'"
+        Assumes, that the string is lower case.
+        """
+        list = []
+        for c in s:
+            if c == "'":
+                list.append(27.)
+            else:
+                list.append(max(0., ord(c) - 96.))
+        return list
+
+    @staticmethod
+    def _pad_lists(lists: list, pad_val=0):
+        """
+        Pads lists, so that all have the same size.
+        Returns list with actual sizes of corresponding input lists
+        """
+        max_length = 0
+        sizes = []
+        for li in lists:
+            sizes.append(len(li))
+            max_length = max_length if len(li) < max_length else len(li)
+        for li in lists:
+            li += [pad_val] * (max_length - len(li))
+        return sizes
+
+    def _gen_transcripts(self, labels, normalize_transcripts: bool = True):
+        """
+        Generate transcripts in format expected by NN
+        """
+        if normalize_transcripts:
+            lists = [
+                self._str2list(normalize_string(self.transcripts[lab.item()],
+                               self.symbols, punctuation_map(self.symbols)))
+                for lab in labels]
+        else:
+            lists = [self._str2list(self.transcripts[lab.item()])
+                     for lab in labels]
+
+        sizes = self._pad_lists(lists)
+        return (torch.tensor(lists).cuda(),
+                torch.tensor(sizes, dtype=torch.int32).cuda())
+
+    def __next__(self):
+        data = self.dali_it.__next__()
+        transcripts, transcripts_lengths = self._gen_transcripts(
+            data[0]["label"])
+        return (data[0]["audio"], data[0]["audio_shape"][:, 1], transcripts,
+                transcripts_lengths)
+
+    def next(self):
+        return self.__next__()
+
+    def __iter__(self):
+        return self
+
+
+# TODO: refactor
+class SyntheticDataIterator(object):
+    def __init__(self, batch_size, nfeatures, feat_min=-5., feat_max=0.,
+                 txt_min=0., txt_max=23., feat_lens_max=1760, txt_lens_max=231,
+                 regenerate=False):
+        """
+        Args:
+            batch_size
+            nfeatures: number of features for melfbanks
+            feat_min: minimum value in `feat` tensor, used for randomization
+            feat_max: maximum value in `feat` tensor, used for randomization
+            txt_min: minimum value in `txt` tensor, used for randomization
+            txt_max: maximum value in `txt` tensor, used for randomization
+            regenerate: If True, regenerate random tensors for every iterator
+                step. If False, generate them only at start.
+        """
+        self.batch_size = batch_size
+        self.nfeatures = nfeatures
+        self.feat_min = feat_min
+        self.feat_max = feat_max
+        self.feat_lens_max = feat_lens_max
+        self.txt_min = txt_min
+        self.txt_max = txt_max
+        self.txt_lens_max = txt_lens_max
+        self.regenerate = regenerate
+
+        if not self.regenerate:
+            (self.feat, self.feat_lens, self.txt, self.txt_lens
+             ) = self._generate_sample()
+
+    def _generate_sample(self):
+        feat = ((self.feat_max - self.feat_min)
+                * np.random.random_sample(
+                    (self.batch_size, self.nfeatures, self.feat_lens_max))
+                + self.feat_min)
+        feat_lens = np.random.randint(0, int(self.feat_lens_max) - 1,
+                                      size=self.batch_size)
+        txt = (self.txt_max - self.txt_min) * np.random.random_sample(
+            (self.batch_size, self.txt_lens_max)) + self.txt_min
+        txt_lens = np.random.randint(0, int(self.txt_lens_max) - 1,
+                                     size=self.batch_size)
+        return (torch.Tensor(feat).cuda(),
+                torch.Tensor(feat_lens).cuda(),
+                torch.Tensor(txt).cuda(),
+                torch.Tensor(txt_lens).cuda())
+
+    def __next__(self):
+        if self.regenerate:
+            return self._generate_sample()
+        return self.feat, self.feat_lens, self.txt, self.txt_lens
+
+    def next(self):
+        return self.__next__()
+
+    def __iter__(self):
+        return self
--- a/PyTorch/SpeechRecognition/QuartzNet/common/dali/pipeline.py
+++ b/PyTorch/SpeechRecognition/QuartzNet/common/dali/pipeline.py
@ -0,0 +1,343 @@
+# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import itertools
+import math
+import multiprocessing
+
+import numpy as np
+
+import nvidia.dali as dali
+import nvidia.dali.fn as fn
+import nvidia.dali.types as types
+import torch
+import torch.distributed as dist
+
+
+def _interleave_lists(*lists):
+    """
+    [*, **, ***], [1, 2, 3], [a, b, c] -> [*, 1, a, **, 2, b, ***, 3, c]
+    Returns:
+        iterator over interleaved list
+    """
+    assert all((len(lists[0]) == len(test_l) for test_l in lists)), \
+        "All lists have to have the same length"
+    return itertools.chain(*zip(*lists))
+
+
+def _generate_cutouts(mask_params, nfeatures):
+    """
+    Returns:
+        Generates anchors and shapes of the cutout regions.
+        Single call generates one batch of data.
+        The output shall be passed to DALI's Erase operator
+        anchors = [f0 t0 f1 t1 ...]
+        shapes = [f0w t0h f1w t1h ...]
+    """
+    MAX_TIME_DIMENSION = 20 * 16000
+    freq_anchors = np.random.random(mask_params['freq_num_regions'])
+    time_anchors = np.random.random(mask_params['time_num_regions'])
+    both_anchors_freq = np.random.random(mask_params['both_num_regions'])
+    both_anchors_time = np.random.random(mask_params['both_num_regions'])
+    anchors = []
+    for anch in freq_anchors:
+        anchors.extend([anch, 0])
+    for anch in time_anchors:
+        anchors.extend([0, anch])
+    for t, f in zip(both_anchors_time, both_anchors_freq):
+        anchors.extend([f, t])
+
+    shapes = []
+    shapes.extend(
+        _interleave_lists(
+            np.random.randint(mask_params['freq_min'],
+                              mask_params['freq_max'] + 1,
+                              mask_params['freq_num_regions']),
+            # XXX: Here, a time dimension of the spectrogram shall be passed.
+            #      However, in DALI ArgumentInput can't come from GPU.
+            #      So we leave the job for Erase (masking operator) to get it together.
+            [int(MAX_TIME_DIMENSION)] * mask_params['freq_num_regions']
+        )
+    )
+    shapes.extend(
+        _interleave_lists(
+            [nfeatures] * mask_params['time_num_regions'],
+            np.random.randint(mask_params['time_min'],
+                              mask_params['time_max'] + 1,
+                              mask_params['time_num_regions'])
+        )
+    )
+    shapes.extend(
+        _interleave_lists(
+            np.random.randint(mask_params['both_min_freq'],
+                              mask_params['both_max_freq'] + 1,
+                              mask_params['both_num_regions']),
+            np.random.randint(mask_params['both_min_time'],
+                              mask_params['both_max_time'] + 1,
+                              mask_params['both_num_regions'])
+        )
+    )
+    return anchors, shapes
+
+
+def _tuples2list(tuples: list):
+    """
+    [(a, b), (c, d)] -> [[a, c], [b, d]]
+    """
+    return map(list, zip(*tuples))
+
+
+def _dali_init_log(args: dict):
+    if not dist.is_initialized() or dist.get_rank() == 0:
+        max_len = max([len(ii) for ii in args.keys()])
+        fmt_string = '\t%' + str(max_len) + 's : %s'
+        print('Initializing DALI with parameters:')
+        for keyPair in sorted(args.items()):
+            print(fmt_string % keyPair)
+
+
+@dali.pipeline_def
+def dali_asr_pipeline(train_pipeline,  # True if training, False if validation
+                      file_root,
+                      file_list,
+                      sample_rate,
+                      silence_threshold,
+                      resample_range,
+                      discrete_resample_range,
+                      window_size,
+                      window_stride,
+                      nfeatures,
+                      nfft,
+                      frame_splicing_factor,
+                      dither_coeff,
+                      pad_align,
+                      preemph_coeff,
+                      do_spectrogram_masking=False,
+                      cutouts_generator=None,
+                      shard_id=0,
+                      n_shards=1,
+                      preprocessing_device="gpu"):
+    do_remove_silence = silence_threshold is not None
+
+    def _div_ceil(dividend, divisor):
+        return (dividend + (divisor - 1)) // divisor
+
+    encoded, label = fn.readers.file(
+        device="cpu", name="file_reader", file_root=file_root,
+        file_list=file_list, shard_id=shard_id, num_shards=n_shards,
+        shuffle_after_epoch=train_pipeline)
+
+    speed_perturbation_coeffs = None
+    if resample_range is not None:
+        if discrete_resample_range:
+            values = [resample_range[0], 1.0, resample_range[1]]
+            speed_perturbation_coeffs = fn.random.uniform(device="cpu",
+                                                          values=values)
+        else:
+            speed_perturbation_coeffs = fn.random.uniform(device="cpu",
+                                                          range=resample_range)
+
+    if train_pipeline and speed_perturbation_coeffs is not None:
+        dec_sample_rate_arg = speed_perturbation_coeffs * sample_rate
+    elif resample_range is None:
+        dec_sample_rate_arg = sample_rate
+    else:
+        dec_sample_rate_arg = None
+
+    audio, _ = fn.decoders.audio(encoded, sample_rate=dec_sample_rate_arg,
+                                 dtype=types.FLOAT, downmix=True)
+    if do_remove_silence:
+        begin, length = fn.nonsilent_region(audio, cutoff_db=silence_threshold)
+        audio = fn.slice(audio, begin, length, axes=[0])
+
+    # Max duration drop is performed at DataLayer stage
+
+    if preprocessing_device == "gpu":
+        audio = audio.gpu()
+
+    if dither_coeff != 0.:
+        audio = audio + fn.random.normal(device=preprocessing_device
+                                         ) * dither_coeff
+
+    audio = fn.preemphasis_filter(audio, preemph_coeff=preemph_coeff)
+
+    spec = fn.spectrogram(audio, nfft=nfft,
+                          window_length=window_size * sample_rate,
+                          window_step=window_stride * sample_rate)
+
+    mel_spec = fn.mel_filter_bank(spec, sample_rate=sample_rate,
+                                  nfilter=nfeatures, normalize=True)
+
+    log_features = fn.to_decibels(mel_spec, multiplier=np.log(10),
+                                  reference=1.0, cutoff_db=math.log(1e-20))
+
+    log_features_len = fn.shapes(log_features)
+    if frame_splicing_factor != 1:
+        log_features_len = _div_ceil(log_features_len, frame_splicing_factor)
+
+    log_features = fn.normalize(log_features, axes=[1])
+    log_features = fn.pad(log_features, axes=[1], fill_value=0, align=pad_align)
+
+    if train_pipeline and do_spectrogram_masking:
+        anchors, shapes = fn.external_source(source=cutouts_generator,
+                                             num_outputs=2, cycle=True)
+        log_features = fn.erase(log_features, anchor=anchors, shape=shapes,
+                                axes=[0, 1], fill_value=0,
+                                normalized_anchor=True)
+
+    # When modifying DALI pipeline returns, make sure you update `output_map`
+    # in DALIGenericIterator invocation
+    return log_features.gpu(), label.gpu(), log_features_len.gpu()
+
+
+def make_dali_asr_pipeline(train_pipeline: bool, device_id, batch_size,
+                           file_root: str, file_list: str, config_data: dict,
+                           config_features: dict, device_type: str = "gpu",
+                           do_resampling: bool = True,
+                           num_cpu_threads: int = multiprocessing.cpu_count()):
+    max_duration = config_data['max_duration']
+    sample_rate = config_data['sample_rate']
+    silence_threshold = -60 if config_data['trim_silence'] else None
+
+    # TODO Take into account resampling probablity
+    # TODO     config_features['speed_perturbation']['p']
+    if do_resampling and config_data['speed_perturbation'] is not None:
+        resample_range = [config_data['speed_perturbation']['min_rate'],
+                            config_data['speed_perturbation']['max_rate']]
+        discrete_resample_range = config_data['speed_perturbation']['discrete']
+    else:
+        resample_range = None
+        discrete_resample_range = False
+
+    window_size = config_features['window_size']
+    window_stride = config_features['window_stride']
+    nfeatures = config_features['n_filt']
+    nfft = config_features['n_fft']
+    frame_splicing_factor = config_features['frame_splicing']
+    dither_coeff = config_features['dither']
+    pad_align = config_features['pad_align']
+    pad_to_max_duration = config_features['pad_to_max_duration']
+    assert not pad_to_max_duration, \
+        "Padding to max duration currently not supported in DALI"
+    preemph_coeff = .97
+
+    config_spec = config_features['spec_augment']
+    if config_spec is not None:
+        mask_time_num_regions = config_spec['time_masks']
+        mask_time_min = config_spec['min_time']
+        mask_time_max = config_spec['max_time']
+        mask_freq_num_regions = config_spec['freq_masks']
+        mask_freq_min = config_spec['min_freq']
+        mask_freq_max = config_spec['max_freq']
+    else:
+        mask_time_num_regions = 0
+        mask_time_min = 0
+        mask_time_max = 0
+        mask_freq_num_regions = 0
+        mask_freq_min = 0
+        mask_freq_max = 0
+
+    config_cutout = config_features['cutout_augment']
+    if config_cutout is not None:
+        mask_both_num_regions = config_cutout['masks']
+        mask_both_min_time = config_cutout['min_time']
+        mask_both_max_time = config_cutout['max_time']
+        mask_both_min_freq = config_cutout['min_freq']
+        mask_both_max_freq = config_cutout['max_freq']
+    else:
+        mask_both_num_regions = 0
+        mask_both_min_time = 0
+        mask_both_max_time = 0
+        mask_both_min_freq = 0
+        mask_both_max_freq = 0
+
+    nfeatures = config_features['n_filt']
+    do_spectrogram_masking = \
+        mask_time_num_regions > 0 or mask_freq_num_regions > 0 or \
+        mask_both_num_regions > 0
+
+    do_remove_silence = silence_threshold is not None
+
+    del(config_spec)
+    del(config_cutout)
+    del(config_data)
+    del(config_features)
+
+    _dali_init_log(locals())
+
+    mask_params = {
+        'time_num_regions': mask_time_num_regions,
+        'time_min': mask_time_min,
+        'time_max': mask_time_max,
+        'freq_num_regions': mask_freq_num_regions,
+        'freq_min': mask_freq_min,
+        'freq_max': mask_freq_max,
+        'both_num_regions': mask_both_num_regions,
+        'both_min_time': mask_both_min_time,
+        'both_max_time': mask_both_max_time,
+        'both_min_freq': mask_both_min_freq,
+        'both_max_freq': mask_both_max_freq,
+    }
+
+    def _cutouts_generator():
+        """
+        Generator, that wraps cutouts creation in order to randomize inputs
+        and allow passing them to DALI's ExternalSource operator
+        """
+        [anchors, shapes] = _tuples2list(
+            [_generate_cutouts(mask_params, nfeatures)
+             for _ in range(batch_size)])
+
+        yield (np.array(anchors, dtype=np.float32),
+               np.array(shapes, dtype=np.float32))
+
+    cutouts_gen = _cutouts_generator if do_spectrogram_masking else None
+
+    if torch.distributed.is_initialized():
+        shard_id = torch.distributed.get_rank()
+        n_shards = torch.distributed.get_world_size()
+    else:
+        shard_id = 0
+        n_shards = 1
+
+    preprocessing_device = device_type.lower()
+    assert preprocessing_device == "cpu" or preprocessing_device == "gpu", \
+        "Incorrect preprocessing device. Please choose either 'cpu' or 'gpu'"
+
+    pipe = dali_asr_pipeline(
+        train_pipeline=train_pipeline,
+        file_root=file_root,
+        file_list=file_list,
+        sample_rate=sample_rate,
+        silence_threshold=silence_threshold,
+        resample_range=resample_range,
+        discrete_resample_range=discrete_resample_range,
+        window_size=window_size,
+        window_stride=window_stride,
+        nfeatures=nfeatures,
+        nfft=nfft,
+        frame_splicing_factor=frame_splicing_factor,
+        dither_coeff=dither_coeff,
+        pad_align=pad_align,
+        preemph_coeff=preemph_coeff,
+        do_spectrogram_masking=do_spectrogram_masking,
+        cutouts_generator=cutouts_gen,
+        shard_id=shard_id,
+        n_shards=n_shards,
+        preprocessing_device=preprocessing_device,
+        batch_size=batch_size,
+        num_threads=num_cpu_threads,
+        device_id=device_id
+    )
+    return pipe
--- a/PyTorch/SpeechRecognition/QuartzNet/common/dataset.py
+++ b/PyTorch/SpeechRecognition/QuartzNet/common/dataset.py
@ -0,0 +1,234 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+from pathlib import Path
+
+import numpy as np
+
+import torch
+from torch.utils.data import Dataset, DataLoader
+from torch.utils.data.distributed import DistributedSampler
+
+from .audio import (audio_from_file, AudioSegment, GainPerturbation,
+                    ShiftPerturbation, SpeedPerturbation)
+from .text import _clean_text, punctuation_map
+
+
+def normalize_string(s, labels, punct_map):
+    """Normalizes string.
+
+    Example:
+        'call me at 8:00 pm!' -> 'call me at eight zero pm'
+    """
+    labels = set(labels)
+    try:
+        text = _clean_text(s, ["english_cleaners"], punct_map).strip()
+        return ''.join([tok for tok in text if all(t in labels for t in tok)])
+    except:
+        print(f"WARNING: Normalizing failed: {s}")
+        return None
+
+
+class FilelistDataset(Dataset):
+    def __init__(self, filelist_fpath):
+        self.samples = [line.strip() for line in open(filelist_fpath, 'r')]
+
+    def __len__(self):
+        return len(self.samples)
+
+    def __getitem__(self, index):
+        audio, audio_len = audio_from_file(self.samples[index])
+        return (audio.squeeze(0), audio_len, torch.LongTensor([0]),
+                torch.LongTensor([0]))
+
+
+class SingleAudioDataset(FilelistDataset):
+    def __init__(self, audio_fpath):
+        self.samples = [audio_fpath]
+
+
+class AudioDataset(Dataset):
+    def __init__(self, data_dir, manifest_fpaths, labels,
+                 sample_rate=16000, min_duration=0.1, max_duration=float("inf"),
+                 pad_to_max_duration=False, max_utts=0, normalize_transcripts=True,
+                 sort_by_duration=False, trim_silence=False,
+                 speed_perturbation=None, gain_perturbation=None,
+                 shift_perturbation=None, ignore_offline_speed_perturbation=False):
+        """Loads audio, transcript and durations listed in a .json file.
+
+        Args:
+            data_dir: absolute path to dataset folder
+            manifest_filepath: relative path from dataset folder
+                to manifest json as described above. Can be coma-separated paths.
+            labels (str): all possible output symbols
+            min_duration (int): skip audio shorter than threshold
+            max_duration (int): skip audio longer than threshold
+            pad_to_max_duration (bool): pad all sequences to max_duration
+            max_utts (int): limit number of utterances
+            normalize_transcripts (bool): normalize transcript text
+            sort_by_duration (bool): sort sequences by increasing duration
+            trim_silence (bool): trim leading and trailing silence from audio
+            ignore_offline_speed_perturbation (bool): use precomputed speed perturbation
+
+        Returns:
+            tuple of Tensors
+        """
+        self.data_dir = data_dir
+        self.labels = labels
+        self.labels_map = dict([(labels[i], i) for i in range(len(labels))])
+        self.punctuation_map = punctuation_map(labels)
+        self.blank_index = len(labels)
+
+        self.pad_to_max_duration = pad_to_max_duration
+
+        self.sort_by_duration = sort_by_duration
+        self.max_utts = max_utts
+        self.normalize_transcripts = normalize_transcripts
+        self.ignore_offline_speed_perturbation = ignore_offline_speed_perturbation
+
+        self.min_duration = min_duration
+        self.max_duration = max_duration
+        self.trim_silence = trim_silence
+        self.sample_rate = sample_rate
+
+        perturbations = []
+        if speed_perturbation is not None:
+            perturbations.append(SpeedPerturbation(**speed_perturbation))
+        if gain_perturbation is not None:
+            perturbations.append(GainPerturbation(**gain_perturbation))
+        if shift_perturbation is not None:
+            perturbations.append(ShiftPerturbation(**shift_perturbation))
+        self.perturbations = perturbations
+
+        self.max_duration = max_duration
+
+        self.samples = []
+        self.duration = 0.0
+        self.duration_filtered = 0.0
+
+        for fpath in manifest_fpaths:
+            self._load_json_manifest(fpath)
+
+        if sort_by_duration:
+            self.samples = sorted(self.samples, key=lambda s: s['duration'])
+
+    def __getitem__(self, index):
+        s = self.samples[index]
+        rn_indx = np.random.randint(len(s['audio_filepath']))
+        duration = s['audio_duration'][rn_indx] if 'audio_duration' in s else 0
+        offset = s.get('offset', 0)
+
+        segment = AudioSegment(
+            s['audio_filepath'][rn_indx], target_sr=self.sample_rate,
+            offset=offset, duration=duration, trim=self.trim_silence)
+
+        for p in self.perturbations:
+            p.maybe_apply(segment, self.sample_rate)
+
+        segment = torch.FloatTensor(segment.samples)
+
+        return (segment,
+                torch.tensor(segment.shape[0]).int(),
+                torch.tensor(s["transcript"]),
+                torch.tensor(len(s["transcript"])).int())
+
+    def __len__(self):
+        return len(self.samples)
+
+    def _load_json_manifest(self, fpath):
+        for s in json.load(open(fpath, "r", encoding="utf-8")):
+
+            if self.pad_to_max_duration and not self.ignore_offline_speed_perturbation:
+                # require all perturbed samples to be < self.max_duration
+                s_max_duration = max(f['duration'] for f in s['files'])
+            else:
+                # otherwise we allow perturbances to be > self.max_duration
+                s_max_duration = s['original_duration']
+
+            s['duration'] = s.pop('original_duration')
+            if not (self.min_duration <= s_max_duration <= self.max_duration):
+                self.duration_filtered += s['duration']
+                continue
+
+            # Prune and normalize according to transcript
+            tr = (s.get('transcript', None) or
+                  self.load_transcript(s['text_filepath']))
+
+            if not isinstance(tr, str):
+                print(f'WARNING: Skipped sample (transcript not a str): {tr}.')
+                self.duration_filtered += s['duration']
+                continue
+
+            if self.normalize_transcripts:
+                tr = normalize_string(tr, self.labels, self.punctuation_map)
+
+            s["transcript"] = self.to_vocab_inds(tr)
+
+            files = s.pop('files')
+            if self.ignore_offline_speed_perturbation:
+                files = [f for f in files if f['speed'] == 1.0]
+
+            s['audio_duration'] = [f['duration'] for f in files]
+            s['audio_filepath'] = [str(Path(self.data_dir, f['fname']))
+                                   for f in files]
+            self.samples.append(s)
+            self.duration += s['duration']
+
+            if self.max_utts > 0 and len(self.samples) >= self.max_utts:
+                print(f'Reached max_utts={self.max_utts}. Finished parsing {fpath}.')
+                break
+
+    def load_transcript(self, transcript_path):
+        with open(transcript_path, 'r', encoding="utf-8") as transcript_file:
+            transcript = transcript_file.read().replace('\n', '')
+        return transcript
+
+    def to_vocab_inds(self, transcript):
+        chars = [self.labels_map.get(x, self.blank_index) for x in list(transcript)]
+        transcript = list(filter(lambda x: x != self.blank_index, chars))
+        return transcript
+
+
+def collate_fn(batch):
+    bs = len(batch)
+    max_len = lambda l, idx: max(el[idx].size(0) for el in l)
+    audio = torch.zeros(bs, max_len(batch, 0))
+    audio_lens = torch.zeros(bs, dtype=torch.int32)
+    transcript = torch.zeros(bs, max_len(batch, 2))
+    transcript_lens = torch.zeros(bs, dtype=torch.int32)
+
+    for i, sample in enumerate(batch):
+        audio[i].narrow(0, 0, sample[0].size(0)).copy_(sample[0])
+        audio_lens[i] = sample[1]
+        transcript[i].narrow(0, 0, sample[2].size(0)).copy_(sample[2])
+        transcript_lens[i] = sample[3]
+    return audio, audio_lens, transcript, transcript_lens
+
+
+def get_data_loader(dataset, batch_size, multi_gpu=True, shuffle=True,
+                    drop_last=True, num_workers=4):
+
+    kw = {'dataset': dataset, 'collate_fn': collate_fn,
+          'num_workers': num_workers, 'pin_memory': True}
+
+    if multi_gpu:
+        loader_shuffle = False
+        sampler = DistributedSampler(dataset, shuffle=shuffle)
+    else:
+        loader_shuffle = shuffle
+        sampler = None
+
+    return DataLoader(batch_size=batch_size, drop_last=drop_last,
+                      sampler=sampler, shuffle=loader_shuffle, **kw)
--- a/PyTorch/SpeechRecognition/QuartzNet/common/features.py
+++ b/PyTorch/SpeechRecognition/QuartzNet/common/features.py
@ -0,0 +1,301 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import random
+
+import librosa
+import torch
+import torch.nn as nn
+
+
+class BaseFeatures(nn.Module):
+    """Base class for GPU accelerated audio preprocessing."""
+    __constants__ = ["pad_align", "pad_to_max_duration", "max_len"]
+
+    def __init__(self, pad_align, pad_to_max_duration, max_duration,
+                 sample_rate, window_size, window_stride, spec_augment=None,
+                 cutout_augment=None):
+        super(BaseFeatures, self).__init__()
+
+        self.pad_align = pad_align
+        self.pad_to_max_duration = pad_to_max_duration
+        self.win_length = int(sample_rate * window_size) # frame size
+        self.hop_length = int(sample_rate * window_stride)
+
+        # Calculate maximum sequence length (# frames)
+        if pad_to_max_duration:
+            self.max_len = 1 + math.ceil(
+                (max_duration * sample_rate - self.win_length) / self.hop_length
+            )
+
+        if spec_augment is not None:
+            self.spec_augment = SpecAugment(**spec_augment)
+        else:
+            self.spec_augment = None
+
+        if cutout_augment is not None:
+            self.cutout_augment = CutoutAugment(**cutout_augment)
+        else:
+            self.cutout_augment = None
+
+    @torch.no_grad()
+    def calculate_features(self, audio, audio_lens):
+        return audio, audio_lens
+
+    def __call__(self, audio, audio_lens):
+        dtype = audio.dtype
+        audio = audio.float()
+        feat, feat_lens = self.calculate_features(audio, audio_lens)
+
+        feat = self.apply_padding(feat)
+
+        if self.cutout_augment is not None:
+            feat = self.cutout_augment(feat)
+
+        if self.spec_augment is not None:
+            feat = self.spec_augment(feat)
+
+        feat = feat.to(dtype)
+        return feat, feat_lens
+
+    def apply_padding(self, x):
+        if self.pad_to_max_duration:
+            x_size = max(x.size(-1), self.max_len)
+        else:
+            x_size = x.size(-1)
+
+        if self.pad_align > 0:
+            pad_amt = x_size % self.pad_align
+        else:
+            pad_amt = 0
+
+        padded_len = x_size + (self.pad_align - pad_amt if pad_amt > 0 else 0)
+        return nn.functional.pad(x, (0, padded_len - x.size(-1)))
+
+
+class SpecAugment(nn.Module):
+    """Spec augment. refer to https://arxiv.org/abs/1904.08779
+    """
+    def __init__(self, freq_masks=0, min_freq=0, max_freq=10, time_masks=0,
+                 min_time=0, max_time=10):
+        super(SpecAugment, self).__init__()
+        assert 0 <= min_freq <= max_freq
+        assert 0 <= min_time <= max_time
+
+        self.freq_masks = freq_masks
+        self.min_freq = min_freq
+        self.max_freq = max_freq
+
+        self.time_masks = time_masks
+        self.min_time = min_time
+        self.max_time = max_time
+
+    @torch.no_grad()
+    def forward(self, x):
+        sh = x.shape
+        mask = torch.zeros(x.shape, dtype=torch.bool, device=x.device)
+
+        for idx in range(sh[0]):
+            for _ in range(self.freq_masks):
+                w = torch.randint(self.min_freq, self.max_freq + 1, size=(1,)).item()
+                f0 = torch.randint(0, max(1, sh[1] - w), size=(1,))
+                mask[idx, f0:f0+w] = 1
+
+            for _ in range(self.time_masks):
+                w = torch.randint(self.min_time, self.max_time + 1, size=(1,)).item()
+                t0 = torch.randint(0, max(1, sh[2] - w), size=(1,))
+                mask[idx, :, t0:t0+w] = 1
+
+        return x.masked_fill(mask, 0)
+
+
+class CutoutAugment(nn.Module):
+    """Cutout. refer to https://arxiv.org/pdf/1708.04552.pdf
+    """
+    def __init__(self, masks=0, min_freq=20, max_freq=20, min_time=5, max_time=5):
+        super(CutoutAugment, self).__init__()
+        assert 0 <= min_freq <= max_freq
+        assert 0 <= min_time <= max_time
+
+        self.masks = masks
+        self.min_freq = min_freq
+        self.max_freq = max_freq
+        self.min_time = min_time
+        self.max_time = max_time
+
+    @torch.no_grad()
+    def forward(self, x):
+        sh = x.shape
+        mask = torch.zeros(x.shape, dtype=torch.bool, device=x.device)
+
+        for idx in range(sh[0]):
+            for i in range(self.masks):
+
+                w = torch.randint(self.min_freq, self.max_freq + 1, size=(1,)).item()
+                h = torch.randint(self.min_time, self.max_time + 1, size=(1,)).item()
+
+                f0 = int(random.uniform(0, sh[1] - w))
+                t0 = int(random.uniform(0, sh[2] - h))
+
+                mask[idx, f0:f0+w, t0:t0+h] = 1
+
+        return x.masked_fill(mask, 0)
+
+
+@torch.jit.script
+def normalize_batch(x, seq_len, normalize_type: str):
+#    print ("normalize_batch: x, seq_len, shapes: ", x.shape, seq_len, seq_len.shape)
+    if normalize_type == "per_feature":
+        x_mean = torch.zeros((seq_len.shape[0], x.shape[1]), dtype=x.dtype,
+                                                 device=x.device)
+        x_std = torch.zeros((seq_len.shape[0], x.shape[1]), dtype=x.dtype,
+                                                device=x.device)
+        for i in range(x.shape[0]):
+            x_mean[i, :] = x[i, :, :seq_len[i]].mean(dim=1)
+            x_std[i, :] = x[i, :, :seq_len[i]].std(dim=1)
+        # make sure x_std is not zero
+        x_std += 1e-5
+        return (x - x_mean.unsqueeze(2)) / x_std.unsqueeze(2)
+
+    elif normalize_type == "all_features":
+        x_mean = torch.zeros(seq_len.shape, dtype=x.dtype, device=x.device)
+        x_std = torch.zeros(seq_len.shape, dtype=x.dtype, device=x.device)
+        for i in range(x.shape[0]):
+            x_mean[i] = x[i, :, :int(seq_len[i])].mean()
+            x_std[i] = x[i, :, :int(seq_len[i])].std()
+        # make sure x_std is not zero
+        x_std += 1e-5
+        return (x - x_mean.view(-1, 1, 1)) / x_std.view(-1, 1, 1)
+    else:
+        return x
+
+
+@torch.jit.script
+def splice_frames(x, frame_splicing: int):
+    """ Stacks frames together across feature dim
+
+    input is batch_size, feature_dim, num_frames
+    output is batch_size, feature_dim*frame_splicing, num_frames
+
+    """
+    seq = [x]
+    # TORCHSCRIPT: JIT doesnt like range(start, stop)
+    for n in range(frame_splicing - 1):
+        seq.append(torch.cat([x[:, :, :n + 1], x[:, :, n + 1:]], dim=2))
+    return torch.cat(seq, dim=1)
+
+
+class FilterbankFeatures(BaseFeatures):
+    # For JIT, https://pytorch.org/docs/stable/jit.html#python-defined-constants
+    __constants__ = ["dither", "preemph", "n_fft", "hop_length", "win_length",
+                     "log", "frame_splicing", "normalize"]
+    # torchscript: "center" removed due to a bug
+
+    def __init__(self, spec_augment=None, cutout_augment=None,
+                 sample_rate=8000, window_size=0.02, window_stride=0.01,
+                 window="hamming", normalize="per_feature", n_fft=None,
+                 preemph=0.97, n_filt=64, lowfreq=0, highfreq=None, log=True,
+                 dither=1e-5, pad_align=8, pad_to_max_duration=False,
+                 max_duration=float('inf'), frame_splicing=1):
+        super(FilterbankFeatures, self).__init__(
+            pad_align=pad_align, pad_to_max_duration=pad_to_max_duration,
+            max_duration=max_duration, sample_rate=sample_rate,
+            window_size=window_size, window_stride=window_stride,
+            spec_augment=spec_augment, cutout_augment=cutout_augment)
+
+        torch_windows = {
+            'hann': torch.hann_window,
+            'hamming': torch.hamming_window,
+            'blackman': torch.blackman_window,
+            'bartlett': torch.bartlett_window,
+            'none': None,
+        }
+
+        self.n_fft = n_fft or 2 ** math.ceil(math.log2(self.win_length))
+
+        self.normalize = normalize
+        self.log = log
+        #TORCHSCRIPT: Check whether or not we need this
+        self.dither = dither
+        self.frame_splicing = frame_splicing
+        self.n_filt = n_filt
+        self.preemph = preemph
+        highfreq = highfreq or sample_rate / 2
+        window_fn = torch_windows.get(window, None)
+        window_tensor = window_fn(self.win_length,
+                                  periodic=False) if window_fn else None
+        filterbanks = torch.tensor(
+            librosa.filters.mel(sample_rate, self.n_fft, n_mels=n_filt,
+                                fmin=lowfreq, fmax=highfreq),
+            dtype=torch.float).unsqueeze(0)
+        # torchscript
+        self.register_buffer("fb", filterbanks)
+        self.register_buffer("window", window_tensor)
+
+    def get_seq_len(self, seq_len):
+        return torch.ceil(seq_len.to(dtype=torch.float) / self.hop_length).to(
+            dtype=torch.int)
+
+    # do stft
+    # TORCHSCRIPT: center removed due to bug
+    def stft(self, x):
+        return torch.stft(x, n_fft=self.n_fft, hop_length=self.hop_length,
+                          win_length=self.win_length,
+                          window=self.window.to(dtype=torch.float))
+
+    @torch.no_grad()
+    def calculate_features(self, x, seq_len):
+        dtype = x.dtype
+
+        seq_len = self.get_seq_len(seq_len)
+
+        # dither
+        if self.dither > 0:
+            x += self.dither * torch.randn_like(x)
+
+        # do preemphasis
+        if self.preemph is not None:
+            x = torch.cat(
+                (x[:, 0].unsqueeze(1), x[:, 1:] - self.preemph * x[:, :-1]), dim=1)
+        x  = self.stft(x)
+
+            # get power spectrum
+        x = x.pow(2).sum(-1)
+
+        # dot with filterbank energies
+        x = torch.matmul(self.fb.to(x.dtype), x)
+
+        # log features if required
+        if self.log:
+            x = torch.log(x + 1e-20)
+
+        # frame splicing if required
+        if self.frame_splicing > 1:
+            raise ValueError('Frame splicing not supported')
+
+        # normalize if required
+        x = normalize_batch(x, seq_len, normalize_type=self.normalize)
+
+        # mask to zero any values beyond seq_len in batch,
+        # pad to multiple of `pad_align` (for efficiency)
+        max_len = x.size(-1)
+        mask = torch.arange(max_len, dtype=seq_len.dtype, device=x.device)
+        mask = mask.expand(x.size(0), max_len) >= seq_len.unsqueeze(1)
+        x = x.masked_fill(mask.unsqueeze(1), 0)
+
+        # TORCHSCRIPT: Is this del important? It breaks scripting
+        # del mask
+
+        return x.to(dtype), seq_len
--- a/PyTorch/SpeechRecognition/QuartzNet/common/helpers.py
+++ b/PyTorch/SpeechRecognition/QuartzNet/common/helpers.py
@ -0,0 +1,276 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import glob
+import os
+import re
+from collections import OrderedDict
+
+import torch
+import torch.distributed as dist
+
+from .metrics import word_error_rate
+
+
+def print_once(msg):
+    if not dist.is_initialized() or dist.get_rank() == 0:
+        print(msg)
+
+
+def add_ctc_blank(symbols):
+    return symbols + ['<BLANK>']
+
+
+def ctc_decoder_predictions_tensor(tensor, labels):
+    """
+    Takes output of greedy ctc decoder and performs ctc decoding algorithm to
+    remove duplicates and special symbol. Returns prediction
+    Args:
+        tensor: model output tensor
+        label: A list of labels
+    Returns:
+        prediction
+    """
+    blank_id = len(labels) - 1
+    hypotheses = []
+    labels_map = {i: labels[i] for i in range(len(labels))}
+    prediction_cpu_tensor = tensor.long().cpu()
+    # iterate over batch
+    for ind in range(prediction_cpu_tensor.shape[0]):
+        prediction = prediction_cpu_tensor[ind].numpy().tolist()
+        # CTC decoding procedure
+        decoded_prediction = []
+        previous = len(labels) - 1 # id of a blank symbol
+        for p in prediction:
+            if (p != previous or previous == blank_id) and p != blank_id:
+                decoded_prediction.append(p)
+            previous = p
+        hypothesis = ''.join([labels_map[c] for c in decoded_prediction])
+        hypotheses.append(hypothesis)
+    return hypotheses
+
+
+def greedy_wer(preds, tgt, tgt_lens, labels):
+    """
+    Takes output of greedy ctc decoder and performs ctc decoding algorithm to
+    remove duplicates and special symbol. Prints wer and prediction examples to screen
+    Args:
+        tensors: A list of 3 tensors (predictions, targets, target_lengths)
+        labels: A list of labels
+
+    Returns:
+        word error rate
+    """
+    with torch.no_grad():
+        references = gather_transcripts([tgt], [tgt_lens], labels)
+        hypotheses = ctc_decoder_predictions_tensor(preds, labels)
+
+    wer, _, _ = word_error_rate(hypotheses, references)
+    return wer, hypotheses[0], references[0]
+
+
+def gather_losses(losses_list):
+    return [torch.mean(torch.stack(losses_list))]
+
+
+def gather_predictions(predictions_list, labels):
+    results = []
+    for prediction in predictions_list:
+        results += ctc_decoder_predictions_tensor(prediction, labels=labels)
+    return results
+
+
+def gather_transcripts(transcript_list, transcript_len_list, labels):
+    results = []
+    labels_map = {i: labels[i] for i in range(len(labels))}
+    # iterate over workers
+    for txt, lens in zip(transcript_list, transcript_len_list):
+        for t, l in zip(txt.long().cpu(), lens.long().cpu()):
+            t = list(t.numpy())
+            results.append(''.join([labels_map[c] for c in t[:l]]))
+    return results
+
+
+def process_evaluation_batch(tensors, global_vars, labels):
+    """
+    Processes results of an iteration and saves it in global_vars
+    Args:
+        tensors: dictionary with results of an evaluation iteration, e.g. loss, predictions, transcript, and output
+        global_vars: dictionary where processes results of iteration are saved
+        labels: A list of labels
+    """
+    for kv, v in tensors.items():
+        if kv.startswith('loss'):
+            global_vars['EvalLoss'] += gather_losses(v)
+        elif kv.startswith('predictions'):
+            global_vars['preds'] += gather_predictions(v, labels)
+        elif kv.startswith('transcript_length'):
+            transcript_len_list = v
+        elif kv.startswith('transcript'):
+            transcript_list = v
+        elif kv.startswith('output'):
+            global_vars['logits'] += v
+
+    global_vars['txts'] += gather_transcripts(
+        transcript_list, transcript_len_list, labels)
+
+
+def process_evaluation_epoch(aggregates, tag=None):
+    """
+    Processes results from each worker at the end of evaluation and combine to final result
+    Args:
+        aggregates: dictionary containing information of entire evaluation
+    Return:
+        wer: final word error rate
+        loss: final loss
+    """
+    if 'losses' in aggregates:
+        eloss = torch.mean(torch.stack(aggregates['losses'])).item()
+    else:
+        eloss = None
+    hypotheses = aggregates['preds']
+    references = aggregates['txts']
+
+    wer, scores, num_words = word_error_rate(hypotheses, references)
+    multi_gpu = dist.is_initialized()
+    if multi_gpu:
+        if eloss is not None:
+            eloss /= dist.get_world_size()
+            eloss_tensor = torch.tensor(eloss).cuda()
+            dist.all_reduce(eloss_tensor)
+            eloss = eloss_tensor.item()
+
+        scores_tensor = torch.tensor(scores).cuda()
+        dist.all_reduce(scores_tensor)
+        scores = scores_tensor.item()
+        num_words_tensor = torch.tensor(num_words).cuda()
+        dist.all_reduce(num_words_tensor)
+        num_words = num_words_tensor.item()
+        wer = scores * 1.0 / num_words
+    return wer, eloss
+
+
+def num_weights(module):
+    return sum(p.numel() for p in module.parameters() if p.requires_grad)
+
+
+class Checkpointer(object):
+
+    def __init__(self, save_dir, model_name, keep_milestones=[100, 200, 300]):
+        self.save_dir = save_dir
+        self.keep_milestones = keep_milestones
+        self.model_name = model_name
+
+        tracked = [
+            (int(re.search('epoch(\d+)_', f).group(1)), f)
+            for f in glob.glob(f'{save_dir}/{self.model_name}_epoch*_checkpoint.pt')]
+        tracked = sorted(tracked, key=lambda t: t[0])
+        self.tracked = OrderedDict(tracked)
+
+    def save(self, model, ema_model, optimizer, scaler, epoch, step, best_wer,
+             is_best=False):
+        """Saves model checkpoint for inference/resuming training.
+
+        Args:
+            model: the model, optionally wrapped by DistributedDataParallel
+            ema_model: model with averaged weights, can be None
+            optimizer: optimizer
+            epoch (int): epoch during which the model is saved
+            step (int): number of steps since beginning of training
+            best_wer (float): lowest recorded WER on the dev set
+            is_best (bool, optional): set name of checkpoint to 'best'
+                and overwrite the previous one
+        """
+        rank = 0
+        if dist.is_initialized():
+            dist.barrier()
+            rank = dist.get_rank()
+
+        if rank != 0:
+            return
+
+        # Checkpoint already saved
+        if not is_best and epoch in self.tracked:
+            return
+
+        unwrap_ddp = lambda model: getattr(model, 'module', model)
+        state = {
+            'epoch': epoch,
+            'step': step,
+            'best_wer': best_wer,
+            'state_dict': unwrap_ddp(model).state_dict(),
+            'ema_state_dict': unwrap_ddp(ema_model).state_dict() if ema_model is not None else None,
+            'optimizer': optimizer.state_dict(),
+            'scaler': scaler.state_dict(),
+        }
+
+        if is_best:
+            fpath = os.path.join(
+                self.save_dir, f"{self.model_name}_best_checkpoint.pt")
+        else:
+            fpath = os.path.join(
+                self.save_dir, f"{self.model_name}_epoch{epoch}_checkpoint.pt")
+
+        print_once(f"Saving {fpath}...")
+        torch.save(state, fpath)
+
+        if not is_best:
+            # Remove old checkpoints; keep milestones and the last two
+            self.tracked[epoch] = fpath
+            for epoch in set(list(self.tracked)[:-2]) - set(self.keep_milestones):
+                try:
+                    os.remove(self.tracked[epoch])
+                except:
+                    pass
+                del self.tracked[epoch]
+
+    def last_checkpoint(self):
+        tracked = list(self.tracked.values())
+
+        if len(tracked) >= 1:
+            try:
+                torch.load(tracked[-1], map_location='cpu')
+                return tracked[-1]
+            except:
+                print_once(f'Last checkpoint {tracked[-1]} appears corrupted.')
+
+        elif len(tracked) >= 2:
+            return tracked[-2]
+        else:
+            return None
+
+    def load(self, fpath, model, ema_model, optimizer, scaler, meta):
+
+        print_once(f'Loading model from {fpath}')
+        checkpoint = torch.load(fpath, map_location="cpu")
+
+        unwrap_ddp = lambda model: getattr(model, 'module', model)
+        state_dict = checkpoint['state_dict']
+        unwrap_ddp(model).load_state_dict(state_dict, strict=True)
+
+        if ema_model is not None:
+            if checkpoint.get('ema_state_dict') is not None:
+                key = 'ema_state_dict'
+            else:
+                key = 'state_dict'
+                print_once('WARNING: EMA weights not found in the checkpoint.')
+                print_once('WARNING: Initializing EMA model with regular params.')
+            state_dict = checkpoint[key]
+            unwrap_ddp(ema_model).load_state_dict(state_dict, strict=True)
+
+        optimizer.load_state_dict(checkpoint['optimizer'])
+        scaler.load_state_dict(checkpoint['scaler'])
+
+        meta['start_epoch'] = checkpoint.get('epoch')
+        meta['best_wer'] = checkpoint.get('best_wer', meta['best_wer'])
--- a/PyTorch/SpeechRecognition/QuartzNet/common/metrics.py
+++ b/PyTorch/SpeechRecognition/QuartzNet/common/metrics.py
@ -0,0 +1,59 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+def __levenshtein(a, b):
+    """Calculates the Levenshtein distance between two sequences."""
+
+    n, m = len(a), len(b)
+    if n > m:
+        # Make sure n <= m, to use O(min(n,m)) space
+        a, b = b, a
+        n, m = m, n
+
+    current = list(range(n + 1))
+    for i in range(1, m + 1):
+        previous, current = current, [i] + [0] * n
+        for j in range(1, n + 1):
+            add, delete = previous[j] + 1, current[j - 1] + 1
+            change = previous[j - 1]
+            if a[j - 1] != b[i - 1]:
+                change = change + 1
+            current[j] = min(add, delete, change)
+
+    return current[n]
+
+
+def word_error_rate(hypotheses, references):
+    """Computes average Word Error Rate (WER) between two text lists."""
+
+    scores = 0
+    words = 0
+    len_diff = len(references) - len(hypotheses)
+    if len_diff > 0:
+        raise ValueError("Uneqal number of hypthoses and references: "
+                         "{0} and {1}".format(len(hypotheses), len(references)))
+    elif len_diff < 0:
+        hypotheses = hypotheses[:len_diff]
+
+    for h, r in zip(hypotheses, references):
+        h_list = h.split()
+        r_list = r.split()
+        words += len(r_list)
+        scores += __levenshtein(h_list, r_list)
+    if words!=0:
+        wer = 1.0*scores/words
+    else:
+        wer = float('inf')
+    return wer, scores, words
--- a/PyTorch/SpeechRecognition/QuartzNet/common/optimizers.py
+++ b/PyTorch/SpeechRecognition/QuartzNet/common/optimizers.py
@ -0,0 +1,269 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from torch.optim import Optimizer
+import math
+
+
+def lr_policy(step, epoch, initial_lr, optimizer, steps_per_epoch, warmup_epochs,
+              hold_epochs, num_epochs=None, policy='linear', min_lr=1e-5,
+              exp_gamma=None):
+    """
+    learning rate decay
+    Args:
+        initial_lr: base learning rate
+        step: current iteration number
+        N: total number of iterations over which learning rate is decayed
+        lr_steps: list of steps to apply exp_gamma
+    """
+    warmup_steps = warmup_epochs * steps_per_epoch
+    hold_steps = hold_epochs * steps_per_epoch
+
+    if policy == 'legacy':
+        assert num_epochs is not None
+        tot_steps = num_epochs * steps_per_epoch
+
+        if step < warmup_steps:
+            a = (step + 1) / (warmup_steps + 1)
+        elif step < warmup_steps + hold_steps:
+            a = 1.0
+        else:
+            a = (((tot_steps - step)
+                 / (tot_steps - warmup_steps - hold_steps)) ** 2)
+
+    elif policy == 'exponential':
+        assert exp_gamma is not None
+
+        if step < warmup_steps:
+            a = (step + 1) / (warmup_steps + 1)
+        elif step < warmup_steps + hold_steps:
+            a = 1.0
+        else:
+            a = exp_gamma ** (epoch - warmup_epochs - hold_epochs)
+
+    else:
+        raise ValueError
+
+    new_lr = max(a * initial_lr, min_lr)
+    for param_group in optimizer.param_groups:
+        param_group['lr'] = new_lr
+
+
+class AdamW(Optimizer):
+    """Implements AdamW algorithm.
+  
+    It has been proposed in `Adam: A Method for Stochastic Optimization`_.
+  
+    Arguments:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups
+        lr (float, optional): learning rate (default: 1e-3)
+        betas (Tuple[float, float], optional): coefficients used for computing
+            running averages of gradient and its square (default: (0.9, 0.999))
+        eps (float, optional): term added to the denominator to improve
+            numerical stability (default: 1e-8)
+        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
+        amsgrad (boolean, optional): whether to use the AMSGrad variant of this
+            algorithm from the paper `On the Convergence of Adam and Beyond`_
+  
+        Adam: A Method for Stochastic Optimization:
+        https://arxiv.org/abs/1412.6980
+        On the Convergence of Adam and Beyond:
+        https://openreview.net/forum?id=ryQu7f-RZ
+    """
+  
+    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
+                  weight_decay=0, amsgrad=False):
+        if not 0.0 <= lr:
+            raise ValueError("Invalid learning rate: {}".format(lr))
+        if not 0.0 <= eps:
+            raise ValueError("Invalid epsilon value: {}".format(eps))
+        if not 0.0 <= betas[0] < 1.0:
+            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
+        if not 0.0 <= betas[1] < 1.0:
+            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
+        defaults = dict(lr=lr, betas=betas, eps=eps,
+                        weight_decay=weight_decay, amsgrad=amsgrad)
+        super(AdamW, self).__init__(params, defaults)
+  
+    def __setstate__(self, state):
+        super(AdamW, self).__setstate__(state)
+        for group in self.param_groups:
+            group.setdefault('amsgrad', False)
+  
+    def step(self, closure=None):
+        """Performs a single optimization step.
+  
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            loss = closure()
+  
+        for group in self.param_groups:
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+                grad = p.grad.data
+                if grad.is_sparse:
+                    raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
+                amsgrad = group['amsgrad']
+  
+                state = self.state[p]
+  
+                # State initialization
+                if len(state) == 0:
+                    state['step'] = 0
+                    # Exponential moving average of gradient values
+                    state['exp_avg'] = torch.zeros_like(p.data)
+                    # Exponential moving average of squared gradient values
+                    state['exp_avg_sq'] = torch.zeros_like(p.data)
+                    if amsgrad:
+                        # Maintains max of all exp. moving avg. of sq. grad. values
+                        state['max_exp_avg_sq'] = torch.zeros_like(p.data)
+  
+                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
+                if amsgrad:
+                    max_exp_avg_sq = state['max_exp_avg_sq']
+                beta1, beta2 = group['betas']
+  
+                state['step'] += 1
+                # Decay the first and second moment running average coefficient
+                exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
+                exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
+                if amsgrad:
+                    # Maintains the maximum of all 2nd moment running avg. till now
+                    torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)
+                    # Use the max. for normalizing running avg. of gradient
+                    denom = max_exp_avg_sq.sqrt().add_(group['eps'])
+                else:
+                    denom = exp_avg_sq.sqrt().add_(group['eps'])
+  
+                bias_correction1 = 1 - beta1 ** state['step']
+                bias_correction2 = 1 - beta2 ** state['step']
+                step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1
+                p.data.add_(torch.mul(p.data, group['weight_decay']).addcdiv_(1, exp_avg, denom), alpha=-step_size)
+  
+        return loss
+
+  
+class Novograd(Optimizer):
+    """
+    Implements Novograd algorithm.
+
+    Args:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups
+        lr (float, optional): learning rate (default: 1e-3)
+        betas (Tuple[float, float], optional): coefficients used for computing
+            running averages of gradient and its square (default: (0.95, 0))
+        eps (float, optional): term added to the denominator to improve
+            numerical stability (default: 1e-8)
+        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
+        grad_averaging: gradient averaging
+        amsgrad (boolean, optional): whether to use the AMSGrad variant of this
+            algorithm from the paper `On the Convergence of Adam and Beyond`_
+            (default: False)
+    """
+
+    def __init__(self, params, lr=1e-3, betas=(0.95, 0), eps=1e-8,
+                 weight_decay=0, grad_averaging=False, amsgrad=False):
+        if not 0.0 <= lr:
+            raise ValueError("Invalid learning rate: {}".format(lr))
+        if not 0.0 <= eps:
+            raise ValueError("Invalid epsilon value: {}".format(eps))
+        if not 0.0 <= betas[0] < 1.0:
+            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
+        if not 0.0 <= betas[1] < 1.0:
+            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
+        defaults = dict(lr=lr, betas=betas, eps=eps,
+                      weight_decay=weight_decay,
+                      grad_averaging=grad_averaging,
+                      amsgrad=amsgrad)
+
+        super(Novograd, self).__init__(params, defaults)
+
+    def __setstate__(self, state):
+        super(Novograd, self).__setstate__(state)
+        for group in self.param_groups:
+            group.setdefault('amsgrad', False)
+
+    def step(self, closure=None):
+        """Performs a single optimization step.
+
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+            and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            loss = closure()
+
+        for group in self.param_groups:
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+                grad = p.grad.data
+                if grad.is_sparse:
+                    raise RuntimeError('Sparse gradients are not supported.')
+                amsgrad = group['amsgrad']
+
+                state = self.state[p]
+
+                # State initialization
+                if len(state) == 0:
+                    state['step'] = 0
+                    # Exponential moving average of gradient values
+                    state['exp_avg'] = torch.zeros_like(p.data)
+                    # Exponential moving average of squared gradient values
+                    state['exp_avg_sq'] = torch.zeros([]).to(state['exp_avg'].device)
+                    if amsgrad:
+                        # Maintains max of all exp. moving avg. of sq. grad. values
+                        state['max_exp_avg_sq'] = torch.zeros([]).to(state['exp_avg'].device)
+
+                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
+                if amsgrad:
+                    max_exp_avg_sq = state['max_exp_avg_sq']
+                beta1, beta2 = group['betas']
+
+                state['step'] += 1
+
+                norm = torch.sum(torch.pow(grad, 2))
+
+                if exp_avg_sq == 0:
+                    exp_avg_sq.copy_(norm)
+                else:
+                    exp_avg_sq.mul_(beta2).add_(norm, alpha=1 - beta2)
+
+                if amsgrad:
+                    # Maintains the maximum of all 2nd moment running avg. till now
+                    torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)
+                    # Use the max. for normalizing running avg. of gradient
+                    denom = max_exp_avg_sq.sqrt().add_(group['eps'])
+                else:
+                    denom = exp_avg_sq.sqrt().add_(group['eps'])
+
+                grad.div_(denom)
+                if group['weight_decay'] != 0:
+                    grad.add_(p.data, alpha=group['weight_decay'])
+                if group['grad_averaging']:
+                    grad.mul_(1 - beta1)
+                exp_avg.mul_(beta1).add_(grad)
+
+                p.data.add_(exp_avg, alpha=-group['lr'])
+        
+        return loss
--- a/PyTorch/SpeechRecognition/QuartzNet/common/sampler.py
+++ b/PyTorch/SpeechRecognition/QuartzNet/common/sampler.py
@ -0,0 +1,128 @@
+import torch
+import numpy as np
+
+from torch.utils.data.sampler import Sampler
+
+
+class DistributedSampler(Sampler):
+    def __init__(self, dataset, batch_size, world_size, rank):
+        """
+        Constructor for the DistributedSampler.
+        :param dataset: dataset
+        :param batch_size: local batch size
+        :param world_size: number of distributed workers
+        :param rank: rank of the current process
+        """
+        self.dataset = dataset
+        self.world_size = world_size
+        self.rank = rank
+        self.epoch = 0
+
+        self.batch_size = batch_size
+        self.global_batch_size = batch_size * world_size
+
+        self.data_len = len(self.dataset)
+
+        self.num_samples = self.data_len // self.global_batch_size \
+            * self.global_batch_size
+
+    def distribute_batches(self, indices):
+        """
+        Assigns batches to workers.
+        Consecutive ranks are getting consecutive batches.
+        :param indices: torch.tensor with batch indices
+        """
+        assert len(indices) == self.num_samples
+
+        indices = indices.view(-1, self.batch_size)
+        indices = indices[self.rank::self.world_size].contiguous()
+        indices = indices.view(-1)
+        indices = indices.tolist()
+
+        assert len(indices) == self.num_samples // self.world_size
+        return indices
+
+    def reshuffle_batches(self, indices, rng):
+        """
+        Permutes global batches
+        :param indices: torch.tensor with batch indices
+        :param rng: instance of torch.Generator
+        """
+        indices = indices.view(-1, self.global_batch_size)
+        num_batches = indices.shape[0]
+        order = torch.randperm(num_batches, generator=rng)
+        indices = indices[order, :]
+        indices = indices.view(-1)
+        return indices
+
+    def __iter__(self):
+        g = torch.Generator()
+        g.manual_seed(self.epoch)
+        # generate permutation
+        indices = torch.randperm(self.data_len, generator=rng)
+
+        # make indices evenly divisible by (batch_size * world_size)
+        indices = indices[:self.num_samples]
+
+        # assign batches to workers
+        indices = self.distribute_batches(indices)
+        return iter(indices)
+
+    def set_epoch(self, epoch):
+        """
+        Sets current epoch index.
+        Epoch index is used to seed RNG in __iter__() function.
+        :param epoch: index of current epoch
+        """
+        self.epoch = epoch
+
+    def __len__(self):
+        return self.num_samples // self.world_size
+
+
+class BucketingSampler(DistributedSampler):
+    def __init__(self, dataset, batch_size, num_buckets, world_size, rank):
+        """
+        Bucketing sampler with approx. equally-sized buckets.
+        :param dataset: dataset
+        :param batch_size: local batch size
+        :param seeds: list of seeds, one seed for each training epoch
+        :param num_buckets: number of buckets
+        :param world_size: number of distributed workers
+        :param rank: rank of the current process
+        """
+        super().__init__(dataset, batch_size, world_size, rank)
+
+        self.num_buckets = num_buckets
+        len_ids = np.argsort([sample['duration'] for sample in dataset.samples])
+        self.buckets = [torch.from_numpy(t)
+                        for t in np.array_split(len_ids, num_buckets)]
+        global_bs = self.global_batch_size
+
+    def __iter__(self):
+        g = torch.Generator()
+        g.manual_seed(self.epoch)
+        global_bsz = self.global_batch_size
+
+        indices = []
+        for bid in range(self.num_buckets):
+            # random shuffle within current bucket
+            perm = torch.randperm(len(self.buckets[bid]), generator=g)
+            bucket_indices = self.buckets[bid][perm]
+
+            # add samples from current bucket to indices for current epoch
+            indices.append(bucket_indices)
+
+        indices = torch.cat(indices)
+
+        # make indices evenly divisible by global batch size
+        length = len(indices) // global_bsz * global_bsz
+        indices = indices[:length]
+
+        assert len(indices) % self.global_batch_size == 0
+
+        # perform global reshuffle of all global batches
+        indices = self.reshuffle_batches(indices, g)
+        # distribute batches to individual workers
+        indices = self.distribute_batches(indices)
+        return iter(indices)
--- a/PyTorch/SpeechRecognition/QuartzNet/common/tb_dllogger.py
+++ b/PyTorch/SpeechRecognition/QuartzNet/common/tb_dllogger.py
@ -0,0 +1,173 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import atexit
+import glob
+import os
+import re
+import numpy as np
+
+import torch
+from torch.utils.tensorboard import SummaryWriter
+
+import dllogger
+from dllogger import StdOutBackend, JSONStreamBackend, Verbosity
+
+
+tb_loggers = {}
+
+
+class TBLogger:
+    """
+    xyz_dummies: stretch the screen with empty plots so the legend would
+                 always fit for other plots
+    """
+    def __init__(self, enabled, log_dir, name, interval=1, dummies=True):
+        self.enabled = enabled
+        self.interval = interval
+        self.cache = {}
+        if self.enabled:
+            self.summary_writer = SummaryWriter(
+                log_dir=os.path.join(log_dir, name),
+                flush_secs=120, max_queue=200)
+            atexit.register(self.summary_writer.close)
+            if dummies:
+                for key in ('aaa', 'zzz'):
+                    self.summary_writer.add_scalar(key, 0.0, 1)
+
+    def log(self, step, data):
+        for k, v in data.items():
+            self.log_value(step, k, v.item() if type(v) is torch.Tensor else v)
+
+    def log_value(self, step, key, val, stat='mean'):
+        if self.enabled:
+            if key not in self.cache:
+                self.cache[key] = []
+            self.cache[key].append(val)
+            if len(self.cache[key]) == self.interval:
+                agg_val = getattr(np, stat)(self.cache[key])
+                self.summary_writer.add_scalar(key, agg_val, step)
+                del self.cache[key]
+
+    def log_grads(self, step, model):
+        if self.enabled:
+            norms = [p.grad.norm().item() for p in model.parameters()
+                     if p.grad is not None]
+            for stat in ('max', 'min', 'mean'):
+                self.log_value(step, f'grad_{stat}', getattr(np, stat)(norms),
+                               stat=stat)
+
+
+def unique_log_fpath(log_fpath):
+
+    if not os.path.isfile(log_fpath):
+        return log_fpath
+
+    # Avoid overwriting old logs
+    saved = sorted([int(re.search('\.(\d+)', f).group(1))
+                    for f in glob.glob(f'{log_fpath}.*')])
+
+    log_num = (saved[-1] if saved else 0) + 1
+    return f'{log_fpath}.{log_num}'
+
+
+def stdout_step_format(step):
+    if isinstance(step, str):
+        return step
+    fields = []
+    if len(step) > 0:
+        fields.append("epoch {:>4}".format(step[0]))
+    if len(step) > 1:
+        fields.append("iter {:>4}".format(step[1]))
+    if len(step) > 2:
+        fields[-1] += "/{}".format(step[2])
+    return " | ".join(fields)
+
+
+def stdout_metric_format(metric, metadata, value):
+    name = metadata.get("name", metric + " : ")
+    unit = metadata.get("unit", None)
+    format = f'{{{metadata.get("format", "")}}}'
+    fields = [name, format.format(value) if value is not None else value, unit]
+    fields = [f for f in fields if f is not None]
+    return "| " + " ".join(fields)
+
+
+def init_log(args):
+    enabled = (args.local_rank == 0)
+    if enabled:
+        fpath = args.log_file or os.path.join(args.output_dir, 'nvlog.json')
+        backends = [JSONStreamBackend(Verbosity.DEFAULT,
+                                      unique_log_fpath(fpath)),
+                    StdOutBackend(Verbosity.VERBOSE,
+                                  step_format=stdout_step_format,
+                                  metric_format=stdout_metric_format)]
+    else:
+        backends = []
+
+    dllogger.init(backends=backends)
+    dllogger.metadata("train_lrate", {"name": "lrate", "format": ":>3.2e"})
+
+    for id_, pref in [('train', ''), ('train_avg', 'avg train '),
+                      ('dev', '  avg dev '), ('dev_ema', '  EMA dev ')]:
+
+        dllogger.metadata(f"{id_}_loss",
+                          {"name": f"{pref}loss", "format": ":>7.2f"})
+
+        dllogger.metadata(f"{id_}_wer",
+                          {"name": f"{pref}wer", "format": ":>6.2f"})
+
+        dllogger.metadata(f"{id_}_throughput",
+                          {"name": f"{pref}utts/s", "format": ":>5.0f"})
+
+        dllogger.metadata(f"{id_}_took",
+                          {"name": "took", "unit": "s", "format": ":>5.2f"})
+
+    tb_subsets = ['train', 'dev', 'dev_ema'] if args.ema else ['train', 'dev']
+    global tb_loggers
+    tb_loggers = {s: TBLogger(enabled, args.output_dir, name=s)
+                  for s in tb_subsets}
+
+    log_parameters(vars(args), tb_subset='train')
+
+
+def log(step, tb_total_steps=None, subset='train', data={}):
+
+    if tb_total_steps is not None:
+        tb_loggers[subset].log(tb_total_steps, data)
+
+    if subset != '':
+        data = {f'{subset}_{key}': val for key, val in data.items()}
+    dllogger.log(step, data=data)
+
+
+def log_grads_tb(tb_total_steps, grads, tb_subset='train'):
+    tb_loggers[tb_subset].log_grads(tb_total_steps, grads)
+
+
+def log_parameters(data, verbosity=0, tb_subset=None):
+    for k, v in data.items():
+        dllogger.log(step="PARAMETER", data={k: v}, verbosity=verbosity)
+
+    if tb_subset is not None and tb_loggers[tb_subset].enabled:
+        tb_data = {k: v for k, v in data.items()
+                   if type(v) in (str, bool, int, float)}
+        tb_loggers[tb_subset].summary_writer.add_hparams(tb_data, {})
+
+
+def flush_log():
+    dllogger.flush()
+    for tbl in tb_loggers.values():
+        if tbl.enabled:
+            tbl.summary_writer.flush()
--- a/PyTorch/SpeechRecognition/QuartzNet/common/text/LICENSE
+++ b/PyTorch/SpeechRecognition/QuartzNet/common/text/LICENSE
@ -0,0 +1,19 @@
+Copyright (c) 2017 Keith Ito
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
--- a/PyTorch/SpeechRecognition/QuartzNet/common/text/init.py
+++ b/PyTorch/SpeechRecognition/QuartzNet/common/text/init.py
@ -0,0 +1,32 @@
+# Copyright (c) 2017 Keith Ito
+""" from https://github.com/keithito/tacotron """
+import re
+import string
+from . import cleaners
+
+def _clean_text(text, cleaner_names, *args):
+    for name in cleaner_names:
+        cleaner = getattr(cleaners, name)
+        if not cleaner:
+            raise Exception('Unknown cleaner: %s' % name)
+        text = cleaner(text, *args)
+    return text
+
+
+def punctuation_map(labels):
+    # Punctuation to remove
+    punctuation = string.punctuation
+    punctuation = punctuation.replace("+", "")
+    punctuation = punctuation.replace("&", "")
+    # TODO We might also want to consider:
+    # @ -> at
+    # # -> number, pound, hashtag
+    # ~ -> tilde
+    # _ -> underscore
+    # % -> percent
+    # If a punctuation symbol is inside our vocab, we do not remove from text
+    for l in labels:
+        punctuation = punctuation.replace(l, "")
+    # Turn all punctuation to whitespace
+    table = str.maketrans(punctuation, " " * len(punctuation))
+    return table
--- a/PyTorch/SpeechRecognition/QuartzNet/common/text/cleaners.py
+++ b/PyTorch/SpeechRecognition/QuartzNet/common/text/cleaners.py
@ -0,0 +1,107 @@
+# Copyright (c) 2017 Keith Ito
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+""" from https://github.com/keithito/tacotron 
+Modified to add puncturation removal
+"""
+
+'''
+Cleaners are transformations that run over the input text at both training and eval time.
+
+Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
+hyperparameter. Some cleaners are English-specific. You'll typically want to use:
+    1. "english_cleaners" for English text
+    2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
+         the Unidecode library (https://pypi.python.org/pypi/Unidecode)
+    3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
+         the symbols in symbols.py to match your data).
+
+'''
+
+import re
+from unidecode import unidecode
+from .numbers import normalize_numbers
+
+# Regular expression matching whitespace:
+_whitespace_re = re.compile(r'\s+')
+
+# List of (regular expression, replacement) pairs for abbreviations:
+_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [
+    ('mrs', 'misess'),
+    ('mr', 'mister'),
+    ('dr', 'doctor'),
+    ('st', 'saint'),
+    ('co', 'company'),
+    ('jr', 'junior'),
+    ('maj', 'major'),
+    ('gen', 'general'),
+    ('drs', 'doctors'),
+    ('rev', 'reverend'),
+    ('lt', 'lieutenant'),
+    ('hon', 'honorable'),
+    ('sgt', 'sergeant'),
+    ('capt', 'captain'),
+    ('esq', 'esquire'),
+    ('ltd', 'limited'),
+    ('col', 'colonel'),
+    ('ft', 'fort'),
+]]
+
+def expand_abbreviations(text):
+    for regex, replacement in _abbreviations:
+        text = re.sub(regex, replacement, text)
+    return text
+
+def expand_numbers(text):
+    return normalize_numbers(text)
+
+def lowercase(text):
+    return text.lower()
+
+def collapse_whitespace(text):
+    return re.sub(_whitespace_re, ' ', text)
+
+def convert_to_ascii(text):
+    return unidecode(text)
+
+def remove_punctuation(text, table):
+    text = text.translate(table)
+    text = re.sub(r'&', " and ", text)
+    text = re.sub(r'\+', " plus ", text)
+    return text
+
+def basic_cleaners(text):
+    '''Basic pipeline that lowercases and collapses whitespace without transliteration.'''
+    text = lowercase(text)
+    text = collapse_whitespace(text)
+    return text
+
+def transliteration_cleaners(text):
+    '''Pipeline for non-English text that transliterates to ASCII.'''
+    text = convert_to_ascii(text)
+    text = lowercase(text)
+    text = collapse_whitespace(text)
+    return text
+
+def english_cleaners(text, table=None):
+    '''Pipeline for English text, including number and abbreviation expansion.'''
+    text = convert_to_ascii(text)
+    text = lowercase(text)
+    text = expand_numbers(text)
+    text = expand_abbreviations(text)
+    if table is not None:
+        text = remove_punctuation(text, table)
+    text = collapse_whitespace(text)
+    return text
--- a/PyTorch/SpeechRecognition/QuartzNet/common/text/numbers.py
+++ b/PyTorch/SpeechRecognition/QuartzNet/common/text/numbers.py
@ -0,0 +1,99 @@
+# Copyright (c) 2017 Keith Ito
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" from https://github.com/keithito/tacotron 
+Modifed to add support for time and slight tweaks to _expand_number
+"""
+
+import inflect
+import re
+
+
+_inflect = inflect.engine()
+_comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
+_decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
+_pounds_re = re.compile(r'£([0-9\,]*[0-9]+)')
+_dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)')
+_ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)')
+_number_re = re.compile(r'[0-9]+')
+_time_re = re.compile(r'([0-9]{1,2}):([0-9]{2})')
+
+
+def _remove_commas(m):
+    return m.group(1).replace(',', '')
+
+
+def _expand_decimal_point(m):
+    return m.group(1).replace('.', ' point ')
+
+
+def _expand_dollars(m):
+    match = m.group(1)
+    parts = match.split('.')
+    if len(parts) > 2:
+        return match + ' dollars'  # Unexpected format
+    dollars = int(parts[0]) if parts[0] else 0
+    cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
+    if dollars and cents:
+        dollar_unit = 'dollar' if dollars == 1 else 'dollars'
+        cent_unit = 'cent' if cents == 1 else 'cents'
+        return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit)
+    elif dollars:
+        dollar_unit = 'dollar' if dollars == 1 else 'dollars'
+        return '%s %s' % (dollars, dollar_unit)
+    elif cents:
+        cent_unit = 'cent' if cents == 1 else 'cents'
+        return '%s %s' % (cents, cent_unit)
+    else:
+        return 'zero dollars'
+
+
+def _expand_ordinal(m):
+    return _inflect.number_to_words(m.group(0))
+
+
+def _expand_number(m):
+    if int(m.group(0)[0]) == 0:
+        return _inflect.number_to_words(m.group(0), andword='', group=1)
+    num = int(m.group(0))
+    if num > 1000 and num < 3000:
+        if num == 2000:
+            return 'two thousand'
+        elif num > 2000 and num < 2010:
+            return 'two thousand ' + _inflect.number_to_words(num % 100)
+        elif num % 100 == 0:
+            return _inflect.number_to_words(num // 100) + ' hundred'
+        else:
+            return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ')
+    # Add check for number phones and other large numbers
+    elif num > 1000000000 and num % 10000 != 0:
+        return _inflect.number_to_words(num, andword='', group=1)
+    else:
+        return _inflect.number_to_words(num, andword='')
+
+def _expand_time(m):
+    mins = int(m.group(2))
+    if mins == 0:
+        return _inflect.number_to_words(m.group(1))
+    return " ".join([_inflect.number_to_words(m.group(1)), _inflect.number_to_words(m.group(2))])
+
+def normalize_numbers(text):
+    text = re.sub(_comma_number_re, _remove_commas, text)
+    text = re.sub(_pounds_re, r'\1 pounds', text)
+    text = re.sub(_dollars_re, _expand_dollars, text)
+    text = re.sub(_decimal_number_re, _expand_decimal_point, text)
+    text = re.sub(_ordinal_re, _expand_ordinal, text)
+    text = re.sub(_number_re, _expand_number, text)
+    text = re.sub(_time_re, _expand_time, text)
+    return text
--- a/PyTorch/SpeechRecognition/QuartzNet/common/text/symbols.py
+++ b/PyTorch/SpeechRecognition/QuartzNet/common/text/symbols.py
@ -0,0 +1,19 @@
+# Copyright (c) 2017 Keith Ito
+""" from https://github.com/keithito/tacotron """
+
+'''
+Defines the set of symbols used in text input to the model.
+
+The default is a set of ASCII characters that works well for English or text that has been run through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details. '''
+from . import cmudict
+
+_pad        = '_'
+_punctuation = '!\'(),.:;? '
+_special = '-'
+_letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
+
+# Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters):
+_arpabet = ['@' + s for s in cmudict.valid_symbols]
+
+# Export all symbols:
+symbols = [_pad] + list(_special) + list(_punctuation) + list(_letters) + _arpabet
--- a/PyTorch/SpeechRecognition/QuartzNet/common/utils.py
+++ b/PyTorch/SpeechRecognition/QuartzNet/common/utils.py
@ -0,0 +1,20 @@
+import numpy as np
+
+
+class BenchmarkStats:
+    """ Tracks statistics used for benchmarking. """
+    def __init__(self):
+        self.utts = []
+        self.times = []
+        self.losses = []
+
+    def update(self, utts, times, losses):
+        self.utts.append(utts)
+        self.times.append(times)
+        self.losses.append(losses)
+
+    def get(self, n_epochs):
+        throughput = sum(self.utts[-n_epochs:]) / sum(self.times[-n_epochs:])
+
+        return {'throughput': throughput, 'benchmark_epochs_num': n_epochs,
+                'loss': np.mean(self.losses[-n_epochs:])}
--- a/PyTorch/SpeechRecognition/QuartzNet/configs/quartznet15x5_speedp-online-1.15_speca.yaml
+++ b/PyTorch/SpeechRecognition/QuartzNet/configs/quartznet15x5_speedp-online-1.15_speca.yaml
@ -0,0 +1,151 @@
+# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: "QuartzNet"
+labels: [" ", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m",
+         "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "'"]
+
+input_val:
+  audio_dataset: &val_dataset
+    sample_rate: &sample_rate 16000
+    trim_silence: true
+    normalize_transcripts: true
+
+  filterbank_features: &val_features
+    normalize: per_feature
+    sample_rate: *sample_rate
+    window_size: 0.02
+    window_stride: 0.01
+    window: hann
+    n_filt: &n_filt 64
+    n_fft: 512
+    frame_splicing: &frame_splicing 1
+    dither: 0.00001
+    pad_align: 16
+
+# For training we keep samples < 16.7s and apply augmentation
+input_train:
+  audio_dataset:
+    <<: *val_dataset
+    max_duration: 16.7
+    ignore_offline_speed_perturbation: true
+
+    speed_perturbation:
+      min_rate: 0.85
+      max_rate: 1.15
+
+  filterbank_features:
+    <<: *val_features
+    max_duration: 16.7
+
+    spec_augment:
+      freq_masks: 2
+      max_freq: 15
+      time_masks: 2
+      max_time: 55
+
+quartznet:
+  encoder:
+    init: xavier_uniform
+    in_feats: *n_filt
+    frame_splicing: *frame_splicing
+    activation: relu
+    use_conv_masks: true
+    blocks:
+    - &Conv1
+      filters: 256
+      repeat: 1
+      kernel_size: [33]
+      dilation: [1]
+      stride: [2]
+      dropout: 0.0
+      residual: false
+      separable: true
+    - &B1
+      filters: 256
+      repeat: 5
+      kernel_size: [33]
+      dilation: [1]
+      stride: [1]
+      dropout: 0.0
+      residual: true
+      separable: true
+    - *B1
+    - *B1
+    - &B2
+      filters: 256
+      repeat: 5
+      kernel_size: [39]
+      dilation: [1]
+      stride: [1]
+      dropout: 0.0
+      residual: true
+      separable: true
+    - *B2
+    - *B2
+    - &B3
+      filters: 512
+      repeat: 5
+      kernel_size: [51]
+      dilation: [1]
+      stride: [1]
+      dropout: 0.0
+      residual: true
+      separable: true
+    - *B3
+    - *B3
+    - &B4
+      filters: 512
+      repeat: 5
+      kernel_size: [63]
+      dilation: [1]
+      stride: [1]
+      dropout: 0.0
+      residual: true
+      separable: true
+    - *B4
+    - *B4
+    - &B5
+      filters: 512
+      repeat: 5
+      kernel_size: [75]
+      dilation: [1]
+      stride: [1]
+      dropout: 0.0
+      residual: true
+      separable: true
+    - *B5
+    - *B5
+    - &Conv2
+      filters: 512
+      repeat: 1
+      kernel_size: [87]
+      dilation: [2]
+      stride: [1]
+      dropout: 0.0
+      residual: false
+      separable: true
+    - &Conv3
+      filters: &enc_feats 1024
+      repeat: 1
+      kernel_size: [1]
+      dilation: [1]
+      stride: [1]
+      dropout: 0.0
+      residual: false
+      separable: false
+
+  decoder:
+    in_feats: *enc_feats
+    init: xavier_uniform
--- a/PyTorch/SpeechRecognition/QuartzNet/configs/quartznet15x5_speedp-online-1.15_speca_drop0.2.yaml
+++ b/PyTorch/SpeechRecognition/QuartzNet/configs/quartznet15x5_speedp-online-1.15_speca_drop0.2.yaml
@ -0,0 +1,151 @@
+# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: "QuartzNet"
+labels: [" ", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m",
+         "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "'"]
+
+input_val:
+  audio_dataset: &val_dataset
+    sample_rate: &sample_rate 16000
+    trim_silence: true
+    normalize_transcripts: true
+
+  filterbank_features: &val_features
+    normalize: per_feature
+    sample_rate: *sample_rate
+    window_size: 0.02
+    window_stride: 0.01
+    window: hann
+    n_filt: &n_filt 64
+    n_fft: 512
+    frame_splicing: &frame_splicing 1
+    dither: 0.00001
+    pad_align: 16
+
+# For training we keep samples < 16.7s and apply augmentation
+input_train:
+  audio_dataset:
+    <<: *val_dataset
+    max_duration: 16.7
+    ignore_offline_speed_perturbation: true
+
+    speed_perturbation:
+      min_rate: 0.85
+      max_rate: 1.15
+
+  filterbank_features:
+    <<: *val_features
+    max_duration: 16.7
+
+    spec_augment:
+      freq_masks: 2
+      max_freq: 20
+      time_masks: 2
+      max_time: 75
+
+quartznet:
+  encoder:
+    init: xavier_uniform
+    in_feats: *n_filt
+    frame_splicing: *frame_splicing
+    activation: relu
+    use_conv_masks: true
+    blocks:
+    - &Conv1
+      filters: 256
+      repeat: 1
+      kernel_size: [33]
+      dilation: [1]
+      stride: [2]
+      dropout: 0.2
+      residual: false
+      separable: true
+    - &B1
+      filters: 256
+      repeat: 5
+      kernel_size: [33]
+      dilation: [1]
+      stride: [1]
+      dropout: 0.2
+      residual: true
+      separable: true
+    - *B1
+    - *B1
+    - &B2
+      filters: 256
+      repeat: 5
+      kernel_size: [39]
+      dilation: [1]
+      stride: [1]
+      dropout: 0.2
+      residual: true
+      separable: true
+    - *B2
+    - *B2
+    - &B3
+      filters: 512
+      repeat: 5
+      kernel_size: [51]
+      dilation: [1]
+      stride: [1]
+      dropout: 0.2
+      residual: true
+      separable: true
+    - *B3
+    - *B3
+    - &B4
+      filters: 512
+      repeat: 5
+      kernel_size: [63]
+      dilation: [1]
+      stride: [1]
+      dropout: 0.2
+      residual: true
+      separable: true
+    - *B4
+    - *B4
+    - &B5
+      filters: 512
+      repeat: 5
+      kernel_size: [75]
+      dilation: [1]
+      stride: [1]
+      dropout: 0.2
+      residual: true
+      separable: true
+    - *B5
+    - *B5
+    - &Conv2
+      filters: 512
+      repeat: 1
+      kernel_size: [87]
+      dilation: [2]
+      stride: [1]
+      dropout: 0.2
+      residual: false
+      separable: true
+    - &Conv3
+      filters: &enc_feats 1024
+      repeat: 1
+      kernel_size: [1]
+      dilation: [1]
+      stride: [1]
+      dropout: 0.2
+      residual: false
+      separable: false
+
+  decoder:
+    in_feats: *enc_feats
+    init: xavier_uniform
--- a/PyTorch/SpeechRecognition/QuartzNet/img/model.png
+++ b/PyTorch/SpeechRecognition/QuartzNet/img/model.png
--- a/PyTorch/SpeechRecognition/QuartzNet/img/tcs_conv.png
+++ b/PyTorch/SpeechRecognition/QuartzNet/img/tcs_conv.png
--- a/PyTorch/SpeechRecognition/QuartzNet/inference.py
+++ b/PyTorch/SpeechRecognition/QuartzNet/inference.py
@ -0,0 +1,390 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import math
+import os
+import random
+import time
+from heapq import nlargest
+from itertools import chain, repeat
+from pathlib import Path
+from tqdm import tqdm
+
+import dllogger
+import torch
+import numpy as np
+import torch.distributed as distrib
+from dllogger import JSONStreamBackend, StdOutBackend, Verbosity
+
+from quartznet import config
+from common import helpers
+from common.dali.data_loader import DaliDataLoader
+from common.dataset import (AudioDataset, FilelistDataset, get_data_loader,
+                            SingleAudioDataset)
+from common.features import BaseFeatures, FilterbankFeatures
+from common.helpers import print_once, process_evaluation_epoch
+from quartznet.model import GreedyCTCDecoder, QuartzNet
+from common.tb_dllogger import stdout_metric_format, unique_log_fpath
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(description='QuartzNet inference')
+    parser.add_argument('--batch_size', default=16, type=int,
+                        help='Data batch size')
+    parser.add_argument('--steps', default=0, type=int,
+                        help='Eval this many steps for every worker')
+    parser.add_argument('--warmup_steps', default=0, type=int,
+                        help='Burn-in period before measuring latencies')
+    parser.add_argument('--model_config', type=str, required=True,
+                        help='Relative model config path given dataset folder')
+    parser.add_argument('--dataset_dir', type=str,
+                        help='Absolute path to dataset folder')
+    parser.add_argument('--val_manifests', type=str, nargs='+',
+                        help='Relative path to evaluation dataset manifest files')
+    parser.add_argument('--ckpt', default=None, type=str,
+                        help='Path to model checkpoint')
+    parser.add_argument('--amp', '--fp16', action='store_true',
+                        help='Use FP16 precision')
+    parser.add_argument('--cudnn_benchmark', action='store_true',
+                        help='Enable cudnn benchmark')
+    parser.add_argument('--cpu', action='store_true',
+                        help='Run inference on CPU')
+    parser.add_argument("--seed", default=None, type=int, help='Random seed')
+    parser.add_argument('--local_rank', default=os.getenv('LOCAL_RANK', 0),
+                        type=int, help='GPU id used for distributed training')
+
+    io = parser.add_argument_group('feature and checkpointing setup')
+    io.add_argument('--dali_device', type=str, choices=['none', 'cpu', 'gpu'],
+                    default='gpu', help='Use DALI pipeline for fast data processing')
+    io.add_argument('--save_predictions', type=str, default=None,
+                    help='Save predictions in text form at this location')
+    io.add_argument('--save_logits', default=None, type=str,
+                    help='Save output logits under specified path')
+    io.add_argument('--transcribe_wav', type=str,
+                    help='Path to a single .wav file (16KHz)')
+    io.add_argument('--transcribe_filelist', type=str,
+                    help='Path to a filelist with one .wav path per line')
+    io.add_argument('-o', '--output_dir', default='results/',
+                    help='Output folder to save audio (file per phrase)')
+    io.add_argument('--log_file', type=str, default=None,
+                    help='Path to a DLLogger log file')
+    io.add_argument('--ema', action='store_true',
+                    help='Load averaged model weights')
+    io.add_argument('--torchscript', action='store_true',
+                    help='Evaluate with a TorchScripted model')
+    io.add_argument('--torchscript_export', action='store_true',
+                    help='Export the model with torch.jit to the output_dir')
+    io.add_argument('--override_config', type=str, action='append',
+                    help='Overrides arbitrary config value.'
+                         ' Syntax: `--override_config nested.config.key=val`.')
+    return parser
+
+
+def durs_to_percentiles(durations, ratios):
+    durations = np.asarray(durations) * 1000  # in ms
+    latency = durations
+
+    latency = latency[5:]
+    mean_latency = np.mean(latency)
+
+    latency_worst = nlargest(math.ceil((1 - min(ratios)) * len(latency)), latency)
+    latency_ranges = get_percentile(ratios, latency_worst, len(latency))
+    latency_ranges[0.5] = mean_latency
+    return latency_ranges
+
+
+def get_percentile(ratios, arr, nsamples):
+    res = {}
+    for a in ratios:
+        idx = max(int(nsamples * (1 - a)), 0)
+        res[a] = arr[idx]
+    return res
+
+
+def torchscript_export(data_loader, audio_processor, model, greedy_decoder,
+                       output_dir, use_amp, use_conv_masks, model_config, device,
+                       save):
+
+    audio_processor.to(device)
+
+    for batch in data_loader:
+        batch = [t.to(device, non_blocking=True) for t in batch]
+        audio, audio_len, _, _ = batch
+        feats, feat_lens = audio_processor(audio, audio_len)
+        break
+
+    print("\nExporting featurizer...")
+    print("\nNOTE: Dithering causes warnings about non-determinism.\n")
+    ts_feat = torch.jit.trace(audio_processor, (audio, audio_len))
+
+    print("\nExporting acoustic model...")
+    model(feats, feat_lens)
+    ts_acoustic = torch.jit.trace(model, (feats, feat_lens))
+
+    print("\nExporting decoder...")
+    log_probs = model(feats, feat_lens)
+    ts_decoder = torch.jit.script(greedy_decoder, log_probs)
+    print("\nJIT export complete.")
+
+    if save:
+        precision = "fp16" if use_amp else "fp32"
+        module_name = f'{os.path.basename(model_config)}_{precision}'
+        ts_feat.save(os.path.join(output_dir, module_name + "_feat.pt"))
+        ts_acoustic.save(os.path.join(output_dir, module_name + "_acoustic.pt"))
+        ts_decoder.save(os.path.join(output_dir, module_name + "_decoder.pt"))
+
+    return ts_feat, ts_acoustic, ts_decoder
+
+
+def main():
+
+    parser = get_parser()
+    args = parser.parse_args()
+
+    log_fpath = args.log_file or str(Path(args.output_dir, 'nvlog_infer.json'))
+    log_fpath = unique_log_fpath(log_fpath)
+    dllogger.init(backends=[JSONStreamBackend(Verbosity.DEFAULT, log_fpath),
+                            StdOutBackend(Verbosity.VERBOSE,
+                                          metric_format=stdout_metric_format)])
+
+    [dllogger.log("PARAMETER", {k: v}) for k, v in vars(args).items()]
+
+    for step in ['DNN', 'data+DNN', 'data']:
+        for c in [0.99, 0.95, 0.9, 0.5]:
+            cs = 'avg' if c == 0.5 else f'{int(100*c)}%'
+            dllogger.metadata(f'{step.lower()}_latency_{c}',
+                              {'name': f'{step} latency {cs}',
+                               'format': ':>7.2f', 'unit': 'ms'})
+    dllogger.metadata(
+        'eval_wer', {'name': 'WER', 'format': ':>3.2f', 'unit': '%'})
+
+    if args.cpu:
+        device = torch.device('cpu')
+    else:
+        assert torch.cuda.is_available()
+        device = torch.device('cuda')
+        torch.backends.cudnn.benchmark = args.cudnn_benchmark
+
+    if args.seed is not None:
+        torch.manual_seed(args.seed + args.local_rank)
+        np.random.seed(args.seed + args.local_rank)
+        random.seed(args.seed + args.local_rank)
+
+    # set up distributed training
+    multi_gpu = not args.cpu and int(os.environ.get('WORLD_SIZE', 1)) > 1
+    if multi_gpu:
+        torch.cuda.set_device(args.local_rank)
+        distrib.init_process_group(backend='nccl', init_method='env://')
+        print_once(f'Inference with {distrib.get_world_size()} GPUs')
+
+    cfg = config.load(args.model_config)
+    config.apply_config_overrides(cfg, args)
+
+    symbols = helpers.add_ctc_blank(cfg['labels'])
+
+    use_dali = args.dali_device in ('cpu', 'gpu')
+    dataset_kw, features_kw = config.input(cfg, 'val')
+
+    measure_perf = args.steps > 0
+
+    # dataset
+    if args.transcribe_wav or args.transcribe_filelist:
+
+        if use_dali:
+            print("DALI supported only with input .json files; disabling")
+            use_dali = False
+
+        assert not args.pad_to_max_duration
+        assert not (args.transcribe_wav and args.transcribe_filelist)
+
+        if args.transcribe_wav:
+            dataset = SingleAudioDataset(args.transcribe_wav)
+        else:
+            dataset = FilelistDataset(args.transcribe_filelist)
+
+        data_loader = get_data_loader(dataset,
+                                      batch_size=1,
+                                      multi_gpu=multi_gpu,
+                                      shuffle=False,
+                                      num_workers=0,
+                                      drop_last=(True if measure_perf else False))
+
+        _, features_kw = config.input(cfg, 'val')
+        feat_proc = FilterbankFeatures(**features_kw)
+
+    elif use_dali:
+        # pad_to_max_duration is not supported by DALI - have simple padders
+        if features_kw['pad_to_max_duration']:
+            feat_proc = BaseFeatures(
+                pad_align=features_kw['pad_align'],
+                pad_to_max_duration=True,
+                max_duration=features_kw['max_duration'],
+                sample_rate=features_kw['sample_rate'],
+                window_size=features_kw['window_size'],
+                window_stride=features_kw['window_stride'])
+            features_kw['pad_to_max_duration'] = False
+        else:
+            feat_proc = None
+
+        data_loader = DaliDataLoader(
+            gpu_id=args.local_rank or 0,
+            dataset_path=args.dataset_dir,
+            config_data=dataset_kw,
+            config_features=features_kw,
+            json_names=args.val_manifests,
+            batch_size=args.batch_size,
+            pipeline_type=("train" if measure_perf else "val"),  # no drop_last
+            device_type=args.dali_device,
+            symbols=symbols)
+
+    else:
+        dataset = AudioDataset(args.dataset_dir,
+                               args.val_manifests,
+                               symbols,
+                               **dataset_kw)
+
+        data_loader = get_data_loader(dataset,
+                                      args.batch_size,
+                                      multi_gpu=multi_gpu,
+                                      shuffle=False,
+                                      num_workers=4,
+                                      drop_last=False)
+
+        feat_proc = FilterbankFeatures(**features_kw)
+
+    model = QuartzNet(encoder_kw=config.encoder(cfg),
+                      decoder_kw=config.decoder(cfg, n_classes=len(symbols)))
+
+    if args.ckpt is not None:
+        print(f'Loading the model from {args.ckpt} ...')
+        checkpoint = torch.load(args.ckpt, map_location="cpu")
+        key = 'ema_state_dict' if args.ema else 'state_dict'
+        state_dict = checkpoint[key]
+        model.load_state_dict(state_dict, strict=True)
+
+    model.to(device)
+    model.eval()
+
+    if feat_proc is not None:
+        feat_proc.to(device)
+        feat_proc.eval()
+
+    if args.amp:
+        model = model.half()
+
+    if args.torchscript:
+        greedy_decoder = GreedyCTCDecoder()
+
+        feat_proc, model, greedy_decoder = torchscript_export(
+            data_loader, feat_proc, model, greedy_decoder, args.output_dir,
+            use_amp=args.amp, use_conv_masks=True, model_toml=args.model_toml,
+            device=device, save=args.torchscript_export)
+
+    if multi_gpu:
+        model = torch.nn.parallel.DistributedDataParallel(
+            model, device_ids=[args.local_rank], output_device=args.local_rank)
+
+    agg = {'txts': [], 'preds': [], 'logits': []}
+    dur = {'data': [], 'dnn': [], 'data+dnn': []}
+
+    looped_loader = chain.from_iterable(repeat(data_loader))
+    greedy_decoder = GreedyCTCDecoder()
+
+    sync = lambda: torch.cuda.synchronize() if device.type == 'cuda' else None
+
+    steps = args.steps + args.warmup_steps or len(data_loader)
+    with torch.no_grad():
+
+        for it, batch in enumerate(tqdm(looped_loader, initial=1, total=steps)):
+
+            if use_dali:
+                feats, feat_lens, txt, txt_lens = batch
+                if feat_proc is not None:
+                    feats, feat_lens = feat_proc(feats, feat_lens)
+            else:
+                batch = [t.to(device, non_blocking=True) for t in batch]
+                audio, audio_lens, txt, txt_lens = batch
+                feats, feat_lens = feat_proc(audio, audio_lens)
+
+            sync()
+            t1 = time.perf_counter()
+
+            if args.amp:
+                feats = feats.half()
+
+            if model.encoder.use_conv_masks:
+                log_probs, log_prob_lens = model(feats, feat_lens)
+            else:
+                log_probs = model(feats, feat_lens)
+
+            preds = greedy_decoder(log_probs)
+
+            sync()
+            t2 = time.perf_counter()
+
+            # burn-in period; wait for a new loader due to num_workers
+            if it >= 1 and (args.steps == 0 or it >= args.warmup_steps):
+                dur['data'].append(t1 - t0)
+                dur['dnn'].append(t2 - t1)
+                dur['data+dnn'].append(t2 - t0)
+
+            if txt is not None:
+                agg['txts'] += helpers.gather_transcripts([txt], [txt_lens],
+                                                          symbols)
+            agg['preds'] += helpers.gather_predictions([preds], symbols)
+            agg['logits'].append(log_probs)
+
+            if it + 1 == steps:
+                break
+
+            sync()
+            t0 = time.perf_counter()
+
+        # communicate the results
+        if args.transcribe_wav:
+            for idx, p in enumerate(agg['preds']):
+                print_once(f'Prediction {idx+1: >3}: {p}')
+
+        elif args.transcribe_filelist:
+            pass
+
+        elif not multi_gpu or distrib.get_rank() == 0:
+            wer, _ = process_evaluation_epoch(agg)
+
+            dllogger.log(step=(), data={'eval_wer': 100 * wer})
+
+        if args.save_predictions:
+            with open(args.save_predictions, 'w') as f:
+                f.write('\n'.join(agg['preds']))
+
+        if args.save_logits:
+            logits = torch.cat(agg['logits'], dim=0).cpu()
+            torch.save(logits, args.save_logits)
+
+    # report timings
+    if len(dur['data']) >= 20:
+        ratios = [0.9, 0.95, 0.99]
+        for stage in dur:
+            lat = durs_to_percentiles(dur[stage], ratios)
+            for k in [0.99, 0.95, 0.9, 0.5]:
+                kk = str(k).replace('.', '_')
+                dllogger.log(step=(), data={f'{stage.lower()}_latency_{kk}': lat[k]})
+
+    else:
+        print_once('Not enough samples to measure latencies.')
+
+
+if __name__ == "__main__":
+    main()
--- a/PyTorch/SpeechRecognition/QuartzNet/platform/DGX2_QuartzNet_AMP_16GPU.sh
+++ b/PyTorch/SpeechRecognition/QuartzNet/platform/DGX2_QuartzNet_AMP_16GPU.sh
@ -0,0 +1,10 @@
+#!/bin/bash
+
+set -a
+
+: ${NUM_GPUS:=16}
+: ${GPU_BATCH_SIZE:=36}
+: ${GRAD_ACCUMULATION:=2}
+: ${AMP=:true}
+
+bash scripts/train.sh "$@"
--- a/PyTorch/SpeechRecognition/QuartzNet/platform/DGX2_QuartzNet_AMP_8GPU.sh
+++ b/PyTorch/SpeechRecognition/QuartzNet/platform/DGX2_QuartzNet_AMP_8GPU.sh
@ -0,0 +1,10 @@
+#!/bin/bash
+
+set -a
+
+: ${NUM_GPUS:=8}
+: ${GPU_BATCH_SIZE:=36}
+: ${GRAD_ACCUMULATION:=4}
+: ${AMP=:true}
+
+bash scripts/train.sh "$@"
--- a/PyTorch/SpeechRecognition/QuartzNet/platform/DGX2_QuartzNet_FP32_16GPU.sh
+++ b/PyTorch/SpeechRecognition/QuartzNet/platform/DGX2_QuartzNet_FP32_16GPU.sh
@ -0,0 +1,10 @@
+#!/bin/bash
+
+set -a
+
+: ${NUM_GPUS:=16}
+: ${GPU_BATCH_SIZE:=36}
+: ${GRAD_ACCUMULATION:=2}
+: ${AMP=:false}
+
+bash scripts/train.sh "$@"
--- a/PyTorch/SpeechRecognition/QuartzNet/platform/DGX2_QuartzNet_FP32_8GPU.sh
+++ b/PyTorch/SpeechRecognition/QuartzNet/platform/DGX2_QuartzNet_FP32_8GPU.sh
@ -0,0 +1,10 @@
+#!/bin/bash
+
+set -a
+
+: ${NUM_GPUS:=8}
+: ${GPU_BATCH_SIZE:=36}
+: ${GRAD_ACCUMULATION:=4}
+: ${AMP=:false}
+
+bash scripts/train.sh "$@"
--- a/PyTorch/SpeechRecognition/QuartzNet/platform/DGXA100_QuartzNet_AMP_8GPU.sh
+++ b/PyTorch/SpeechRecognition/QuartzNet/platform/DGXA100_QuartzNet_AMP_8GPU.sh
@ -0,0 +1,10 @@
+#!/bin/bash
+
+set -a
+
+: ${NUM_GPUS:=8}
+: ${GPU_BATCH_SIZE:=72}
+: ${GRAD_ACCUMULATION:=2}
+: ${AMP=:true}
+
+bash scripts/train.sh "$@"
--- a/PyTorch/SpeechRecognition/QuartzNet/platform/DGXA100_QuartzNet_TF32_8GPU.sh
+++ b/PyTorch/SpeechRecognition/QuartzNet/platform/DGXA100_QuartzNet_TF32_8GPU.sh
@ -0,0 +1,10 @@
+#!/bin/bash
+
+set -a
+
+: ${NUM_GPUS:=8}
+: ${GPU_BATCH_SIZE:=72}
+: ${GRAD_ACCUMULATION:=2}
+: ${AMP=:false}
+
+bash scripts/train.sh "$@"
--- a/PyTorch/SpeechRecognition/QuartzNet/quartznet/config.py
+++ b/PyTorch/SpeechRecognition/QuartzNet/quartznet/config.py
@ -0,0 +1,140 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import inspect
+from ast import literal_eval
+from contextlib import suppress
+from numbers import Number
+
+import yaml
+
+from common.audio import GainPerturbation, ShiftPerturbation, SpeedPerturbation
+from common.dataset import AudioDataset
+from common.features import (CutoutAugment, FilterbankFeatures, SpecAugment)
+from quartznet.model import JasperDecoderForCTC, JasperBlock, JasperEncoder
+
+
+def default_args(klass):
+    sig = inspect.signature(klass.__init__)
+    return {k: v.default for k, v in sig.parameters.items() if k != 'self'}
+
+
+def load(fpath):
+
+    cfg = yaml.safe_load(open(fpath, 'r'))
+
+    # Reload to deep copy shallow copies, which were made with yaml anchors
+    yaml.Dumper.ignore_aliases = lambda *args: True
+    cfg = yaml.dump(cfg)
+    cfg = yaml.safe_load(cfg)
+    return cfg
+
+
+def validate_and_fill(klass, user_conf, ignore_unk=[], optional=[]):
+    conf = default_args(klass)
+
+    for k, v in user_conf.items():
+        assert k in conf or k in ignore_unk, f'Unknown param {k} for {klass}'
+        conf[k] = v
+
+    # Keep only mandatory or optional-nonempty
+    conf = {k: v for k, v in conf.items()
+            if k not in optional or v is not inspect.Parameter.empty}
+
+    # Validate
+    for k, v in conf.items():
+        assert v is not inspect.Parameter.empty, \
+            f'Value for {k} not specified for {klass}'
+    return conf
+
+
+def input(conf_yaml, split='train'):
+
+    conf = copy.deepcopy(conf_yaml[f'input_{split}'])
+    conf_dataset = conf.pop('audio_dataset')
+    conf_features = conf.pop('filterbank_features')
+
+    # Validate known inner classes
+    inner_classes = [
+        (conf_dataset, 'speed_perturbation', SpeedPerturbation),
+        (conf_dataset, 'gain_perturbation', GainPerturbation),
+        (conf_dataset, 'shift_perturbation', ShiftPerturbation),
+        (conf_features, 'spec_augment', SpecAugment),
+        (conf_features, 'cutout_augment', CutoutAugment),
+    ]
+    for conf_tgt, key, klass in inner_classes:
+        if key in conf_tgt:
+            conf_tgt[key] = validate_and_fill(klass, conf_tgt[key])
+
+    for k in conf:
+        raise ValueError(f'Unknown key {k}')
+
+    # Validate outer classes
+    conf_dataset = validate_and_fill(
+        AudioDataset, conf_dataset,
+        optional=['data_dir', 'labels', 'manifest_fpaths'])
+
+    # klass = feature_class(conf_features['feature_type'])
+    # conf_features = validate_and_fill(
+    #     klass, conf_features, ignore_unk=['feature_type'])
+
+    conf_features = validate_and_fill(
+        FilterbankFeatures, conf_features)  # , ignore_unk=['feature_type'])
+
+    # Check params shared between classes
+    shared = ['sample_rate', 'max_duration', 'pad_to_max_duration']
+    for sh in shared:
+        assert conf_dataset[sh] == conf_features[sh], (
+            f'{sh} should match in Dataset and FeatureProcessor: '
+            f'{conf_dataset[sh]}, {conf_features[sh]}')
+
+    return conf_dataset, conf_features
+
+
+def encoder(conf):
+    """Validate config for JasperEncoder and subsequent JasperBlocks"""
+
+    # Validate, but don't overwrite with defaults
+    for blk in conf['quartznet']['encoder']['blocks']:
+        validate_and_fill(JasperBlock, blk, optional=['infilters'],
+                          ignore_unk=['residual_dense'])
+
+    return validate_and_fill(JasperEncoder, conf['quartznet']['encoder'])
+
+
+def decoder(conf, n_classes):
+    decoder_kw = {'n_classes': n_classes, **conf['quartznet']['decoder']}
+    return validate_and_fill(JasperDecoderForCTC, decoder_kw)
+
+
+def apply_config_overrides(conf, args):
+    if args.override_config is None:
+        return
+    for override_key_val in args.override_config:
+        key, val = override_key_val.split('=')
+        with suppress(TypeError, ValueError):
+            val = literal_eval(val)
+        apply_nested_config_override(conf, key, val)
+
+
+def apply_nested_config_override(conf, key_str, val):
+    fields = key_str.split('.')
+    for f in fields[:-1]:
+        conf = conf[f]
+    f = fields[-1]
+    assert (f not in conf
+            or type(val) is type(conf[f])
+            or (isinstance(val, Number) and isinstance(conf[f], Number)))
+    conf[f] = val
--- a/PyTorch/SpeechRecognition/QuartzNet/quartznet/model.py
+++ b/PyTorch/SpeechRecognition/QuartzNet/quartznet/model.py
@ -0,0 +1,391 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+activations = {
+    "hardtanh": nn.Hardtanh,
+    "relu": nn.ReLU,
+    "selu": nn.SELU,
+}
+
+
+def init_weights(m, mode='xavier_uniform'):
+    if type(m) == nn.Conv1d or type(m) == MaskedConv1d:
+        if mode == 'xavier_uniform':
+            nn.init.xavier_uniform_(m.weight, gain=1.0)
+        elif mode == 'xavier_normal':
+            nn.init.xavier_normal_(m.weight, gain=1.0)
+        elif mode == 'kaiming_uniform':
+            nn.init.kaiming_uniform_(m.weight, nonlinearity="relu")
+        elif mode == 'kaiming_normal':
+            nn.init.kaiming_normal_(m.weight, nonlinearity="relu")
+        else:
+            raise ValueError("Unknown Initialization mode: {0}".format(mode))
+
+    elif type(m) == nn.BatchNorm1d:
+        if m.track_running_stats:
+            m.running_mean.zero_()
+            m.running_var.fill_(1)
+            m.num_batches_tracked.zero_()
+        if m.affine:
+            nn.init.ones_(m.weight)
+            nn.init.zeros_(m.bias)
+
+
+def compute_new_kernel_size(kernel_size, kernel_width):
+    new_kernel_size = max(int(kernel_size * kernel_width), 1)
+    # If kernel is even shape, round up to make it odd
+    if new_kernel_size % 2 == 0:
+        new_kernel_size += 1
+    return new_kernel_size
+
+
+def get_same_padding(kernel_size, stride, dilation):
+    if stride > 1 and dilation > 1:
+        raise ValueError("Only stride OR dilation may be greater than 1")
+    return (kernel_size // 2) * dilation
+
+
+class GroupShuffle(nn.Module):
+    def __init__(self, groups, channels):
+        super(GroupShuffle, self).__init__()
+        self.groups = groups
+        self.channels_per_group = channels // groups
+
+    def forward(self, x):
+        sh = x.shape
+        x = x.view(-1, self.groups, self.channels_per_group, sh[-1])
+        x = torch.transpose(x, 1, 2).contiguous()
+        x = x.view(-1, self.groups * self.channels_per_group, sh[-1])
+        return x
+
+
+class MaskedConv1d(nn.Conv1d):
+    """1D convolution with sequence masking
+    """
+    __constants__ = ["masked"]
+
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1,
+                 padding=0, dilation=1, groups=1, bias=False, use_mask=True,
+                 heads=-1):
+
+        # Jasper refactor compat
+        assert heads == -1  # Unsupported
+        masked = use_mask
+
+        super(MaskedConv1d, self).__init__(
+            in_channels, out_channels, kernel_size, stride=stride,
+            padding=padding, dilation=dilation, groups=groups, bias=bias)
+
+        self.masked = masked
+
+    def get_seq_len(self, lens):
+        pad, ks = self.padding[0], self.kernel_size[0]
+        return torch.div(lens + 2 * pad - self.dilation[0] * (ks - 1) - 1,
+                         self.stride[0], rounding_mode='trunc') + 1
+
+    def forward(self, x, x_lens=None):
+        if self.masked:
+            max_len = x.size(2)
+            idxs = torch.arange(max_len, dtype=x_lens.dtype, device=x.device)
+            mask = idxs.expand(x_lens.size(0), max_len) >= x_lens.unsqueeze(1)
+            x = x.masked_fill(mask.unsqueeze(1).to(device=x.device), 0)
+            x_lens = self.get_seq_len(x_lens)
+
+        return super(MaskedConv1d, self).forward(x), x_lens
+
+
+class JasperBlock(nn.Module):
+    __constants__ = ["conv_mask", "separable", "res", "mconv"]
+
+    def __init__(self, infilters, filters, repeat=3, kernel_size=11,
+                 kernel_size_factor=1, stride=1, dilation=1, padding='same',
+                 dropout=0.2, activation=None, residual=True, groups=1,
+                 separable=False, heads=-1, normalization="batch",
+                 norm_groups=1, residual_panes=[], use_conv_masks=False):
+        super(JasperBlock, self).__init__()
+
+        # Fix params being passed as list, but default to ints
+        wrap = lambda v: [v] if type(v) is int else v
+        kernel_size = wrap(kernel_size)
+        dilation = wrap(dilation)
+        padding = wrap(padding)
+        stride = wrap(stride)
+
+        if padding != "same":
+            raise ValueError("currently only 'same' padding is supported")
+
+        kernel_size_factor = float(kernel_size_factor)
+        if type(kernel_size) in (list, tuple):
+            kernel_size = [compute_new_kernel_size(k, kernel_size_factor)
+                           for k in kernel_size]
+        else:
+            kernel_size = compute_new_kernel_size(kernel_size,
+                                                  kernel_size_factor)
+
+        padding_val = get_same_padding(kernel_size[0], stride[0], dilation[0])
+        self.conv_mask = use_conv_masks
+        self.separable = separable
+
+        infilters_loop = infilters
+        conv = nn.ModuleList()
+
+        for _ in range(repeat - 1):
+            conv.extend(
+                self._get_conv_bn_layer(
+                    infilters_loop, filters, kernel_size=kernel_size,
+                    stride=stride, dilation=dilation, padding=padding_val,
+                    groups=groups, heads=heads, separable=separable,
+                    normalization=normalization, norm_groups=norm_groups)
+            )
+            conv.extend(self._get_act_dropout_layer(drop_prob=dropout,
+                                                    activation=activation))
+            infilters_loop = filters
+
+        conv.extend(
+            self._get_conv_bn_layer(
+                infilters_loop, filters, kernel_size=kernel_size, stride=stride,
+                dilation=dilation, padding=padding_val, groups=groups,
+                heads=heads, separable=separable, normalization=normalization,
+                norm_groups=norm_groups)
+        )
+        self.mconv = conv
+
+        res_panes = residual_panes.copy()
+        self.dense_residual = residual
+
+        if residual:
+            res_list = nn.ModuleList()
+
+            if len(residual_panes) == 0:
+                res_panes = [infilters]
+                self.dense_residual = False
+            for ip in res_panes:
+                res_list.append(nn.ModuleList(
+                    self._get_conv_bn_layer(ip, filters, kernel_size=1,
+                                            normalization=normalization,
+                                            norm_groups=norm_groups, stride=[1])
+                ))
+
+            self.res = res_list
+        else:
+            self.res = None
+
+        self.mout = nn.Sequential(*self._get_act_dropout_layer(
+            drop_prob=dropout, activation=activation))
+
+    def _get_conv(self, in_channels, out_channels, kernel_size=11, stride=1,
+                  dilation=1, padding=0, bias=False, groups=1, heads=-1,
+                  separable=False):
+
+        kw = {'in_channels': in_channels, 'out_channels': out_channels,
+              'kernel_size': kernel_size, 'stride': stride, 'dilation': dilation,
+              'padding': padding, 'bias': bias, 'groups': groups}
+
+        if self.conv_mask:
+            return MaskedConv1d(**kw, heads=heads, use_mask=self.conv_mask)
+        else:
+            return nn.Conv1d(**kw)
+
+    def _get_conv_bn_layer(self, in_channels, out_channels, kernel_size=11,
+                           stride=1, dilation=1, padding=0, bias=False,
+                           groups=1, heads=-1, separable=False,
+                           normalization="batch", norm_groups=1):
+        if norm_groups == -1:
+            norm_groups = out_channels
+
+        if separable:
+            layers = [
+                self._get_conv(in_channels, in_channels, kernel_size,
+                               stride=stride, dilation=dilation, padding=padding,
+                               bias=bias, groups=in_channels, heads=heads),
+                self._get_conv(in_channels, out_channels, kernel_size=1,
+                               stride=1, dilation=1, padding=0, bias=bias,
+                               groups=groups),
+            ]
+        else:
+            layers = [
+                self._get_conv(in_channels, out_channels, kernel_size,
+                               stride=stride, dilation=dilation,
+                               padding=padding, bias=bias, groups=groups)
+            ]
+
+        if normalization == "group":
+            layers.append(nn.GroupNorm(num_groups=norm_groups,
+                                       num_channels=out_channels))
+        elif normalization == "instance":
+            layers.append(nn.GroupNorm(num_groups=out_channels,
+                                       num_channels=out_channels))
+        elif normalization == "layer":
+            layers.append(nn.GroupNorm(num_groups=1, num_channels=out_channels))
+
+        elif normalization == "batch":
+            layers.append(nn.BatchNorm1d(out_channels, eps=1e-3, momentum=0.1))
+        else:
+            raise ValueError(
+                f"Normalization method ({normalization}) does not match"
+                f" one of [batch, layer, group, instance]."
+            )
+
+        if groups > 1:
+            layers.append(GroupShuffle(groups, out_channels))
+        return layers
+
+    def _get_act_dropout_layer(self, drop_prob=0.2, activation=None):
+        if activation is None:
+            activation = nn.Hardtanh(min_val=0.0, max_val=20.0)
+        layers = [activation, nn.Dropout(p=drop_prob)]
+        return layers
+
+    def forward(self, xs, xs_lens=None):
+        if not self.conv_mask:
+            xs_lens = 0
+
+        # compute forward convolutions
+        out = xs[-1]
+        lens = xs_lens
+        for i, l in enumerate(self.mconv):
+            # if we're doing masked convolutions, we need to pass in and
+            # possibly update the sequence lengths
+            # if (i % 4) == 0 and self.conv_mask:
+            if isinstance(l, MaskedConv1d):
+                out, lens = l(out, lens)
+            else:
+                out = l(out)
+
+        # compute the residuals
+        if self.res is not None:
+            for i, layer in enumerate(self.res):
+                res_out = xs[i]
+                for j, res_layer in enumerate(layer):
+                    if isinstance(res_layer, MaskedConv1d):
+                        res_out, _ = res_layer(res_out, xs_lens)
+                    else:
+                        res_out = res_layer(res_out)
+
+                out = out + res_out
+
+        # compute the output
+        out = self.mout(out)
+        if self.res is not None and self.dense_residual:
+            out = xs + [out]
+        else:
+            out = [out]
+
+        return (out, lens) if self.conv_mask else (out, None)
+
+
+class JasperEncoder(nn.Module):
+    __constants__ = ["use_conv_masks"]
+
+    def __init__(self, in_feats, activation, frame_splicing=1,
+                 init='xavier_uniform', use_conv_masks=False, blocks=[]):
+        super(JasperEncoder, self).__init__()
+
+        self.use_conv_masks = use_conv_masks
+        self.layers = nn.ModuleList()
+
+        in_feats *= frame_splicing
+        all_residual_panes = []
+        for i, blk in enumerate(blocks):
+
+            blk['activation'] = activations[activation]()
+
+            has_residual_dense = blk.pop('residual_dense', False)
+            if has_residual_dense:
+                all_residual_panes += [in_feats]
+                blk['residual_panes'] = all_residual_panes
+            else:
+                blk['residual_panes'] = []
+
+            self.layers.append(
+                JasperBlock(in_feats, use_conv_masks=use_conv_masks, **blk))
+
+            in_feats = blk['filters']
+
+        self.apply(lambda x: init_weights(x, mode=init))
+
+    def forward(self, x, x_lens=None):
+        out, out_lens = [x], x_lens
+        for layer in self.layers:
+            out, out_lens = layer(out, out_lens)
+
+        return out, out_lens
+
+
+class JasperDecoderForCTC(nn.Module):
+    def __init__(self, in_feats, n_classes, init='xavier_uniform'):
+        super(JasperDecoderForCTC, self).__init__()
+
+        self.layers = nn.Sequential(
+            nn.Conv1d(in_feats, n_classes, kernel_size=1, bias=True),)
+        self.apply(lambda x: init_weights(x, mode=init))
+
+    def forward(self, enc_out):
+        out = self.layers(enc_out[-1]).transpose(1, 2)
+        return F.log_softmax(out, dim=2)
+
+
+class GreedyCTCDecoder(nn.Module):
+    @torch.no_grad()
+    def forward(self, log_probs):
+        return log_probs.argmax(dim=-1, keepdim=False).int()
+
+
+class QuartzNet(nn.Module):
+    def __init__(self, encoder_kw, decoder_kw, transpose_in=False):
+        super(QuartzNet, self).__init__()
+        self.transpose_in = transpose_in
+        self.encoder = JasperEncoder(**encoder_kw)
+        self.decoder = JasperDecoderForCTC(**decoder_kw)
+
+    def forward(self, x, x_lens=None):
+        if self.encoder.use_conv_masks:
+            assert x_lens is not None
+            enc, enc_lens = self.encoder(x, x_lens)
+            out = self.decoder(enc)
+            return out, enc_lens
+        else:
+            if self.transpose_in:
+                x = x.transpose(1, 2)
+            enc, _ = self.encoder(x)
+            out = self.decoder(enc)
+            return out  # XXX torchscript refuses to output None
+
+    # TODO Explicitly add x_lens=None for inference (now x can be a Tensor or tuple)
+    def infer(self, x):
+        if self.encoder.use_conv_masks:
+            return self.forward(x)
+        else:
+            ret = self.forward(x[0])
+            return ret, len(ret)
+
+
+class CTCLossNM:
+    def __init__(self, n_classes):
+        self._criterion = nn.CTCLoss(blank=n_classes-1, reduction='none')
+
+    def __call__(self, log_probs, targets, input_length, target_length):
+        input_length = input_length.long()
+        target_length = target_length.long()
+        targets = targets.long()
+        loss = self._criterion(log_probs.transpose(1, 0), targets,
+                               input_length, target_length)
+        # note that this is different from reduction = 'mean'
+        # because we are not dividing by target lengths
+        return torch.mean(loss)
--- a/PyTorch/SpeechRecognition/QuartzNet/requirements.txt
+++ b/PyTorch/SpeechRecognition/QuartzNet/requirements.txt
@ -0,0 +1,6 @@
+tqdm==4.53.0
+librosa==0.8.0
+soundfile
+sox==1.4.1
+pyyaml
+git+git://github.com/NVIDIA/dllogger.git@26a0f8f1958de2c0c460925ff6102a4d2486d6cc#egg=dllogger
--- a/PyTorch/SpeechRecognition/QuartzNet/scripts/docker/build.sh
+++ b/PyTorch/SpeechRecognition/QuartzNet/scripts/docker/build.sh
@ -0,0 +1,3 @@
+#!/bin/bash
+
+docker build . --rm -t quartznet
--- a/PyTorch/SpeechRecognition/QuartzNet/scripts/docker/launch.sh
+++ b/PyTorch/SpeechRecognition/QuartzNet/scripts/docker/launch.sh
@ -0,0 +1,24 @@
+#!/bin/bash
+
+SCRIPT_DIR=$(cd $(dirname $0); pwd)
+QN_REPO=${QN_REPO:-"${SCRIPT_DIR}/../.."}
+
+DATA_DIR=${1:-${DATA_DIR-${QN_REPO}"/datasets"}}
+CHECKPOINT_DIR=${2:-${CHECKPOINT_DIR:-${QN_REPO}"/checkpoints"}}
+RESULT_DIR=${3:-${RESULT_DIR:-${QN_REPO}"/results"}}
+PROGRAM_PATH=${PROGRAM_PATH}
+
+MOUNTS=""
+MOUNTS+=" -v $DATA_DIR:/datasets"
+MOUNTS+=" -v $CHECKPOINT_DIR:/checkpoints"
+MOUNTS+=" -v $RESULT_DIR:/results"
+MOUNTS+=" -v ${QN_REPO}:/quartznet"
+
+docker run -it --rm --gpus all\
+  --env PYTHONDONTWRITEBYTECODE=1 \
+  --shm-size=4g \
+  --ulimit memlock=-1 \
+  --ulimit stack=67108864 \
+  $MOUNTS \
+  -w /quartznet \
+  quartznet:latest bash $PROGRAM_PATH
--- a/PyTorch/SpeechRecognition/QuartzNet/scripts/download_librispeech.sh
+++ b/PyTorch/SpeechRecognition/QuartzNet/scripts/download_librispeech.sh
@ -0,0 +1,32 @@
+#!/usr/bin/env bash
+
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+DATA_SET="LibriSpeech"
+DATA_ROOT_DIR="/datasets"
+DATA_DIR="${DATA_ROOT_DIR}/${DATA_SET}"
+
+if [ ! -d "$DATA_DIR" ]
+then
+   mkdir --mode 755 $DATA_DIR
+
+   python utils/download_librispeech.py \
+      utils/librispeech.csv \
+      $DATA_DIR \
+      -e ${DATA_ROOT_DIR}/
+else
+   echo "Directory $DATA_DIR already exists."
+fi
--- a/PyTorch/SpeechRecognition/QuartzNet/scripts/evaluation.sh
+++ b/PyTorch/SpeechRecognition/QuartzNet/scripts/evaluation.sh
@ -0,0 +1,21 @@
+#!/bin/bash
+
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -a
+
+: ${PREDICTION_FILE:=}
+
+bash ./scripts/inference.sh "$@"
--- a/PyTorch/SpeechRecognition/QuartzNet/scripts/inference.sh
+++ b/PyTorch/SpeechRecognition/QuartzNet/scripts/inference.sh
@ -0,0 +1,63 @@
+#!/bin/bash
+
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+: ${DATA_DIR:=${1:-"/datasets/LibriSpeech"}}
+: ${MODEL_CONFIG:=${2:-"configs/quartznet15x5_speedp-online-1.15_speca.yaml"}}
+: ${OUTPUT_DIR:=${3:-"/results"}}
+: ${CHECKPOINT:=${4:-"/checkpoints/quartznet_fp16.pt"}}
+: ${DATASET:="test-other"}
+: ${LOG_FILE:=""}
+: ${CUDNN_BENCHMARK:=false}
+: ${MAX_DURATION:=""}
+: ${PAD_TO_MAX_DURATION:=false}
+: ${NUM_GPUS:=1}
+: ${NUM_STEPS:=0}
+: ${NUM_WARMUP_STEPS:=0}
+: ${AMP:=false}
+: ${BATCH_SIZE:=64}
+: ${EMA:=true}
+: ${SEED:=0}
+: ${DALI_DEVICE:="gpu"}
+: ${CPU:=false}
+: ${LOGITS_FILE:=}
+: ${PREDICTION_FILE:="${OUTPUT_DIR}/${DATASET}.predictions"}
+
+mkdir -p "$OUTPUT_DIR"
+
+ARGS="--dataset_dir=$DATA_DIR"
+ARGS+=" --val_manifest=$DATA_DIR/librispeech-${DATASET}-wav.json"
+ARGS+=" --model_config=$MODEL_CONFIG"
+ARGS+=" --output_dir=$OUTPUT_DIR"
+ARGS+=" --batch_size=$BATCH_SIZE"
+ARGS+=" --seed=$SEED"
+ARGS+=" --dali_device=$DALI_DEVICE"
+ARGS+=" --steps $NUM_STEPS"
+ARGS+=" --warmup_steps $NUM_WARMUP_STEPS"
+
+[ "$AMP" = true ] &&                 ARGS+=" --amp"
+[ "$EMA" = true ] &&                 ARGS+=" --ema"
+[ "$CUDNN_BENCHMARK" = true ] &&     ARGS+=" --cudnn_benchmark"
+[ -n "$CHECKPOINT" ] &&              ARGS+=" --ckpt=${CHECKPOINT}"
+[ -n "$LOG_FILE" ] &&                ARGS+=" --log_file $LOG_FILE"
+[ -n "$PREDICTION_FILE" ] &&         ARGS+=" --save_prediction $PREDICTION_FILE"
+[ -n "$LOGITS_FILE" ] &&             ARGS+=" --logits_save_to $LOGITS_FILE"
+[ "$CPU" == "true" ] &&              ARGS+=" --cpu"
+[ -n "$MAX_DURATION" ] &&            ARGS+=" --override_config input_val.audio_dataset.max_duration=$MAX_DURATION" \
+                                     ARGS+=" --override_config input_val.filterbank_features.max_duration=$MAX_DURATION"
+[ "$PAD_TO_MAX_DURATION" = true ] && ARGS+=" --override_config input_val.audio_dataset.pad_to_max_duration=True" \
+                                     ARGS+=" --override_config input_val.filterbank_features.pad_to_max_duration=True"
+
+python -m torch.distributed.launch --nproc_per_node=$NUM_GPUS inference.py $ARGS
--- a/PyTorch/SpeechRecognition/QuartzNet/scripts/inference_benchmark.sh
+++ b/PyTorch/SpeechRecognition/QuartzNet/scripts/inference_benchmark.sh
@ -0,0 +1,37 @@
+#!/bin/bash
+
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -a
+
+: ${OUTPUT_DIR:=${3:-"/results"}}
+: ${CUDNN_BENCHMARK:=true}
+: ${PAD_TO_MAX_DURATION:=true}
+: ${NUM_WARMUP_STEPS:=10}
+: ${NUM_STEPS:=500}
+
+: ${AMP:=false}
+: ${DALI_DEVICE:="cpu"}
+: ${BATCH_SIZE_SEQ:="1 2 4 8 16"}
+: ${MAX_DURATION_SEQ:="2 7 16.7"}
+
+for MAX_DURATION in $MAX_DURATION_SEQ; do
+  for BATCH_SIZE in $BATCH_SIZE_SEQ; do
+
+    LOG_FILE="$OUTPUT_DIR/perf-infer_dali-${DALI_DEVICE}_amp-${AMP}_dur${MAX_DURATION}_bs${BATCH_SIZE}.json"
+    bash ./scripts/inference.sh "$@"
+
+  done
+done
--- a/PyTorch/SpeechRecognition/QuartzNet/scripts/preprocess_librispeech.sh
+++ b/PyTorch/SpeechRecognition/QuartzNet/scripts/preprocess_librispeech.sh
@ -0,0 +1,51 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env bash
+
+python ./utils/convert_librispeech.py \
+    --input_dir /datasets/LibriSpeech/train-clean-100 \
+    --dest_dir /datasets/LibriSpeech/train-clean-100-wav \
+    --output_json /datasets/LibriSpeech/librispeech-train-clean-100-wav.json \
+    --speed 0.9 1.1
+python ./utils/convert_librispeech.py \
+    --input_dir /datasets/LibriSpeech/train-clean-360 \
+    --dest_dir /datasets/LibriSpeech/train-clean-360-wav \
+    --output_json /datasets/LibriSpeech/librispeech-train-clean-360-wav.json \
+    --speed 0.9 1.1
+python ./utils/convert_librispeech.py \
+    --input_dir /datasets/LibriSpeech/train-other-500 \
+    --dest_dir /datasets/LibriSpeech/train-other-500-wav \
+    --output_json /datasets/LibriSpeech/librispeech-train-other-500-wav.json \
+    --speed 0.9 1.1
+
+
+python ./utils/convert_librispeech.py \
+    --input_dir /datasets/LibriSpeech/dev-clean \
+    --dest_dir /datasets/LibriSpeech/dev-clean-wav \
+    --output_json /datasets/LibriSpeech/librispeech-dev-clean-wav.json
+python ./utils/convert_librispeech.py \
+    --input_dir /datasets/LibriSpeech/dev-other \
+    --dest_dir /datasets/LibriSpeech/dev-other-wav \
+    --output_json /datasets/LibriSpeech/librispeech-dev-other-wav.json
+
+
+python ./utils/convert_librispeech.py \
+    --input_dir /datasets/LibriSpeech/test-clean \
+    --dest_dir /datasets/LibriSpeech/test-clean-wav \
+    --output_json /datasets/LibriSpeech/librispeech-test-clean-wav.json
+python ./utils/convert_librispeech.py \
+    --input_dir /datasets/LibriSpeech/test-other \
+    --dest_dir /datasets/LibriSpeech/test-other-wav \
+    --output_json /datasets/LibriSpeech/librispeech-test-other-wav.json
--- a/PyTorch/SpeechRecognition/QuartzNet/scripts/train.sh
+++ b/PyTorch/SpeechRecognition/QuartzNet/scripts/train.sh
@ -0,0 +1,100 @@
+#!/bin/bash
+
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+export OMP_NUM_THREADS=1
+
+: ${DATA_DIR:=${1:-"/datasets/LibriSpeech"}}
+: ${MODEL_CONFIG:=${2:-"configs/quartznet15x5_speedp-online-1.15_speca.yaml"}}
+: ${OUTPUT_DIR:=${3:-"/results"}}
+: ${CHECKPOINT:=${4:-}}
+: ${CUDNN_BENCHMARK:=true}
+: ${NUM_GPUS:=8}
+: ${AMP:=false}
+: ${GPU_BATCH_SIZE:=72}
+: ${GRAD_ACCUMULATION:=2}
+: ${OPTIMIZER:=fused_novograd}
+: ${LEARNING_RATE:=0.01}
+: ${LR_POLICY:=exponential}
+: ${LR_EXP_GAMMA:=0.981}
+: ${EMA:=0.999}
+: ${MULTI_TENSOR_EMA:=true}
+: ${SEED:=0}
+: ${EPOCHS:=260}
+: ${WARMUP_EPOCHS:=2}
+: ${HOLD_EPOCHS:=140}
+: ${SAVE_FREQUENCY:=10}
+: ${EPOCHS_THIS_JOB:=0}
+: ${DALI_DEVICE:="gpu"}
+: ${PAD_TO_MAX_DURATION:=false}
+: ${EVAL_FREQUENCY:=241}
+: ${PREDICTION_FREQUENCY:=241}
+: ${TRAIN_MANIFESTS:="$DATA_DIR/librispeech-train-clean-100-wav.json \
+                      $DATA_DIR/librispeech-train-clean-360-wav.json \
+                      $DATA_DIR/librispeech-train-other-500-wav.json"}
+: ${VAL_MANIFESTS:="$DATA_DIR/librispeech-dev-clean-wav.json"}
+
+mkdir -p "$OUTPUT_DIR"
+
+ARGS="--dataset_dir=$DATA_DIR"
+ARGS+=" --val_manifests $VAL_MANIFESTS"
+ARGS+=" --train_manifests $TRAIN_MANIFESTS"
+ARGS+=" --model_config=$MODEL_CONFIG"
+ARGS+=" --output_dir=$OUTPUT_DIR"
+ARGS+=" --lr=$LEARNING_RATE"
+ARGS+=" --gpu_batch_size=$GPU_BATCH_SIZE"
+ARGS+=" --min_lr=1e-5"
+ARGS+=" --lr_policy=$LR_POLICY"
+ARGS+=" --lr_exp_gamma=$LR_EXP_GAMMA"
+ARGS+=" --epochs=$EPOCHS"
+ARGS+=" --warmup_epochs=$WARMUP_EPOCHS"
+ARGS+=" --hold_epochs=$HOLD_EPOCHS"
+ARGS+=" --epochs_this_job=$EPOCHS_THIS_JOB"
+ARGS+=" --ema=$EMA"
+ARGS+=" --seed=$SEED"
+ARGS+=" --optimizer=$OPTIMIZER"
+ARGS+=" --weight_decay=1e-3"
+ARGS+=" --resume"
+ARGS+=" --save_frequency=$SAVE_FREQUENCY"
+ARGS+=" --keep_milestones 100 200"
+ARGS+=" --save_best_from=200"
+ARGS+=" --log_frequency=1"
+ARGS+=" --eval_frequency=$EVAL_FREQUENCY"
+ARGS+=" --prediction_frequency=$PREDICTION_FREQUENCY"
+ARGS+=" --grad_accumulation=$GRAD_ACCUMULATION "
+ARGS+=" --dali_device=$DALI_DEVICE"
+
+[ "$AMP" = true ] &&                 ARGS+=" --amp"
+[ "$CUDNN_BENCHMARK" = true ] &&     ARGS+=" --cudnn_benchmark"
+[ -n "$MAX_DURATION" ] &&            ARGS+=" --override_config input_train.audio_dataset.max_duration=$MAX_DURATION" \
+                                     ARGS+=" --override_config input_train.filterbank_features.max_duration=$MAX_DURATION"
+[ "$PAD_TO_MAX_DURATION" = true ] && ARGS+=" --override_config input_train.audio_dataset.pad_to_max_duration=True" \
+                                     ARGS+=" --override_config input_train.filterbank_features.pad_to_max_duration=True"
+[ -n "$CHECKPOINT" ] &&              ARGS+=" --ckpt=${CHECKPOINT}"
+[ -n "$LOG_FILE" ] &&                ARGS+=" --log_file $LOG_FILE"
+[ -n "$PRE_ALLOCATE" ] &&            ARGS+=" --pre_allocate_range $PRE_ALLOCATE"
+[ "$MULTI_TENSOR_EMA" = true ] &&    ARGS+=" --multi_tensor_ema"
+[ -n "$BENCHMARK_EPOCHS" ] &&        ARGS+=" --benchmark_epochs_num=$BENCHMARK_EPOCHS"
+
+GBS=$(($NUM_GPUS * $GPU_BATCH_SIZE * $GRAD_ACCUMULATION))
+if [ $GBS -ne $((8 * 144)) ]; then
+    echo -e "\nWARNING: Global batch size changed from $((8 * 144)) to ${GBS}."
+    sleep 3
+fi
+echo -e "\nAMP=$AMP,""${NUM_GPUS}x${GPU_BATCH_SIZE}x${GRAD_ACCUMULATION}" \
+        "(global batch size ${GBS})\n"
+
+: ${DISTRIBUTED:="-m torch.distributed.launch --nproc_per_node=$NUM_GPUS"}
+python $DISTRIBUTED train.py $ARGS
--- a/PyTorch/SpeechRecognition/QuartzNet/scripts/train_benchmark.sh
+++ b/PyTorch/SpeechRecognition/QuartzNet/scripts/train_benchmark.sh
@ -0,0 +1,57 @@
+#!/bin/bash
+
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -a
+
+: ${DATA_DIR:=${1:-"/datasets/LibriSpeech"}}
+: ${OUTPUT_DIR:=${3:-"/results"}}
+: ${TRAIN_MANIFESTS:="$DATA_DIR/librispeech-train-clean-100-wav.json"}
+
+: ${BENCHMARK_EPOCHS:=20}
+: ${EPOCHS:=100000}
+: ${RESUME:=false}
+: ${SAVE_FREQUENCY:=100000}
+: ${EVAL_FREQUENCY:=100000}
+: ${LEARNING_RATE:=0.0001}
+
+: ${AMP:=false}
+: ${EMA:=0}
+: ${DALI_DEVICE:="gpu"}
+: ${NUM_GPUS_SEQ:="8 4 1"}
+: ${ACC_BATCH_SIZE:="144"}
+: ${GRAD_ACC_SEQ:="4 2"}
+
+# A range of batch lengths for LibriSpeech
+# with continuous speed perturbation (0.85, 1.15) and max duration 16.7s
+: ${PRE_ALLOCATE:="1408 1920"}
+
+for NUM_GPUS in $NUM_GPUS_SEQ; do
+  for GRAD_ACCUMULATION in $GRAD_ACC_SEQ; do
+
+    # Scale the number of epochs to the number of GPUs
+    BMARK=$((BENCHMARK_EPOCHS * NUM_GPUS / 8))
+    BMARK=$((BMARK < 2 ? 2 : BMARK))
+    BMARK=$((BMARK > BENCHMARK_EPOCHS ? BENCHMARK_EPOCHS : BMARK))
+    EPOCHS_THIS_JOB=$((BMARK + 1))
+
+    GPU_BATCH_SIZE=$((ACC_BATCH_SIZE / $GRAD_ACCUMULATION * 8 / $NUM_GPUS))
+
+    LOG_FILE="$OUTPUT_DIR/perf-train_dali-${DALI_DEVICE}_amp-${AMP}_"
+    LOG_FILE+="1x${NUM_GPUS}x${GPU_BATCH_SIZE}x${GRAD_ACCUMULATION}.json"
+    BENCHMARK_EPOCHS=$BMARK bash ./scripts/train.sh "$@"
+
+  done
+done
--- a/PyTorch/SpeechRecognition/QuartzNet/train.py
+++ b/PyTorch/SpeechRecognition/QuartzNet/train.py
@ -0,0 +1,558 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import copy
+import os
+import random
+import time
+
+try:
+    import nvidia_dlprof_pytorch_nvtx as pyprof
+except:
+    import pyprof
+import torch
+import amp_C
+import numpy as np
+import torch.cuda.profiler as profiler
+import torch.distributed as dist
+from apex.optimizers import FusedLAMB, FusedNovoGrad
+from contextlib import suppress as empty_context
+
+from common import helpers
+from common.dali.data_loader import DaliDataLoader
+from common.dataset import AudioDataset, get_data_loader
+from common.features import BaseFeatures, FilterbankFeatures
+from common.helpers import (Checkpointer, greedy_wer, num_weights, print_once,
+                            process_evaluation_epoch)
+from common.optimizers import AdamW, lr_policy, Novograd
+from common.tb_dllogger import flush_log, init_log, log
+from common.utils import BenchmarkStats
+from quartznet import config
+from quartznet.model import CTCLossNM, GreedyCTCDecoder, QuartzNet
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='QuartzNet')
+
+    training = parser.add_argument_group('training setup')
+    training.add_argument('--epochs', default=400, type=int,
+                          help='Number of epochs for the entire training; influences the lr schedule')
+    training.add_argument("--warmup_epochs", default=0, type=int,
+                          help='Initial epochs of increasing learning rate')
+    training.add_argument("--hold_epochs", default=0, type=int,
+                          help='Constant max learning rate epochs after warmup')
+    training.add_argument('--epochs_this_job', default=0, type=int,
+                          help=('Run for a number of epochs with no effect on the lr schedule.'
+                                'Useful for re-starting the training.'))
+    training.add_argument('--cudnn_benchmark', action='store_true', default=True,
+                          help='Enable cudnn benchmark')
+    training.add_argument('--amp', '--fp16', action='store_true', default=False,
+                          help='Use pytorch native mixed precision training')
+    training.add_argument('--seed', default=1, type=int, help='Random seed')
+    training.add_argument('--local_rank', default=os.getenv('LOCAL_RANK', 0), type=int,
+                          help='GPU id used for distributed training')
+    training.add_argument('--pre_allocate_range', default=None, type=int, nargs=2,
+                          help='Warmup with batches of length [min, max] before training')
+    training.add_argument('--pyprof', action='store_true', help='Enable pyprof profiling')
+
+    optim = parser.add_argument_group('optimization setup')
+    optim.add_argument('--gpu_batch_size', default=32, type=int,
+                       help='Batch size for a single forward/backward pass. '
+                            'The Effective batch size is gpu_batch_size * grad_accumulation.')
+    optim.add_argument('--lr', default=1e-3, type=float,
+                       help='Peak learning rate')
+    optim.add_argument("--min_lr", default=1e-5, type=float,
+                       help='minimum learning rate')
+    optim.add_argument("--lr_policy", default='exponential', type=str,
+                       choices=['exponential', 'legacy'], help='lr scheduler')
+    optim.add_argument("--lr_exp_gamma", default=0.99, type=float,
+                       help='gamma factor for exponential lr scheduler')
+    optim.add_argument('--weight_decay', default=1e-3, type=float,
+                       help='Weight decay for the optimizer')
+    optim.add_argument('--grad_accumulation', '--update-freq', default=1, type=int,
+                       help='Number of accumulation steps')
+    optim.add_argument('--optimizer', default='novograd', type=str,
+                       choices=['novograd', 'adamw', 'lamb98', 'fused_novograd'],
+                       help='Optimization algorithm')
+    optim.add_argument('--ema', type=float, default=0.0,
+                       help='Discount factor for exp averaging of model weights')
+    optim.add_argument('--multi_tensor_ema', action='store_true',
+                       help='Use multi_tensor_apply for EMA')
+
+    io = parser.add_argument_group('feature and checkpointing setup')
+    io.add_argument('--dali_device', type=str, choices=['none', 'cpu', 'gpu'],
+                    default='gpu', help='Use DALI pipeline for fast data processing')
+    io.add_argument('--resume', action='store_true',
+                    help='Try to resume from last saved checkpoint.')
+    io.add_argument('--ckpt', default=None, type=str,
+                    help='Path to a checkpoint for resuming training')
+    io.add_argument('--save_frequency', default=10, type=int,
+                    help='Checkpoint saving frequency in epochs')
+    io.add_argument('--keep_milestones', default=[100, 200, 300], type=int, nargs='+',
+                    help='Milestone checkpoints to keep from removing')
+    io.add_argument('--save_best_from', default=380, type=int,
+                    help='Epoch on which to begin tracking best checkpoint (dev WER)')
+    io.add_argument('--eval_frequency', default=200, type=int,
+                    help='Number of steps between evaluations on dev set')
+    io.add_argument('--log_frequency', default=25, type=int,
+                    help='Number of steps between printing training stats')
+    io.add_argument('--prediction_frequency', default=100, type=int,
+                    help='Number of steps between printing sample decodings')
+    io.add_argument('--model_config', type=str, required=True,
+                    help='Path of the model configuration file')
+    io.add_argument('--train_manifests', type=str, required=True, nargs='+',
+                    help='Paths of the training dataset manifest file')
+    io.add_argument('--val_manifests', type=str, required=True, nargs='+',
+                    help='Paths of the evaluation datasets manifest files')
+    io.add_argument('--dataset_dir', required=True, type=str,
+                    help='Root dir of dataset')
+    io.add_argument('--output_dir', type=str, required=True,
+                    help='Directory for logs and checkpoints')
+    io.add_argument('--log_file', type=str, default=None,
+                    help='Path to save the training logfile.')
+    io.add_argument('--benchmark_epochs_num', type=int, default=1,
+                    help='Number of epochs accounted in final average throughput.')
+    io.add_argument('--override_config', type=str, action='append',
+                    help='Overrides arbitrary config value.'
+                         ' Syntax: `--override_config nested.config.key=val`.')
+
+    return parser.parse_args()
+
+
+def reduce_tensor(tensor, num_gpus):
+    rt = tensor.clone()
+    dist.all_reduce(rt, op=dist.ReduceOp.SUM)
+    return rt.true_divide(num_gpus)
+
+
+def init_multi_tensor_ema(model, ema_model):
+    model_weights = list(model.state_dict().values())
+    ema_model_weights = list(ema_model.state_dict().values())
+    ema_overflow_buf = torch.cuda.IntTensor([0])
+    return model_weights, ema_model_weights, ema_overflow_buf
+
+
+def apply_multi_tensor_ema(decay, model_weights, ema_model_weights, overflow_buf):
+    amp_C.multi_tensor_axpby(
+        65536, overflow_buf,
+        [ema_model_weights, model_weights, ema_model_weights],
+        decay, 1-decay, -1)
+
+
+def apply_ema(model, ema_model, decay):
+    if not decay:
+        return
+
+    sd = getattr(model, 'module', model).state_dict()
+    for k, v in ema_model.state_dict().items():
+        v.copy_(decay * v + (1 - decay) * sd[k])
+
+
+@torch.no_grad()
+def evaluate(epoch, step, val_loader, val_feat_proc, labels, model,
+             ema_model, ctc_loss, greedy_decoder, use_amp, use_dali=False):
+
+    for model, subset in [(model, 'dev'), (ema_model, 'dev_ema')]:
+        if model is None:
+            continue
+
+        model.eval()
+        start_time = time.time()
+        agg = {'losses': [], 'preds': [], 'txts': []}
+
+        for batch in val_loader:
+            if use_dali:
+                # with DALI, the data is already on GPU
+                feat, feat_lens, txt, txt_lens = batch
+                if val_feat_proc is not None:
+                    feat, feat_lens = val_feat_proc(feat, feat_lens)
+            else:
+                batch = [t.cuda(non_blocking=True) for t in batch]
+                audio, audio_lens, txt, txt_lens = batch
+                feat, feat_lens = val_feat_proc(audio, audio_lens)
+
+            with torch.cuda.amp.autocast(enabled=use_amp):
+                log_probs, enc_lens = model(feat, feat_lens)
+                loss = ctc_loss(log_probs, txt, enc_lens, txt_lens)
+                pred = greedy_decoder(log_probs)
+
+            agg['losses'] += helpers.gather_losses([loss])
+            agg['preds'] += helpers.gather_predictions([pred], labels)
+            agg['txts'] += helpers.gather_transcripts([txt], [txt_lens], labels)
+
+        wer, loss = process_evaluation_epoch(agg)
+        log((epoch,), step, subset, {'loss': loss, 'wer': 100.0 * wer,
+                                     'took': time.time() - start_time})
+        model.train()
+    return wer
+
+
+def main():
+    args = parse_args()
+
+    assert(torch.cuda.is_available())
+    assert args.prediction_frequency % args.log_frequency == 0
+
+    torch.backends.cudnn.benchmark = args.cudnn_benchmark
+
+    # set up distributed training
+    multi_gpu = int(os.environ.get('WORLD_SIZE', 1)) > 1
+    if multi_gpu:
+        torch.cuda.set_device(args.local_rank)
+        dist.init_process_group(backend='nccl', init_method='env://')
+        world_size = dist.get_world_size()
+        print_once(f'Distributed training with {world_size} GPUs\n')
+    else:
+        world_size = 1
+
+    torch.manual_seed(args.seed + args.local_rank)
+    np.random.seed(args.seed + args.local_rank)
+    random.seed(args.seed + args.local_rank)
+
+    init_log(args)
+
+    cfg = config.load(args.model_config)
+    config.apply_config_overrides(cfg, args)
+
+    symbols = helpers.add_ctc_blank(cfg['labels'])
+
+    assert args.grad_accumulation >= 1
+    batch_size = args.gpu_batch_size
+
+    print_once('Setting up datasets...')
+    train_dataset_kw, train_features_kw = config.input(cfg, 'train')
+    val_dataset_kw, val_features_kw = config.input(cfg, 'val')
+
+    use_dali = args.dali_device in ('cpu', 'gpu')
+    if use_dali:
+        assert train_dataset_kw['ignore_offline_speed_perturbation'], \
+            "DALI doesn't support offline speed perturbation"
+
+        # pad_to_max_duration is not supported by DALI - have simple padders
+        if train_features_kw['pad_to_max_duration']:
+            train_feat_proc = BaseFeatures(
+                pad_align=train_features_kw['pad_align'],
+                pad_to_max_duration=True,
+                max_duration=train_features_kw['max_duration'],
+                sample_rate=train_features_kw['sample_rate'],
+                window_size=train_features_kw['window_size'],
+                window_stride=train_features_kw['window_stride'])
+            train_features_kw['pad_to_max_duration'] = False
+        else:
+            train_feat_proc = None
+
+        if val_features_kw['pad_to_max_duration']:
+            val_feat_proc = BaseFeatures(
+                pad_align=val_features_kw['pad_align'],
+                pad_to_max_duration=True,
+                max_duration=val_features_kw['max_duration'],
+                sample_rate=val_features_kw['sample_rate'],
+                window_size=val_features_kw['window_size'],
+                window_stride=val_features_kw['window_stride'])
+            val_features_kw['pad_to_max_duration'] = False
+        else:
+            val_feat_proc = None
+
+        train_loader = DaliDataLoader(gpu_id=args.local_rank,
+                                      dataset_path=args.dataset_dir,
+                                      config_data=train_dataset_kw,
+                                      config_features=train_features_kw,
+                                      json_names=args.train_manifests,
+                                      batch_size=batch_size,
+                                      grad_accumulation_steps=args.grad_accumulation,
+                                      pipeline_type="train",
+                                      device_type=args.dali_device,
+                                      symbols=symbols)
+
+        val_loader = DaliDataLoader(gpu_id=args.local_rank,
+                                    dataset_path=args.dataset_dir,
+                                    config_data=val_dataset_kw,
+                                    config_features=val_features_kw,
+                                    json_names=args.val_manifests,
+                                    batch_size=batch_size,
+                                    pipeline_type="val",
+                                    device_type=args.dali_device,
+                                    symbols=symbols)
+    else:
+        train_dataset_kw, train_features_kw = config.input(cfg, 'train')
+        train_dataset = AudioDataset(args.dataset_dir,
+                                     args.train_manifests,
+                                     symbols,
+                                     **train_dataset_kw)
+        train_loader = get_data_loader(train_dataset,
+                                       batch_size,
+                                       multi_gpu=multi_gpu,
+                                       shuffle=True,
+                                       num_workers=4)
+        train_feat_proc = FilterbankFeatures(**train_features_kw)
+
+        val_dataset_kw, val_features_kw = config.input(cfg, 'val')
+        val_dataset = AudioDataset(args.dataset_dir,
+                                   args.val_manifests,
+                                   symbols,
+                                   **val_dataset_kw)
+        val_loader = get_data_loader(val_dataset,
+                                     batch_size,
+                                     multi_gpu=multi_gpu,
+                                     shuffle=False,
+                                     num_workers=4,
+                                     drop_last=False)
+        val_feat_proc = FilterbankFeatures(**val_features_kw)
+
+        dur = train_dataset.duration / 3600
+        dur_f = train_dataset.duration_filtered / 3600
+        nsampl = len(train_dataset)
+        print_once(f'Training samples: {nsampl} ({dur:.1f}h, '
+                   f'filtered {dur_f:.1f}h)')
+
+    if train_feat_proc is not None:
+        train_feat_proc.cuda()
+    if val_feat_proc is not None:
+        val_feat_proc.cuda()
+
+    steps_per_epoch = len(train_loader) // args.grad_accumulation
+
+    # set up the model
+    model = QuartzNet(encoder_kw=config.encoder(cfg),
+                      decoder_kw=config.decoder(cfg, n_classes=len(symbols)))
+    model.cuda()
+    ctc_loss = CTCLossNM(n_classes=len(symbols))
+    greedy_decoder = GreedyCTCDecoder()
+
+    print_once(f'Model size: {num_weights(model) / 10**6:.1f}M params\n')
+
+    # optimization
+    kw = {'lr': args.lr, 'weight_decay': args.weight_decay}
+    if args.optimizer == "novograd":
+        optimizer = Novograd(model.parameters(), **kw)
+    elif args.optimizer == "adamw":
+        optimizer = AdamW(model.parameters(), **kw)
+    elif args.optimizer == 'lamb98':
+        optimizer = FusedLAMB(model.parameters(), betas=(0.9, 0.98), eps=1e-9,
+                              **kw)
+    elif args.optimizer == 'fused_novograd':
+        optimizer = FusedNovoGrad(model.parameters(), betas=(0.95, 0),
+                                  bias_correction=False, reg_inside_moment=True,
+                                  grad_averaging=False, **kw)
+    else:
+        raise ValueError(f'Invalid optimizer "{args.optimizer}"')
+
+    scaler = torch.cuda.amp.GradScaler(enabled=args.amp)
+
+    adjust_lr = lambda step, epoch, optimizer: lr_policy(
+        step, epoch, args.lr, optimizer, steps_per_epoch=steps_per_epoch,
+        warmup_epochs=args.warmup_epochs, hold_epochs=args.hold_epochs,
+        num_epochs=args.epochs, policy=args.lr_policy, min_lr=args.min_lr,
+        exp_gamma=args.lr_exp_gamma)
+
+    if args.ema > 0:
+        ema_model = copy.deepcopy(model)
+    else:
+        ema_model = None
+
+    if multi_gpu:
+        model = torch.nn.parallel.DistributedDataParallel(
+            model, device_ids=[args.local_rank], output_device=args.local_rank)
+    if args.pyprof:
+        pyprof.init(enable_function_stack=True)
+
+    # load checkpoint
+    meta = {'best_wer': 10**6, 'start_epoch': 0}
+    checkpointer = Checkpointer(args.output_dir, 'QuartzNet',
+                                args.keep_milestones)
+    if args.resume:
+        args.ckpt = checkpointer.last_checkpoint() or args.ckpt
+
+    if args.ckpt is not None:
+        checkpointer.load(args.ckpt, model, ema_model, optimizer, scaler, meta)
+
+    start_epoch = meta['start_epoch']
+    best_wer = meta['best_wer']
+    epoch = 1
+    step = start_epoch * steps_per_epoch + 1
+
+    if args.pyprof:
+        torch.autograd.profiler.emit_nvtx().__enter__()
+        profiler.start()
+
+    # training loop
+    model.train()
+    if args.ema > 0.0:
+        mt_ema_params = init_multi_tensor_ema(model, ema_model)
+    # ema_model_weight_list, model_weight_list, overflow_buf_for_ema = ema_
+
+    # pre-allocate
+    if args.pre_allocate_range is not None:
+        n_feats = train_features_kw['n_filt']
+        pad_align = train_features_kw['pad_align']
+        a, b = args.pre_allocate_range
+        for n_frames in range(a, b + pad_align, pad_align):
+            print_once(f'Pre-allocation ({batch_size}x{n_feats}x{n_frames})...')
+
+            feat = torch.randn(batch_size, n_feats, n_frames, device='cuda')
+            feat_lens = torch.ones(batch_size, device='cuda').fill_(n_frames)
+            txt = torch.randint(high=len(symbols)-1, size=(batch_size, 100),
+                                device='cuda')
+            txt_lens = torch.ones(batch_size, device='cuda').fill_(100)
+            with torch.cuda.amp.autocast(enabled=args.amp):
+                log_probs, enc_lens = model(feat, feat_lens)
+                del feat
+                loss = ctc_loss(log_probs, txt, enc_lens, txt_lens)
+            loss.backward()
+            model.zero_grad()
+    torch.cuda.empty_cache()
+
+    bmark_stats = BenchmarkStats()
+
+    for epoch in range(start_epoch + 1, args.epochs + 1):
+        if multi_gpu and not use_dali:
+            train_loader.sampler.set_epoch(epoch)
+
+        epoch_utts = 0
+        epoch_loss = 0
+        accumulated_batches = 0
+        epoch_start_time = time.time()
+        epoch_eval_time = 0
+
+        for batch in train_loader:
+
+            if accumulated_batches == 0:
+                step_loss = 0
+                step_utts = 0
+                step_start_time = time.time()
+
+            if use_dali:
+                # with DALI, the data is already on GPU
+                feat, feat_lens, txt, txt_lens = batch
+                if train_feat_proc is not None:
+                    feat, feat_lens = train_feat_proc(feat, feat_lens)
+            else:
+                batch = [t.cuda(non_blocking=True) for t in batch]
+                audio, audio_lens, txt, txt_lens = batch
+                feat, feat_lens = train_feat_proc(audio, audio_lens)
+
+            # Use context manager to prevent redundant accumulation of gradients
+            if (multi_gpu and accumulated_batches + 1 < args.grad_accumulation):
+                ctx = model.no_sync()
+            else:
+                ctx = empty_context()
+
+            with ctx:
+                with torch.cuda.amp.autocast(enabled=args.amp):
+                    log_probs, enc_lens = model(feat, feat_lens)
+
+                    loss = ctc_loss(log_probs, txt, enc_lens, txt_lens)
+                    loss /= args.grad_accumulation
+
+                if multi_gpu:
+                    reduced_loss = reduce_tensor(loss.data, world_size)
+                else:
+                    reduced_loss = loss
+
+                if torch.isnan(reduced_loss).any():
+                    print_once(f'WARNING: loss is NaN; skipping update')
+                    continue
+                else:
+                    step_loss += reduced_loss.item()
+                    step_utts += batch[0].size(0) * world_size
+                    epoch_utts += batch[0].size(0) * world_size
+                    accumulated_batches += 1
+
+                    scaler.scale(loss).backward()
+
+            if accumulated_batches % args.grad_accumulation == 0:
+                epoch_loss += step_loss
+                scaler.step(optimizer)
+                scaler.update()
+
+                adjust_lr(step, epoch, optimizer)
+                optimizer.zero_grad()
+
+                if args.ema > 0.0:
+                    apply_multi_tensor_ema(args.ema, *mt_ema_params)
+
+                if step % args.log_frequency == 0:
+                    preds = greedy_decoder(log_probs)
+                    wer, pred_utt, ref = greedy_wer(preds, txt, txt_lens, symbols)
+
+                    if step % args.prediction_frequency == 0:
+                        print_once(f'  Decoded:   {pred_utt[:90]}')
+                        print_once(f'  Reference: {ref[:90]}')
+
+                    step_time = time.time() - step_start_time
+                    log((epoch, step % steps_per_epoch or steps_per_epoch, steps_per_epoch),
+                        step, 'train',
+                        {'loss': step_loss,
+                         'wer': 100.0 * wer,
+                         'throughput': step_utts / step_time,
+                         'took': step_time,
+                         'lrate': optimizer.param_groups[0]['lr']})
+
+                step_start_time = time.time()
+
+                if step % args.eval_frequency == 0:
+                    tik = time.time()
+                    wer = evaluate(epoch, step, val_loader, val_feat_proc,
+                                   symbols, model, ema_model, ctc_loss,
+                                   greedy_decoder, args.amp, use_dali)
+
+                    if wer < best_wer and epoch >= args.save_best_from:
+                        checkpointer.save(model, ema_model, optimizer, scaler,
+                                          epoch, step, best_wer, is_best=True)
+                        best_wer = wer
+                    epoch_eval_time += time.time() - tik
+
+                step += 1
+                accumulated_batches = 0
+                # end of step
+
+            # DALI iterator need to be exhausted;
+            # if not using DALI, simulate drop_last=True with grad accumulation
+            if not use_dali and step > steps_per_epoch * epoch:
+                break
+
+        epoch_time = time.time() - epoch_start_time
+        epoch_loss /= steps_per_epoch
+        log((epoch,), None, 'train_avg', {'throughput': epoch_utts / epoch_time,
+                                          'took': epoch_time,
+                                          'loss': epoch_loss})
+        bmark_stats.update(epoch_utts, epoch_time, epoch_loss)
+
+        if epoch % args.save_frequency == 0 or epoch in args.keep_milestones:
+            checkpointer.save(model, ema_model, optimizer, scaler, epoch, step,
+                              best_wer)
+
+        if 0 < args.epochs_this_job <= epoch - start_epoch:
+            print_once(f'Finished after {args.epochs_this_job} epochs.')
+            break
+        # end of epoch
+
+    if args.pyprof:
+        profiler.stop()
+        torch.autograd.profiler.emit_nvtx().__exit__(None, None, None)
+
+    log((), None, 'train_avg', bmark_stats.get(args.benchmark_epochs_num))
+
+    if epoch == args.epochs:
+        evaluate(epoch, step, val_loader, val_feat_proc, symbols, model,
+                 ema_model, ctc_loss, greedy_decoder, args.amp, use_dali)
+
+        checkpointer.save(model, ema_model, optimizer, scaler, epoch, step,
+                          best_wer)
+    flush_log()
+
+
+if __name__ == "__main__":
+    main()
--- a/PyTorch/SpeechRecognition/QuartzNet/utils/init.py
+++ b/PyTorch/SpeechRecognition/QuartzNet/utils/init.py
--- a/PyTorch/SpeechRecognition/QuartzNet/utils/convert_librispeech.py
+++ b/PyTorch/SpeechRecognition/QuartzNet/utils/convert_librispeech.py
@ -0,0 +1,81 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+#!/usr/bin/env python
+import argparse
+import os
+import glob
+import multiprocessing
+import json
+
+import pandas as pd
+
+from preprocessing_utils import parallel_preprocess
+
+parser = argparse.ArgumentParser(description='Preprocess LibriSpeech.')
+parser.add_argument('--input_dir', type=str, required=True,
+                    help='LibriSpeech collection input dir')
+parser.add_argument('--dest_dir', type=str, required=True,
+                    help='Output dir')
+parser.add_argument('--output_json', type=str, default='./',
+                    help='name of the output json file.')
+parser.add_argument('-s','--speed', type=float, nargs='*',
+                    help='Speed perturbation ratio')
+parser.add_argument('--target_sr', type=int, default=None,
+                    help='Target sample rate. '
+                         'defaults to the input sample rate')
+parser.add_argument('--overwrite', action='store_true',
+                    help='Overwrite file if exists')
+parser.add_argument('--parallel', type=int, default=multiprocessing.cpu_count(),
+                    help='Number of threads to use when processing audio files')
+args = parser.parse_args()
+
+args.input_dir = args.input_dir.rstrip('/')
+args.dest_dir = args.dest_dir.rstrip('/')
+
+def build_input_arr(input_dir):
+    txt_files = glob.glob(os.path.join(input_dir, '**', '*.trans.txt'),
+                          recursive=True)
+    input_data = []
+    for txt_file in txt_files:
+        rel_path = os.path.relpath(txt_file, input_dir)
+        with open(txt_file) as fp:
+            for line in fp:
+                fname, _, transcript = line.partition(' ')
+                input_data.append(dict(input_relpath=os.path.dirname(rel_path),
+                                       input_fname=fname+'.flac',
+                                       transcript=transcript))
+    return input_data
+
+
+print("[%s] Scaning input dir..." % args.output_json)
+dataset = build_input_arr(input_dir=args.input_dir)
+
+print("[%s] Converting audio files..." % args.output_json)
+dataset = parallel_preprocess(dataset=dataset,
+                              input_dir=args.input_dir,
+                              dest_dir=args.dest_dir,
+                              target_sr=args.target_sr,
+                              speed=args.speed,
+                              overwrite=args.overwrite,
+                              parallel=args.parallel)
+
+print("[%s] Generating json..." % args.output_json)
+df = pd.DataFrame(dataset, dtype=object)
+
+# Save json with python. df.to_json() produces back slashed in file paths
+dataset = df.to_dict(orient='records')
+with open(args.output_json, 'w') as fp:
+    json.dump(dataset, fp, indent=2)
--- a/PyTorch/SpeechRecognition/QuartzNet/utils/download_librispeech.py
+++ b/PyTorch/SpeechRecognition/QuartzNet/utils/download_librispeech.py
@ -0,0 +1,72 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python
+
+import os
+import argparse
+import pandas as pd
+
+from download_utils import download_file, md5_checksum, extract
+
+parser = argparse.ArgumentParser(description='Download, verify and extract dataset files')
+parser.add_argument('csv', type=str,
+                    help='CSV file with urls and checksums to download.')
+parser.add_argument('dest', type=str,
+                    help='Download destnation folder.')
+parser.add_argument('-e', type=str, default=None,
+                    help='Extraction destnation folder. Defaults to download folder if not provided')
+parser.add_argument('--skip_download', action='store_true',
+                    help='Skip downloading the files')
+parser.add_argument('--skip_checksum', action='store_true',
+                    help='Skip checksum')
+parser.add_argument('--skip_extract', action='store_true',
+                    help='Skip extracting files')
+args = parser.parse_args()
+args.e = args.e or args.dest
+
+
+df = pd.read_csv(args.csv, delimiter=',')
+
+
+if not args.skip_download:
+    for url in df.url:
+        fname = url.split('/')[-1]
+        print("Downloading %s:" % fname)
+        download_file(url=url, dest_folder=args.dest, fname=fname)
+else:
+    print("Skipping file download")
+
+
+if not args.skip_checksum:
+    for index, row in df.iterrows():
+        url = row['url']
+        md5 = row['md5']
+        fname = url.split('/')[-1]
+        fpath = os.path.join(args.dest, fname)
+        print("Verifing %s: " % fname, end='')
+        ret = md5_checksum(fpath=fpath, target_hash=md5)
+        print("Passed" if ret else "Failed")
+else:
+    print("Skipping checksum")
+
+
+if not args.skip_extract:
+    for url in df.url:
+        fname = url.split('/')[-1]
+        fpath = os.path.join(args.dest, fname)
+        print("Decompressing %s:" % fpath)
+        extract(fpath=fpath, dest_folder=args.e)
+else:
+    print("Skipping file extraction")
--- a/PyTorch/SpeechRecognition/QuartzNet/utils/download_utils.py
+++ b/PyTorch/SpeechRecognition/QuartzNet/utils/download_utils.py
@ -0,0 +1,71 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python
+
+import hashlib
+import requests
+import os
+import tarfile
+import tqdm
+
+def download_file(url, dest_folder, fname, overwrite=False):
+    fpath = os.path.join(dest_folder, fname)
+    if os.path.isfile(fpath):
+        if overwrite:
+            print("Overwriting existing file")
+        else:
+            print("File exists, skipping download.")
+            return
+
+    tmp_fpath = fpath + '.tmp'
+
+    if not os.path.exists(os.path.dirname(tmp_fpath)):
+        os.makedirs(os.path.dirname(tmp_fpath))
+
+    r = requests.get(url, stream=True)
+    file_size = int(r.headers['Content-Length'])
+    chunk_size = 1024 * 1024  # 1MB
+    total_chunks = int(file_size / chunk_size)
+
+    with open(tmp_fpath, 'wb') as fp:
+        content_iterator = r.iter_content(chunk_size=chunk_size)
+        chunks = tqdm.tqdm(content_iterator, total=total_chunks,
+                           unit='MB', desc=fpath, leave=True)
+        for chunk in chunks:
+            fp.write(chunk)
+
+    os.rename(tmp_fpath, fpath)
+
+
+def md5_checksum(fpath, target_hash):
+    file_hash = hashlib.md5()
+    with open(fpath, "rb") as fp:
+        for chunk in iter(lambda: fp.read(1024*1024), b""):
+            file_hash.update(chunk)
+    return file_hash.hexdigest() == target_hash
+
+
+def extract(fpath, dest_folder):
+    if fpath.endswith('.tar.gz'):
+        mode = 'r:gz'
+    elif fpath.endswith('.tar'):
+        mode = 'r:'
+    else:
+        raise IOError('fpath has unknown extention: %s' % fpath)
+
+    with tarfile.open(fpath, mode) as tar:
+        members = tar.getmembers()
+        for member in tqdm.tqdm(iterable=members, total=len(members), leave=True):
+            tar.extract(path=dest_folder, member=member)
--- a/PyTorch/SpeechRecognition/QuartzNet/utils/inference_librispeech.csv
+++ b/PyTorch/SpeechRecognition/QuartzNet/utils/inference_librispeech.csv
@ -0,0 +1,5 @@
+url,md5
+http://www.openslr.org/resources/12/dev-clean.tar.gz,42e2234ba48799c1f50f24a7926300a1
+http://www.openslr.org/resources/12/dev-other.tar.gz,c8d0bcc9cca99d4f8b62fcc847357931
+http://www.openslr.org/resources/12/test-clean.tar.gz,32fa31d27d2e1cad72775fee3f4849a9
+http://www.openslr.org/resources/12/test-other.tar.gz,fb5a50374b501bb3bac4815ee91d3135
--- a/PyTorch/SpeechRecognition/QuartzNet/utils/librispeech.csv
+++ b/PyTorch/SpeechRecognition/QuartzNet/utils/librispeech.csv
@ -0,0 +1,8 @@
+url,md5
+http://www.openslr.org/resources/12/dev-clean.tar.gz,42e2234ba48799c1f50f24a7926300a1
+http://www.openslr.org/resources/12/dev-other.tar.gz,c8d0bcc9cca99d4f8b62fcc847357931
+http://www.openslr.org/resources/12/test-clean.tar.gz,32fa31d27d2e1cad72775fee3f4849a9
+http://www.openslr.org/resources/12/test-other.tar.gz,fb5a50374b501bb3bac4815ee91d3135
+http://www.openslr.org/resources/12/train-clean-100.tar.gz,2a93770f6d5c6c964bc36631d331a522
+http://www.openslr.org/resources/12/train-clean-360.tar.gz,c0e676e450a7ff2f54aeade5171606fa
+http://www.openslr.org/resources/12/train-other-500.tar.gz,d1a0fd59409feb2c614ce4d30c387708
--- a/PyTorch/SpeechRecognition/QuartzNet/utils/preprocessing_utils.py
+++ b/PyTorch/SpeechRecognition/QuartzNet/utils/preprocessing_utils.py
@ -0,0 +1,76 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python
+import os
+import multiprocessing
+import librosa
+import functools
+
+import sox
+
+
+from tqdm import tqdm
+
+def preprocess(data, input_dir, dest_dir, target_sr=None, speed=None,
+               overwrite=True):
+    speed = speed or []
+    speed.append(1)
+    speed = list(set(speed))  # Make uniqe
+
+    input_fname = os.path.join(input_dir,
+                               data['input_relpath'],
+                               data['input_fname'])
+    input_sr = sox.file_info.sample_rate(input_fname)
+    target_sr = target_sr or input_sr
+
+    os.makedirs(os.path.join(dest_dir, data['input_relpath']), exist_ok=True)
+
+    output_dict = {}
+    output_dict['transcript'] = data['transcript'].lower().strip()
+    output_dict['files'] = []
+
+    fname = os.path.splitext(data['input_fname'])[0]
+    for s in speed:
+        output_fname = fname + '{}.wav'.format('' if s==1 else '-{}'.format(s))
+        output_fpath = os.path.join(dest_dir,
+                                    data['input_relpath'],
+                                    output_fname)
+
+        if not os.path.exists(output_fpath) or overwrite:
+            cbn = sox.Transformer().speed(factor=s).convert(target_sr)
+            cbn.build(input_fname, output_fpath)
+
+        file_info = sox.file_info.info(output_fpath)
+        file_info['fname'] = os.path.join(os.path.basename(dest_dir),
+                                          data['input_relpath'],
+                                          output_fname)
+        file_info['speed'] = s
+        output_dict['files'].append(file_info)
+
+        if s == 1:
+            file_info = sox.file_info.info(output_fpath)
+            output_dict['original_duration'] = file_info['duration']
+            output_dict['original_num_samples'] = file_info['num_samples']
+
+    return output_dict
+
+
+def parallel_preprocess(dataset, input_dir, dest_dir, target_sr, speed, overwrite, parallel):
+    with multiprocessing.Pool(parallel) as p:
+        func = functools.partial(preprocess,
+            input_dir=input_dir, dest_dir=dest_dir,
+            target_sr=target_sr, speed=speed, overwrite=overwrite)
+        dataset = list(tqdm(p.imap(func, dataset), total=len(dataset)))
+        return dataset