Merge pull request #7 from NVIDIA/master

Pull from remote
2020-04-24 13:50:28 -07:00 · 2020-04-24 13:50:28 -07:00 · eb8e823c39
parent c57e4d2b08 4733603577
commit eb8e823c39
66 changed files with 8945 additions and 63 deletions
--- a/PyTorch/LanguageModeling/BERT/modeling.py
+++ b/PyTorch/LanguageModeling/BERT/modeling.py
@ -117,11 +117,11 @@ def load_tf_weights_in_bert(model, tf_checkpoint_path):
    return model

 def gelu(x):
-    return  x * 0.5 * (1.0 + torch.erf(x / 1.41421))
+    return x * 0.5 * (1.0 + torch.erf(x / 1.41421))

 def bias_gelu(bias, y):
    x = bias + y
-    return torch.nn.functional.gelu(x)# x * 0.5 * (1.0 + torch.erf(x / 1.41421))
+    return x * 0.5 * (1.0 + torch.erf(x / 1.41421))

 def bias_tanh(bias, y):
    x = bias + y
@ -130,7 +130,7 @@ def bias_tanh(bias, y):
 def swish(x):
    return x * torch.sigmoid(x)

-ACT2FN = {"gelu": torch.nn.functional.gelu, "bias_gelu": bias_gelu, "bias_tanh": bias_tanh, "relu": torch.nn.functional.relu, "swish": swish}
+ACT2FN = {"gelu": gelu, "bias_gelu": bias_gelu, "bias_tanh": bias_tanh, "relu": torch.nn.functional.relu, "swish": swish}

 class LinearActivation(Module):
    r"""Fused Linear and activation Module.
--- a/PyTorch/LanguageModeling/BERT/run_squad.py
+++ b/PyTorch/LanguageModeling/BERT/run_squad.py
@ -472,6 +472,11 @@ def get_answers(examples, features, results, args):
                preds,
                key=lambda x: (x.start_logit + x.end_logit),
                reverse=True)[:args.n_best_size]
+        
+        # In very rare edge cases we could only have single null prediction.
+	      # So we just create a nonce prediction in this case to avoid failure.
+        if not nbest:                                                    
+	          nbest.append(Prediction(text="empty", start_logit=0.0, end_logit=0.0))

        total_scores = []
        best_non_null_entry = None
--- a/PyTorch/Recommendation/DLRM/Dockerfile
+++ b/PyTorch/Recommendation/DLRM/Dockerfile
@ -0,0 +1,34 @@
+# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:20.03-py3
+FROM ${FROM_IMAGE_NAME}
+
+RUN apt update && \
+    apt install -y openjdk-8-jdk && \
+    curl http://archive.apache.org/dist/spark/spark-2.4.5/spark-2.4.5-bin-hadoop2.7.tgz -o /opt/spark-2.4.5-bin-hadoop2.7.tgz && \
+    tar zxf /opt/spark-2.4.5-bin-hadoop2.7.tgz -C /opt/ && \
+    rm /opt/spark-2.4.5-bin-hadoop2.7.tgz
+
+ADD requirements.txt .
+RUN pip install -r requirements.txt
+
+RUN pip uninstall -y apex && \
+    git clone https://github.com/NVIDIA/apex && \
+    cd apex && \
+    pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./
+
+WORKDIR /workspace/dlrm
+
+COPY . .
--- a/PyTorch/Recommendation/DLRM/LICENSE
+++ b/PyTorch/Recommendation/DLRM/LICENSE
@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
--- a/PyTorch/Recommendation/DLRM/NOTICE
+++ b/PyTorch/Recommendation/DLRM/NOTICE
@ -0,0 +1,3 @@
+DLRM for PyTorch
+
+This repository includes software from https://github.com/facebookresearch/dlrm licensed under the MIT License
--- a/PyTorch/Recommendation/DLRM/README.md
+++ b/PyTorch/Recommendation/DLRM/README.md
@ -0,0 +1,516 @@
+
+
+# DLRM For PyTorch
+
+This repository provides a script and recipe to train the Deep Learning Recommendation Model (DLRM) to achieve state-of-the-art accuracy and is tested and maintained by NVIDIA.
+
+## Table Of Contents	
+
+
+  * [Table Of Contents](#table-of-contents)
+  * [Model overview](#model-overview)
+     * [Model architecture](#model-architecture)
+     * [Default configuration](#default-configuration)
+     * [Feature support matrix](#feature-support-matrix)
+        * [Features](#features)
+     * [Mixed precision training](#mixed-precision-training)
+        * [Enabling mixed precision](#enabling-mixed-precision)
+  * [Setup](#setup)
+     * [Requirements](#requirements)
+  * [Quick Start Guide](#quick-start-guide)
+  * [Advanced](#advanced)
+     * [Scripts and sample code](#scripts-and-sample-code)
+     * [Parameters](#parameters)
+     * [Command-line options](#command-line-options)
+     * [Getting the data](#getting-the-data)
+        * [Dataset guidelines](#dataset-guidelines)
+        * [Multi-dataset](#multi-dataset)
+        * [Preprocess with Spark](#preprocess-with-spark)
+     * [Training process](#training-process)
+     * [Inference process](#inference-process)
+  * [Performance](#performance)
+     * [Benchmarking](#benchmarking)
+        * [Training performance benchmark](#training-performance-benchmark)
+        * [Inference performance benchmark](#inference-performance-benchmark)
+     * [Results](#results)
+        * [Training accuracy results](#training-accuracy-results)
+           * [Training accuracy: NVIDIA DGX-1 (8x V100 32G)](#training-accuracy-nvidia-dgx-1-8x-v100-32g)
+           * [Training stability test](#training-stability-test)
+        * [Training performance results](#training-performance-results)
+           * [Training performance: NVIDIA DGX-1 (8x V100 32G)](#training-performance-nvidia-dgx-1-8x-v100-32g)
+  * [Release notes](#release-notes)
+     * [Changelog](#changelog)
+     * [Known issues](#known-issues)
+
+## Model overview
+
+The Deep Learning Recommendation Model (DLRM) is a recommendation model designed to 
+make use of both categorical and numerical inputs. It was first described in 
+[Deep Learning Recommendation Model for Personalization and Recommendation Systems](https://arxiv.org/abs/1906.00091).
+This repository provides a reimplementation of the codebase provided originally [here](https://github.com/facebookresearch/dlrm).
+The scripts provided enable you to train DLRM on the [Criteo Terabyte Dataset](https://labs.criteo.com/2013/12/download-terabyte-click-logs/). 
+
+This model uses a slightly different preprocessing procedure than the one found in the original implementation. You can find a detailed description of the preprocessing steps in the [Dataset guidelines](#dataset-guidelines) section.
+
+Using DLRM you can train a high-quality general model for providing recommendations.
+
+This model is trained with mixed precision using Tensor Cores on NVIDIA Volta and Turing GPUs. Therefore, researchers can get results 1.77x faster than training without Tensor Cores while experiencing the benefits of mixed precision training. It is tested against each NGC monthly container release to ensure consistent accuracy and performance over time.
+
+
+
+### Model architecture
+
+DLRM accepts two types of features: categorical and numerical. For each categorical
+feature, an embedding table is used to provide dense representation to each unique value. The dense features enter the model and are transformed by a 
+simple neural network referred to as "bottom MLP". This part of the network consists of a series
+of linear layers with ReLU activations. The output of the bottom MLP and the embedding vectors
+are then fed into the "dot interaction" operation. The output of "dot interaction" is then concatenated with the features resulting from bottom MLP and fed into the "top MLP" which is also a series of dense layers with activations.
+The model outputs a single number which can be interpreted as a likelihood of a certain user clicking an ad. 
+
+
+
+<p align="center">
+  <img width="100%" src="./notebooks/DLRM_architecture.png" />
+  <br>
+Figure 1. The architecture of DLRM. 
+</p>
+
+### Default configuration
+
+The following features were implemented in this model:
+- general
+	- static loss scaling for Tensor Cores (mixed precision) training
+- preprocessing
+    - dataset preprocessing using Spark 
+    
+### Feature support matrix
+
+The following features are supported by this model: 
+
+| Feature               | DLRM                
+|----------------------|--------------------------
+|Automatic mixed precision (AMP)   | yes
+         
+#### Features
+
+Automatic Mixed Precision (AMP) - enables mixed precision training without any changes to the code-base by performing automatic graph rewrites and loss scaling controlled by an environmental variable.
+
+
+### Mixed precision training
+
+Mixed precision is the combined use of different numerical precisions in a computational method. [Mixed precision](https://arxiv.org/abs/1710.03740) training offers significant computational speedup by performing operations in half-precision format while storing minimal information in single-precision to retain as much information as possible in critical parts of the network. Since the introduction of [Tensor Cores](https://developer.nvidia.com/tensor-cores) in the Volta and Turing architecture, significant training speedups are experienced by switching to mixed precision -- up to 3x overall speedup on the most arithmetically intense model architectures. Using mixed precision training requires two steps:
+1.  Porting the model to use the FP16 data type where appropriate.    
+2.  Adding loss scaling to preserve small gradient values.
+
+The ability to train deep learning networks with lower precision was introduced in the Pascal architecture and first supported in [CUDA 8](https://devblogs.nvidia.com/parallelforall/tag/fp16/) in the NVIDIA Deep Learning SDK.
+
+For information about:
+-   How to train using mixed precision, see the [Mixed Precision Training](https://arxiv.org/abs/1710.03740) paper and [Training With Mixed Precision](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html) documentation.
+-   Techniques used for mixed precision training, see the [Mixed-Precision Training of Deep Neural Networks](https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/) blog.
+-   APEX tools for mixed precision training, see the [NVIDIA Apex: Tools for Easy Mixed-Precision Training in PyTorch](https://devblogs.nvidia.com/apex-pytorch-easy-mixed-precision-training/).
+
+#### Enabling mixed precision
+
+Mixed precision training is enabled by default. To turn it off issue the `--nofp16` flag to the `main.py` script.
+
+
+## Setup
+
+The following section lists the requirements for training DLRM.
+
+### Requirements
+
+This repository contains Dockerfile which extends the PyTorch NGC container and encapsulates some dependencies. Aside from these dependencies, ensure you have the following components:
+-   [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker)
+-   [PyTorch 20.03-py3+] NGC container
+-   [NVIDIA Volta](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/) or [Turing](https://www.nvidia.com/en-us/geforce/turing/) based GPU
+
+For more information about how to get started with NGC containers, see the following sections from the NVIDIA GPU Cloud Documentation and the Deep Learning Documentation:
+-   [Getting Started Using NVIDIA GPU Cloud](https://docs.nvidia.com/ngc/ngc-getting-started-guide/index.html)
+-   [Accessing And Pulling From The NGC Container Registry](https://docs.nvidia.com/deeplearning/frameworks/user-guide/index.html#accessing_registry)
+- [Running PyTorch](https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/running.html#running)
+  
+For those unable to use the PyTorch NGC container, to set up the required environment or create your own container, see the versioned [NVIDIA Container Support Matrix](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html).
+
+## Quick Start Guide
+
+To train your model using mixed precision with Tensor Cores or using FP32, perform the following steps using
+the default parameters of DLRM on the Criteo Terabyte dataset. For the specifics concerning training and inference,
+see the [Advanced](#advanced) section.
+
+1. Clone the repository.
+```
+git clone https://github.com/NVIDIA/DeepLearningExamples
+cd DeepLearningExamples/PyTorch/Recommendation/DLRM
+```
+
+2. Build a DLRM Docker container
+```bash
+docker build . -t nvidia_dlrm_pyt
+```
+
+3. Start an interactive session in the NGC container to run preprocessing/training and inference.
+The NCF PyTorch container can be launched with:
+```bash
+mkdir -p data
+docker run --runtime=nvidia -it --rm --ipc=host  -v ${PWD}/data:/data nvidia_dlrm_pyt bash
+```
+
+4.  Download and preprocess the dataset.
+You can download the data by following the instructions at: http://labs.criteo.com/2013/12/download-terabyte-click-logs/.
+When you have successfully downloaded it, put it in the `/data/dlrm/criteo/` directory in the container (`$PWD/data/dlrm/criteo` in the host system).
+You can then run the preprocessing with the commands below. Note
+that this will require about 4TB of disk storage.
+```
+cd preproc
+./prepare_dataset.sh
+cd -
+```
+
+5. Start training.
+```
+python -m dlrm.scripts.main --mode train --dataset /data/dlrm/binary_dataset/
+```
+
+6. Start validation/evaluation.
+```
+python -m dlrm.scripts.main --mode test --dataset /data/dlrm/binary_dataset/
+```
+
+
+## Advanced
+
+The following sections provide greater details of the dataset, running training and inference, and the training results.
+
+### Scripts and sample code
+
+The `dlrm/scripts/main.py` script provides an entry point to most of the functionality. Using different command-line flags allows you to run training, validation and benchmark both training and inference on real or synthetic data. 
+
+The `dlrm/model.py` file provides the definition of the DLRM neural network.
+
+Utilities connected to loading the data reside in the `data` directory.
+
+
+### Parameters
+
+### Command-line options
+
+The `dlrm/scripts/main.py` script supports a number of command-line flags. You can get the descriptions of those by running `python -m dlrm.scripts.main --help`. Running this command will output:
+
+```        
+       USAGE: /workspace/dlrm/dlrm/scripts/main.py [flags]
+flags:
+
+/workspace/dlrm/dlrm/scripts/main.py:
+  --auc_threshold: Stop the training after achieving this AUC
+    (a number)
+  --base_device: Device to run the majority of the model operations
+    (default: 'cuda')
+  --batch_size: Batch size used for training
+    (default: '32768')
+    (an integer)
+  --benchmark_warmup_steps: Number of initial iterations to exclude from
+    throughput measurements
+    (default: '0')
+    (an integer)
+  --bottom_mlp_sizes: Linear layer sizes for the bottom MLP
+    (default: '512,256,128')
+    (a comma separated list)
+  --dataset: Full path to binary dataset. Must include files such as:
+    train_data.bin, test_data.bin
+  --dataset_subset: Use only a subset of the training data. If None (default)
+    will use all of it. Must be either None, or a float in range [0,1]
+    (a number)
+  --decay_start_step: Optimization step after which to start decaying the
+    learning rate, if None will start decaying right after the warmup phase is
+    completed
+    (default: '64000')
+    (an integer)
+  --decay_steps: Polynomial learning rate decay steps. If equal to 0 will not do
+    any decaying
+    (default: '80000')
+    (an integer)
+  --embedding_dim: Dimensionality of embedding space for categorical features
+    (default: '128')
+    (an integer)
+  --epochs: Number of epochs to train for
+    (default: '1')
+    (an integer)
+  --[no]fp16: If True (default) the script will use Automatic Mixed Precision
+    (default: 'true')
+  --[no]hash_indices: If True the model will compute `index := index % table
+    size` to ensure that the indices match table sizes
+    (default: 'false')
+  --inference_benchmark_batch_sizes: Batch sizes for inference throughput and
+    latency measurements
+    (default: '1,64,4096')
+    (a comma separated list)
+  --inference_benchmark_steps: Number of steps for measuring inference latency
+    and throughput
+    (default: '200')
+    (an integer)
+  --interaction_op: Type of interaction operation to perform. Supported choices:
+    'dot' or 'cat'
+    (default: 'dot')
+  --load_checkpoint_path: Path from which to load a checkpoint
+  --log_path: Destination for the log file with various results and statistics
+    (default: './log.json')
+  --loss_scale: Static loss scale for Mixed Precision Training
+    (default: '8192.0')
+    (a number)
+  --lr: Base learning rate
+    (default: '28.0')
+    (a number)
+  --max_steps: Stop training after doing this many optimization steps
+    (an integer)
+  --max_table_size: Maximum number of rows per embedding table, by default equal
+    to the number of unique values for each categorical variable
+    (an integer)
+  --mode: <train|test|inference_benchmark>: Select task to be performed
+    (default: 'train')
+  --num_numerical_features: Number of numerical features in the dataset.
+    Defaults to 13 for the Criteo Terabyte Dataset
+    (default: '13')
+    (an integer)
+  --output_dir: Path where to save the checkpoints
+    (default: '/tmp')
+  --print_freq: Number of optimizations steps between printing training status
+    to stdout
+    (default: '200')
+    (an integer)
+  --save_checkpoint_path: Path to which to save the training checkpoints
+  --seed: Random seed
+    (default: '12345')
+    (an integer)
+  --[no]self_interaction: Set to True to use self-interaction
+    (default: 'false')
+  -shuffle,--[no]shuffle_batch_order: Read batch in train dataset by random
+    order
+    (default: 'false')
+  --[no]synthetic_dataset: Use synthetic instead of real data for benchmarking
+    purposes
+    (default: 'false')
+  --synthetic_dataset_table_sizes: Embedding table sizes to use with the
+    synthetic dataset
+    (a comma separated list)
+  --test_after: Don't test the model unless this many epochs has been completed
+    (default: '0.0')
+    (a number)
+  --test_batch_size: Batch size used for testing/validation
+    (default: '32768')
+    (an integer)
+  --test_freq: Number of optimization steps between validations. If None will
+    test after each epoch
+    (an integer)
+  --top_mlp_sizes: Linear layer sizes for the top MLP
+    (default: '1024,1024,512,256,1')
+    (a comma separated list)
+  --warmup_factor: Learning rate warmup factor. Must be a non-negative integer
+    (default: '0')
+    (an integer)
+  --warmup_steps: Number of warmup optimization steps
+    (default: '6400')
+    (an integer)
+``` 
+
+
+The following example output is printed when running the model:
+```
+Epoch:[0/1] [200/128028]  eta: 1:28:44  loss: 0.1782  step_time: 0.041657  lr: 0.8794
+Epoch:[0/1] [400/128028]  eta: 1:25:15  loss: 0.1403  step_time: 0.038504  lr: 1.7544
+Epoch:[0/1] [600/128028]  eta: 1:23:56  loss: 0.1384  step_time: 0.038422  lr: 2.6294
+Epoch:[0/1] [800/128028]  eta: 1:23:13  loss: 0.1370  step_time: 0.038421  lr: 3.5044
+Epoch:[0/1] [1000/128028]  eta: 1:22:45  loss: 0.1362  step_time: 0.038464  lr: 4.3794
+Epoch:[0/1] [1200/128028]  eta: 1:22:24  loss: 0.1346  step_time: 0.038455  lr: 5.2544
+Epoch:[0/1] [1400/128028]  eta: 1:22:07  loss: 0.1339  step_time: 0.038459  lr: 6.1294
+Epoch:[0/1] [1600/128028]  eta: 1:21:52  loss: 0.1320  step_time: 0.038481  lr: 7.0044
+Epoch:[0/1] [1800/128028]  eta: 1:21:39  loss: 0.1315  step_time: 0.038482  lr: 7.8794
+Epoch:[0/1] [2000/128028]  eta: 1:21:27  loss: 0.1304  step_time: 0.038466  lr: 8.7544
+Epoch:[0/1] [2200/128028]  eta: 1:21:15  loss: 0.1305  step_time: 0.038430  lr: 9.6294
+```
+
+### Getting the data
+
+This example uses the [Criteo Terabyte Dataset](https://labs.criteo.com/2013/12/download-terabyte-click-logs/).
+The first 23 days are used as the training set. The last day is split in half. The first part is used as a validation set and the second one as a hold-out test set.
+
+
+#### Dataset guidelines
+
+The preprocessing steps applied to the raw data include:
+- Replacing the missing values with `0`
+- Replacing the categorical values that exist fewer than 15 times with a special value
+- Converting the hash values to consecutive integers
+- Adding 2 to all the numerical features so that all of them are greater or equal to 1
+- Taking a natural logarithm of all numerical features
+
+#### Multi-dataset
+
+Our preprocessing scripts are designed for the Criteo Terabyte Dataset and should work with any other dataset with the same format. The data should be split into text files. Each line of those text files should contain a single training example. An example should consist of multiple fields separated by tabulators:
+- The first field is the label – `1` for a positive example and `0` for negative.
+- The next `N` tokens should contain the numerical features separated by tabs.
+- The next `M` tokens should contain the hashed categorical features separated by tabs.
+
+
+#### Preprocess with Spark
+
+The script `spark_data_utils.py` is a PySpark application, which is used to preprocess the Criteo Terabyte Dataset. In the Docker image, we have installed Spark 2.4.5, which will start a standalone cluster of Spark. The script `run-spark.sh` starts the Spark, then runs several PySpark jobs with `spark_data_utils.py`. 
+Generate the dictionary
+Transform train dataset
+Transform test dataset
+Transform validation dataset
+
+    Change the variables in the `run-spark.sh` script according to your environment.
+    Configure the paths.
+```
+export SPARK_LOCAL_DIRS=/data/spark-tmp
+export INPUT_PATH=/data/criteo
+export OUTPUT_PATH=/data/output
+```
+Note that the Spark job requires about 3TB disk space used for data shuffle.
+
+`SPARK_LOCAL_DIRS` is the path where Spark uses to write shuffle data.
+
+`INPUT_PATH` is the path of the Criteo Terabyte Dataset, including uncompressed files like day_0, day_1…
+
+`OUTPUT_PATH` is where the script writes the output data. It will generate below subdirectories of `models`, `train`, `test`, and `validation`. 
+The `model` is the dictionary folder. 
+The `train` is the train dataset transformed from day_0 to day_22. 
+The `test` is the test dataset transformed from the prior half of day_23. 
+The `validation` is the dataset transformed from the latter half of day_23.
+
+Configure the resources which Spark will use.
+```
+export TOTAL_CORES=80
+export TOTAL_MEMORY=800
+```
+`TOTAL_CORES` is the total CPU cores you want Spark to use.
+
+`TOTAL_MEMORY` is the total memory Spark will use.
+
+Configure frequency limit.
+```
+USE_FREQUENCY_LIMIT=15
+```
+The frequency limit is used to filter out the categorical values which appear less than n times in the whole dataset, and make them be 0. Change this variable to 1 to enable it. The default frequency limit is 15 in the script. You also can change the number as you want by changing  the line of `OPTS="--frequency_limit 8"`.
+
+After the above configuration, you can run `run-spark.sh` if you already downloaded the dataset or run through `prepare_dataset.sh`, which includes verifying the downloaded dataset and running the job to preprocess the dataset.
+
+### Training process
+
+The main training script resides in `dlrm/scripts/main.py`. Once the training is completed, it stores the checkpoint
+in the path specified by `--save_checkpoint_path` and a training log in `--log_path`. The quality of the predictions 
+generated by the model is measured by the [ROC AUC metric](https://scikit-learn.org/stable/modules/model_evaluation.html#roc-metrics).
+The speed of training and inference is measured by throughput i.e., the number 
+of samples processed per second. We use mixed precision training with static loss scaling for the bottom and top MLPs while embedding tables are stored in FP32 format.
+
+
+### Inference process
+
+This section describes inference with PyTorch in Python. If you're interested in inference using the Triton Inference Server, refer to `triton/README.md` file.
+
+Two modes for inference are currently supported by the `dlrm/scripts/main.py` script:
+
+1. Inference benchmark – this mode will measure and print out throughput and latency numbers for multiple batch sizes. You can activate it by setting the batch sizes to be tested with the `inference_benchmark_batch_sizes` command-line argument. It will use the default test dataset unless the `--synthetic_dataset` flag is passed.
+2. Test-only – this mode can be used to run a full validation on a checkpoint to measure ROC AUC . You can enable it by passing the `--mode test` flag.
+
+## Performance
+
+### Benchmarking
+
+The following section shows how to run benchmarks measuring the model performance in training and inference modes.
+
+#### Training performance benchmark
+
+To benchmark the training performance on a specific batch size, run:
+
+```
+python -m dlrm.scripts.main --mode train --max_steps 500 --benchmark_warmup_steps 250 --dataset /data
+```
+
+You can also pass the `--synthetic_dataset` flag if you haven't yet downloaded the dataset.
+
+#### Inference performance benchmark
+
+To benchmark the inference performance on a specific batch size, run:
+
+```
+python -m dlrm.scripts.main --mode inference_benchmark --dataset /data
+```
+
+You can also pass the `--synthetic_dataset` flag if you haven't yet downloaded the dataset.
+
+### Results 
+
+The following sections provide details on how we achieved our performance and accuracy in training and inference.
+
+#### Training accuracy results
+
+
+##### Training accuracy: NVIDIA DGX-1 (8x V100 32G)
+
+Our results were obtained by running the `dlrm/scripts/main.py` script for one epoch as described in the Quick Start Guide training script in the DLRM Docker container on a single Tesla V100 32G GPU.
+
+| GPUs    | Batch size / GPU    | Accuracy (AUC) - FP32  | Accuracy (AUC) - mixed precision  |   Time to train - FP32  [hours] |  Time to train - mixed precision  [hours] | Time to train speedup (FP32 to mixed precision)        
+|----|----|----|----|---|---|---|
+| 1 | 32k | 0.80362 | 0.80362 | 2.46 | 1.44 | 1.71 |
+
+
+
+##### Training stability test
+
+The table below shows the complete convergence data for 16 different random seeds. 
+
+|   Random seed |  Mixed precision AUC | Single precision AUC |
+|-------:|---------:|---------:|
+|      8 | 0.803696 | 0.803669 |
+|      9 | 0.803617 | 0.803574 |
+|     10 | 0.803672 | 0.80367  |
+|     11 | 0.803699 | 0.803683 |
+|     12 | 0.803659 | 0.803724 |
+|     13 | 0.803578 | 0.803565 |
+|     14 | 0.803609 | 0.803613 |
+|     15 | 0.803585 | 0.803615 |
+|     16 | 0.803553 | 0.803583 |
+|     17 | 0.803644 | 0.803688 |
+|     18 | 0.803656 | 0.803609 |
+|     19 | 0.803589 | 0.803635 |
+|     20 | 0.803567 | 0.803611 |
+|     21 | 0.803548 | 0.803487 |
+|     22 | 0.803532 | 0.803591 |
+|     23 | 0.803625 | 0.803601 |
+| **mean** | **0.803614** | **0.803620** |
+
+
+
+#### Training performance results
+
+
+##### Training performance: NVIDIA DGX-1 (8x V100 32G)
+
+Our results were obtained by running:
+```
+python -m dlrm.scripts.main --mode train --max_steps 200 --benchmark_warmup_steps 50 --fp16 --dataset /data
+```
+ in the DLRM Docker container on NVIDIA DGX-1 with (8x V100 32G) GPUs. Performance numbers (in items/images per second) were averaged over 150 training steps.
+
+| GPUs   | Batch size / GPU   | Throughput - FP32    | Throughput - mixed precision    | Throughput speedup (FP32 - mixed precision)   |
+|----|---|---|---|---|
+| 1 | 32k |  494k | 875k | 1.773 |
+
+
+We used throughput in items processed per second as the performance metric.
+
+
+## Release notes
+
+### Changelog
+
+April 2020
+- Initial release
+
+
+### Known issues
+
+There are no known issues with this model
+
--- a/PyTorch/Recommendation/DLRM/dlrm/init.py
+++ b/PyTorch/Recommendation/DLRM/dlrm/init.py
--- a/PyTorch/Recommendation/DLRM/dlrm/data/init.py
+++ b/PyTorch/Recommendation/DLRM/dlrm/data/init.py
--- a/PyTorch/Recommendation/DLRM/dlrm/data/data_loader.py
+++ b/PyTorch/Recommendation/DLRM/dlrm/data/data_loader.py
@ -0,0 +1,98 @@
+# Copyright (c) 2020 NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import math
+import os
+import time
+import numpy as np
+import argparse
+
+import torch
+from torch.utils.data import Dataset
+
+class CriteoBinDataset(Dataset):
+    """Simple dataloader for a recommender system. Designed to work with a single binary file."""
+
+    def __init__(self, data_file, batch_size=1, subset=None,
+                 numerical_features=13, categorical_features=26,
+                 data_type='int32', online_shuffle=True):
+        self.data_type = np.__dict__[data_type]
+        bytes_per_feature = self.data_type().nbytes
+
+        self.tad_fea = 1 + numerical_features
+        self.tot_fea = 1 + numerical_features + categorical_features
+
+        self.batch_size = batch_size
+        self.bytes_per_entry = (bytes_per_feature * self.tot_fea * batch_size)
+
+        self.num_entries = math.ceil(os.path.getsize(data_file) / self.bytes_per_entry)
+
+        if subset is not None:
+            if subset <= 0 or subset > 1:
+                raise ValueError('Subset parameter must be in (0,1) range')
+            self.num_entries = self.num_entries * subset
+
+        print('data file:', data_file, 'number of batches:', self.num_entries)
+        self.file = open(data_file, 'rb')
+        self.online_shuffle=online_shuffle
+
+    def __len__(self):
+        return self.num_entries
+
+    def __getitem__(self, idx):
+        if idx == 0:
+            self.file.seek(0, 0)
+
+        if self.online_shuffle:
+            self.file.seek(idx * self.bytes_per_entry, 0)
+
+        raw_data = self.file.read(self.bytes_per_entry)
+        array = np.frombuffer(raw_data, dtype=self.data_type).reshape(-1, self.tot_fea)
+
+        # numerical features are encoded as float32
+        numerical_features = array[:, 1:self.tad_fea].view(dtype=np.float32)
+        numerical_features = torch.from_numpy(numerical_features)
+
+
+        categorical_features = torch.from_numpy(array[:, self.tad_fea:])
+        labels = torch.from_numpy(array[:, 0])
+
+        return numerical_features, categorical_features, labels
+
+    def __del__(self):
+        self.file.close()
+
+
+if __name__ == '__main__':
+    print('Dataloader benchmark')
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--file', type=str)
+    parser.add_argument('--batch_size', type=int)
+    parser.add_argument('--steps', type=int, default=1000)
+    args = parser.parse_args()
+
+    dataset = CriteoBinDataset(data_file=args.file, batch_size=args.batch_size)
+
+    begin = time.time()
+    for i in range(args.steps):
+        _ = dataset[i]
+    end = time.time()
+    
+    step_time = (end - begin) / args.steps
+    throughput = args.batch_size / step_time
+
+    print(f'Mean step time: {step_time:.6f} [s]')
+    print(f'Mean throughput: {throughput:,.0f} [samples / s]')
--- a/PyTorch/Recommendation/DLRM/dlrm/data/synthetic_dataset.py
+++ b/PyTorch/Recommendation/DLRM/dlrm/data/synthetic_dataset.py
@ -0,0 +1,42 @@
+# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import math
+from torch.utils.data import Dataset
+
+
+class SyntheticDataset(Dataset):
+    """Synthetic dataset version of criteo dataset."""
+
+    def __init__(self,  num_entries, device='cuda', batch_size=1, dense_features=13,
+                 categorical_feature_sizes=None):
+        # dataset. single target, 13 dense features, 26 sparse features
+        self.sparse_features = len(categorical_feature_sizes)
+        self.dense_features = dense_features
+
+        self.tot_fea = 1 + dense_features + self.sparse_features
+        self.batch_size = batch_size
+        self.batches_per_epoch = math.ceil(num_entries / batch_size)
+        self.categorical_feature_sizes = categorical_feature_sizes
+        self.device = device
+
+        self.tensor = torch.randint(low=0, high=2, size=(self.batch_size, self.tot_fea), device=self.device)
+        self.tensor = self.tensor.float()
+
+    def __len__(self):
+        return self.batches_per_epoch
+
+    def __getitem__(self, idx):
+        return self.tensor[:, 1:14], self.tensor[:, 14:], self.tensor[:, 0]
--- a/PyTorch/Recommendation/DLRM/dlrm/model.py
+++ b/PyTorch/Recommendation/DLRM/dlrm/model.py
@ -0,0 +1,224 @@
+# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import copy
+import json
+import math
+
+from absl import logging
+
+import torch
+from torch import nn
+from typing import List
+
+
+class Dlrm(nn.Module):
+    """Reimplement Facebook's DLRM model
+
+    Original implementation is from https://github.com/facebookresearch/dlrm.
+
+    """
+
+    def __init__(self, num_numerical_features, categorical_feature_sizes, bottom_mlp_sizes, top_mlp_sizes,
+                     embedding_dim=32, interaction_op="dot", self_interaction=False, hash_indices=False,
+                     base_device="cuda", sigmoid=False):
+
+        # Running everything on gpu by default
+        self._base_device = base_device
+        self._embedding_device_map = [base_device for _ in range(len(categorical_feature_sizes))]
+
+        super(Dlrm, self).__init__()
+
+        if embedding_dim != bottom_mlp_sizes[-1]:
+            raise TypeError("The last bottom MLP layer must have same size as embedding.")
+
+        self._embedding_dim = embedding_dim
+        self._interaction_op = interaction_op
+        self._self_interaction = self_interaction
+        self._hash_indices = hash_indices
+        self._categorical_feature_sizes = copy.copy(categorical_feature_sizes)
+
+        # Interactions are among outputs of all the embedding tables and bottom MLP, total number of
+        # (num_embedding_tables + 1) vectors with size embdding_dim. ``dot`` product interaction computes dot product
+        # between any 2 vectors. ``cat`` interaction concatenate all the vectors together.
+        # Output of interaction will have shape [num_interactions, embdding_dim].
+        self._num_interaction_inputs = len(categorical_feature_sizes) + 1
+        if interaction_op == "dot":
+            if self_interaction:
+                raise NotImplementedError
+            num_interactions = (self._num_interaction_inputs * (self._num_interaction_inputs - 1)) // 2 + embedding_dim
+        elif interaction_op == "cat":
+            num_interactions = self._num_interaction_inputs * embedding_dim
+        else:
+            raise TypeError(F"Unknown interaction {interaction_op}.")
+
+        self.embeddings = nn.ModuleList()
+        self._create_embeddings(self.embeddings, embedding_dim, categorical_feature_sizes)
+
+        # Create bottom MLP
+        bottom_mlp_layers = []
+        input_dims = num_numerical_features
+        for output_dims in bottom_mlp_sizes:
+            bottom_mlp_layers.append(
+                nn.Linear(input_dims, output_dims))
+            bottom_mlp_layers.append(nn.ReLU(inplace=True))
+            input_dims = output_dims
+        self.bottom_mlp = nn.Sequential(*bottom_mlp_layers)
+
+        # Create Top MLP
+        top_mlp_layers = []
+
+        input_dims = num_interactions
+        if self._interaction_op == 'dot':
+            input_dims += 1  # pad 1 to be multiple of 8
+
+        for output_dims in top_mlp_sizes[:-1]:
+            top_mlp_layers.append(nn.Linear(input_dims, output_dims))
+            top_mlp_layers.append(nn.ReLU(inplace=True))
+            input_dims = output_dims
+        # last Linear layer uses sigmoid
+        top_mlp_layers.append(nn.Linear(input_dims, top_mlp_sizes[-1]))
+        if sigmoid:
+            top_mlp_layers.append(nn.Sigmoid())
+        self.top_mlp = nn.Sequential(*top_mlp_layers)
+
+        self._initialize_mlp_weights()
+        self._interaction_padding = torch.zeros(1, 1, dtype=torch.float32)
+        self.tril_indices = torch.tensor([[i for i in range(len(self.embeddings) + 1) 
+                                             for j in range(i + int(self_interaction))],
+                                          [j for i in range(len(self.embeddings) + 1) 
+                                             for j in range(i + int(self_interaction))]])
+
+    def _interaction(self, 
+            bottom_mlp_output: torch.Tensor, 
+            embedding_outputs: List[torch.Tensor], 
+            batch_size: int) -> torch.Tensor:
+        """Interaction
+
+        "dot" interaction is a bit tricky to implement and test. Break it out from forward so that it can be tested
+        independently.
+
+        Args:
+            bottom_mlp_output (Tensor):
+            embedding_outputs (list): Sequence of tensors
+            batch_size (int):
+        """
+        if self._interaction_padding is None:
+            self._interaction_padding = torch.zeros(
+                batch_size, 1, dtype=bottom_mlp_output.dtype, device=bottom_mlp_output.device)
+        concat = torch.cat([bottom_mlp_output] + embedding_outputs, dim=1)
+        if self._interaction_op == "dot" and not self._self_interaction:
+            concat = concat.view((-1, self._num_interaction_inputs, self._embedding_dim))
+            interaction = torch.bmm(concat, torch.transpose(concat, 1, 2))
+            interaction_flat = interaction[:, self.tril_indices[0], self.tril_indices[1]]
+            # concatenate dense features and interactions
+            interaction_padding = self._interaction_padding.expand(batch_size, 1).to(dtype=bottom_mlp_output.dtype)
+            interaction_output = torch.cat(
+                (bottom_mlp_output, interaction_flat, interaction_padding), dim=1)
+        elif self._interaction_op == "cat":
+            interaction_output = concat
+        else:
+            raise NotImplementedError
+
+        return interaction_output
+
+    def _initialize_mlp_weights(self):
+        """Initializing weights same as original DLRM"""
+        for module in self.modules():
+            if isinstance(module, nn.Linear):
+                nn.init.normal_(module.weight.data, 0., math.sqrt(2. / (module.in_features + module.out_features)))
+                nn.init.normal_(module.bias.data, 0., math.sqrt(1. /  module.out_features))
+
+        # Explicitly set weight corresponding to zero padded interaction output. They will
+        # stay 0 throughout the entire training. An assert can be added to the end of the training
+        # to prove it doesn't increase model capacity but just 0 paddings.
+        nn.init.zeros_(self.top_mlp[0].weight[:, -1].data)
+
+    @property
+    def num_categorical_features(self):
+        return len(self._categorical_feature_sizes)
+
+    def extra_repr(self):
+        s = (F"interaction_op={self._interaction_op}, self_interaction={self._self_interaction}, "
+             F"hash_indices={self._hash_indices}")
+        return s
+    # pylint:enable=missing-docstring
+
+    @classmethod
+    def from_dict(cls, obj_dict, **kwargs):
+        """Create from json str"""
+        return cls(**obj_dict, **kwargs)
+
+    def _create_embeddings(self, embeddings, embedding_dim, categorical_feature_sizes):
+        # Each embedding table has size [num_features, embedding_dim]
+        for i, num_features in enumerate(categorical_feature_sizes):
+            # Allocate directly on GPU is much faster than allocating on CPU then copying over
+            embedding_weight = torch.empty((num_features, embedding_dim), device=self._embedding_device_map[i])
+            embedding = nn.Embedding.from_pretrained(embedding_weight, freeze=False, sparse=True)
+
+            # Initializing embedding same as original DLRM
+            nn.init.uniform_(
+                embedding.weight.data,
+                -math.sqrt(1. / embedding.num_embeddings),
+                math.sqrt(1. / embedding.num_embeddings))
+
+            embeddings.append(embedding)
+
+    def set_devices(self, base_device):
+        """Set devices to run the model
+
+        Args:
+            base_device (string);
+        """
+        self._base_device = base_device
+        self.bottom_mlp.to(base_device)
+        self.top_mlp.to(base_device)
+        self._interaction_padding = self._interaction_padding.to(base_device)
+        self._embedding_device_map = [base_device for _ in range(self.num_categorical_features)]
+
+        for embedding_id, device in enumerate(self._embedding_device_map):
+            logging.info("Place embedding %d on device %s", embedding_id, device)
+            self.embeddings[embedding_id].to(device)
+
+    def forward(self, numerical_input, categorical_inputs):
+        """
+
+        Args:
+            numerical_input (Tensor): with shape [batch_size, num_numerical_features]
+            categorical_inputs (Tensor): with shape [batch_size, num_categorical_features]
+        """
+        batch_size = numerical_input.size()[0]
+
+        # Put indices on the same device as corresponding embedding
+        device_indices = []
+        for embedding_id, _ in enumerate(self.embeddings):
+            device_indices.append(categorical_inputs[:, embedding_id].to(self._embedding_device_map[embedding_id]))
+
+        bottom_mlp_output = self.bottom_mlp(numerical_input)
+
+        # embedding_outputs will be a list of (26 in the case of Criteo) fetched embeddings with shape
+        # [batch_size, embedding_size]
+        embedding_outputs = []
+        for embedding_id, embedding in enumerate(self.embeddings):
+            if self._hash_indices:
+                device_indices[embedding_id] = device_indices[embedding_id] % embedding.num_embeddings
+
+            embedding_outputs.append(embedding(device_indices[embedding_id]).to(self._base_device))
+
+        interaction_output = self._interaction(bottom_mlp_output, embedding_outputs, batch_size)
+
+        top_mlp_output = self.top_mlp(interaction_output)
+
+        return top_mlp_output
--- a/PyTorch/Recommendation/DLRM/dlrm/scripts/main.py
+++ b/PyTorch/Recommendation/DLRM/dlrm/scripts/main.py
@ -0,0 +1,510 @@
+# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import datetime
+import os
+import numpy as np
+import json
+from pprint import pprint
+from time import time
+from sklearn.metrics import roc_auc_score
+
+from absl import app
+from absl import flags
+
+import dllogger
+
+import torch
+from apex import amp
+
+from dlrm.data import data_loader
+from dlrm.data.synthetic_dataset import SyntheticDataset
+from dlrm.model import Dlrm
+
+import dlrm.scripts.utils as utils
+
+FLAGS = flags.FLAGS
+
+# Basic run settings
+flags.DEFINE_enum("mode", default='train', enum_values=['train', 'test', 'inference_benchmark'],
+                  help="Select task to be performed")
+
+flags.DEFINE_integer("seed", 12345, "Random seed")
+
+# Training schedule flags
+flags.DEFINE_integer("batch_size", 32768, "Batch size used for training")
+flags.DEFINE_integer("test_batch_size", 32768, "Batch size used for testing/validation")
+flags.DEFINE_float("lr", 28, "Base learning rate")
+flags.DEFINE_integer("epochs", 1, "Number of epochs to train for")
+flags.DEFINE_integer("max_steps", None, "Stop training after doing this many optimization steps")
+
+flags.DEFINE_integer("warmup_factor", 0, "Learning rate warmup factor. Must be a non-negative integer")
+flags.DEFINE_integer("warmup_steps", 6400, "Number of warmup optimization steps")
+flags.DEFINE_integer("decay_steps", 80000, "Polynomial learning rate decay steps. If equal to 0 will not do any decaying")
+flags.DEFINE_integer("decay_start_step", 64000,
+    "Optimization step after which to start decaying the learning rate, if None will start decaying right after the warmup phase is completed")
+
+# Model configuration
+flags.DEFINE_integer("embedding_dim", 128, "Dimensionality of embedding space for categorical features")
+flags.DEFINE_list("top_mlp_sizes", [1024, 1024, 512, 256, 1], "Linear layer sizes for the top MLP")
+flags.DEFINE_list("bottom_mlp_sizes", [512, 256, 128], "Linear layer sizes for the bottom MLP")
+
+flags.DEFINE_string("interaction_op", "dot",
+                    "Type of interaction operation to perform. Supported choices: 'dot' or 'cat'")
+flags.DEFINE_boolean("self_interaction", False, "Set to True to use self-interaction")
+
+flags.DEFINE_string(
+    "dataset", None,
+    "Full path to binary dataset. Must include files such as: train_data.bin, test_data.bin")
+
+flags.DEFINE_boolean("synthetic_dataset", False, "Use synthetic instead of real data for benchmarking purposes")
+flags.DEFINE_list("synthetic_dataset_table_sizes", default=','.join(26 * [str(10**5)]),
+                  help="Embedding table sizes to use with the synthetic dataset")
+
+flags.DEFINE_boolean("shuffle_batch_order", False, "Read batch in train dataset by random order", short_name="shuffle")
+
+flags.DEFINE_integer("num_numerical_features", 13,
+                     "Number of numerical features in the dataset. Defaults to 13 for the Criteo Terabyte Dataset")
+
+flags.DEFINE_integer("max_table_size", None,
+                     "Maximum number of rows per embedding table, by default equal to the number of unique values for each categorical variable")
+flags.DEFINE_boolean("hash_indices", False,
+                     "If True the model will compute `index := index % table size` to ensure that the indices match table sizes")
+
+flags.DEFINE_float("dataset_subset", None,
+     "Use only a subset of the training data. If None (default) will use all of it. Must be either None, or a float in range [0,1]")
+
+# Checkpointing
+flags.DEFINE_string("load_checkpoint_path", None, "Path from which to load a checkpoint")
+flags.DEFINE_string("save_checkpoint_path", None, "Path to which to save the training checkpoints")
+
+# Saving and logging flags
+flags.DEFINE_string("output_dir", "/tmp", "Path where to save the checkpoints")
+flags.DEFINE_string("log_path", "./log.json", "Destination for the log file with various results and statistics")
+flags.DEFINE_integer("test_freq", None, "Number of optimization steps between validations. If None will test after each epoch")
+flags.DEFINE_float("test_after", 0, "Don't test the model unless this many epochs has been completed")
+flags.DEFINE_integer("print_freq", 200, "Number of optimizations steps between printing training status to stdout")
+
+flags.DEFINE_integer("benchmark_warmup_steps", 0, "Number of initial iterations to exclude from throughput measurements")
+
+# Machine setting flags
+flags.DEFINE_string("base_device", "cuda", "Device to run the majority of the model operations")
+flags.DEFINE_boolean("fp16", True, "If True (default) the script will use Automatic Mixed Precision")
+flags.DEFINE_float("loss_scale", 8192, "Static loss scale for Mixed Precision Training")
+
+# inference benchmark
+flags.DEFINE_list("inference_benchmark_batch_sizes", default=[1, 64, 4096],
+                  help="Batch sizes for inference throughput and latency measurements")
+flags.DEFINE_integer("inference_benchmark_steps", 200,
+                     "Number of steps for measuring inference latency and throughput")
+
+flags.DEFINE_float("auc_threshold", None, "Stop the training after achieving this AUC")
+
+
+def validate_flags():
+    if FLAGS.max_table_size is not None and not FLAGS.hash_indices:
+       raise ValueError('Hash indices must be True when setting a max_table_size')
+
+
+def create_synthetic_datasets(train_batch_size, test_batch_size):
+    categorical_sizes = get_categorical_feature_sizes()
+
+    dataset_train = SyntheticDataset(num_entries=4 * 10**9,
+                                     batch_size=train_batch_size,
+                                     dense_features=FLAGS.num_numerical_features,
+                                     categorical_feature_sizes=categorical_sizes)
+
+    dataset_test = SyntheticDataset(num_entries=100 * 10**6,
+                                    batch_size=test_batch_size,
+                                    dense_features=FLAGS.num_numerical_features,
+                                    categorical_feature_sizes=categorical_sizes)
+
+    return dataset_train, dataset_test
+
+
+def create_real_datasets(train_batch_size, test_batch_size, online_shuffle=True):
+    train_dataset = os.path.join(FLAGS.dataset, "train_data.bin")
+    test_dataset = os.path.join(FLAGS.dataset, "test_data.bin")
+    categorical_sizes = get_categorical_feature_sizes()
+
+    dataset_train = data_loader.CriteoBinDataset(
+        data_file=train_dataset,
+        batch_size=train_batch_size, subset=FLAGS.dataset_subset,
+        numerical_features=FLAGS.num_numerical_features,
+        categorical_features=len(categorical_sizes),
+        online_shuffle=online_shuffle
+    )
+
+    dataset_test = data_loader.CriteoBinDataset(
+        data_file=test_dataset, batch_size=test_batch_size,
+        numerical_features=FLAGS.num_numerical_features,
+        categorical_features=len(categorical_sizes),
+        online_shuffle = False
+    )
+
+    return dataset_train, dataset_test
+
+def get_dataloaders(train_batch_size, test_batch_size):
+    print("Creating data loaders")
+    if FLAGS.synthetic_dataset:
+        dataset_train, dataset_test = create_synthetic_datasets(train_batch_size, test_batch_size)
+    else:
+        dataset_train, dataset_test = create_real_datasets(train_batch_size,
+                                                           test_batch_size,
+                                                           online_shuffle=FLAGS.shuffle_batch_order)
+
+    if FLAGS.shuffle_batch_order and not FLAGS.synthetic_dataset:
+        train_sampler = torch.utils.data.RandomSampler(dataset_train)
+    else:
+        train_sampler = None
+    data_loader_train = torch.utils.data.DataLoader(
+        dataset_train, batch_size=None, num_workers=0, pin_memory=False, sampler=train_sampler)
+    data_loader_test = torch.utils.data.DataLoader(
+        dataset_test, batch_size=None, num_workers=0, pin_memory=False)
+
+    return data_loader_train, data_loader_test
+
+
+def get_categorical_feature_sizes():
+    if FLAGS.synthetic_dataset:
+        feature_sizes = [int(s) for s in FLAGS.synthetic_dataset_table_sizes]
+        return feature_sizes
+
+    categorical_sizes_file = os.path.join(FLAGS.dataset, "model_size.json")
+    with open(categorical_sizes_file) as f:
+        categorical_sizes = json.load(f).values()
+
+    categorical_sizes = list(categorical_sizes)
+
+    # need to add 1 because the JSON file contains the max value not the count
+    categorical_sizes = [s + 1 for s in categorical_sizes]
+
+    if FLAGS.max_table_size is None:
+        return categorical_sizes
+
+    clipped_sizes = [min(s, FLAGS.max_table_size) for s in categorical_sizes]
+    return clipped_sizes
+
+def create_model():
+    print("Creating model")
+
+    model_config = {
+        'top_mlp_sizes': FLAGS.top_mlp_sizes,
+        'bottom_mlp_sizes': FLAGS.bottom_mlp_sizes,
+        'embedding_dim': FLAGS.embedding_dim,
+        'interaction_op': FLAGS.interaction_op,
+        'self_interaction': FLAGS.self_interaction,
+        'categorical_feature_sizes': get_categorical_feature_sizes(),
+        'num_numerical_features': FLAGS.num_numerical_features,
+        'hash_indices': FLAGS.hash_indices,
+        'base_device': FLAGS.base_device,
+    }
+
+    model = Dlrm.from_dict(model_config)
+    print(model)
+
+    if FLAGS.load_checkpoint_path is not None:
+        model.load_state_dict(torch.load(FLAGS.load_checkpoint_path, map_location="cpu"))
+
+    model.to(FLAGS.base_device)
+
+    return model
+
+
+def main(argv):
+    validate_flags()
+    torch.manual_seed(FLAGS.seed)
+
+    utils.init_logging(log_path=FLAGS.log_path)
+    dllogger.log(data=FLAGS.flag_values_dict(), step='PARAMETER')
+
+    data_loader_train, data_loader_test = get_dataloaders(train_batch_size=FLAGS.batch_size,
+                                                          test_batch_size=FLAGS.test_batch_size)
+
+    scaled_lr = FLAGS.lr / FLAGS.loss_scale if FLAGS.fp16 else FLAGS.lr
+
+    model = create_model()
+
+    optimizer = torch.optim.SGD(model.parameters(), lr=scaled_lr)
+
+    if FLAGS.fp16 and FLAGS.mode == 'train':
+        (model.top_mlp, model.bottom_mlp), optimizer = amp.initialize([model.top_mlp, model.bottom_mlp],
+                                                                      optimizer, opt_level="O2",
+                                                                      loss_scale=1)
+    elif FLAGS.fp16:
+        model = model.half()
+
+    loss_fn = torch.nn.BCEWithLogitsLoss(reduction="mean")
+    loss_fn = torch.jit.trace(loss_fn.forward, (torch.rand(FLAGS.batch_size, 1).cuda(),
+                                                torch.rand(FLAGS.batch_size, 1).cuda()))
+
+    if FLAGS.mode == 'test':
+        loss, auc, test_step_time = evaluate(model, loss_fn, data_loader_test)
+
+        avg_test_throughput = FLAGS.batch_size / test_step_time
+        results = {'auc': auc,
+                   'avg_inference_latency': test_step_time,
+                   'average_test_throughput': avg_test_throughput}
+        dllogger.log(data=results, step=tuple())
+
+        print(F"Finished testing. Test Loss {loss:.4f}, auc {auc:.4f}")
+        return
+
+    if FLAGS.mode == 'inference_benchmark':
+        results = {}
+
+        if FLAGS.fp16:
+            # can use pure FP16 for inference
+            model = model.half()
+
+        for batch_size in FLAGS.inference_benchmark_batch_sizes:
+            batch_size = int(batch_size)
+            _, benchmark_data_loader = get_dataloaders(train_batch_size=batch_size,
+                                                       test_batch_size=batch_size)
+
+            latencies = inference_benchmark(model=model, data_loader=benchmark_data_loader,
+                                            num_batches=FLAGS.inference_benchmark_steps)
+
+            print("All inference latencies: {}".format(latencies))
+
+            mean_latency = np.mean(latencies)
+            mean_inference_throughput = batch_size / mean_latency
+            subresult = {F'mean_inference_latency_batch_{batch_size}': mean_latency,
+                         F'mean_inference_throughput_batch_{batch_size}': mean_inference_throughput}
+            results.update(subresult)
+        dllogger.log(data=results, step=tuple())
+
+        print(F"Finished inference benchmark.")
+        return
+
+    if FLAGS.mode == 'train':
+        train(model, loss_fn, optimizer, data_loader_train, data_loader_test, scaled_lr)
+
+
+def maybe_save_checkpoint(model, path):
+    if path is None:
+        return
+
+    begin = time()
+    torch.save(model.state_dict(), path)
+    end = time()
+    print(f'Checkpoint saving took {end-begin:,.2f} [s]')
+
+
+def train(model, loss_fn, optimizer, data_loader_train, data_loader_test, scaled_lr):
+    """Train and evaluate the model
+
+    Args:
+        model (dlrm):
+        loss_fn (torch.nn.Module): Loss function
+        optimizer (torch.nn.optim):
+        data_loader_train (torch.utils.data.DataLoader):
+        data_loader_test (torch.utils.data.DataLoader):
+    """
+    model.train()
+    base_device = FLAGS.base_device
+    print_freq = FLAGS.print_freq
+    steps_per_epoch = len(data_loader_train)
+
+    test_freq = FLAGS.test_freq if FLAGS.test_freq is not None else steps_per_epoch
+
+    metric_logger = utils.MetricLogger(delimiter="  ")
+    metric_logger.add_meter('loss', utils.SmoothedValue(window_size=print_freq, fmt='{avg:.4f}'))
+    metric_logger.add_meter('step_time', utils.SmoothedValue(window_size=print_freq, fmt='{avg:.6f}'))
+    metric_logger.add_meter('lr', utils.SmoothedValue(window_size=1, fmt='{value:.4f}'))
+
+    timer = utils.StepTimer()
+
+    best_auc = 0
+    best_epoch = 0
+    start_time = time()
+    for epoch in range(FLAGS.epochs):
+
+        batch_iter = iter(data_loader_train)
+        for step in range(len(data_loader_train)):
+            timer.click()
+
+            global_step = steps_per_epoch * epoch + step
+
+            numerical_features, categorical_features, click = next(batch_iter)
+
+            categorical_features = categorical_features.to(base_device).to(torch.long)
+            numerical_features = numerical_features.to(base_device)
+            click = click.to(base_device).to(torch.float32)
+
+            utils.lr_step(optimizer, num_warmup_iter=FLAGS.warmup_steps, current_step=global_step + 1,
+                          base_lr=scaled_lr, warmup_factor=FLAGS.warmup_factor,
+                          decay_steps=FLAGS.decay_steps, decay_start_step=FLAGS.decay_start_step)
+
+            if FLAGS.max_steps and global_step > FLAGS.max_steps:
+                print(F"Reached max global steps of {FLAGS.max_steps}. Stopping.")
+                break
+
+            output = model(numerical_features, categorical_features).squeeze().float()
+
+            loss = loss_fn(output, click.squeeze())
+
+            optimizer.zero_grad()
+            if FLAGS.fp16:
+                loss *= FLAGS.loss_scale
+                with amp.scale_loss(loss, optimizer) as scaled_loss:
+                    scaled_loss.backward()
+            else:
+                loss.backward()
+            optimizer.step()
+
+            loss_value = loss.item()
+
+            if timer.measured is None:
+                # first iteration, no step time etc. to print
+                continue
+
+
+            if global_step < FLAGS.benchmark_warmup_steps:
+                metric_logger.update(
+                    loss=loss_value, lr=optimizer.param_groups[0]["lr"])
+            else:
+                unscale_factor = FLAGS.loss_scale if FLAGS.fp16 else 1
+                metric_logger.update(
+                     loss=loss_value / unscale_factor, step_time=timer.measured,
+                     lr=optimizer.param_groups[0]["lr"] * unscale_factor
+                )
+
+            if step % print_freq == 0 and step > 0:
+                if global_step < FLAGS.benchmark_warmup_steps:
+                    print(F'Warming up, step [{global_step}/{FLAGS.benchmark_warmup_steps}]')
+                    continue
+
+                eta_str = datetime.timedelta(seconds=int(metric_logger.step_time.global_avg * (steps_per_epoch - step)))
+                metric_logger.print(
+                    header=F"Epoch:[{epoch}/{FLAGS.epochs}] [{step}/{steps_per_epoch}]  eta: {eta_str}")
+
+            if (global_step + 1) % test_freq == 0 and global_step > 0 and global_step / steps_per_epoch >= FLAGS.test_after:
+                loss, auc, test_step_time = evaluate(model, loss_fn, data_loader_test)
+                print(F"Epoch {epoch} step {step}. Test loss {loss:.5f}, auc {auc:.6f}")
+
+                if auc > best_auc:
+                    best_auc = auc
+                    best_epoch = epoch + ((step + 1) / steps_per_epoch)
+                    maybe_save_checkpoint(model, FLAGS.save_checkpoint_path)
+
+                if FLAGS.auc_threshold and auc >= FLAGS.auc_threshold:
+                    stop_time = time()
+                    run_time_s = int(stop_time - start_time)
+                    print(F"Hit target accuracy AUC {FLAGS.auc_threshold} at epoch "
+                          F"{global_step/steps_per_epoch:.2f} in {run_time_s}s. "
+                          F"Average speed {global_step * FLAGS.batch_size / run_time_s:.1f} records/s.")
+                    return
+
+    avg_throughput = FLAGS.batch_size / metric_logger.step_time.avg
+
+    results = {'best_auc' : best_auc,
+               'best_epoch' : best_epoch,
+               'average_train_throughput' : avg_throughput}
+
+    if 'test_step_time' in locals():
+        avg_test_throughput = FLAGS.test_batch_size / test_step_time
+        results['average_test_throughput'] = avg_test_throughput
+
+    dllogger.log(data=results, step=tuple())
+
+
+def evaluate(model, loss_fn, data_loader):
+    """Test dlrm model
+
+    Args:
+        model (dlrm):
+        loss_fn (torch.nn.Module): Loss function
+        data_loader (torch.utils.data.DataLoader):
+    """
+    model.eval()
+    base_device = FLAGS.base_device
+    print_freq = FLAGS.print_freq
+
+    steps_per_epoch = len(data_loader)
+    metric_logger = utils.MetricLogger(delimiter="  ")
+    metric_logger.add_meter('loss', utils.SmoothedValue(window_size=print_freq, fmt='{avg:.4f}'))
+    metric_logger.add_meter('step_time', utils.SmoothedValue(window_size=print_freq, fmt='{avg:.4f}'))
+    with torch.no_grad():
+        y_true = []
+        y_score = []
+
+        timer = utils.StepTimer()
+        batch_iter = iter(data_loader)
+
+        timer.click()
+        for step in range(len(data_loader)):
+            numerical_features, categorical_features, click = next(batch_iter)
+
+            categorical_features = categorical_features.to(base_device).to(torch.long)
+            numerical_features = numerical_features.to(base_device)
+            click = click.to(torch.float32).to(base_device)
+
+            if FLAGS.fp16:
+                numerical_features = numerical_features.half()
+
+            output = model(numerical_features, categorical_features).squeeze()
+
+            loss = loss_fn(output, click)
+            y_true.append(click)
+            y_score.append(output)
+
+            loss_value = loss.item()
+            timer.click()
+
+            if timer.measured is not None:
+                metric_logger.update(loss=loss_value, step_time=timer.measured)
+                if step % print_freq == 0 and step > 0:
+                    metric_logger.print(header=F"Test: [{step}/{steps_per_epoch}]")
+
+        y_true = torch.cat(y_true).cpu().numpy()
+        y_score = torch.cat(y_score).cpu().numpy()
+        auc = roc_auc_score(y_true=y_true, y_score=y_score)
+
+    model.train()
+
+    return metric_logger.loss.global_avg, auc, metric_logger.step_time.avg
+
+
+def inference_benchmark(model, data_loader, num_batches=100):
+    model.eval()
+    base_device = FLAGS.base_device
+    latencies = []
+
+    with torch.no_grad():
+        for step, (numerical_features, categorical_features, click) in enumerate(data_loader):
+            if step > num_batches:
+                break
+
+            step_start_time = time()
+
+            numerical_features = numerical_features.to(base_device)
+            if FLAGS.fp16:
+                numerical_features = numerical_features.half()
+
+            categorical_features = categorical_features.to(device=base_device, dtype=torch.int64)
+
+            _ = model(numerical_features, categorical_features).squeeze()
+            torch.cuda.synchronize()
+            step_time = time() - step_start_time
+
+            if step >= FLAGS.benchmark_warmup_steps:
+                latencies.append(step_time)
+    return latencies
+
+
+if __name__ == '__main__':
+    app.run(main)
+
--- a/PyTorch/Recommendation/DLRM/dlrm/scripts/utils.py
+++ b/PyTorch/Recommendation/DLRM/dlrm/scripts/utils.py
@ -0,0 +1,278 @@
+# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from collections import defaultdict, deque
+import datetime
+import time
+import torch
+import torch.distributed as dist
+
+import errno
+import os
+
+import dllogger
+
+
+class SmoothedValue(object):
+    """Track a series of values and provide access to smoothed values over a
+    window or the global series average.
+    """
+
+    def __init__(self, window_size=20, fmt=None):
+        if fmt is None:
+            fmt = "{median:.4f} ({global_avg:.4f})"
+        self.deque = deque(maxlen=window_size)
+        self.total = 0.0
+        self.count = 0
+        self.fmt = fmt
+
+    def update(self, value, n=1):
+        self.deque.append(value)
+        self.count += n
+        self.total += value * n
+
+    def synchronize_between_processes(self):
+        """
+        Warning: does not synchronize the deque!
+        """
+        if not is_dist_avail_and_initialized():
+            return
+        t = torch.tensor([self.count, self.total], dtype=torch.float64, device='cuda')
+        dist.barrier()
+        dist.all_reduce(t)
+        t = t.tolist()
+        self.count = int(t[0])
+        self.total = t[1]
+
+    @property
+    def median(self):
+        d = torch.tensor(list(self.deque))
+        return d.median().item()
+
+    @property
+    def avg(self):
+        d = torch.tensor(list(self.deque), dtype=torch.float32)
+        return d.mean().item()
+
+    @property
+    def global_avg(self):
+        return self.total / self.count
+
+    @property
+    def max(self):
+        return max(self.deque)
+
+    @property
+    def value(self):
+        return self.deque[-1]
+
+    def __str__(self):
+        return self.fmt.format(
+            median=self.median,
+            avg=self.avg,
+            global_avg=self.global_avg,
+            max=self.max,
+            value=self.value)
+
+
+class MetricLogger(object):
+    def __init__(self, delimiter="\t"):
+        self.meters = defaultdict(SmoothedValue)
+        self.delimiter = delimiter
+
+    def update(self, **kwargs):
+        for k, v in kwargs.items():
+            if isinstance(v, torch.Tensor):
+                v = v.item()
+            assert isinstance(v, (float, int))
+            self.meters[k].update(v)
+
+    def __getattr__(self, attr):
+        if attr in self.meters:
+            return self.meters[attr]
+        if attr in self.__dict__:
+            return self.__dict__[attr]
+        raise AttributeError("'{}' object has no attribute '{}'".format(
+            type(self).__name__, attr))
+
+    def __str__(self):
+        loss_str = []
+        for name, meter in self.meters.items():
+            loss_str.append(
+                "{}: {}".format(name, str(meter))
+            )
+        return self.delimiter.join(loss_str)
+
+    def synchronize_between_processes(self):
+        for meter in self.meters.values():
+            meter.synchronize_between_processes()
+
+    def add_meter(self, name, meter):
+        self.meters[name] = meter
+
+    def print(self, header=None):
+        if not header:
+            header = ''
+        print_str = header
+        for name, meter in self.meters.items():
+            print_str += F"  {name}: {meter}"
+        print(print_str)
+
+
+def accuracy(output, target, topk=(1,)):
+    """Computes the accuracy over the k top predictions for the specified values of k"""
+    with torch.no_grad():
+        maxk = max(topk)
+        batch_size = target.size(0)
+
+        _, pred = output.topk(maxk, 1, True, True)
+        pred = pred.t()
+        correct = pred.eq(target[None])
+
+        res = []
+        for k in topk:
+            correct_k = correct[:k].flatten().sum(dtype=torch.float32)
+            res.append(correct_k * (100.0 / batch_size))
+        return res
+
+
+def lr_step(optim, num_warmup_iter, current_step, base_lr, warmup_factor, decay_steps=0, decay_start_step=None):
+    if decay_start_step is None:
+        decay_start_step = num_warmup_iter
+
+    new_lr = base_lr
+
+    if decay_start_step < num_warmup_iter:
+        raise ValueError('Learning rate warmup must finish before decay starts')
+
+    if current_step <= num_warmup_iter:
+        warmup_step = base_lr / (num_warmup_iter * (2 ** warmup_factor))
+        new_lr = base_lr - (num_warmup_iter - current_step) * warmup_step
+
+    steps_since_decay_start = current_step - decay_start_step
+    if decay_steps != 0 and steps_since_decay_start > 0:
+        already_decayed_steps = min(steps_since_decay_start, decay_steps)
+        new_lr = base_lr * ((decay_steps - already_decayed_steps) / decay_steps) ** 2
+        min_lr = 0.0000001
+        new_lr = max(min_lr, new_lr)
+
+    for param_group in optim.param_groups:
+        param_group['lr'] = new_lr
+
+
+
+def mkdir(path):
+    try:
+        os.makedirs(path)
+    except OSError as e:
+        if e.errno != errno.EEXIST:
+            raise
+
+
+def setup_for_distributed(is_master):
+    """
+    This function disables printing when not in master process
+    """
+    import builtins as __builtin__
+    builtin_print = __builtin__.print
+
+    def print(*args, **kwargs):
+        force = kwargs.pop('force', False)
+        if is_master or force:
+            builtin_print(*args, **kwargs)
+
+    __builtin__.print = print
+
+
+def is_dist_avail_and_initialized():
+    if not dist.is_available():
+        return False
+    if not dist.is_initialized():
+        return False
+    return True
+
+
+def get_world_size():
+    if not is_dist_avail_and_initialized():
+        return 1
+    return dist.get_world_size()
+
+
+def get_rank():
+    if not is_dist_avail_and_initialized():
+        return 0
+    return dist.get_rank()
+
+
+def is_main_process():
+    return get_rank() == 0
+
+
+def init_logging(log_path):
+    json_backend = dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE,
+                                              filename=log_path)
+    stdout_backend = dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE)
+
+    stdout_backend._metadata['best_auc'].update({'format': '0:.5f'})
+    stdout_backend._metadata['best_epoch'].update({'format': '0:.2f'})
+    stdout_backend._metadata['average_train_throughput'].update({'format': ':.2e'})
+    stdout_backend._metadata['average_test_throughput'].update({'format': ':.2e'})
+
+    dllogger.init(backends=[json_backend, stdout_backend])
+
+
+def save_on_master(*args, **kwargs):
+    if is_main_process():
+        torch.save(*args, **kwargs)
+
+
+def init_distributed_mode(args):
+    if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ:
+        args.rank = int(os.environ["RANK"])
+        args.world_size = int(os.environ['WORLD_SIZE'])
+        args.gpu = int(os.environ['LOCAL_RANK'])
+    elif 'SLURM_PROCID' in os.environ:
+        args.rank = int(os.environ['SLURM_PROCID'])
+        args.gpu = args.rank % torch.cuda.device_count()
+    elif hasattr(args, "rank"):
+        pass
+    else:
+        print('Not using distributed mode')
+        args.distributed = False
+        return
+
+    args.distributed = True
+
+    torch.cuda.set_device(args.gpu)
+    args.dist_backend = 'nccl'
+    print('| distributed init (rank {}): {}'.format(
+        args.rank, args.dist_url), flush=True)
+    torch.distributed.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
+                                         world_size=args.world_size, rank=args.rank)
+    setup_for_distributed(args.rank == 0)
+
+
+class StepTimer():
+    def __init__(self):
+        self._previous = None
+        self._new = None
+        self.measured = None
+
+    def click(self):
+        self._previous = self._new
+        self._new = time.time()
+
+        if self._previous is not None:
+            self.measured = self._new - self._previous
--- a/PyTorch/Recommendation/DLRM/notebooks/DLRM_Triton_inference_demo.ipynb
+++ b/PyTorch/Recommendation/DLRM/notebooks/DLRM_Triton_inference_demo.ipynb
@ -0,0 +1,726 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "colab": {},
+    "colab_type": "code",
+    "id": "Gwt7z7qdmTbW"
+   },
+   "outputs": [],
+   "source": [
+    "# Copyright 2019 NVIDIA Corporation. All Rights Reserved.\n",
+    "#\n",
+    "# Licensed under the Apache License, Version 2.0 (the \"License\");\n",
+    "# you may not use this file except in compliance with the License.\n",
+    "# You may obtain a copy of the License at\n",
+    "#\n",
+    "#     http://www.apache.org/licenses/LICENSE-2.0\n",
+    "#\n",
+    "# Unless required by applicable law or agreed to in writing, software\n",
+    "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
+    "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
+    "# See the License for the specific language governing permissions and\n",
+    "# limitations under the License.\n",
+    "# =============================================================================="
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "i4NKCp2VmTbn"
+   },
+   "source": [
+    "<img src=\"http://developer.download.nvidia.com/compute/machine-learning/frameworks/nvidia_logo.png\" style=\"width: 90px; float: right;\">\n",
+    "\n",
+    "# DLRM Triton Inference Demo"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "fW0OKDzvmTbt"
+   },
+   "source": [
+    "## Overview\n",
+    "\n",
+    "Recomendation system (RecSys) inference involves determining an ordered list of items with which the query user will most likely interact with. For very large commercial databases with millions to hundreds of millions of items to choose from (like advertisements, apps), usually an item retrieval procedure is carried out to reduce the number of items to a more manageable quantity, e.g. a few hundreds to a few thousands. The methods include computationally-light algorithms such as approximate neighborhood search, random forest and filtering based on user preferences. From thereon, a deep learning based RecSys is invoked to re-rank the items and those with the highest scores are presented to the users. This process is well demonstrated in the Google AppStore recommendation system in Figure 1. \n",
+    "\n",
+    "![DLRM_model](recsys_inference.PNG)\n",
+    "\n",
+    "Figure 1: Google’s app recommendation process. [Source](https://arxiv.org/pdf/1606.07792.pdf).\n",
+    "\n",
+    "As we can see, for each query user, the number of user-item pairs to score can be as large as a few thousands. This places an extremely heavy duty on RecSys inference server, which must handle high throughput to serve many users concurrently yet at low latency to satisfy stringent latency thresholds of online commerce engines.\n",
+    "\n",
+    "The NVIDIA Triton Inference Server [9] provides a cloud inferencing solution optimized for NVIDIA GPUs. The server provides an inference service via an HTTP or GRPC endpoint, allowing remote clients to request inferencing for any model being managed by the server. Triton automatically manages and makes use of all the available GPUs.\n",
+    "\n",
+    "We will next see how to prepare the DLRM model for inference with the Triton inference server and see how Triton is up to the task.    \n",
+    "\n",
+    "### Learning objectives\n",
+    "\n",
+    "This notebook demonstrates the steps for preparing a pre-trained DLRM model for deployment and inference with the NVIDIA [Triton inference server](https://github.com/NVIDIA/triton-inference-server). \n",
+    "\n",
+    "## Content\n",
+    "1. [Requirements](#1)\n",
+    "1. [Prepare model for inference](#2)\n",
+    "1. [Start the Triton inference server](#3)\n",
+    "1. [Testing server with the performance client](#4)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "aDFrE4eqmTbv"
+   },
+   "source": [
+    "<a id=\"1\"></a>\n",
+    "## 1. Requirements\n",
+    "\n",
+    "\n",
+    "### 1.1 Docker container\n",
+    "The most convenient way to make use of the NVIDIA DLRM model is via a docker container, which provides a self-contained, isolated and re-producible environment for all experiments.\n",
+    "\n",
+    "First, clone the repository:\n",
+    "\n",
+    "```\n",
+    "git clone https://github.com/NVIDIA/DeepLearningExamples\n",
+    "cd DeepLearningExamples/PyTorch/Recommendation/DLRM\n",
+    "```\n",
+    "\n",
+    "To execute this notebook, first build the following inference container:\n",
+    "\n",
+    "```\n",
+    "docker build -t dlrm-inference . -f triton/Dockerfile\n",
+    "```\n",
+    "\n",
+    "Start in interactive docker session with:\n",
+    "\n",
+    "```\n",
+    "docker run -it --rm --gpus device=0 --shm-size=1g --ulimit memlock=-1 --ulimit stack=67108864 --net=host -v <PATH_TO_SAVED_MODEL>:/models -v <PATH_TO_EXPORT_MODEL>:/repository <PATH_TO_PREPROCESSED_DATA>:/data dlrm-inference bash\n",
+    "```\n",
+    "where:\n",
+    "\n",
+    "- PATH_TO_SAVED_MODEL: directory containing the trained DLRM models with `.pt` extension.\n",
+    " \n",
+    "- PATH_TO_EXPORT_MODEL: directory which will contain the converted model to be used with the NVIDIA Triton inference server.\n",
+    "\n",
+    "- PATH_TO_PREPROCESSED_DATA: path to the preprocessed Criteo Terabyte dataset containing 3 binary data files: `test_data.bin`, `train_data.bin` and   `val_data.bin`  and a JSON `file model_size.json` totalling ~650GB.\n",
+    "\n",
+    "Within the docker interactive bash session, start Jupyter with\n",
+    "\n",
+    "```\n",
+    "export PYTHONPATH=/workspace/dlrm\n",
+    "jupyter notebook --ip 0.0.0.0 --port 8888\n",
+    "```\n",
+    "\n",
+    "Then open the Jupyter GUI interface on your host machine at http://localhost:8888. Within the container, this demo notebook is located at `/workspace/dlrm/notebooks`.\n",
+    "\n",
+    "### 1.2 Hardware\n",
+    "This notebook can be executed on any CUDA-enabled NVIDIA GPU with at least 24GB of GPU memory, although for efficient mixed precision inference, a [Tensor Core NVIDIA GPU](https://www.nvidia.com/en-us/data-center/tensorcore/) is desired (Volta, Turing or newer architectures). "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "colab": {},
+    "colab_type": "code",
+    "id": "k7RLEcKhmTb0"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Sat Apr  4 00:55:05 2020       \r\n",
+      "+-----------------------------------------------------------------------------+\r\n",
+      "| NVIDIA-SMI 440.33.01    Driver Version: 440.33.01    CUDA Version: 10.2     |\r\n",
+      "|-------------------------------+----------------------+----------------------+\r\n",
+      "| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |\r\n",
+      "| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |\r\n",
+      "|===============================+======================+======================|\r\n",
+      "|   0  Tesla V100-PCIE...  On   | 00000000:1A:00.0 Off |                    0 |\r\n",
+      "| N/A   30C    P0    37W / 250W |  19757MiB / 32510MiB |      0%      Default |\r\n",
+      "+-------------------------------+----------------------+----------------------+\r\n",
+      "                                                                               \r\n",
+      "+-----------------------------------------------------------------------------+\r\n",
+      "| Processes:                                                       GPU Memory |\r\n",
+      "|  GPU       PID   Type   Process name                             Usage      |\r\n",
+      "|=============================================================================|\r\n",
+      "+-----------------------------------------------------------------------------+\r\n"
+     ]
+    }
+   ],
+   "source": [
+    "!nvidia-smi"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "HqSUGePjmTb9"
+   },
+   "source": [
+    "<a id=\"2\"></a>\n",
+    "## 2. Prepare model for inference\n",
+    "\n",
+    "We first convert model to a format accepted by the NVIDIA Triton inference server. Triton can accept TorchScript, ONNX amongst other formats. \n",
+    "\n",
+    "To deploy model into Triton compatible format, we provide the deployer.py [script](../triton/deployer.py).\n",
+    "\n",
+    "### TorchScript\n",
+    "TorchScript is a way to create serializable and optimizable models from PyTorch code. Any TorchScript program can be saved from a Python process and loaded in a process where there is no Python dependency.\n",
+    "\n",
+    "We provide two options to convert models to TorchScript:\n",
+    "-  --ts-script           convert to torchscript using torch.jit.script\n",
+    "-  --ts-trace            convert to torchscript using torch.jit.trace\n",
+    "\n",
+    "\n",
+    "In the conversion below, we assume:\n",
+    "\n",
+    "- The trained model is stored at /models/dlrm_model_fp16.pt\n",
+    "\n",
+    "- The maximum batchsize that Triton will handle is 65536.\n",
+    "\n",
+    "- The processed dataset directory is /data which contain a `model_size.json` file."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "deploying model dlrm-ts-script-16 in format pytorch_libtorch\n",
+      "done\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%bash\n",
+    "python ../triton/deployer.py \\\n",
+    "--ts-script \\\n",
+    "--triton-model-name dlrm-ts-script-16 \\\n",
+    "--triton-max-batch-size 65536 \\\n",
+    "--save-dir /repository \\\n",
+    "-- --model_checkpoint /models/dlrm_model_fp16.pt  \\\n",
+    "--fp16 \\\n",
+    "--batch_size 4096 \\\n",
+    "--num_numerical_features 13 \\\n",
+    "--embedding_dim 128 \\\n",
+    "--top_mlp_sizes 1024 1024 512 256 1 \\\n",
+    "--bottom_mlp_sizes 512 256 128 \\\n",
+    "--interaction_op dot \\\n",
+    "--hash_indices \\\n",
+    "--dataset /data \\\n",
+    "--dump_perf_data ./perfdata"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "EQAIszkxmTcT"
+   },
+   "source": [
+    "### ONNX\n",
+    "\n",
+    "[ONNX](https://onnx.ai/) is an open format built to represent machine learning models. ONNX defines a common set of operators - the building blocks of machine learning and deep learning models - and a common file format to enable AI developers to use models with a variety of frameworks, tools, runtimes, and compilers.\n",
+    "\n",
+    "Conversion of DLRM pre-trained PyTorch model to ONNX model can be done with:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "deploying model dlrm-onnx-16 in format onnxruntime_onnx\n",
+      "done\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/opt/conda/lib/python3.6/site-packages/torch/onnx/symbolic_opset9.py:2044: UserWarning: Exporting aten::index operator of advanced indexing in opset 11 is achieved by combination of multiple ONNX operators, including Reshape, Transpose, Concat, and Gather. If indices include negative values, the exported graph will produce incorrect results.\n",
+      "  \"If indices include negative values, the exported graph will produce incorrect results.\")\n",
+      "/opt/conda/lib/python3.6/site-packages/torch/onnx/utils.py:915: UserWarning: No names were found for specified dynamic axes of provided input.Automatically generated names will be applied to each dynamic axes of input input__0\n",
+      "  'Automatically generated names will be applied to each dynamic axes of input {}'.format(key))\n",
+      "/opt/conda/lib/python3.6/site-packages/torch/onnx/utils.py:915: UserWarning: No names were found for specified dynamic axes of provided input.Automatically generated names will be applied to each dynamic axes of input input__1\n",
+      "  'Automatically generated names will be applied to each dynamic axes of input {}'.format(key))\n",
+      "/opt/conda/lib/python3.6/site-packages/torch/onnx/utils.py:915: UserWarning: No names were found for specified dynamic axes of provided input.Automatically generated names will be applied to each dynamic axes of input output__0\n",
+      "  'Automatically generated names will be applied to each dynamic axes of input {}'.format(key))\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%bash\n",
+    "python ../triton/deployer.py \\\n",
+    "--onnx \\\n",
+    "--triton-model-name dlrm-onnx-16 \\\n",
+    "--triton-max-batch-size 4096 \\\n",
+    "--save-dir /repository \\\n",
+    "-- --model_checkpoint /models/dlrm_model_fp16.pt  \\\n",
+    "--fp16 \\\n",
+    "--batch_size 4096 \\\n",
+    "--num_numerical_features 13 \\\n",
+    "--embedding_dim 128 \\\n",
+    "--top_mlp_sizes 1024 1024 512 256 1 \\\n",
+    "--bottom_mlp_sizes 512 256 128 \\\n",
+    "--interaction_op dot \\\n",
+    "--hash_indices \\\n",
+    "--dataset /data \\\n",
+    "--dump_perf_data ./perfdata"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "RL8d9IwzmTcV"
+   },
+   "source": [
+    "<a id=\"3\"></a>\n",
+    "## 3. Start the Triton inference server"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "o6wayGf1mTcX"
+   },
+   "source": [
+    "*Note: this step must be done outside the of the current docker container.*\n",
+    "\n",
+    "Open a bash window on the **host machine** and execute the following commands:\n",
+    "\n",
+    "```\n",
+    "docker pull nvcr.io/nvidia/tensorrtserver:20.03-py3\n",
+    "docker run -d --rm --gpus device=0 --ipc=host --network=host -p 8000:8000 -p 8001:8001 -p 8002:8002 -v <PATH_TO_MODEL_REPOSITORY>:/repository nvcr.io/nvidia/tensorrtserver:20.03-py3 trtserver --model-store=/repository --log-verbose=1 --model-control-mode=explicit\n",
+    "```\n",
+    "\n",
+    "where:\n",
+    "\n",
+    "- PATH_TO_MODEL_REPOSITORY: directory on the host machine containing the converted models in section 2 above. \n",
+    "\n",
+    "Note that each DLRM model will require ~19GB of GPU memory.\n",
+    "\n",
+    "Within the `/models` directory on the inference server, the structure should look similar to the below:\n",
+    "\n",
+    "```\n",
+    "/models\n",
+    "`-- dlrm-onnx-16\n",
+    "    |-- 1\n",
+    "    |   `-- model.onnx\n",
+    "    |       |-- bottom_mlp.0.weight\n",
+    "    |       |-- bottom_mlp.2.weight\n",
+    "    |       |-- bottom_mlp.4.weight\n",
+    "    |       |-- embeddings.0.weight\n",
+    "    |       |-- embeddings.1.weight\n",
+    "    |       |-- embeddings.10.weight\n",
+    "    |       |-- embeddings.11.weight\n",
+    "    |       |-- embeddings.12.weight\n",
+    "    |       |-- embeddings.13.weight\n",
+    "    |       |-- embeddings.14.weight\n",
+    "    |       |-- embeddings.15.weight\n",
+    "    |       |-- embeddings.17.weight\n",
+    "    |       |-- embeddings.18.weight\n",
+    "    |       |-- embeddings.19.weight\n",
+    "    |       |-- embeddings.2.weight\n",
+    "    |       |-- embeddings.20.weight\n",
+    "    |       |-- embeddings.21.weight\n",
+    "    |       |-- embeddings.22.weight\n",
+    "    |       |-- embeddings.23.weight\n",
+    "    |       |-- embeddings.24.weight\n",
+    "    |       |-- embeddings.25.weight\n",
+    "    |       |-- embeddings.3.weight\n",
+    "    |       |-- embeddings.4.weight\n",
+    "    |       |-- embeddings.6.weight\n",
+    "    |       |-- embeddings.7.weight\n",
+    "    |       |-- embeddings.8.weight\n",
+    "    |       |-- embeddings.9.weight\n",
+    "    |       |-- model.onnx\n",
+    "    |       |-- top_mlp.0.weight\n",
+    "    |       |-- top_mlp.2.weight\n",
+    "    |       |-- top_mlp.4.weight\n",
+    "    |       `-- top_mlp.6.weight\n",
+    "    `-- config.pbtxt\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "X959LYwjmTcw"
+   },
+   "source": [
+    "<a id=\"4\"></a>\n",
+    "## 4. Testing server with the performance client\n",
+    "\n",
+    "After model deployment has completed, we can test the deployed model against the Criteo test dataset. \n",
+    "\n",
+    "Note: This requires mounting the Criteo test data to, e.g. `/data/test_data.bin`. Within the dataset directory, there must also be a `model_size.json` file."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Process is terminated.\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%bash\n",
+    "python ../triton/client.py \\\n",
+    "--triton-server-url localhost:8000 \\\n",
+    "--protocol HTTP \\\n",
+    "--triton-model-name dlrm-onnx-16 \\\n",
+    "--num_numerical_features 13 \\\n",
+    "--dataset_config /data/model_size.json \\\n",
+    "--inference_data /data/test_data.bin \\\n",
+    "--batch_size 4096 \\\n",
+    "--fp16"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The Triton inference server comes with a [performance client](https://docs.nvidia.com/deeplearning/sdk/triton-inference-server-master-branch-guide/docs/optimization.html#perf-client) which is designed to stress test the server using multiple client threads.\n",
+    "\n",
+    "The perf_client generates inference requests to your model and measures the throughput and latency of those requests. To get representative results, the perf_client measures the throughput and latency over a time window, and then repeats the measurements until it gets stable values. By default the perf_client uses average latency to determine stability but you can use the --percentile flag to stabilize results based on that confidence level. For example, if --percentile=95 is used the results will be stabilized using the 95-th percentile request latency. \n",
+    "\n",
+    "### Request Concurrency\n",
+    "\n",
+    "By default perf_client measures your model’s latency and throughput using the lowest possible load on the model. To do this perf_client sends one inference request to the server and waits for the response. When that response is received, the perf_client immediately sends another request, and then repeats this process during the measurement windows. The number of outstanding inference requests is referred to as the request concurrency, and so by default perf_client uses a request concurrency of 1.\n",
+    "\n",
+    "Using the --concurrency-range <start>:<end>:<step> option you can have perf_client collect data for a range of request concurrency levels. Use the --help option to see complete documentation for this and other options.\n",
+    "    \n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "*** Measurement Settings ***\n",
+      "  Batch size: 4096\n",
+      "  Measurement window: 5000 msec\n",
+      "  Latency limit: 5000 msec\n",
+      "  Concurrency limit: 10 concurrent requests\n",
+      "  Using synchronous calls for inference\n",
+      "  Stabilizing using average latency\n",
+      "\n",
+      "Request concurrency: 1\n",
+      "  Pass [1] throughput: 67993.6 infer/sec. Avg latency: 60428 usec (std 22260 usec)\n",
+      "  Pass [2] throughput: 61440 infer/sec. Avg latency: 66310 usec (std 21723 usec)\n",
+      "  Pass [3] throughput: 68812.8 infer/sec. Avg latency: 59617 usec (std 22128 usec)\n",
+      "  Client: \n",
+      "    Request count: 84\n",
+      "    Throughput: 68812.8 infer/sec\n",
+      "    Avg latency: 59617 usec (standard deviation 22128 usec)\n",
+      "    p50 latency: 71920 usec\n",
+      "    p90 latency: 80018 usec\n",
+      "    p95 latency: 83899 usec\n",
+      "    p99 latency: 88054 usec\n",
+      "    Avg gRPC time: 58773 usec (marshal 274 usec + response wait 58458 usec + unmarshal 41 usec)\n",
+      "  Server: \n",
+      "    Request count: 102\n",
+      "    Avg request latency: 57208 usec (overhead 6 usec + queue 20184 usec + compute 37018 usec)\n",
+      "\n",
+      "Request concurrency: 2\n",
+      "  Pass [1] throughput: 154010 infer/sec. Avg latency: 53139 usec (std 22418 usec)\n",
+      "  Pass [2] throughput: 155648 infer/sec. Avg latency: 52483 usec (std 24768 usec)\n",
+      "  Pass [3] throughput: 150733 infer/sec. Avg latency: 54271 usec (std 23803 usec)\n",
+      "  Client: \n",
+      "    Request count: 184\n",
+      "    Throughput: 150733 infer/sec\n",
+      "    Avg latency: 54271 usec (standard deviation 23803 usec)\n",
+      "    p50 latency: 57022 usec\n",
+      "    p90 latency: 83000 usec\n",
+      "    p95 latency: 84782 usec\n",
+      "    p99 latency: 88989 usec\n",
+      "    Avg gRPC time: 55692 usec (marshal 274 usec + response wait 55374 usec + unmarshal 44 usec)\n",
+      "  Server: \n",
+      "    Request count: 216\n",
+      "    Avg request latency: 53506 usec (overhead 244 usec + queue 19818 usec + compute 33444 usec)\n",
+      "\n",
+      "Request concurrency: 3\n",
+      "  Pass [1] throughput: 189235 infer/sec. Avg latency: 64917 usec (std 21807 usec)\n",
+      "  Pass [2] throughput: 201523 infer/sec. Avg latency: 60425 usec (std 24622 usec)\n",
+      "  Pass [3] throughput: 203981 infer/sec. Avg latency: 60661 usec (std 24397 usec)\n",
+      "  Client: \n",
+      "    Request count: 249\n",
+      "    Throughput: 203981 infer/sec\n",
+      "    Avg latency: 60661 usec (standard deviation 24397 usec)\n",
+      "    p50 latency: 72344 usec\n",
+      "    p90 latency: 87765 usec\n",
+      "    p95 latency: 91976 usec\n",
+      "    p99 latency: 95775 usec\n",
+      "    Avg gRPC time: 57213 usec (marshal 291 usec + response wait 56875 usec + unmarshal 47 usec)\n",
+      "  Server: \n",
+      "    Request count: 315\n",
+      "    Avg request latency: 55254 usec (overhead 545 usec + queue 19408 usec + compute 35301 usec)\n",
+      "\n",
+      "Request concurrency: 4\n",
+      "  Pass [1] throughput: 273613 infer/sec. Avg latency: 59555 usec (std 22608 usec)\n",
+      "  Pass [2] throughput: 288358 infer/sec. Avg latency: 56895 usec (std 21886 usec)\n",
+      "  Pass [3] throughput: 285082 infer/sec. Avg latency: 57494 usec (std 21833 usec)\n",
+      "  Client: \n",
+      "    Request count: 348\n",
+      "    Throughput: 285082 infer/sec\n",
+      "    Avg latency: 57494 usec (standard deviation 21833 usec)\n",
+      "    p50 latency: 62012 usec\n",
+      "    p90 latency: 83694 usec\n",
+      "    p95 latency: 84966 usec\n",
+      "    p99 latency: 93177 usec\n",
+      "    Avg gRPC time: 59042 usec (marshal 317 usec + response wait 58669 usec + unmarshal 56 usec)\n",
+      "  Server: \n",
+      "    Request count: 404\n",
+      "    Avg request latency: 56316 usec (overhead 569 usec + queue 19140 usec + compute 36607 usec)\n",
+      "\n",
+      "Request concurrency: 5\n",
+      "  Pass [1] throughput: 335872 infer/sec. Avg latency: 60666 usec (std 22599 usec)\n",
+      "  Pass [2] throughput: 308838 infer/sec. Avg latency: 65721 usec (std 22284 usec)\n",
+      "  Pass [3] throughput: 339968 infer/sec. Avg latency: 59920 usec (std 22992 usec)\n",
+      "  Client: \n",
+      "    Request count: 415\n",
+      "    Throughput: 339968 infer/sec\n",
+      "    Avg latency: 59920 usec (standard deviation 22992 usec)\n",
+      "    p50 latency: 67406 usec\n",
+      "    p90 latency: 84561 usec\n",
+      "    p95 latency: 86191 usec\n",
+      "    p99 latency: 94862 usec\n",
+      "    Avg gRPC time: 61127 usec (marshal 304 usec + response wait 60771 usec + unmarshal 52 usec)\n",
+      "  Server: \n",
+      "    Request count: 490\n",
+      "    Avg request latency: 58036 usec (overhead 696 usec + queue 18923 usec + compute 38417 usec)\n",
+      "\n",
+      "Request concurrency: 6\n",
+      "  Pass [1] throughput: 368640 infer/sec. Avg latency: 66037 usec (std 20247 usec)\n",
+      "  Pass [2] throughput: 348979 infer/sec. Avg latency: 71309 usec (std 20236 usec)\n",
+      "  Pass [3] throughput: 334234 infer/sec. Avg latency: 72704 usec (std 18491 usec)\n",
+      "  Client: \n",
+      "    Request count: 408\n",
+      "    Throughput: 334234 infer/sec\n",
+      "    Avg latency: 72704 usec (standard deviation 18491 usec)\n",
+      "    p50 latency: 80327 usec\n",
+      "    p90 latency: 87164 usec\n",
+      "    p95 latency: 91824 usec\n",
+      "    p99 latency: 95617 usec\n",
+      "    Avg gRPC time: 71989 usec (marshal 315 usec + response wait 71617 usec + unmarshal 57 usec)\n",
+      "  Server: \n",
+      "    Request count: 504\n",
+      "    Avg request latency: 68951 usec (overhead 957 usec + queue 18350 usec + compute 49644 usec)\n",
+      "\n",
+      "Request concurrency: 7\n",
+      "  Pass [1] throughput: 395674 infer/sec. Avg latency: 72406 usec (std 18789 usec)\n",
+      "  Pass [2] throughput: 407142 infer/sec. Avg latency: 69909 usec (std 19644 usec)\n",
+      "  Pass [3] throughput: 355533 infer/sec. Avg latency: 81048 usec (std 12687 usec)\n",
+      "  Client: \n",
+      "    Request count: 434\n",
+      "    Throughput: 355533 infer/sec\n",
+      "    Avg latency: 81048 usec (standard deviation 12687 usec)\n",
+      "    p50 latency: 84046 usec\n",
+      "    p90 latency: 91642 usec\n",
+      "    p95 latency: 94089 usec\n",
+      "    p99 latency: 100453 usec\n",
+      "    Avg gRPC time: 79919 usec (marshal 313 usec + response wait 79552 usec + unmarshal 54 usec)\n",
+      "  Server: \n",
+      "    Request count: 525\n",
+      "    Avg request latency: 76078 usec (overhead 1042 usec + queue 17815 usec + compute 57221 usec)\n",
+      "\n",
+      "Request concurrency: 8\n",
+      "  Pass [1] throughput: 524288 infer/sec. Avg latency: 62235 usec (std 15989 usec)\n",
+      "  Pass [2] throughput: 524288 infer/sec. Avg latency: 62741 usec (std 15967 usec)\n",
+      "  Pass [3] throughput: 517734 infer/sec. Avg latency: 63449 usec (std 15144 usec)\n",
+      "  Client: \n",
+      "    Request count: 632\n",
+      "    Throughput: 517734 infer/sec\n",
+      "    Avg latency: 63449 usec (standard deviation 15144 usec)\n",
+      "    p50 latency: 68562 usec\n",
+      "    p90 latency: 75212 usec\n",
+      "    p95 latency: 77256 usec\n",
+      "    p99 latency: 79685 usec\n",
+      "    Avg gRPC time: 62683 usec (marshal 304 usec + response wait 62321 usec + unmarshal 58 usec)\n",
+      "  Server: \n",
+      "    Request count: 768\n",
+      "    Avg request latency: 58942 usec (overhead 1574 usec + queue 2167 usec + compute 55201 usec)\n",
+      "\n",
+      "Request concurrency: 9\n",
+      "  Pass [1] throughput: 376832 infer/sec. Avg latency: 98868 usec (std 34719 usec)\n",
+      "  Pass [2] throughput: 407142 infer/sec. Avg latency: 90421 usec (std 35435 usec)\n",
+      "  Pass [3] throughput: 346522 infer/sec. Avg latency: 106082 usec (std 33649 usec)\n",
+      "  Client: \n",
+      "    Request count: 423\n",
+      "    Throughput: 346522 infer/sec\n",
+      "    Avg latency: 106082 usec (standard deviation 33649 usec)\n",
+      "    p50 latency: 122774 usec\n",
+      "    p90 latency: 139616 usec\n",
+      "    p95 latency: 143511 usec\n",
+      "    p99 latency: 148324 usec\n",
+      "    Avg gRPC time: 106566 usec (marshal 323 usec + response wait 106177 usec + unmarshal 66 usec)\n",
+      "  Server: \n",
+      "    Request count: 505\n",
+      "    Avg request latency: 102100 usec (overhead 1046 usec + queue 43598 usec + compute 57456 usec)\n",
+      "\n",
+      "Request concurrency: 10\n",
+      "  Pass [1] throughput: 407962 infer/sec. Avg latency: 100260 usec (std 27654 usec)\n",
+      "  Pass [2] throughput: 403866 infer/sec. Avg latency: 101427 usec (std 34082 usec)\n",
+      "  Pass [3] throughput: 412058 infer/sec. Avg latency: 99376 usec (std 31125 usec)\n",
+      "  Client: \n",
+      "    Request count: 503\n",
+      "    Throughput: 412058 infer/sec\n",
+      "    Avg latency: 99376 usec (standard deviation 31125 usec)\n",
+      "    p50 latency: 100025 usec\n",
+      "    p90 latency: 137764 usec\n",
+      "    p95 latency: 141030 usec\n",
+      "    p99 latency: 144104 usec\n",
+      "    Avg gRPC time: 98137 usec (marshal 348 usec + response wait 97726 usec + unmarshal 63 usec)\n",
+      "  Server: \n",
+      "    Request count: 612\n",
+      "    Avg request latency: 94377 usec (overhead 1417 usec + queue 40909 usec + compute 52051 usec)\n",
+      "\n",
+      "Inferences/Second vs. Client Average Batch Latency\n",
+      "Concurrency: 1, throughput: 68812.8 infer/sec, latency 59617 usec\n",
+      "Concurrency: 2, throughput: 150733 infer/sec, latency 54271 usec\n",
+      "Concurrency: 3, throughput: 203981 infer/sec, latency 60661 usec\n",
+      "Concurrency: 4, throughput: 285082 infer/sec, latency 57494 usec\n",
+      "Concurrency: 5, throughput: 339968 infer/sec, latency 59920 usec\n",
+      "Concurrency: 6, throughput: 334234 infer/sec, latency 72704 usec\n",
+      "Concurrency: 7, throughput: 355533 infer/sec, latency 81048 usec\n",
+      "Concurrency: 8, throughput: 517734 infer/sec, latency 63449 usec\n",
+      "Concurrency: 9, throughput: 346522 infer/sec, latency 106082 usec\n",
+      "Concurrency: 10, throughput: 412058 infer/sec, latency 99376 usec\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "WARNING: Overriding max_threads specification to ensure requested concurrency range.\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%bash\n",
+    "/workspace/install/bin/perf_client \\\n",
+    "--max-threads 10 \\\n",
+    "-m dlrm-onnx-16 \\\n",
+    "-x 1 \\\n",
+    "-p 5000 \\\n",
+    "-v -i gRPC \\\n",
+    "-u localhost:8001 \\\n",
+    "-b 4096 \\\n",
+    "-l 5000 \\\n",
+    "--concurrency-range 1:10 \\\n",
+    "--input-data ./perfdata \\\n",
+    "-f result.csv"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Visualizing Latency vs. Throughput\n",
+    "\n",
+    "The perf_client provides the -f option to generate a file containing CSV output of the results.\n",
+    "You can import the CSV file into a spreadsheet to help visualize the latency vs inferences/second tradeoff as well as see some components of the latency. Follow these steps:\n",
+    "- Open this [spreadsheet](https://docs.google.com/spreadsheets/d/1IsdW78x_F-jLLG4lTV0L-rruk0VEBRL7Mnb-80RGLL4)\n",
+    "\n",
+    "- Make a copy from the File menu “Make a copy…”\n",
+    "\n",
+    "- Open the copy\n",
+    "\n",
+    "- Select the A1 cell on the “Raw Data” tab\n",
+    "\n",
+    "- From the File menu select “Import…”\n",
+    "\n",
+    "- Select “Upload” and upload the file\n",
+    "\n",
+    "- Select “Replace data at selected cell” and then select the “Import data” button\n",
+    "\n",
+    "![DLRM_model](latency_vs_throughput.PNG)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "g8MxXY5GmTc8"
+   },
+   "source": [
+    "# Conclusion\n",
+    "\n",
+    "In this notebook, we have walked through the complete process of preparing the pretrained DLRM for inference with the Triton inference server. Then, we stress test the server with the performance client to verify inference throughput.\n",
+    "\n",
+    "## What's next\n",
+    "Now it's time to deploy your own DLRM model with Triton. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 0,
+   "metadata": {
+    "colab": {},
+    "colab_type": "code",
+    "id": "249yGNLmmTc_"
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "colab": {
+   "include_colab_link": true,
+   "name": "TensorFlow_UNet_Industrial_Colab_train_and_inference.ipynb",
+   "provenance": []
+  },
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}
--- a/PyTorch/Recommendation/DLRM/notebooks/DLRM_architecture.png
+++ b/PyTorch/Recommendation/DLRM/notebooks/DLRM_architecture.png
--- a/PyTorch/Recommendation/DLRM/notebooks/Pytorch_DLRM_pyt_train_and_inference.ipynb
+++ b/PyTorch/Recommendation/DLRM/notebooks/Pytorch_DLRM_pyt_train_and_inference.ipynb
@ -0,0 +1,470 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "colab": {},
+    "colab_type": "code",
+    "id": "Gwt7z7qdmTbW"
+   },
+   "outputs": [],
+   "source": [
+    "# Copyright 2019 NVIDIA Corporation. All Rights Reserved.\n",
+    "#\n",
+    "# Licensed under the Apache License, Version 2.0 (the \"License\");\n",
+    "# you may not use this file except in compliance with the License.\n",
+    "# You may obtain a copy of the License at\n",
+    "#\n",
+    "#     http://www.apache.org/licenses/LICENSE-2.0\n",
+    "#\n",
+    "# Unless required by applicable law or agreed to in writing, software\n",
+    "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
+    "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
+    "# See the License for the specific language governing permissions and\n",
+    "# limitations under the License.\n",
+    "# =============================================================================="
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "i4NKCp2VmTbn"
+   },
+   "source": [
+    "<img src=\"http://developer.download.nvidia.com/compute/machine-learning/frameworks/nvidia_logo.png\" style=\"width: 90px; float: right;\">\n",
+    "\n",
+    "# DLRM Training and Inference Demo"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "fW0OKDzvmTbt"
+   },
+   "source": [
+    "## Overview\n",
+    "\n",
+    "\n",
+    "DLRM is a deep learning based approach to recommendation introduced by Facebook. \n",
+    "Like other deep learning based approaches, DLRM is designed to make use of both categorical and numerical inputs which are usually present in RecSys training data. The architecture of DLRM can be understood via Figure 1. In order to handle categorical data, embedding layers map each category to a dense representation before being fed into  dense multilayer perceptrons (MLP). Continuous features can be fed directly into a dense MLP. At the next level, second-order interactions of different features are computed explicitly by taking the dot product between all pairs of embedding vectors and processed dense features. Those pairwise interactions are fed into a top level MLP to compute the likelihood of interaction between users and items. \n",
+    "\n",
+    "Compared to other DL based approaches to recommendation, DLRM differs in two ways. First, DLRM computes the feature interaction explicitly while limiting the order of interaction to pairwise interactions. Second, DLRM treats each embedded feature vector (corresponding to categorical features) as a single unit, whereas other methods treat each element  in the feature vector as a new unit that should yield different cross terms. These design choices help reduce computational/memory cost while maintaining competitive accuracy.\n",
+    "\n",
+    "![DLRM_model](DLRM_architecture.png)\n",
+    "\n",
+    "Figure 1. DLRM architecture.\n",
+    "\n",
+    "### Learning objectives\n",
+    "\n",
+    "This notebook demonstrates the steps for training a DLRM model. We then employ the trained model to make inference on new data.\n",
+    "\n",
+    "## Content\n",
+    "1. [Requirements](#1)\n",
+    "1. [Data download and preprocessing](#2)\n",
+    "1. [Training](#3)\n",
+    "1. [Testing trained model](#4)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "aDFrE4eqmTbv"
+   },
+   "source": [
+    "<a id=\"1\"></a>\n",
+    "## 1. Requirements\n",
+    "\n",
+    "\n",
+    "### 1.1 Docker container\n",
+    "The most convenient way to make use of the NVIDIA DLRM model is via a docker container, which provides a self-contained, isolated and re-producible environment for all experiments. Refer to the [Quick Start Guide section](../README.md) of the Readme documentation for a comprehensive guide. We briefly summarize the steps here.\n",
+    "\n",
+    "First, clone the repository:\n",
+    "\n",
+    "```\n",
+    "git clone https://github.com/NVIDIA/DeepLearningExamples\n",
+    "cd DeepLearningExamples/PyTorch/Recommendation/DLRM\n",
+    "```\n",
+    "\n",
+    "Next, build the DLRM container:\n",
+    "```\n",
+    "docker build . -t nvidia_dlrm_pyt\n",
+    "```\n",
+    "\n",
+    "Make a directory for storing DLRM data and start a docker container with:\n",
+    "```\n",
+    "mkdir -p data\n",
+    "docker run --runtime=nvidia -it --rm --ipc=host  -v ${PWD}/data:/data nvidia_dlrm_pyt bash\n",
+    "```\n",
+    "\n",
+    "Within the docker interactive bash session, start Jupyter with\n",
+    "\n",
+    "```\n",
+    "export PYTHONPATH=/workspace/dlrm\n",
+    "jupyter notebook --ip 0.0.0.0 --port 8888\n",
+    "```\n",
+    "\n",
+    "Then open the Jupyter GUI interface on your host machine at http://localhost:8888. Within the container, the demo notebooks are located at `/workspace/dlrm/notebooks`.\n",
+    "\n",
+    "### 1.2 Hardware\n",
+    "This notebook can be executed on any CUDA-enabled NVIDIA GPU with at least 24GB of GPU memory, although for efficient mixed precision training, a [Tensor Core NVIDIA GPU](https://www.nvidia.com/en-us/data-center/tensorcore/) is desired (Volta, Turing or newer architectures). "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "colab": {},
+    "colab_type": "code",
+    "id": "k7RLEcKhmTb0"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Sat Mar 28 06:36:59 2020       \n",
+      "+-----------------------------------------------------------------------------+\n",
+      "| NVIDIA-SMI 440.33.01    Driver Version: 440.33.01    CUDA Version: 10.2     |\n",
+      "|-------------------------------+----------------------+----------------------+\n",
+      "| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |\n",
+      "| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |\n",
+      "|===============================+======================+======================|\n",
+      "|   0  Tesla V100-SXM2...  On   | 00000000:06:00.0 Off |                    0 |\n",
+      "| N/A   32C    P0    42W / 300W |      0MiB / 32510MiB |      0%      Default |\n",
+      "+-------------------------------+----------------------+----------------------+\n",
+      "|   1  Tesla V100-SXM2...  On   | 00000000:07:00.0 Off |                    0 |\n",
+      "| N/A   34C    P0    43W / 300W |      0MiB / 32510MiB |      0%      Default |\n",
+      "+-------------------------------+----------------------+----------------------+\n",
+      "|   2  Tesla V100-SXM2...  On   | 00000000:0A:00.0 Off |                    0 |\n",
+      "| N/A   34C    P0    43W / 300W |      0MiB / 32510MiB |      0%      Default |\n",
+      "+-------------------------------+----------------------+----------------------+\n",
+      "|   3  Tesla V100-SXM2...  On   | 00000000:0B:00.0 Off |                    0 |\n",
+      "| N/A   32C    P0    43W / 300W |      0MiB / 32510MiB |      0%      Default |\n",
+      "+-------------------------------+----------------------+----------------------+\n",
+      "|   4  Tesla V100-SXM2...  On   | 00000000:85:00.0 Off |                    0 |\n",
+      "| N/A   33C    P0    43W / 300W |      0MiB / 32510MiB |      0%      Default |\n",
+      "+-------------------------------+----------------------+----------------------+\n",
+      "|   5  Tesla V100-SXM2...  On   | 00000000:86:00.0 Off |                    0 |\n",
+      "| N/A   35C    P0    44W / 300W |      0MiB / 32510MiB |      0%      Default |\n",
+      "+-------------------------------+----------------------+----------------------+\n",
+      "|   6  Tesla V100-SXM2...  On   | 00000000:89:00.0 Off |                    0 |\n",
+      "| N/A   37C    P0    44W / 300W |      0MiB / 32510MiB |      0%      Default |\n",
+      "+-------------------------------+----------------------+----------------------+\n",
+      "|   7  Tesla V100-SXM2...  On   | 00000000:8A:00.0 Off |                    0 |\n",
+      "| N/A   34C    P0    43W / 300W |      0MiB / 32510MiB |      0%      Default |\n",
+      "+-------------------------------+----------------------+----------------------+\n",
+      "                                                                               \n",
+      "+-----------------------------------------------------------------------------+\n",
+      "| Processes:                                                       GPU Memory |\n",
+      "|  GPU       PID   Type   Process name                             Usage      |\n",
+      "|=============================================================================|\n",
+      "|  No running processes found                                                 |\n",
+      "+-----------------------------------------------------------------------------+\n"
+     ]
+    }
+   ],
+   "source": [
+    "!nvidia-smi"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "HqSUGePjmTb9"
+   },
+   "source": [
+    "<a id=\"2\"></a>\n",
+    "## 2. Data download and preprocessing\n",
+    "\n",
+    "Commercial recommendation systems are often trained on huge data sets, often in the order of terabytes, if not more. While datasets of this scale are rarely available to the public, the Criteo Terabyte click logs public [dataset](https://labs.criteo.com/2013/12/download-terabyte-click-logs/) offers a rare glimpse into the scale of real enterprise data: it contains ~1.3TB of uncompressed click logs collected over the course of 24 days, that can be used to train RecSys models that predict the ads click through rate. Yet, real datasets can be potentially one or two orders of magnitude larger, as enterprises will try to leverage as much historical data as they can use, for this will generally translate into better accuracy.\n",
+    "\n",
+    "Herein, we employ the Criteo Terabyte dataset to demonstrate the efficiency of the GPU-optimized DLRM training procedure.  Each record in this dataset contains 40 columns: the first is a label column that indicates whether an user clicks an ad (value 1) or not (value 0). The next 13 columns are numeric, and the last 26 are categorical columns containing obfuscated hashed values. The columns and their values are all anonymized to protect user privacy.\n",
+    "\n",
+    "\n",
+    "We will first download and preprocess the Criteo Terabyte dataset. Note that this will require about 1TB of disk storage.\n",
+    "\n",
+    "Notice: before downloading data, you must check out and agree with the terms and conditions of the Criteo Terabyte [dataset](https://labs.criteo.com/2013/12/download-terabyte-click-logs/).\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {},
+    "colab_type": "code",
+    "id": "S2PR7weWmTcK"
+   },
+   "outputs": [],
+   "source": [
+    "! cd ../preproc && ./prepare_dataset.sh"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "EQAIszkxmTcT"
+   },
+   "source": [
+    "The original Facebook DLRM code base comes with a data preprocessing utility to preprocess the data. For continuous features, the data preprocessing steps include filling in missing values with 0 and normalization (shifting the values to be >=1 and taking natural logarithm). For categorical features, the preprocessing steps include building embedding tables and transforming hashed values into integer indicators. This code runs on a single CPU thread and takes ~6.5 days to transform the whole Criteo Terabyte data set. \n",
+    "\n",
+    "We improve the data preprocessing process with Spark on CPU to make use of all CPU threads. In the docker image, we have installed spark 2.4.5, which we’ll start a standalone Spark cluster.This results in significant improvement in data pre-processing speed, scaling approximately linearly with the number of available CPU threads. This outputs the transformed data in parquet format. We finally convert the parquet data into the binary format similar to that designed by the Facebook team specially for the Criteo dataset. \n",
+    "\n",
+    "Our preprocessing scripts are designed for the Criteo Terabyte Dataset and should work with any other dataset with the same format. The data should be split into text files. Each line of those text files should contain a single training example. An example should consist of multiple fields separated by tabulators:\n",
+    "- The first field is the label – `1` for a positive example and `0` for negative.\n",
+    "- The next `N` tokens should contain the numerical features separated by tabs.\n",
+    "- The next `M` tokens should contain the hashed categorical features separated by tabs.\n",
+    "\n",
+    "The outcomes of the data preprocessing steps are by default stored in `/data/dlrm/binary_dataset` containing 3 binary data files: `test_data.bin`, `train_data.bin` and   `val_data.bin`  and a JSON `file model_size.json` totalling ~650GB.\n",
+    "\n",
+    "Tips: by defaul the preprocessing script uses the first 23 days of the Criteo Terabyte dataset for training and the last day for validation. For a quick experiment, you can download and make use of a smaller number of days by modifying the `preproc/run_spark.sh` script."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "RL8d9IwzmTcV"
+   },
+   "source": [
+    "<a id=\"3\"></a>\n",
+    "## 3. Training"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "o6wayGf1mTcX"
+   },
+   "source": [
+    "The repository provides several training recipes on 1 GPU with FP32 and automatic mixed precisions."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "HapDsY4VmTce"
+   },
+   "source": [
+    "#### Training with FP32\n",
+    "Training on 1 GPU with FP32 with the `--nofp16` option."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [],
+   "source": [
+    "%run ../dlrm/scripts/main \\\n",
+    "--mode train \\\n",
+    "--dataset /data/dlrm/binary_dataset \\\n",
+    "--nofp16 \\\n",
+    "--save_checkpoint_path ./dlrm_model_fp32.pt"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "On a V100 32GB, training takes approximately 2h56m for 1 epoch to an AUC of ~0.8. The final result should look similar to the below.\n",
+    "\n",
+    "```\n",
+    "Epoch:[0/1] [127600/128028]  eta: 0:00:34  loss: 0.1226  step_time: 0.080038  lr: 1.1766\n",
+    "Epoch:[0/1] [127800/128028]  eta: 0:00:18  loss: 0.1224  step_time: 0.080307  lr: 1.1480\n",
+    "Epoch:[0/1] [128000/128028]  eta: 0:00:02  loss: 0.1221  step_time: 0.080562  lr: 1.1199\n",
+    "Test: [200/2721]  loss: 0.1236  step_time: 0.0303\n",
+    "Test: [400/2721]  loss: 0.1248  step_time: 0.0245\n",
+    "Test: [600/2721]  loss: 0.1262  step_time: 0.0244\n",
+    "Test: [800/2721]  loss: 0.1262  step_time: 0.0245\n",
+    "Test: [1000/2721]  loss: 0.1293  step_time: 0.0245\n",
+    "Test: [1200/2721]  loss: 0.1307  step_time: 0.0245\n",
+    "Test: [1400/2721]  loss: 0.1281  step_time: 0.0245\n",
+    "Test: [1600/2721]  loss: 0.1242  step_time: 0.0246\n",
+    "Test: [1800/2721]  loss: 0.1230  step_time: 0.0245\n",
+    "Test: [2000/2721]  loss: 0.1226  step_time: 0.0244\n",
+    "Test: [2200/2721]  loss: 0.1239  step_time: 0.0246\n",
+    "Test: [2400/2721]  loss: 0.1256  step_time: 0.0249\n",
+    "Test: [2600/2721]  loss: 0.1247  step_time: 0.0248\n",
+    "Epoch 0 step 128027. Test loss 0.12557, auc 0.803517\n",
+    "Checkpoint saving took 42.90 [s]\n",
+    "DLL 2020-03-29 15:59:44.759627 - () best_auc : 0.80352  best_epoch : 1.00  average_train_throughput : 4.07e+05  average_test_throughput : 1.33e+06 \n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "j-aFEwb4mTcn"
+   },
+   "source": [
+    "#### Training with mixed-precision\n",
+    "Mixed precision training can be done with the `--fp16` option. Under the hood, the NVIDIA Pytorch extension library [Apex](https://github.com/NVIDIA/apex) to enable mixed precision training.\n",
+    "\n",
+    "Note: for subsequent launches of the %run magic, please restart your kernel manualy or execute the below cell to restart kernel."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Note: for subsequent launches of the %run magic, \n",
+    "# please restart your kernel manualy or execute this cell to restart kernel.\n",
+    "import os\n",
+    "os._exit(00)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {},
+    "colab_type": "code",
+    "id": "o3AZ-CXYmTcp",
+    "scrolled": false
+   },
+   "outputs": [],
+   "source": [
+    "%run ../dlrm/scripts/main \\\n",
+    "--mode train \\\n",
+    "--dataset /data/dlrm/binary_dataset \\\n",
+    "--fp16 \\\n",
+    "--save_checkpoint_path ./dlrm_model_fp16.pt"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "On a V100 32GB, training takes approximately 1h41m for 1 epoch to an AUC of ~0.8. Thus, mixed precision training provides a speed up of ~ 1.7x.\n",
+    "\n",
+    "The final result should look similar to the below.\n",
+    "\n",
+    "```\n",
+    "...\n",
+    "Epoch:[0/1] [127800/128028]  eta: 0:00:11  loss: 0.1224  step_time: 0.050719  lr: 1.1480\n",
+    "Epoch:[0/1] [128000/128028]  eta: 0:00:01  loss: 0.1221  step_time: 0.050499  lr: 1.1199\n",
+    "Test: [200/2721]  loss: 0.1236  step_time: 0.0271\n",
+    "Test: [400/2721]  loss: 0.1247  step_time: 0.0278\n",
+    "Test: [600/2721]  loss: 0.1262  step_time: 0.0275\n",
+    "Test: [800/2721]  loss: 0.1262  step_time: 0.0278\n",
+    "Test: [1000/2721]  loss: 0.1293  step_time: 0.0273\n",
+    "Test: [1200/2721]  loss: 0.1306  step_time: 0.0264\n",
+    "Test: [1400/2721]  loss: 0.1281  step_time: 0.0281\n",
+    "Test: [1600/2721]  loss: 0.1242  step_time: 0.0273\n",
+    "Test: [1800/2721]  loss: 0.1229  step_time: 0.0280\n",
+    "Test: [2000/2721]  loss: 0.1226  step_time: 0.0274\n",
+    "Test: [2200/2721]  loss: 0.1239  step_time: 0.0278\n",
+    "Test: [2400/2721]  loss: 0.1256  step_time: 0.0289\n",
+    "Test: [2600/2721]  loss: 0.1247  step_time: 0.0282\n",
+    "Epoch 0 step 128027. Test loss 0.12557, auc 0.803562\n",
+    "Checkpoint saving took 40.46 [s]\n",
+    "DLL 2020-03-28 15:15:36.290149 - () best_auc : 0.80356  best_epoch : 1.00  average_train_throughput : 6.47e+05  average_test_throughput : 1.17e+06\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "X959LYwjmTcw"
+   },
+   "source": [
+    "<a id=\"4\"></a>\n",
+    "## 4. Testing trained model\n",
+    "\n",
+    "After model training has completed, we can test the trained model against the Criteo test dataset. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Note: for subsequent launches of the %run magic, \n",
+    "# please restart your kernel manualy or execute this cell to restart kernel.\n",
+    "import os\n",
+    "os._exit(00)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [],
+   "source": [
+    "%run ../dlrm/scripts/main \\\n",
+    "--mode test\\\n",
+    "--dataset /data/dlrm/binary_dataset \\\n",
+    "--load_checkpoint_path ./dlrm_model_fp16.pt"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "g8MxXY5GmTc8"
+   },
+   "source": [
+    "# Conclusion\n",
+    "\n",
+    "In this notebook, we have walked through the complete process of preparing the container and data required for training the DLRM model. We have also investigated various training options with FP32 and automatic mixed precision, trained and tested DLRM models with new test data.\n",
+    "\n",
+    "## What's next\n",
+    "Now it's time to try the DLRM model on your own data. Observe the performance impact of mixed precision training while comparing the final accuracy of the models trained with FP32 and mixed precision.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 0,
+   "metadata": {
+    "colab": {},
+    "colab_type": "code",
+    "id": "249yGNLmmTc_"
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "colab": {
+   "include_colab_link": true,
+   "name": "TensorFlow_UNet_Industrial_Colab_train_and_inference.ipynb",
+   "provenance": []
+  },
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}
--- a/PyTorch/Recommendation/DLRM/notebooks/README.md
+++ b/PyTorch/Recommendation/DLRM/notebooks/README.md
@ -0,0 +1,69 @@
+<!-- #region -->
+# DLRM Jupyter demo notebooks
+
+This folder contains the demo notebooks for DLRM. The most convenient way to use these notebooks is via using a docker container, which provides a self-contained, isolated and re-producible environment for all experiments. Refer to the [Quick Start Guide section](../README.md) of the Readme documentation for a comprehensive guide. 
+
+First, clone the repository:
+
+```
+git clone https://github.com/NVIDIA/DeepLearningExamples
+cd DeepLearningExamples/PyTorch/Recommendation/DLRM
+```
+
+
+## Notebook list
+
+### 1. Pytorch_DLRM_pyt_train_and_inference.ipynb: training and inference demo
+
+To execute this notebook, first build the DLRM container:
+```
+docker build . -t nvidia_dlrm_pyt
+```
+
+Make a directory for storing DLRM data and start a docker containerexport PYTHONPATH=/workspace/dlrm with:
+```
+mkdir -p data
+docker run --runtime=nvidia -it --rm --ipc=host  -v ${PWD}/data:/data nvidia_dlrm_pyt bash
+```
+
+Within the docker interactive bash session, start Jupyter with
+
+```
+export PYTHONPATH=/workspace/dlrm
+jupyter notebook --ip 0.0.0.0 --port 8888
+```
+
+Then open the Jupyter GUI interface on your host machine at http://localhost:8888. Within the container, this demo notebook is located at `/workspace/dlrm/notebooks`.
+<!-- #endregion -->
+
+### 2. DLRM_Triton_inference_demo.ipynb: inference demo with the NVIDIA Triton Inference server.
+
+To execute this notebook, first build the following inference container:
+
+```
+docker build -t dlrm-inference . -f triton/Dockerfile
+```
+
+Start in interactive docker session with:
+
+```
+docker run -it --rm --gpus device=0 --shm-size=1g --ulimit memlock=-1 --ulimit stack=67108864 --net=host -v <PATH_TO_SAVED_MODEL>:/models -v <PATH_TO_EXPORT_MODEL>:/repository dlrm-inference bash
+```
+where:
+
+- PATH_TO_SAVED_MODEL: directory containing the trained DLRM models.
+ 
+- PATH_TO_EXPORT_MODEL: directory which will contain the converted model to be used with the NVIDIA Triton inference server.
+
+Within the docker interactive bash session, start Jupyter with
+
+```
+export PYTHONPATH=/workspace/dlrm
+jupyter notebook --ip 0.0.0.0 --port 8888
+```
+
+Then open the Jupyter GUI interface on your host machine at http://localhost:8888. Within the container, this demo notebook is located at `/workspace/dlrm/notebooks`.
+
+```python
+
+```
--- a/PyTorch/Recommendation/DLRM/notebooks/latency_vs_throughput.PNG
+++ b/PyTorch/Recommendation/DLRM/notebooks/latency_vs_throughput.PNG
--- a/PyTorch/Recommendation/DLRM/notebooks/recsys_inference.PNG
+++ b/PyTorch/Recommendation/DLRM/notebooks/recsys_inference.PNG
--- a/PyTorch/Recommendation/DLRM/preproc/parquet_to_binary.py
+++ b/PyTorch/Recommendation/DLRM/preproc/parquet_to_binary.py
@ -0,0 +1,90 @@
+# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import numpy as np
+import pandas as pd
+import os
+from joblib import Parallel, delayed
+import glob
+import argparse
+import tqdm
+import subprocess
+
+def process_file(f, dst):
+
+    all_columns_sorted = [f'_c{i}' for i in range(0, 40)]
+
+    data = pd.read_parquet(f)
+    data = data[all_columns_sorted]
+
+    dense_columns = [f'_c{i}' for i in range(1, 14)]
+    data[dense_columns] = data[dense_columns].astype(np.float32)
+
+    data = data.to_records(index=False)
+    data = data.tobytes()
+
+    dst_file = dst + '/' + f.split('/')[-1] + '.bin'
+    with open(dst_file, 'wb') as dst_fd:
+        dst_fd.write(data)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--src_dir', type=str)
+    parser.add_argument('--intermediate_dir', type=str)
+    parser.add_argument('--dst_dir', type=str)
+    parser.add_argument('--parallel_jobs', default=40, type=int)
+    args = parser.parse_args()
+
+    print('Processing train files...')
+    train_src_files = glob.glob(args.src_dir + '/train/*.parquet')
+    train_intermediate_dir = args.intermediate_dir + '/train'
+    os.makedirs(train_intermediate_dir, exist_ok=True)
+
+    Parallel(n_jobs=args.parallel_jobs)(delayed(process_file)(f, train_intermediate_dir) for f in tqdm.tqdm(train_src_files))
+
+    print('Train files conversion done')
+
+    print('Processing test files...')
+    test_src_files = glob.glob(args.src_dir + '/test/*.parquet')
+    test_intermediate_dir = args.intermediate_dir + '/test'
+    os.makedirs(test_intermediate_dir, exist_ok=True)
+
+    Parallel(n_jobs=args.parallel_jobs)(delayed(process_file)(f, test_intermediate_dir) for f in tqdm.tqdm(test_src_files))
+    print('Test files conversion done')
+
+    print('Processing validation files...')
+    valid_src_files = glob.glob(args.src_dir + '/validation/*.parquet')
+    valid_intermediate_dir = args.intermediate_dir + '/valid'
+    os.makedirs(valid_intermediate_dir, exist_ok=True)
+
+    Parallel(n_jobs=args.parallel_jobs)(delayed(process_file)(f, valid_intermediate_dir) for f in tqdm.tqdm(valid_src_files))
+    print('Validation files conversion done')
+
+    os.makedirs(args.dst_dir, exist_ok=True)
+
+    print('Concatenating train files')
+    os.system(f'cat {train_intermediate_dir}/*.bin > {args.dst_dir}/train_data.bin')
+
+    print('Concatenating test files')
+    os.system(f'cat {test_intermediate_dir}/*.bin > {args.dst_dir}/test_data.bin')
+
+    print('Concatenating validation files')
+    os.system(f'cat {valid_intermediate_dir}/*.bin > {args.dst_dir}/val_data.bin')
+    print('Done')
+
+
+if __name__ == '__main__':
+    main()
--- a/PyTorch/Recommendation/DLRM/preproc/prepare_dataset.sh
+++ b/PyTorch/Recommendation/DLRM/preproc/prepare_dataset.sh
@ -0,0 +1,59 @@
+# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#! /bin/bash
+
+set -e
+set -x
+
+ls -ltrash
+
+download_dir=${download_dir:-'/data/dlrm/criteo'}
+./verify_criteo_downloaded.sh ${download_dir}
+
+spark_output_path=${spark_output_path:-'/data/dlrm/spark/output'}
+
+
+if [ -f ${spark_output_path}/train/_SUCCESS ] \
+   && [ -f ${spark_output_path}/validation/_SUCCESS ] \
+   && [ -f ${spark_output_path}/test/_SUCCESS ]; then
+
+   echo "Spark preprocessing already carried done"
+else
+   echo "Performing spark preprocessing"
+   ./run_spark.sh ${download_dir} ${spark_output_path}
+fi
+
+conversion_intermediate_dir=${conversion_intermediate_dir:-'/data/dlrm/intermediate_binary'}
+final_output_dir=${final_output_dir:-'/data/dlrm/binary_dataset'}
+
+
+if [ -f ${final_output_dir}/train_data.bin ] \
+   && [ -f ${final_output_dir}/val_data.bin ] \
+   && [ -f ${final_output_dir}/test_data.bin ] \
+   && [ -f ${final_output_dir}/model_sizes.json ]; then
+
+    echo "Final conversion already done"
+else
+    echo "Performing final conversion to a custom data format"
+    python parquet_to_binary.py --parallel_jobs 40 --src_dir ${spark_output_path} \
+                                --intermediate_dir  ${conversion_intermediate_dir} \
+                                --dst_dir ${final_output_dir}
+
+    cp "${spark_output_path}/model_size.json" "${final_output_dir}/model_size.json"
+fi
+
+echo "Done preprocessing the Criteo Kaggle Dataset"
+echo "You can now start the training with: "
+echo "python -m dlrm.scripts.main --mode train --dataset  /data/dlrm/binary_dataset/ --model_config dlrm/config/default.json"
--- a/PyTorch/Recommendation/DLRM/preproc/run_spark.sh
+++ b/PyTorch/Recommendation/DLRM/preproc/run_spark.sh
@ -0,0 +1,166 @@
+# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#########################################################################
+# File Name: run-spark.sh
+
+#!/bin/bash
+
+set -e
+
+# the environment variables to run spark job
+# should modify below environment variables
+
+# the data path including 1TB criteo data, day_0, day_1, ...
+export INPUT_PATH=${1:-'/data/dlrm/criteo'}
+
+# the output path, use for generating the dictionary and the final dataset
+# the output folder should have more than 300GB
+export OUTPUT_PATH=${2:-'/data/dlrm/spark/output'}
+
+# spark local dir should have about 3TB
+# the temporary path used for spark shuffle write
+export SPARK_LOCAL_DIRS='/data/dlrm/spark/tmp'
+
+# below numbers should be adjusted according to the resource of your running environment
+# set the total number of CPU cores, spark can use
+export TOTAL_CORES=80
+
+# set the number of executors
+export NUM_EXECUTORS=8
+
+# the cores for each executor, it'll be calculated
+export NUM_EXECUTOR_CORES=$((${TOTAL_CORES}/${NUM_EXECUTORS}))
+
+# unit: GB,  set the max memory you want to use
+export TOTAL_MEMORY=800
+
+# unit: GB, set the memory for driver
+export DRIVER_MEMORY=32
+
+# the memory per executor
+export EXECUTOR_MEMORY=$(((${TOTAL_MEMORY}-${DRIVER_MEMORY})/${NUM_EXECUTORS}))
+
+# use frequency_limit=15 or not
+# by default use a frequency limit of 15
+USE_FREQUENCY_LIMIT=1
+OPTS=""
+if [[ $USE_FREQUENCY_LIMIT == 1 ]]; then
+    OPTS="--frequency_limit 15"
+fi
+
+export SPARK_HOME=/opt/spark-2.4.5-bin-hadoop2.7
+export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64
+export PATH=$SPARK_HOME/bin:$SPARK_HOME/sbin:$PATH
+
+# we use spark standalone to run the job
+export MASTER=spark://$HOSTNAME:7077
+
+echo "Starting spark standalone"
+start-master.sh
+start-slave.sh $MASTER
+
+echo "Generating the dictionary..."
+spark-submit --master $MASTER \
+    	--driver-memory "${DRIVER_MEMORY}G" \
+    	--executor-cores $NUM_EXECUTOR_CORES \
+    	--executor-memory "${EXECUTOR_MEMORY}G" \
+    	--conf spark.cores.max=$TOTAL_CORES \
+    	--conf spark.task.cpus=1 \
+        --conf spark.sql.files.maxPartitionBytes=1073741824 \
+    	--conf spark.sql.shuffle.partitions=600 \
+    	--conf spark.driver.maxResultSize=2G \
+    	--conf spark.locality.wait=0s \
+    	--conf spark.network.timeout=1800s \
+    	spark_data_utils.py --mode generate_models \
+    	$OPTS \
+    	--input_folder $INPUT_PATH \
+    	--days 0-23 \
+    	--model_folder $OUTPUT_PATH/models \
+    	--write_mode overwrite --low_mem 2>&1 | tee submit_dict_log.txt
+
+echo "Transforming the train data from day_0 to day_22..."
+spark-submit --master $MASTER \
+    	--driver-memory "${DRIVER_MEMORY}G" \
+    	--executor-cores $NUM_EXECUTOR_CORES \
+    	--executor-memory "${EXECUTOR_MEMORY}G" \
+    	--conf spark.cores.max=$TOTAL_CORES \
+    	--conf spark.task.cpus=1 \
+        --conf spark.sql.files.maxPartitionBytes=1073741824 \
+    	--conf spark.sql.shuffle.partitions=600 \
+    	--conf spark.driver.maxResultSize=2G \
+    	--conf spark.locality.wait=0s \
+    	--conf spark.network.timeout=1800s \
+    	spark_data_utils.py --mode transform \
+    	--input_folder $INPUT_PATH \
+    	--days 0-22 \
+    	--output_folder $OUTPUT_PATH/train \
+        --model_size_file $OUTPUT_PATH/model_size.json \
+    	--model_folder $OUTPUT_PATH/models \
+    	--write_mode overwrite --low_mem 2>&1 | tee submit_train_log.txt
+
+echo "Splitting the last day into 2 parts of test and validation..."
+last_day=$INPUT_PATH/day_23
+temp_test=$OUTPUT_PATH/temp/test
+temp_validation=$OUTPUT_PATH/temp/validation
+mkdir -p $temp_test $temp_validation
+
+lines=`wc -l $last_day | awk '{print $1}'`
+former=$((lines / 2))
+latter=$((lines - former))
+
+head -n $former $last_day > $temp_test/day_23
+tail -n $latter $last_day > $temp_validation/day_23
+
+echo "Transforming the test data in day_23..."
+spark-submit --master $MASTER \
+    	--driver-memory "${DRIVER_MEMORY}G" \
+    	--executor-cores $NUM_EXECUTOR_CORES \
+    	--executor-memory "${EXECUTOR_MEMORY}G" \
+    	--conf spark.cores.max=$TOTAL_CORES \
+    	--conf spark.task.cpus=1 \
+        --conf spark.sql.files.maxPartitionBytes=1073741824 \
+    	--conf spark.sql.shuffle.partitions=30 \
+    	--conf spark.driver.maxResultSize=2G \
+    	--conf spark.locality.wait=0s \
+    	--conf spark.network.timeout=1800s \
+    	spark_data_utils.py --mode transform \
+    	--input_folder $temp_test \
+    	--days 23-23 \
+    	--output_folder $OUTPUT_PATH/test \
+    	--output_ordering input \
+    	--model_folder $OUTPUT_PATH/models \
+    	--write_mode overwrite --low_mem 2>&1 | tee submit_test_log.txt
+
+echo "Transforming the validation data in day_23..."
+spark-submit --master $MASTER \
+    	--driver-memory "${DRIVER_MEMORY}G" \
+    	--executor-cores $NUM_EXECUTOR_CORES \
+    	--executor-memory "${EXECUTOR_MEMORY}G" \
+    	--conf spark.cores.max=$TOTAL_CORES \
+    	--conf spark.task.cpus=1 \
+        --conf spark.sql.files.maxPartitionBytes=1073741824 \
+    	--conf spark.sql.shuffle.partitions=30 \
+    	--conf spark.driver.maxResultSize=2G \
+    	--conf spark.locality.wait=0s \
+    	--conf spark.network.timeout=1800s \
+    	spark_data_utils.py --mode transform \
+    	--input_folder $temp_validation \
+    	--days 23-23 \
+    	--output_folder $OUTPUT_PATH/validation \
+    	--output_ordering input \
+    	--model_folder $OUTPUT_PATH/models \
+    	--write_mode overwrite --low_mem 2>&1 | tee submit_validation_log.txt
+
+rm -r $temp_test $temp_validation
--- a/PyTorch/Recommendation/DLRM/preproc/spark_data_utils.py
+++ b/PyTorch/Recommendation/DLRM/preproc/spark_data_utils.py
@ -0,0 +1,507 @@
+# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import json
+import os
+import sys
+
+from argparse import ArgumentParser
+from collections import OrderedDict
+from contextlib import contextmanager
+from operator import itemgetter
+from time import time
+
+from pyspark import broadcast
+from pyspark.sql import Row, SparkSession, Window
+from pyspark.sql.functions import *
+from pyspark.sql.types import *
+
+
+LABEL_COL = 0
+INT_COLS = list(range(1, 14))
+CAT_COLS = list(range(14, 40))
+
+
+def get_column_counts_with_frequency_limit(df, frequency_limit = None):
+    cols = ['_c%d' % i for i in CAT_COLS]
+    df = (df
+        .select(posexplode(array(*cols)))
+        .withColumnRenamed('pos', 'column_id')
+        .withColumnRenamed('col', 'data')
+        .filter('data is not null')
+        .groupBy('column_id', 'data')
+        .count())
+
+    if frequency_limit:
+        frequency_limit = frequency_limit.split(",")
+        exclude = []
+        default_limit = None
+        for fl in frequency_limit:
+            frequency_pair = fl.split(":")
+            if len(frequency_pair) == 1:
+                default_limit = int(frequency_pair[0])
+            elif len(frequency_pair) == 2:
+                df = df.filter((col('column_id') != int(frequency_pair[0]) - CAT_COLS[0]) | (col('count') >= int(frequency_pair[1])))
+                exclude.append(int(frequency_pair[0]))
+        if default_limit:
+            remain = [x - CAT_COLS[0] for x in CAT_COLS if x not in exclude]
+            df = df.filter((~col('column_id').isin(remain)) | (col('count') >= default_limit))
+            # for comparing isin and separate filter
+            # for i in remain:
+            #     df = df.filter((col('column_id') != i - CAT_COLS[0]) | (col('count') >= default_limit))
+    return df
+
+
+def assign_id_with_window(df):
+    windowed = Window.partitionBy('column_id').orderBy(desc('count'))
+    return (df
+            .withColumn('id', row_number().over(windowed))
+            .withColumnRenamed('count', 'model_count'))
+
+
+def assign_low_mem_partial_ids(df):
+    # To avoid some scaling issues with a simple window operation, we use a more complex method
+    # to compute the same thing, but in a more distributed spark specific way
+    df = df.orderBy(asc('column_id'), desc('count'))
+    # The monotonically_increasing_id is the partition id in the top 31 bits and the rest
+    # is an increasing count of the rows within that partition.  So we split it into two parts,
+    # the partion id part_id and the count mono_id
+    df = df.withColumn('part_id', spark_partition_id())
+    return df.withColumn('mono_id', monotonically_increasing_id() - shiftLeft(col('part_id'), 33))
+
+
+def assign_low_mem_final_ids(df):
+    # Now we can find the minimum and maximum mono_ids within a given column/partition pair
+    sub_model = df.groupBy('column_id', 'part_id').agg(max('mono_id').alias('top'), min('mono_id').alias('bottom'))
+    sub_model = sub_model.withColumn('diff', col('top') - col('bottom') + 1)
+    sub_model = sub_model.drop('top')
+    # This window function is over aggregated column/partition pair table. It will do a running sum of the rows
+    # within that column
+    windowed = Window.partitionBy('column_id').orderBy('part_id').rowsBetween(Window.unboundedPreceding, -1)
+    sub_model = sub_model.withColumn('running_sum', sum('diff').over(windowed)).na.fill(0, ["running_sum"])
+
+    joined = df.withColumnRenamed('column_id', 'i_column_id')
+    joined = joined.withColumnRenamed('part_id', 'i_part_id')
+    joined = joined.withColumnRenamed('count', 'model_count')
+
+    # Then we can join the original input with the pair it is a part of
+    joined = joined.join(sub_model, (col('i_column_id') == col('column_id')) & (col('part_id') == col('i_part_id')))
+
+    # So with all that we can subtract bottom from mono_id makeing it start at 0 for each partition
+    # and then add in the running_sum so the id is contiguous and unique for the entire column. + 1 to make it match the 1 based indexing
+    # for row_number
+    ret = joined.select(col('column_id'),
+                        col('data'),
+                        (col('mono_id') - col('bottom') + col('running_sum') + 1).cast(IntegerType()).alias('id'),
+                        col('model_count'))
+    return ret
+
+
+def get_column_models(combined_model):
+    for i in CAT_COLS:
+        model = (combined_model
+            .filter('column_id == %d' % (i - CAT_COLS[0]))
+            .drop('column_id'))
+        yield i, model
+
+
+def col_of_rand_long():
+    return (rand() * (1 << 52)).cast(LongType())
+
+def skewed_join(df, model, col_name, cutoff):
+    # Most versions of spark don't have a good way
+    # to deal with a skewed join out of the box.
+    # Some do and if you want to replace this with
+    # one of those that would be great.
+    
+    # Because we have statistics about the skewedness
+    # that we can used we divide the model up into two parts
+    # one part is the highly skewed part and we do a
+    # broadcast join for that part, but keep the result in
+    # a separate column
+    b_model = broadcast(model.filter(col('model_count') >= cutoff)
+            .withColumnRenamed('data', col_name)
+            .drop('model_count'))
+    
+    df = (df
+            .join(b_model, col_name, how='left')
+            .withColumnRenamed('id', 'id_tmp'))
+    
+    # We also need to spread the skewed data that matched
+    # evenly.  We will use a source of randomness for this
+    # but use a -1 for anything that still needs to be matched
+    if 'ordinal' in df.columns:
+        rand_column = col('ordinal')
+    else:
+        rand_column = col_of_rand_long()
+
+    df = df.withColumn('join_rand',
+            # null values are not in the model, they are filtered out
+            # but can be a source of skewedness so include them in
+            # the even distribution
+            when(col('id_tmp').isNotNull() | col(col_name).isNull(), rand_column)
+            .otherwise(lit(-1)))
+    
+    # Null out the string data that already matched to save memory
+    df = df.withColumn(col_name,
+            when(col('id_tmp').isNotNull(), None)
+            .otherwise(col(col_name)))
+    
+    # Now do the second join, which will be a non broadcast join.
+    # Sadly spark is too smart for its own good and will optimize out
+    # joining on a column it knows will always be a constant value.
+    # So we have to make a convoluted version of assigning a -1 to the
+    # randomness column for the model itself to work around that.
+    nb_model = (model
+            .withColumn('join_rand', when(col('model_count') < cutoff, lit(-1)).otherwise(lit(-2)))
+            .filter(col('model_count') < cutoff)
+            .withColumnRenamed('data', col_name)
+            .drop('model_count'))
+    
+    df = (df
+            .join(nb_model, ['join_rand', col_name], how='left')
+            .drop(col_name, 'join_rand')
+            # Pick either join result as an answer
+            .withColumn(col_name, coalesce(col('id'), col('id_tmp')))
+            .drop('id', 'id_tmp'))
+
+    return df
+
+
+def apply_models(df, models, broadcast_model = False, skew_broadcast_pct = 1.0):
+    # sort the models so broadcast joins come first. This is
+    # so we reduce the amount of shuffle data sooner than later
+    # If we parsed the string hex values to ints early on this would
+    # not make a difference.
+    models = sorted(models, key=itemgetter(3), reverse=True)
+    for i, model, original_rows, would_broadcast in models:
+        col_name = '_c%d' % i
+        if not (would_broadcast or broadcast_model):
+            # The data is highly skewed so we need to offset that
+            cutoff = int(original_rows * skew_broadcast_pct/100.0)
+            df = skewed_join(df, model, col_name, cutoff)
+        else:
+            # broadcast joins can handle skewed data so no need to
+            # do anything special
+            model = (model.drop('model_count')
+                          .withColumnRenamed('data', col_name))
+            model = broadcast(model) if broadcast_model else model
+            df = (df
+                .join(model, col_name, how='left')
+                .drop(col_name)
+                .withColumnRenamed('id', col_name))
+    return df.fillna(0, ['_c%d' % i for i in CAT_COLS])
+
+
+def transform_log(df, transform_log = False):
+    cols = ['_c%d' % i for i in INT_COLS]
+    if transform_log:
+        for col_name in cols:
+            df = df.withColumn(col_name, log(df[col_name] + 3))
+    return df.fillna(0, cols)
+
+
+def would_broadcast(spark, str_path):
+    sc = spark.sparkContext
+    config = sc._jsc.hadoopConfiguration()
+    path = sc._jvm.org.apache.hadoop.fs.Path(str_path)
+    fs = sc._jvm.org.apache.hadoop.fs.FileSystem.get(config)
+    stat = fs.listFiles(path, True)
+    sum = 0
+    while stat.hasNext():
+       sum = sum + stat.next().getLen()
+    sql_conf = sc._jvm.org.apache.spark.sql.internal.SQLConf()
+    cutoff = sql_conf.autoBroadcastJoinThreshold() * sql_conf.fileCompressionFactor()
+    return sum <= cutoff
+
+def delete_data_source(spark, path):
+    sc = spark.sparkContext
+    config = sc._jsc.hadoopConfiguration()
+    path = sc._jvm.org.apache.hadoop.fs.Path(path)
+    sc._jvm.org.apache.hadoop.fs.FileSystem.get(config).delete(path, True)
+
+
+def load_raw(spark, folder, day_range):
+    label_fields = [StructField('_c%d' % LABEL_COL, IntegerType())]
+    int_fields = [StructField('_c%d' % i, IntegerType()) for i in INT_COLS]
+    str_fields = [StructField('_c%d' % i, StringType()) for i in CAT_COLS]
+
+    schema = StructType(label_fields + int_fields + str_fields)
+    paths = [os.path.join(folder, 'day_%d' % i) for i in day_range]
+    return (spark
+        .read
+        .schema(schema)
+        .option('sep', '\t')
+        .csv(paths))
+
+def rand_ordinal(df):
+    # create a random long from the double precision float.  
+    # The fraction part of a double is 52 bits, so we try to capture as much
+    # of that as possible
+    return df.withColumn('ordinal', col_of_rand_long())
+
+def day_from_ordinal(df, num_days):
+    return df.withColumn('day', (col('ordinal') % num_days).cast(IntegerType()))
+
+def day_from_input_file(df):
+    return df.withColumn('day', substring_index(input_file_name(), '_', -1).cast(IntegerType()))
+
+def psudo_sort_by_day_plus(spark, df, num_days):
+    # Sort is very expensive because it needs to calculate the partitions
+    # which in our case may involve rereading all of the data.  In some cases
+    # we can avoid this by repartitioning the data and sorting within a single partition
+    shuffle_parts = int(spark.conf.get('spark.sql.shuffle.partitions'))
+    extra_parts = int(shuffle_parts/num_days)
+    if extra_parts <= 0:
+        df = df.repartition('day')
+    else:
+        #We want to spread out the computation to about the same amount as shuffle_parts
+        divided = (col('ordinal') / num_days).cast(LongType())
+        extra_ident = divided % extra_parts
+        df = df.repartition(col('day'), extra_ident)
+    return df.sortWithinPartitions('day', 'ordinal')
+
+
+def load_combined_model(spark, model_folder):
+    path = os.path.join(model_folder, 'combined.parquet')
+    return spark.read.parquet(path)
+
+
+def save_combined_model(df, model_folder, mode=None):
+    path = os.path.join(model_folder, 'combined.parquet')
+    df.write.parquet(path, mode=mode)
+
+
+def delete_combined_model(spark, model_folder):
+    path = os.path.join(model_folder, 'combined.parquet')
+    delete_data_source(spark, path)
+
+
+def load_low_mem_partial_ids(spark, model_folder):
+    path = os.path.join(model_folder, 'partial_ids.parquet')
+    return spark.read.parquet(path)
+
+
+def save_low_mem_partial_ids(df, model_folder, mode=None):
+    path = os.path.join(model_folder, 'partial_ids.parquet')
+    df.write.parquet(path, mode=mode)
+
+
+def delete_low_mem_partial_ids(spark, model_folder):
+    path = os.path.join(model_folder, 'partial_ids.parquet')
+    delete_data_source(spark, path)
+
+
+def load_column_models(spark, model_folder, count_required):
+    for i in CAT_COLS:
+        path = os.path.join(model_folder, '%d.parquet' % i)
+        df = spark.read.parquet(path)
+        if count_required:
+            values = df.agg(sum('model_count').alias('sum'), count('*').alias('size')).collect()
+        else:
+            values = df.agg(sum('model_count').alias('sum')).collect()
+        yield i, df, values[0], would_broadcast(spark, path)
+
+def save_column_models(column_models, model_folder, mode=None):
+    for i, model in column_models:
+        path = os.path.join(model_folder, '%d.parquet' % i)
+        model.write.parquet(path, mode=mode)
+
+
+def save_model_size(model_size, path, write_mode):
+    if os.path.exists(path) and write_mode == 'errorifexists':
+        print('Error: model size file %s exists' % path)
+        sys.exit(1)
+
+    os.makedirs(os.path.dirname(os.path.abspath(path)), exist_ok=True)
+    with open(path, 'w') as fp:
+        json.dump(model_size, fp, indent=4)
+
+
+_benchmark = {}
+
+
+@contextmanager
+def _timed(step):
+    start = time()
+    yield
+    end = time()
+    _benchmark[step] = end - start
+
+
+def _parse_args():
+    parser = ArgumentParser()
+
+    parser.add_argument(
+        '--mode',
+        required=True,
+        choices=['generate_models', 'transform'])
+
+    parser.add_argument('--days', required=True)
+    parser.add_argument('--input_folder', required=True)
+    parser.add_argument('--output_folder')
+    parser.add_argument('--model_size_file')
+    parser.add_argument('--model_folder', required=True)
+    parser.add_argument(
+        '--write_mode',
+        choices=['overwrite', 'errorifexists'],
+        default='errorifexists')
+
+    parser.add_argument('--frequency_limit')
+    parser.add_argument('--no_numeric_log_col', action='store_true')
+    #Support for running in a lower memory environment
+    parser.add_argument('--low_mem', action='store_true')
+    parser.add_argument(
+        '--output_ordering',
+        choices=['total_random', 'day_random', 'any', 'input'],
+        default='total_random')
+
+    parser.add_argument(
+        '--output_partitioning',
+        choices=['day', 'none'],
+        default='none')
+
+    parser.add_argument('--dict_build_shuffle_parallel_per_day', type=int, default=2)
+    parser.add_argument('--apply_shuffle_parallel_per_day', type=int, default=25)
+    parser.add_argument('--skew_broadcast_pct', type=float, default=1.0)
+
+    parser.add_argument('--debug_mode', action='store_true')
+
+    args = parser.parse_args()
+
+    start, end = args.days.split('-')
+    args.day_range = list(range(int(start), int(end) + 1))
+    args.days = len(args.day_range)
+
+    return args
+
+
+def _main():
+    args = _parse_args()
+    spark = SparkSession.builder.getOrCreate()
+
+    df = load_raw(spark, args.input_folder, args.day_range)
+
+    if args.mode == 'generate_models':
+        spark.conf.set('spark.sql.shuffle.partitions', args.days * args.dict_build_shuffle_parallel_per_day)
+        with _timed('generate models'):
+            col_counts = get_column_counts_with_frequency_limit(df, args.frequency_limit)
+            if args.low_mem:
+                # in low memory mode we have to save an intermediate result
+                # because if we try to do it in one query spark ends up assigning the
+                # partial ids in two different locations that are not guaranteed to line up
+                # this prevents that from happening by assigning the partial ids
+                # and then writeing them out.
+                save_low_mem_partial_ids(
+                        assign_low_mem_partial_ids(col_counts),
+                        args.model_folder,
+                        args.write_mode)
+                save_combined_model(
+                        assign_low_mem_final_ids(load_low_mem_partial_ids(spark, args.model_folder)),
+                        args.model_folder,
+                        args.write_mode)
+                if not args.debug_mode:
+                    delete_low_mem_partial_ids(spark, args.model_folder)
+
+            else:
+                save_combined_model(
+                        assign_id_with_window(col_counts),
+                        args.model_folder,
+                        args.write_mode)
+            save_column_models(
+                get_column_models(load_combined_model(spark, args.model_folder)),
+                args.model_folder,
+                args.write_mode)
+            if not args.debug_mode:
+                delete_combined_model(spark, args.model_folder)
+
+    if args.mode == 'transform':
+        spark.conf.set('spark.sql.shuffle.partitions', args.days * args.apply_shuffle_parallel_per_day)
+        with _timed('transform'):
+            if args.output_ordering == 'total_random':
+                df = rand_ordinal(df)
+                if args.output_partitioning == 'day':
+                    df = day_from_ordinal(df, args.days)
+            elif args.output_ordering == 'day_random':
+                df = rand_ordinal(df)
+                df = day_from_input_file(df)
+            elif args.output_ordering == 'input':
+                df = df.withColumn('ordinal', monotonically_increasing_id())
+                if args.output_partitioning == 'day':
+                    df = day_from_input_file(df)
+            else: # any ordering
+                if args.output_partitioning == 'day':
+                    df = day_from_input_file(df)
+
+            models = list(load_column_models(spark, args.model_folder, bool(args.model_size_file)))
+            if args.model_size_file:
+                save_model_size(
+                    OrderedDict(('_c%d' % i, agg.size) for i, _, agg, _ in models),
+                    args.model_size_file,
+                    args.write_mode)
+            models = [(i, df, agg.sum, flag) for i, df, agg, flag in models]
+
+            df = apply_models(
+                df,
+                models,
+                not args.low_mem,
+                args.skew_broadcast_pct)
+            df = transform_log(df, not args.no_numeric_log_col)
+
+
+            if args.output_partitioning == 'day':
+                partitionBy = 'day'
+            else:
+                partitionBy = None
+
+            if args.output_ordering == 'total_random':
+                if args.output_partitioning == 'day':
+                    df = psudo_sort_by_day_plus(spark, df, args.days)
+                else: # none
+                    # Don't do a full sort it is expensive. Order is random so
+                    # just make it random
+                    df = df.repartition('ordinal').sortWithinPartitions('ordinal')
+
+                df = df.drop('ordinal')
+            elif args.output_ordering == 'day_random':
+                df = psudo_sort_by_day_plus(spark, df, args.days)
+                df = df.drop('ordinal')
+                if args.output_partitioning != 'day':
+                    df = df.drop('day')
+            elif args.output_ordering == 'input':
+                if args.low_mem:
+                    # This is the slowest option. We totally messed up the order so we have to put
+                    # it back in the correct order
+                    df = df.orderBy('ordinal')
+                else:
+                    # Applying the dictionary happened within a single task so we are already really
+                    # close to the correct order, just need to sort within the partition
+                    df = df.sortWithinPartitions('ordinal')
+                df = df.drop('ordinal')
+                if args.output_partitioning != 'day':
+                    df = df.drop('day')
+            # else: any ordering so do nothing the ordering does not matter
+
+            df.write.parquet(
+                args.output_folder,
+                mode=args.write_mode,
+                partitionBy=partitionBy)
+
+    print('=' * 100)
+    print(_benchmark)
+
+
+if __name__ == '__main__':
+    _main()
--- a/PyTorch/Recommendation/DLRM/preproc/verify_criteo_downloaded.sh
+++ b/PyTorch/Recommendation/DLRM/preproc/verify_criteo_downloaded.sh
@ -0,0 +1,34 @@
+# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#! /bin/bash
+
+set -e
+set -x
+
+download_dir=${1:-'/data/dlrm/criteo'}
+
+cd ${download_dir}
+for i in $(seq 0 23); do
+    filename=day_${i}
+    if [ -f $filename ]; then
+        echo "$filename exists, OK"
+    else
+        echo "$filename does not exist. Please follow the instructions at: http://labs.criteo.com/2013/12/download-terabyte-click-logs/ to download it"
+        exit 1
+    fi
+done
+cd -
+
+echo "Criteo data verified"
--- a/PyTorch/Recommendation/DLRM/requirements.txt
+++ b/PyTorch/Recommendation/DLRM/requirements.txt
@ -0,0 +1,4 @@
+-e git://github.com/NVIDIA/dllogger#egg=dllogger
+absl-py>=0.7.0
+numpy
+pyarrow
--- a/PyTorch/Recommendation/DLRM/setup.py
+++ b/PyTorch/Recommendation/DLRM/setup.py
@ -0,0 +1,31 @@
+# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import os
+import subprocess
+from setuptools import setup, find_packages
+from torch.utils.cpp_extension import BuildExtension, CppExtension, CUDAExtension
+
+abspath = os.path.dirname(os.path.realpath(__file__))
+
+print(find_packages(exclude=["*.tests", "*.tests.*", "tests.*", "tests"]))
+
+setup(name="dlrm",
+      package_dir={'dlrm': 'dlrm'},
+      version="1.0.0",
+      description="Reimplementation of Facebook's DLRM",
+      packages=find_packages(exclude=["*.tests", "*.tests.*", "tests.*", "tests"]),
+      zip_safe=False,
+      cmdclass={"build_ext": BuildExtension})
--- a/PyTorch/Recommendation/DLRM/triton/Dockerfile
+++ b/PyTorch/Recommendation/DLRM/triton/Dockerfile
@ -0,0 +1,31 @@
+# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:20.03-py3
+FROM nvcr.io/nvidia/tritonserver:20.03-py3-clientsdk as trt
+FROM ${FROM_IMAGE_NAME}
+
+ADD requirements.txt .
+RUN pip install -r requirements.txt
+RUN pip install onnxruntime
+
+COPY --from=trt /workspace/install /workspace/install/
+
+ENV LD_LIBRARY_PATH /workspace/install/lib:${LD_LIBRARY_PATH}
+RUN ls /workspace/install/python
+RUN pip install /workspace/install/python/tensorrtserver-1.12.0-py3-none-linux_x86_64.whl
+
+ENV PYTHONPATH /workspace/dlrm
+WORKDIR /workspace/dlrm
+COPY . .
--- a/PyTorch/Recommendation/DLRM/triton/README.md
+++ b/PyTorch/Recommendation/DLRM/triton/README.md
@ -0,0 +1,278 @@
+# Deploying the DLRM model using Triton Inference Server
+
+The [NVIDIA Triton Inference Server](https://github.com/NVIDIA/trtis-inference-server) provides a datacenter and cloud inferencing solution optimized for NVIDIA GPUs. The server provides an inference service via an HTTP or gRPC endpoint, allowing remote clients to request inferencing for any number of GPU or CPU models being managed by the server. 
+
+This folder contains instructions for deploment and exemplary client application to run inference on
+Triton Inference Server as well as detailed performance analysis.
+
+## Table Of Contents
+
+- [Running Triton Inference Server and client](#running-triton-inference-server-and-client)
+- [Latency vs Throughput](#throughputlatency-results)
+- [Dynamic batching support](#dynamic-batching-support)
+
+## Running Triton Inference Server and client
+
+The very first step of deployment is to acquire trained checkpoint and model configuration for this
+checkpoint. Default model configuration are stored inside `dlrm/config` directory.
+
+### Inference container
+
+Every command below is called from special inference container. To build that container go to main
+repository folder and call
+
+`docker build -t dlrm-inference . -f triton/Dockerfile`
+
+This command will download dependencies and build inference container. Then run shell inside the
+container:
+
+`docker run -it --rm --gpus device=0 --shm-size=1g --ulimit memlock=-1 --ulimit stack=67108864 --net=host -v <PATH_TO_MODEL_REPOSITORY>:/repository dlrm-inference bash`
+
+Here `device=0,1,2,3` selects GPUs indexed by ordinals `0,1,2` and `3`, respectively. The server will see only these GPUs. If you write `device=all`, then the server will see all the available GPUs. `PATH_TO_MODEL_REPOSITORY` indicates location where
+deployed models were stored.
+
+### Deploying the model
+
+To deploy model into Triton compatible format, `deployer.py` script can by used. This script is
+meant to be run from inside deployment docker container. 
+
+```
+usage: deployer.py [-h] (--ts-script | --ts-trace | --onnx) [--triton-no-cuda]
+                   [--triton-model-name TRITON_MODEL_NAME]
+                   [--triton-model-version TRITON_MODEL_VERSION]
+                   [--triton-max-batch-size TRITON_MAX_BATCH_SIZE]
+                   [--triton-dyn-batching-delay TRITON_DYN_BATCHING_DELAY]
+                   [--triton-engine-count TRITON_ENGINE_COUNT]
+                   [--save-dir SAVE_DIR]
+                   ...
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --ts-script           convert to torchscript using torch.jit.script
+  --ts-trace            convert to torchscript using torch.jit.trace
+  --onnx                convert to onnx using torch.onnx.export
+
+triton related flags:
+  --triton-no-cuda      Use the CPU for tracing.
+  --triton-model-name TRITON_MODEL_NAME
+                        exports to appropriate directory structure for triton
+  --triton-model-version TRITON_MODEL_VERSION
+                        exports to appropriate directory structure for triton
+  --triton-max-batch-size TRITON_MAX_BATCH_SIZE
+                        Specifies the 'max_batch_size' in the triton model
+                        config. See the triton documentation for more info.
+  --triton-dyn-batching-delay TRITON_DYN_BATCHING_DELAY
+                        Determines the dynamic_batching queue delay in
+                        milliseconds(ms) for the triton model config. Use '0'
+                        or '-1' to specify static batching. See the triton
+                        documentation for more info.
+  --triton-engine-count TRITON_ENGINE_COUNT
+                        Specifies the 'instance_group' count value in the
+                        triton model config. See the triton documentation for
+                        more info.
+  --save-dir SAVE_DIR   Saved model directory
+
+other flags:
+  model_arguments       arguments that will be ignored by deployer lib and
+                        will be forwarded to your deployer script
+
+```
+
+Following model specific arguments have to be specified for model deployment:
+  
+```
+  --num_numerical_features NUM_FEATURES
+                        Number of numerical features at network input.
+  --embedding_dim EMBEDDING_DIM
+                        Embedding dimensionality.
+  --top_mlp_sizes TOP_MLP_SIZES [TOP_MLP_SIZES ...]
+                        Units in layers of top MLP (default: 1024 1024 512 256 1).
+  --bottom_mlp_sizes BOTTOM_MLP_SIZES [BOTTOM_MLP_SIZES ...]
+                        Units in layers of bottom MLP (default: 512 256 128).
+  --interaction_op {cat,dot}
+                        Interaction operator to use.
+  --self_interaction
+                        Enables self interaction.
+  --hash_indices
+                        Hash indices for categorical features.
+  --dataset DATASET
+                        Path to dataset directory contaning model_size.json file
+                        describing input sizes for each embedding layer.
+  --batch_size BATCH_SIZE
+                        Internal dataloader batch size, usually it is the same as batch size
+                        specified in --triton-max-batch_size flag.
+  --fp16
+                        Set a model for fp16 deployment.
+  --dump_perf_data DIRECTORY_NAME
+                        Dump binary performance data that can by loaded by perf client.
+  --model_checkpoint MODEL_CHECKPOINT
+                        Checkpoint file with trained model that is going to be deployed.
+  --cpu                 Export cpu model instead of gpu.
+```
+
+For example, to deploy model into onnx format, using half precision and max batch size 4096 called
+`dlrm-onnx-16` execute:
+
+`python triton/deployer.py --onnx --triton-model-name dlrm-onnx-16 --triton-max-batch-size 4096 --save-dir /repository -- --model_checkpoint /results/checkpoint --fp16 --batch_size 4096 --num_numerical_features 13 --embedding_dim 128 --top_mlp_sizes 1024 1024 512 256 1 --bottom_mlp_sizes 512 256 128 --interaction_op dot --hash_indices --dataset /data`
+
+Where `model_checkpoint` is a checkpoint for a trained model with the same configuration as used during export and dataset (or at least dataset configuration)
+is mounted under `/data`
+
+### Running the Triton server
+**NOTE: This step is executed outside inference container**
+
+1. `docker pull nvcr.io/nvidia/tritonserver:20.03-py3`
+2. `docker run -d --rm --gpus device=0 --ipc=host --network=host [--cpuset-cpus=0-15] -p 8000:8000 -p 8001:8001 -p 8002:8002 -v <PATH_TO_MODEL_REPOSITORY>:/models nvcr.io/nvidia/tritonserver:20.03-py3 trtserver --model-store=/models --log-verbose=1 --model-control-mode=explicit`
+
+Here `device=0,1,2,3` selects GPUs indexed by ordinals `0,1,2` and `3`, respectively. The server will see only these GPUs. If you write `device=all`, then the server will see all the available GPUs. `PATH_TO_MODEL_REPOSITORY` indicates location where
+deployed models were stored. Additional `--model-controle-mode` option allows to manually load and
+unload models. This is especially useful when dealing with numerous large models like DLRM.
+
+For models exported to onnx format and hosted inside onnx runtime it might be required to limit visible cpu to fully utlize gpu acceleration. Use `--cpuset-cpus` docker option for that.
+
+### Running client
+
+Exemplary client `client.py` allows to check model performance against synthetic or real validation
+data. Client connects to Triton server and perform inference. 
+
+```
+usage: client.py [-h] --triton-server-url TRITON_SERVER_URL
+                 --triton-model-name TRITON_MODEL_NAME
+                 [--triton-model-version TRITON_MODEL_VERSION]
+                 [--protocol PROTOCOL] [-v] [-H HTTP_HEADER]
+                 [--num_numerical_features NUM_NUMERICAL_FEATURES]
+                 --dataset_config DATASET_CONFIG
+                 [--inference_data INFERENCE_DATA] [--batch_size BATCH_SIZE]
+                 [--fp16]
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --triton-server-url TRITON_SERVER_URL
+                        URL adress of trtion server (with port)
+  --triton-model-name TRITON_MODEL_NAME
+                        Triton deployed model name
+  --triton-model-version TRITON_MODEL_VERSION
+                        Triton model version
+  --protocol PROTOCOL   Communication protocol (HTTP/GRPC)
+  -v, --verbose         Verbose mode.
+  -H HTTP_HEADER        HTTP headers to add to inference server requests.
+                        Format is -H"Header:Value".
+  --num_numerical_features NUM_NUMERICAL_FEATURES
+                        Number of numerical features as an input.
+  --dataset_config DATASET_CONFIG
+                        Configuration file describing categorical features
+  --inference_data INFERENCE_DATA
+                        Path to file with inference data.
+  --batch_size BATCH_SIZE
+                        Inference request batch size
+  --fp16                Use 16bit for numerical input
+```
+
+To run inference on model exported in previous steps, using data located under
+`/data/test_data.bin` execute:
+
+`python triton/client.py --triton-server-url localhost:8000 --protocol HTTP --triton-model-name dlrm-onnx-16 --num_numerical_features 13 --dataset_config /data/model_size.json --inference_data /data/test_data.bin --batch_size 4096 --fp16`
+
+or
+
+`python triton/client.py --triton-server-url localhost:8001 --protocol GRPC --triton-model-name dlrm-onnx-16 --num_numerical_features 13 --dataset_config /data/model_size.json --inference_data /data/test_data.bin --batch_size 4096 --fp16`
+
+
+### Gathering performance data
+Performance data can be gathered using `perf_client` tool. To use this tool, performance data needs
+to be dumped during deployment. To do that, use `--dump_perf_data` option for the deployer:
+
+`python triton/deployer.py --onnx --triton-model-name dlrm-onnx-16 --triton-max-batch-size 4096 --save-dir /repository -- --model_checkpoint /results/checkpoint --fp16 --batch_size 4096 --num_numerical_features 13 --embedding_dim 128 --top_mlp_sizes 1024 1024 512 256 1 --bottom_mlp_sizes 512 256 128 --interaction_op dot --hash_indices --dataset /data --dump_perf_data /location/for/perfdata`
+
+When perf data are dumped, `perf_client` can be used with following command:
+
+`/workspace/install/bin/perf_client --max-threads 10 -m dlrm-onnx-16 -x 1 -p 5000 -v -i gRPC -u localhost:8001 -b 4096 -l 5000 --concurrency-range 1 --input-data /location/for/perfdata -f result.csv`
+
+For more information about `perf_client` please refer to [official documentation](https://docs.nvidia.com/deeplearning/sdk/triton-inference-server-master-branch-guide/docs/optimization.html#perf-client).
+
+## Throughput/Latency results
+
+Throughput is measured in recommendations/second, and latency in milliseconds. 
+
+**ONNX FP16 inference (V100-32G)**
+
+| **Batch Size** | **Throughput** | **Avg Latency** | **95% Latency** | **99% Latency** |
+|----------------|----------------|-----------------|-----------------|-----------------|
+| 1	             | 432.4 rec/s    | 2.31 ms         | 2.42 ms         | 2.51 ms         |
+| 8	             | 3214.4 rec/s   |	2.48 ms         |	2.64 ms         |	2.72 ms         |
+| 64	           | 26924.8 rec/s  |	2.37 ms         | 2.50 ms	        | 2.57 ms         |
+| 512	           | 190413 rec/s   |	2.68 ms         | 2.85 ms         | 2.94 ms         |
+| 4096	         | 891290 rec/s   | 4.58 ms         |	4.82 ms         |	4.96 ms         |
+| 32768	         | 1218970 rec/s  |	26.85 ms        |	27.43 ms        |	28.81 ms        |
+| 65536	         | 1245180 rec/s  |	52.55	ms        | 53.46	ms        | 55.83 ms        |
+| 131072	       | 1140330 rec/s  |	115.24 ms       |	117.56 ms       |	120.32 ms       |
+
+**TorchScript FP16 inference (V100-32G)**
+
+| **Batch Size** | **Throughput** | **Avg Latency** | **95% Latency** | **99% Latency** |
+|----------------|----------------|-----------------|-----------------|-----------------|
+| 1	             | 399.6 rec/s    |	2.50 ms         | 2.56 ms         | 2.70 ms         |
+| 8	             | 3563.2 rec/s   |	2.24 ms         | 2.29 ms         | 2.42 ms         |
+| 64             | 28288.2 rec/s  | 2.26 ms         | 2.33 ms         | 2.41 ms         |
+| 512            | 220774 rec/s   | 2.31 ms         | 2.38 ms         | 2.44 ms         |
+| 4096           | 1104280 rec/s  | 3.70 ms         | 3.78 ms         | 3.86 ms         |
+| 32768          | 1428680 rec/s  | 22.97 ms        | 23.29 ms        | 24.05 ms        |
+| 65536          | 1402470 rec/s  | 46.80 ms        | 48.12 ms        | 52.88 ms        |
+| 131072         | 1546650 rec/s  | 85.27 ms        | 86.17 ms        | 87.05 ms        |
+
+**TorchScript FP32 inference (V100-32G)**
+
+| **Batch Size** | **Throughput** | **Avg Latency** | **95% Latency** | **99% Latency** |
+|----------------|----------------|-----------------|-----------------|-----------------|
+| 1              | 333.7 rec/s    | 2.99 ms         | 3.17 ms         | 3.32 ms         |
+| 8              | 3092.8 rec/s   | 2.58 ms         | 2.79 ms         | 2.91 ms         |
+| 64             | 24435.2 rec/s  | 2.61 ms         | 2.78 ms         | 2.89 ms         |
+| 512            | 169216 rec/s   | 3.02 ms         | 3.14 ms         | 3.19 ms         |
+| 4096           | 718438 rec/s   | 5.69 ms         | 5.93 ms         | 6.08 ms         |
+| 32768          | 842138 rec/s   | 38.96 ms        | 39.68 ms        | 41.02 ms        |
+| 65536          | 892138 rec/s   | 73.53 ms        | 74.56 ms        | 74.99 ms        |
+| 131072         | 904397 rec/s   | 146.11 ms       | 149.88 ms       | 151.43 ms       |
+
+**ONNX FP32 inference CPU (2x E5-2698 v4 @ 2.20GHz)**
+
+| **Batch Size** | **Throughput** | **Avg Latency** | **95% Latency** | **99% Latency** |
+|----------------|----------------|-----------------|-----------------|-----------------|
+| 1              | 402.5 rec/s    | 2.48 ms         | 2.34 ms         | 3.16 ms         |
+| 8              | 2316 rec/s     | 3.39 ms         | 2.89 ms         | 6.93 ms         |
+| 64             | 9248 rec/s     | 6.91 ms         | 6.73 ms         | 13.14 ms        |
+| 512            | 14643.3 rec/s  | 35.00 ms        | 42.77 ms        | 69.24 ms        |
+| 4096           | 13926.4 rec/s  | 291.28 ms       | 321.90 ms       | 490.06 ms       |
+| 32768          | 13107.2 rec/s  | 2387.24 ms      | 2395.80 ms      | 2395.80 ms      |
+| 65536          | 14417.9 rec/s  | 5008.26 ms      | 5311.47 ms      | 5311.47 ms      |
+| 131072         | 13107.2 rec/s  | 10033.19 ms     | 10416.43 ms     | 10416.43 ms     |
+
+**TorchScript FP32 inference CPU (2x E5-2698 v4 @ 2.20GHz)**
+
+| **Batch Size** | **Throughput** | **Avg Latency** | **95% Latency** | **99% Latency** |
+|----------------|----------------|-----------------|-----------------|-----------------|
+| 1              | 116.3 rec/s    | 8.60 ms         | 9.83 ms         | 14.60 ms        |
+| 8              | 3723.2 rec/s   | 2.14 ms         | 2.55 ms         | 2.78 ms         |
+| 64             | 3014.4 rec/s   | 21.22 ms        | 31.34 ms        | 41.28 ms        |
+| 512            | 6451.2 rec/s   | 79.69 ms        | 106.00 ms       | 296.39 ms       |
+| 4096           | 41984 rec/s    | 97.71 ms        | 118.70 ms       | 123.37 ms       |
+| 32768          | 79735.5 rec/s  | 407.98 ms       | 426.64 ms       | 430.66 ms       |
+| 65536          | 79021.8 rec/s  | 852.90 ms       | 902.39 ms       | 911.46 ms       |
+| 131072         | 81264.6 rec/s  | 1601.28 ms      | 1694.64 ms      | 1711.57 ms      |
+
+![Latency vs Throughput](./img/lat_vs_thr.png)
+
+The plot above shows, that the GPU is saturated with batch size 4096. However, running inference with larger batches 
+might be faster, than running several inference requests. Therefore, we choose 65536 as the optimal batch size. 
+
+
+## Dynamic batching support
+The Triton server has a dynamic batching mechanism built in, that can be enabled. When it is enabled, then the server creates 
+inference batches from the received requests. Since the output of the model is a single probability, the batch size of a 
+single request may be large. Here it is assumed to be 4096. With dynamic batching enabled, the server will concatenate requests of this size into 
+an inference batch. The upper bound of the size of the inference batch is set to 65536. All these parameters are configurable. 
+Performance results on a single V100-32G (ONNX FP16 model) for various numbers of simultaneous requests are shown in the figure below.
+
+![Dynamic batching](./img/dyn_batch_concurrency.png)
+
+The plot above shows, that if we have a 20ms upper bound on latency, then a single GPU can handle up to 8 concurrent requests. 
+This leads to total throughput of 1.776.030 recommendations/sec. This means 35520 recommendations within 20ms, on a single GPU. 
--- a/PyTorch/Recommendation/DLRM/triton/client.py
+++ b/PyTorch/Recommendation/DLRM/triton/client.py
@ -0,0 +1,133 @@
+# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import json
+
+import torch
+
+from dlrm.data import data_loader
+from dlrm.data.synthetic_dataset import SyntheticDataset
+
+from tqdm import tqdm
+from tensorrtserver.api import *
+
+from sklearn.metrics import roc_auc_score
+from functools import partial
+
+def get_data_loader(batch_size, *, data_file, model_config):
+    with open(model_config.dataset_config) as f:
+        categorical_sizes = list(json.load(f).values())
+    if data_file:
+        data = data_loader.CriteoBinDataset(data_file=data_file,
+                batch_size=batch_size, subset=None,
+                numerical_features=model_config.num_numerical_features,
+                categorical_features=len(categorical_sizes),
+                online_shuffle=False)
+    else:
+        data = SyntheticDataset(num_entries=batch_size * 1024, batch_size=batch_size,
+                dense_features=model_config.num_numerical_features,
+                categorical_feature_sizes=categorical_sizes,
+                device="cpu")
+
+    return torch.utils.data.DataLoader(data,
+                                       batch_size=None,
+                                       num_workers=0,
+                                       pin_memory=False)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--triton-server-url", type=str, required=True, 
+                        help="URL adress of trtion server (with port)")
+    parser.add_argument("--triton-model-name", type=str, required=True,
+                        help="Triton deployed model name")
+    parser.add_argument("--triton-model-version", type=int, default=-1,
+                        help="Triton model version")
+    parser.add_argument("--protocol", type=str, default="HTTP",
+                        help="Communication protocol (HTTP/GRPC)")
+    parser.add_argument("-v", "--verbose", action="store_true", default=False,
+                        help="Verbose mode.")
+    parser.add_argument('-H', dest='http_headers', metavar="HTTP_HEADER",
+                        required=False, action='append',
+                        help='HTTP headers to add to inference server requests. ' +
+                        'Format is -H"Header:Value".')
+
+    parser.add_argument("--num_numerical_features", type=int, default=13)
+    parser.add_argument("--dataset_config", type=str, required=True)
+    parser.add_argument("--inference_data", type=str, 
+                        help="Path to file with inference data.")
+    parser.add_argument("--batch_size", type=int, default=1,
+                        help="Inference request batch size")
+    parser.add_argument("--fp16", action="store_true", default=False,
+                        help="Use 16bit for numerical input")
+    FLAGS = parser.parse_args()
+
+    FLAGS.protocol = ProtocolType.from_str(FLAGS.protocol)
+    
+    # Create a health context, get the ready and live state of server.
+    health_ctx = ServerHealthContext(FLAGS.triton_server_url, FLAGS.protocol, 
+                                     http_headers=FLAGS.http_headers, verbose=FLAGS.verbose)
+    print("Health for model {}".format(FLAGS.triton_model_name))
+    print("Live: {}".format(health_ctx.is_live()))
+    print("Ready: {}".format(health_ctx.is_ready()))
+    
+    with ModelControlContext(FLAGS.triton_server_url, FLAGS.protocol) as ctx:
+        ctx.load(FLAGS.triton_model_name)
+
+    # Create a status context and get server status
+    status_ctx = ServerStatusContext(FLAGS.triton_server_url, FLAGS.protocol, FLAGS.triton_model_name, 
+                                     http_headers=FLAGS.http_headers, verbose=FLAGS.verbose)
+    print("Status for model {}".format(FLAGS.triton_model_name))
+    print(status_ctx.get_server_status())
+    
+    # Create the inference context for the model.
+    infer_ctx = InferContext(FLAGS.triton_server_url, FLAGS.protocol, FLAGS.triton_model_name, 
+                             FLAGS.triton_model_version, 
+                             http_headers=FLAGS.http_headers, verbose=FLAGS.verbose)
+
+    dataloader = get_data_loader(FLAGS.batch_size, 
+                                 data_file=FLAGS.inference_data,
+                                 model_config=FLAGS)
+
+    results = []
+    tgt_list = []
+
+    for num, cat, target in tqdm(dataloader):
+        num = num.cpu().numpy()
+        if FLAGS.fp16:
+            num = num.astype(np.float16)
+        cat = cat.long().cpu().numpy()
+
+        input_dict = {"input__0": tuple(num[i] for i in range(len(num))),
+                      "input__1": tuple(cat[i] for i in range(len(cat)))}
+        output_keys = ["output__0"]
+        output_dict = {x: InferContext.ResultFormat.RAW for x in output_keys}
+
+        result = infer_ctx.run(input_dict, output_dict, len(num))
+        results.append(result["output__0"])
+        tgt_list.append(target.cpu().numpy())
+
+    results = np.concatenate(results).squeeze()
+    tgt_list = np.concatenate(tgt_list)
+
+    score = roc_auc_score(tgt_list, results)
+    print(F"Model score: {score}")
+
+    with ModelControlContext(FLAGS.triton_server_url, FLAGS.protocol) as ctx:
+        ctx.unload(FLAGS.triton_model_name)
+
+
+
+
--- a/PyTorch/Recommendation/DLRM/triton/deployer.py
+++ b/PyTorch/Recommendation/DLRM/triton/deployer.py
@ -0,0 +1,127 @@
+#!/usr/bin/python
+
+# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License. 
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+# See the License for the specific language governing permissions and
+# limitations under the License. 
+
+import os
+import torch
+import argparse
+import deployer_lib
+import json
+# 
+import sys
+sys.path.append('../')
+
+from dlrm.model import Dlrm
+from dlrm.data.synthetic_dataset import SyntheticDataset
+
+def get_model_args(model_args):
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--batch_size", default=1, type=int)
+    parser.add_argument("--fp16", action="store_true", default=False)
+    parser.add_argument("--dump_perf_data", type=str, default=None)
+    parser.add_argument("--model_checkpoint", type=str, default=None)
+
+    parser.add_argument("--num_numerical_features", type=int, default=13)
+    parser.add_argument("--embedding_dim", type=int, default=128)
+    parser.add_argument("--top_mlp_sizes", type=int, nargs="+",
+                        default=[1024, 1024, 512, 256, 1])
+    parser.add_argument("--bottom_mlp_sizes", type=int, nargs="+",
+                        default=[512, 256, 128])
+    parser.add_argument("--interaction_op", type=str, default="dot",
+                        choices=["dot", "cat"])
+    parser.add_argument("--self_interaction", default=False, 
+                        action="store_true")
+    parser.add_argument("--hash_indices", default=False, 
+                        action="store_true")
+    parser.add_argument("--cpu", default=False, action="store_true")
+    parser.add_argument("--dataset", type=str, required=True)
+    
+    return parser.parse_args(model_args)
+
+def initialize_model(args, categorical_sizes):
+    ''' return model, ready to trace '''
+    base_device = "cuda:0" if not args.cpu else "cpu"
+    model_config = {
+        "top_mlp_sizes": args.top_mlp_sizes,
+        "bottom_mlp_sizes": args.bottom_mlp_sizes,
+        "embedding_dim": args.embedding_dim,
+        "interaction_op": args.interaction_op,
+        "self_interaction": args.self_interaction,
+        "categorical_feature_sizes": categorical_sizes,
+        "num_numerical_features": args.num_numerical_features,
+        "hash_indices": args.hash_indices,
+        "base_device": base_device
+    }
+        
+    model = Dlrm.from_dict(model_config, sigmoid=True)
+    model.to(base_device)
+
+    if args.model_checkpoint:
+        model.load_state_dict(torch.load(args.model_checkpoint,  
+                                         map_location="cpu"))
+
+    if args.fp16:
+        model = model.half()
+
+    return model
+
+def get_dataloader(args, categorical_sizes):
+    dataset_test = SyntheticDataset(num_entries=2000,
+                                    batch_size=args.batch_size,
+                                    dense_features=args.num_numerical_features,
+                                    categorical_feature_sizes=categorical_sizes,
+                                    device="cpu" if args.cpu else "cuda:0")
+    class RemoveOutput:
+        def __init__(self, dataset):
+            self.dataset = dataset
+
+        def __getitem__(self, idx):
+            value = self.dataset[idx]
+            if args.fp16:
+                value = (value[0].half(), value[1].long(), value[2])
+            else:
+                value = (value[0], value[1].long(), value[2])
+            return value[:-1]
+
+        def __len__(self):
+            return len(self.dataset)
+
+    test_loader = torch.utils.data.DataLoader(RemoveOutput(dataset_test), 
+                                              batch_size=None, 
+                                              num_workers=0, 
+                                              pin_memory=False)
+
+    return test_loader
+
+
+if __name__=='__main__':
+    deployer, model_args = deployer_lib.create_deployer(sys.argv[1:], 
+            get_model_args) # deployer and returns removed deployer arguments
+    with open(os.path.join(model_args.dataset, "model_size.json")) as f:
+        categorical_sizes = list(json.load(f).values())
+
+    model = initialize_model(model_args, categorical_sizes)
+    dataloader = get_dataloader(model_args, categorical_sizes)
+
+    if model_args.dump_perf_data:
+        input_0, input_1 = next(iter(dataloader))
+        if model_args.fp16:
+            input_0 = input_0.half()
+
+        os.makedirs(model_args.dump_perf_data, exist_ok=True)
+        input_0.detach().cpu().numpy()[0].tofile(os.path.join(model_args.dump_perf_data, "input__0"))
+        input_1.detach().cpu().numpy()[0].tofile(os.path.join(model_args.dump_perf_data, "input__1"))
+        
+    deployer.deploy(dataloader, model)
--- a/PyTorch/Recommendation/DLRM/triton/deployer_lib.py
+++ b/PyTorch/Recommendation/DLRM/triton/deployer_lib.py
@ -0,0 +1,540 @@
+#!/usr/bin/python
+
+# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+import shutil
+import time
+import json
+import onnx
+import torch
+import argparse
+import statistics
+import onnxruntime
+from collections import Counter
+
+
+torch_type_to_triton_type = {
+    torch.bool: 'TYPE_BOOL',
+    torch.int8: 'TYPE_INT8',
+    torch.int16: 'TYPE_INT16',
+    torch.int32: 'TYPE_INT32',
+    torch.int64: 'TYPE_INT64',
+    torch.uint8: 'TYPE_UINT8',
+    torch.float16: 'TYPE_FP16',
+    torch.float32: 'TYPE_FP32',
+    torch.float64: 'TYPE_FP64'
+}
+
+CONFIG_TEMPLATE = r"""
+name: "{model_name}"
+platform: "{platform}"
+max_batch_size: {max_batch_size}
+input [
+    {spec_inputs}
+]
+output [
+    {spec_outputs}
+]
+{dynamic_batching}
+{model_optimizations}
+instance_group [
+    {{
+        count: {engine_count}
+        kind: KIND_GPU
+        gpus: [ {gpu_list} ]
+    }}
+]
+"""
+
+INPUT_TEMPLATE = r"""
+{{
+    name: "input__{num}"
+    data_type: {type}
+    dims: {dims}
+    {reshape}
+}},"""
+
+OUTPUT_TEMPLATE = r""" 
+{{
+    name: "output__{num}"
+    data_type: {type}
+    dims: {dims}
+    {reshape}
+}},"""
+
+MODEL_OPTIMIZATION_TEMPLATE = r"""
+optimization {{
+  execution_accelerators {{
+    gpu_execution_accelerator: [
+      {{
+        name: "tensorrt"
+      }}
+    ]
+  }}
+}}
+"""
+
+
+def remove_empty_lines(text):
+    ''' removes empty lines from text, returns the result '''
+    ret = "".join([s for s in text.strip().splitlines(True) if s.strip()])
+    return ret
+
+
+def create_deployer(argv, model_args_parser):
+    ''' takes a list of arguments, returns a deployer object and the list of unused arguments '''
+    parser = argparse.ArgumentParser()
+    # required args
+    method = parser.add_mutually_exclusive_group(required=True)
+    method.add_argument('--ts-script',
+                        action='store_true',
+                        help='convert to torchscript using torch.jit.script')
+    method.add_argument('--ts-trace',
+                        action='store_true',
+                        help='convert to torchscript using torch.jit.trace')
+    method.add_argument('--onnx',
+                        action='store_true',
+                        help='convert to onnx using torch.onnx.export')
+    # triton related args
+    arguments = parser.add_argument_group('triton related flags')
+    arguments.add_argument('--triton-no-cuda',
+                           action='store_true',
+                           help='Use the CPU for tracing.')
+    arguments.add_argument(
+        '--triton-model-name',
+        type=str,
+        default="model",
+        help="exports to appropriate directory structure for triton")
+    arguments.add_argument(
+        "--triton-model-version",
+        type=int,
+        default=1,
+        help="exports to appropriate directory structure for triton")
+    arguments.add_argument(
+        "--triton-max-batch-size",
+        type=int,
+        default=8,
+        help="Specifies the 'max_batch_size' in the triton model config.\
+                                  See the triton documentation for more info.")
+    arguments.add_argument(
+        "--triton-dyn-batching-delay",
+        type=float,
+        default=0,
+        help=
+        "Determines the dynamic_batching queue delay in milliseconds(ms) for\
+                                  the triton model config. Use '0' or '-1' to specify static batching.\
+                                  See the triton documentation for more info.")
+    arguments.add_argument(
+        "--triton-engine-count",
+        type=int,
+        default=1,
+        help=
+        "Specifies the 'instance_group' count value in the triton model config.\
+                                  See the triton documentation for more info.")
+    arguments.add_argument('--save-dir',
+                           type=str,
+                           default='./triton_models',
+                           help='Saved model directory')
+    # other args
+    arguments = parser.add_argument_group('other flags')
+
+    # remainder args
+    arguments.add_argument(
+        'model_arguments',
+        nargs=argparse.REMAINDER,
+        help=
+        'arguments that will be ignored by deployer lib and will be forwarded to your deployer script'
+    )
+    #
+    args = parser.parse_args(argv)
+    model_args = model_args_parser(args.model_arguments[1:])
+
+    model_args_no_def = {
+        k: v
+        for k, v in vars(model_args).items()
+        if k in [arg[2:] for arg in args.model_arguments[1:]]
+    }
+    deployer = Deployer(args, model_args_no_def)
+    #
+
+    return deployer, model_args
+
+
+class DeployerLibrary:
+    def __init__(self, args, model_args):
+        self.args = args
+        self.model_args = model_args
+        self.platform = None
+
+    def set_platform(self, platform):
+        ''' sets the platform
+            :: platform :: "pytorch_libtorch" or "onnxruntime_onnx"
+        '''
+        self.platform = platform
+
+    def prepare_inputs(self, dataloader, device):
+        ''' load sample inputs to device '''
+        inputs = []
+        for batch in dataloader:
+            if type(batch) is torch.Tensor:
+                batch_d = batch.to(device)
+                batch_d = (batch_d, )
+                inputs.append(batch_d)
+            else:
+                batch_d = []
+                for x in batch:
+                    assert type(x) is torch.Tensor, "input is not a tensor"
+                    batch_d.append(x.to(device) if device else x)
+                batch_d = tuple(batch_d)
+                inputs.append(batch_d)
+        return inputs
+
+    def get_list_of_shapes(self, l, fun):
+        ''' returns the list of min/max shapes, depending on fun
+            :: l :: list of tuples of tensors
+            :: fun :: min or max
+        '''
+        tensor_tuple = l[0]
+        shapes = [list(x.shape) for x in tensor_tuple]
+        for tensor_tuple in l:
+            assert len(tensor_tuple) == len(
+                shapes), "tensors with varying shape lengths are not supported"
+            for i, x in enumerate(tensor_tuple):
+                for j in range(len(x.shape)):
+                    shapes[i][j] = fun(shapes[i][j], x.shape[j])
+        return shapes  # a list of shapes
+
+    def get_tuple_of_min_shapes(self, l):
+        ''' returns the tuple of min shapes 
+            :: l :: list of tuples of tensors '''
+        shapes = self.get_list_of_shapes(l, min)
+        min_batch = 1
+        shapes = [[min_batch, *shape[1:]] for shape in shapes]
+        shapes = tuple(shapes)
+        return shapes  # tuple of min shapes
+
+    def get_tuple_of_max_shapes(self, l):
+        ''' returns the tuple of max shapes 
+            :: l :: list of tuples of tensors '''
+        shapes = self.get_list_of_shapes(l, max)
+        max_batch = max(2, shapes[0][0])
+        shapes = [[max_batch, *shape[1:]] for shape in shapes]
+        shapes = tuple(shapes)
+        return shapes  # tuple of max shapes
+
+    def get_tuple_of_opt_shapes(self, l):
+        ''' returns the tuple of opt shapes 
+            :: l :: list of tuples of tensors '''
+        counter = Counter()
+        for tensor_tuple in l:
+            shapes = [x.shape for x in tensor_tuple]
+            shapes = tuple(shapes)
+            counter[shapes] += 1
+        shapes = counter.most_common(1)[0][0]
+        return shapes  # tuple of most common occuring shapes
+
+    def get_tuple_of_dynamic_shapes(self, l):
+        ''' returns a tuple of dynamic shapes: variable tensor dimensions 
+            (for ex. batch size) occur as -1 in the tuple
+            :: l :: list of tuples of tensors '''
+        tensor_tuple = l[0]
+        shapes = [list(x.shape) for x in tensor_tuple]
+        for tensor_tuple in l:
+            err_msg = "tensors with varying shape lengths are not supported"
+            assert len(tensor_tuple) == len(shapes), err_msg
+            for i, x in enumerate(tensor_tuple):
+                for j in range(len(x.shape)):
+                    if shapes[i][j] != x.shape[j] or j == 0:
+                        shapes[i][j] = -1
+        shapes = tuple(shapes)
+        return shapes  # tuple of dynamic shapes
+
+    def run_models(self, models, inputs):
+        ''' run the models on inputs, return the outputs and execution times '''
+        ret = []
+        for model in models:
+            torch.cuda.synchronize()
+            time_start = time.time()
+            outputs = []
+            for input in inputs:
+                with torch.no_grad():
+                    output = model(*input)
+                if type(output) is torch.Tensor:
+                    output = [output]
+                outputs.append(output)
+            torch.cuda.synchronize()
+            time_end = time.time()
+            t = time_end - time_start
+            ret.append(outputs)
+            ret.append(t)
+        return ret
+
+    def compute_errors(self, outputs_A, outputs_B):
+        ''' returns the list of L_inf errors computed over every single output tensor '''
+        Linf_errors = []
+        for output_A, output_B in zip(outputs_A, outputs_B):
+            for x, y in zip(output_A, output_B):
+                error = (x - y).norm(float('inf')).item()
+                Linf_errors.append(error)
+        return Linf_errors
+
+    def print_errors(self, Linf_errors):
+        ''' print various statistcs of Linf errors '''
+        print()
+        print("conversion correctness test results")
+        print("-----------------------------------")
+        print("maximal absolute error over dataset (L_inf): ",
+              max(Linf_errors))
+        print()
+        print("average L_inf error over output tensors: ",
+              statistics.mean(Linf_errors))
+        print("variance of L_inf error over output tensors: ",
+              statistics.variance(Linf_errors))
+        print("stddev of L_inf error over output tensors: ",
+              statistics.stdev(Linf_errors))
+        print()
+
+    def write_config(self,
+                     config_filename,
+                     input_shapes,
+                     input_types,
+                     output_shapes,
+                     output_types):
+        ''' writes triton config file 
+            :: config_filename :: the file to write the config file into
+            :: input_shapes :: tuple of dynamic shapes of the input tensors
+            :: input_types :: tuple of torch types of the input tensors
+            :: output_shapes :: tuple of dynamic shapes of the output tensors
+            :: output_types :: tuple of torch types of the output tensors
+        '''
+        assert self.platform is not None, "error - platform is not set"
+
+        config_template = CONFIG_TEMPLATE
+        accelerator_template = MODEL_OPTIMIZATION_TEMPLATE
+        input_template = INPUT_TEMPLATE
+        spec_inputs = r""""""
+        for i,(shape,typ) in enumerate(zip(input_shapes,input_types)):
+            d = {
+                'num' : str(i), 
+                'type': torch_type_to_triton_type[typ],
+                'dims': str([1]) if len(shape) == 1 else str(list(shape)[1:]) # first dimension is the batch size 
+            }
+            d['reshape'] = 'reshape: { shape: [ ] }' if len(shape) == 1 else ''
+            spec_inputs += input_template.format_map(d)
+        spec_inputs = spec_inputs[:-1]
+        
+        output_template = OUTPUT_TEMPLATE
+        spec_outputs = r""""""
+        for i,(shape,typ) in enumerate(zip(output_shapes,output_types)):
+            d = {
+                'num' : str(i), 
+                'type': torch_type_to_triton_type[typ],
+                'dims': str([1]) if len(shape) == 1 else str(list(shape)[1:]) # first dimension is the batch size 
+            }
+            d['reshape'] = 'reshape: { shape: [ ] }' if len(shape) == 1 else ''
+            spec_outputs += output_template.format_map(d)
+        spec_outputs = spec_outputs[:-1]
+        
+        batching_str = ""
+        parameters_str = ""
+        max_batch_size = self.args.triton_max_batch_size
+        accelerator_str = ""
+
+        if (self.args.triton_dyn_batching_delay > 0):
+            # Use only full and half full batches
+            pref_batch_size = [int(max_batch_size / 2.0), max_batch_size]
+
+            batching_str = r"""
+dynamic_batching {{
+    preferred_batch_size: [{0}]
+    max_queue_delay_microseconds: {1}
+}}""".format(", ".join([str(x) for x in pref_batch_size]),
+             int(self.args.triton_dyn_batching_delay * 1000.0))
+
+        if self.platform == 'onnxruntime_onnx':
+            accelerator_str = accelerator_template.format_map({})
+
+        config_values = {
+            "model_name":
+            self.args.triton_model_name,
+            "platform":
+            self.platform,
+            "max_batch_size":
+            max_batch_size,
+            "spec_inputs":
+            spec_inputs,
+            "spec_outputs":
+            spec_outputs,
+            "dynamic_batching":
+            batching_str,
+            "model_parameters":
+            parameters_str,
+            "model_optimizations":
+            accelerator_str,
+            "gpu_list":
+            ", ".join([str(x) for x in range(torch.cuda.device_count())]),
+            "engine_count":
+            self.args.triton_engine_count
+        }
+
+        # write config
+        with open(config_filename, "w") as file:
+            final_config_str = config_template.format_map(config_values)
+            final_config_str = remove_empty_lines(final_config_str)
+            file.write(final_config_str)
+
+
+class Deployer:
+    def __init__(self, args, model_args):
+        self.args = args
+        self.lib = DeployerLibrary(args, model_args)
+
+    def deploy(self, dataloader, model):
+        ''' deploy the model and test for correctness with dataloader '''
+        if self.args.ts_script or self.args.ts_trace:
+            self.lib.set_platform("pytorch_libtorch")
+            print("deploying model " + self.args.triton_model_name +
+                  " in format " + self.lib.platform)
+            self.to_triton_torchscript(dataloader, model)
+        elif self.args.onnx:
+            self.lib.set_platform("onnxruntime_onnx")
+            print("deploying model " + self.args.triton_model_name +
+                  " in format " + self.lib.platform)
+            self.to_triton_onnx(dataloader, model)
+        else:
+            assert False, "error"
+        print("done")
+    
+    def to_triton_onnx(self, dataloader, model):
+        ''' export the model to onnx and test correctness on dataloader '''
+        model.eval()
+        assert not model.training, "internal error - model should be in eval() mode! "
+        
+        # prepare inputs
+        inputs = self.lib.prepare_inputs(dataloader, device=None)
+        
+        # generate outputs
+        outputs = []
+        for input in inputs:
+            with torch.no_grad():
+                output = model(*input)
+            if type(output) is torch.Tensor:
+                output = [output]
+            outputs.append(output)
+        
+        # generate input shapes - dynamic tensor shape support 
+        input_shapes = self.lib.get_tuple_of_dynamic_shapes(inputs)
+        
+        # generate output shapes - dynamic tensor shape support 
+        output_shapes = self.lib.get_tuple_of_dynamic_shapes(outputs)
+        
+        # generate input types 
+        input_types = [x.dtype for x in inputs[0]]
+        
+        # generate output types
+        output_types = [x.dtype for x in outputs[0]]
+        
+        # get input names
+        rng = range(len(input_types))
+        input_names = ["input__" + str(num) for num in rng]
+        
+        # get output names
+        rng = range(len(output_types))
+        output_names = ["output__" + str(num) for num in rng]
+        
+        # prepare save path
+        model_folder = os.path.join(self.args.save_dir, self.args.triton_model_name)
+        version_folder = os.path.join(model_folder, str(self.args.triton_model_version))
+        if not os.path.exists(version_folder):
+            os.makedirs(version_folder)
+
+        final_model_path = os.path.join(version_folder, 'model.onnx')
+        if not os.path.exists(final_model_path):
+            os.makedirs(final_model_path)
+        final_model_path = os.path.join(final_model_path, 'model.onnx')
+        
+        # get indices of dynamic input and output shapes
+        dynamic_axes = {}
+        for input_name,input_shape in zip(input_names,input_shapes):
+            dynamic_axes[input_name] = [i for i,x in enumerate(input_shape) if x == -1]
+        for output_name,output_shape in zip(output_names,output_shapes):
+            dynamic_axes[output_name] = [i for i,x in enumerate(output_shape) if x == -1]
+        
+        # export the model
+        assert not model.training, "internal error - model should be in eval() mode! "
+        with torch.no_grad():
+            torch.onnx.export(model, inputs[0], final_model_path, verbose=False, 
+                              input_names=input_names, output_names=output_names, 
+                              dynamic_axes=dynamic_axes, opset_version=11,
+                              use_external_data_format=True)
+        
+        config_filename = os.path.join(model_folder, "config.pbtxt")
+        self.lib.write_config(config_filename, 
+                              input_shapes, input_types, 
+                              output_shapes, output_types)
+    
+    def to_triton_torchscript(self, dataloader, model):
+        ''' export the model to torchscript and test correctness on dataloader '''
+        model.eval()
+        assert not model.training, "internal error - model should be in eval() mode! "
+        
+        # prepare inputs
+        inputs = self.lib.prepare_inputs(dataloader, device=None)
+        
+        # generate input shapes - dynamic tensor shape support 
+        input_shapes = self.lib.get_tuple_of_dynamic_shapes(inputs)
+        
+        # generate input types 
+        input_types = [x.dtype for x in inputs[0]]
+        
+        # prepare save path 
+        model_folder = os.path.join(self.args.save_dir, self.args.triton_model_name)
+        version_folder = os.path.join(model_folder, str(self.args.triton_model_version))
+        if not os.path.exists(version_folder):
+            os.makedirs(version_folder)
+        final_model_path = os.path.join(version_folder, 'model.pt')
+        
+        # convert the model 
+        with torch.no_grad():
+            if self.args.ts_trace: # trace it 
+                model_ts = torch.jit.trace(model, inputs[0])
+            if self.args.ts_script: # script it 
+                model_ts = torch.jit.script(model)
+
+        # generate outputs
+        outputs = []
+        for input in inputs:
+            with torch.no_grad():
+                output = model(*input)
+            if type(output) is torch.Tensor:
+                output = [output]
+            outputs.append(output)
+
+        # save the model 
+        torch.jit.save(model_ts, final_model_path)
+               
+        # generate output shapes - dynamic tensor shape support 
+        output_shapes = self.lib.get_tuple_of_dynamic_shapes(outputs)
+        
+        # generate output types 
+        output_types = [x.dtype for x in outputs[0]]
+        
+        # now we build the config for triton 
+        config_filename = os.path.join(model_folder, "config.pbtxt")
+        self.lib.write_config(config_filename, 
+                              input_shapes, input_types, 
+                              output_shapes, output_types)
--- a/PyTorch/Recommendation/DLRM/triton/img/dyn_batch_concurrency.png
+++ b/PyTorch/Recommendation/DLRM/triton/img/dyn_batch_concurrency.png
--- a/PyTorch/Recommendation/DLRM/triton/img/lat_vs_thr.png
+++ b/PyTorch/Recommendation/DLRM/triton/img/lat_vs_thr.png
--- a/PyTorch/SpeechSynthesis/Tacotron2/test_infer.sh
+++ b/PyTorch/SpeechSynthesis/Tacotron2/test_infer.sh
@ -66,6 +66,9 @@ done
 if [ "$PRECISION" = "amp" ]
 then
    AMP_RUN="--amp-run"
+elif  [ "$PRECISION" = "fp16" ]
+then
+    AMP_RUN="--fp16"
 fi

 LOG_SUFFIX=bs${BATCH_SIZE}_il${INPUT_LENGTH}_${PRECISION}
@ -76,14 +79,14 @@ LOGFILE=log_${LOG_SUFFIX}.log

 if [ "$TEST_PROGRAM" = "trt/test_infer_trt.py" ]
 then
-    MODELS="--encoder $ENCODER_CKPT --decoder $DECODER_CKPT --postnet $POSTNET_CKPT"
+    TACOTRON2_PARAMS="--encoder $ENCODER_CKPT --decoder $DECODER_CKPT --postnet $POSTNET_CKPT"
 else
-    MODELS="--tacotron2 $TACOTRON2_CKPT"
+    TACOTRON2_PARAMS="--tacotron2 $TACOTRON2_CKPT"
 fi

 set -x
 python $TEST_PROGRAM \
-       $MODELS \
+       $TACOTRON2_PARAMS \
       --waveglow $WAVEGLOW_CKPT \
       --batch-size $BATCH_SIZE \
       --input-length $INPUT_LENGTH $AMP_RUN \
--- a/PyTorch/SpeechSynthesis/Tacotron2/trt/inference_trt.py
+++ b/PyTorch/SpeechSynthesis/Tacotron2/trt/inference_trt.py
@ -218,22 +218,23 @@ def infer_tacotron2_trt(encoder, decoder_iter, postnet,
    decoder_outputs = init_decoder_outputs(memory, sequence_lengths)

    print("Running Tacotron2 Decoder")
+    measurements_decoder = {}
    while True:
        decoder_tensors = init_decoder_tensors(decoder_inputs, decoder_outputs)
-        with MeasureTime(measurements, "step"):
+        with MeasureTime(measurements_decoder, "step"):
            run_trt_engine(decoder_context, decoder_iter, decoder_tensors)

        if first_iter:
            mel_outputs = torch.unsqueeze(decoder_outputs[7], 2)
            gate_outputs = torch.unsqueeze(decoder_outputs[8], 2)
            alignments = torch.unsqueeze(decoder_outputs[4], 2)
-            measurements['tacotron2_decoder_time'] = measurements['step']
+            measurements['tacotron2_decoder_time'] = measurements_decoder['step']
            first_iter = False
        else:
            mel_outputs = torch.cat((mel_outputs, torch.unsqueeze(decoder_outputs[7], 2)), 2)
            gate_outputs = torch.cat((gate_outputs, torch.unsqueeze(decoder_outputs[8], 2)), 2)
            alignments = torch.cat((alignments, torch.unsqueeze(decoder_outputs[4], 2)), 2)
-            measurements['tacotron2_decoder_time'] += measurements['step']
+            measurements['tacotron2_decoder_time'] += measurements_decoder['step']

        dec = torch.le(torch.sigmoid(decoder_outputs[8]), gate_threshold).to(torch.int32).squeeze(1)
        not_finished = not_finished*dec
@ -271,10 +272,8 @@ def infer_waveglow_trt(waveglow, waveglow_context, mel, measurements, fp16):
    mel_size = mel.size(2)
    batch_size = mel.size(0)
    stride = 256
-    kernel_size = 1024
    n_group = 8
-    z_size = (mel_size-1)*stride+(kernel_size-1)+1
-    z_size = z_size - (kernel_size-stride)
+    z_size = mel_size*stride
    z_size = z_size//n_group
    z = torch.randn(batch_size, n_group, z_size, 1).cuda()
    audios = torch.zeros(batch_size, mel_size*stride).cuda()
--- a/PyTorch/SpeechSynthesis/Tacotron2/trt/run_latency_tests_trt.sh
+++ b/PyTorch/SpeechSynthesis/Tacotron2/trt/run_latency_tests_trt.sh
@ -1,6 +1 @@
-#!/bin/bash
-
-for i in {1..1003}
-do
-    python trt/inference_trt.py -i ./phrases/phrase_1_128.txt --encoder ./output/encoder_fp16.engine --decoder ./output/decoder_iter_fp16.engine --postnet ./output/postnet_fp16.engine  --waveglow ./output/waveglow_fp16.engine -o output/ --fp16 >> tmp_log_bs1_fp16.log 2>&1
-done
+bash test_infer.sh --test trt/test_infer_trt.py -bs 1 -il 128 -p fp16 --num-iters 1003 --encoder ./output/encoder_fp16.engine --decoder ./output/decoder_iter_fp16.engine --postnet ./output/postnet_fp16.engine --waveglow ./output/waveglow_fp16.engine
--- a/PyTorch/SpeechSynthesis/Tacotron2/trt/test_infer_trt.py
+++ b/PyTorch/SpeechSynthesis/Tacotron2/trt/test_infer_trt.py
@ -0,0 +1,265 @@
+# *****************************************************************************
+#  Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are met:
+#      * Redistributions of source code must retain the above copyright
+#        notice, this list of conditions and the following disclaimer.
+#      * Redistributions in binary form must reproduce the above copyright
+#        notice, this list of conditions and the following disclaimer in the
+#        documentation and/or other materials provided with the distribution.
+#      * Neither the name of the NVIDIA CORPORATION nor the
+#        names of its contributors may be used to endorse or promote products
+#        derived from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+#  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+#  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+#  DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+#  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+#  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+#  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+#  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+#  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# *****************************************************************************
+
+import sys
+sys.path.append('./')
+from tacotron2.text import text_to_sequence
+import models
+import torch
+import argparse
+import numpy as np
+from scipy.io.wavfile import write
+
+from inference import checkpoint_from_distributed, unwrap_distributed, MeasureTime, prepare_input_sequence
+from inference_trt import infer_tacotron2_trt, infer_waveglow_trt
+
+from trt.trt_utils import load_engine, run_trt_engine
+import tensorrt as trt
+
+import time
+import dllogger as DLLogger
+from dllogger import StdOutBackend, JSONStreamBackend, Verbosity
+
+from apex import amp
+
+def parse_args(parser):
+    """
+    Parse commandline arguments.
+    """
+    parser.add_argument('--encoder', type=str, required=True,
+                        help='full path to the Encoder engine')
+    parser.add_argument('--decoder', type=str, required=True,
+                        help='full path to the DecoderIter engine')
+    parser.add_argument('--postnet', type=str, required=True,
+                        help='full path to the Postnet engine')
+    parser.add_argument('--waveglow', type=str, required=True,
+                        help='full path to the WaveGlow engine')
+    parser.add_argument('--waveglow-ckpt', type=str, default="",
+                        help='full path to the WaveGlow model checkpoint file')
+    parser.add_argument('-s', '--sigma-infer', default=0.6, type=float)
+    parser.add_argument('-sr', '--sampling-rate', default=22050, type=int,
+                        help='Sampling rate')
+    parser.add_argument('--fp16', action='store_true',
+                        help='inference with FP16')
+    parser.add_argument('--log-file', type=str, default='nvlog.json',
+                        help='Filename for logging')
+    parser.add_argument('--stft-hop-length', type=int, default=256,
+                        help='STFT hop length for estimating audio length from mel size')
+    parser.add_argument('--num-iters', type=int, default=10,
+                        help='Number of iterations')
+    parser.add_argument('-il', '--input-length', type=int, default=64,
+                        help='Input length')
+    parser.add_argument('-bs', '--batch-size', type=int, default=1,
+                        help='Batch size')
+
+    return parser
+
+
+def load_and_setup_model(model_name, parser, checkpoint, amp_run, to_cuda=True):
+    model_parser = models.parse_model_args(model_name, parser, add_help=False)
+    model_args, _ = model_parser.parse_known_args()
+
+    model_config = models.get_model_config(model_name, model_args)
+    model = models.get_model(model_name, model_config, to_cuda=to_cuda)
+
+    if checkpoint is not None:
+        if to_cuda:
+            state_dict = torch.load(checkpoint)['state_dict']
+        else:
+            state_dict = torch.load(checkpoint,map_location='cpu')['state_dict']
+        if checkpoint_from_distributed(state_dict):
+            state_dict = unwrap_distributed(state_dict)
+
+        model.load_state_dict(state_dict)
+
+    if model_name == "WaveGlow":
+        model = model.remove_weightnorm(model)
+
+    model.eval()
+
+    if amp_run:
+        model, _ = amp.initialize(model, [], opt_level="O3")
+
+    return model
+
+
+def print_stats(measurements_all):
+
+    print(np.mean(measurements_all['latency'][1:]),
+          np.mean(measurements_all['throughput'][1:]),
+          np.mean(measurements_all['pre_processing'][1:]),
+          np.mean(measurements_all['type_conversion'][1:])+
+          np.mean(measurements_all['storage'][1:])+
+          np.mean(measurements_all['data_transfer'][1:]),
+          np.mean(measurements_all['num_mels_per_audio'][1:]))
+
+    throughput = measurements_all['throughput']
+    preprocessing = measurements_all['pre_processing']
+    type_conversion = measurements_all['type_conversion']
+    storage = measurements_all['storage']
+    data_transfer = measurements_all['data_transfer']
+    postprocessing = [sum(p) for p in zip(type_conversion,storage,data_transfer)]
+    latency = measurements_all['latency']
+    num_mels_per_audio = measurements_all['num_mels_per_audio']
+
+    latency.sort()
+
+    cf_50 = max(latency[:int(len(latency)*0.50)])
+    cf_90 = max(latency[:int(len(latency)*0.90)])
+    cf_95 = max(latency[:int(len(latency)*0.95)])
+    cf_99 = max(latency[:int(len(latency)*0.99)])
+    cf_100 = max(latency[:int(len(latency)*1.0)])
+
+    print("Throughput average (samples/sec) = {:.4f}".format(np.mean(throughput)))
+    print("Preprocessing average (seconds) = {:.4f}".format(np.mean(preprocessing)))
+    print("Postprocessing average (seconds) = {:.4f}".format(np.mean(postprocessing)))
+    print("Number of mels per audio average = {}".format(np.mean(num_mels_per_audio)))
+    print("Latency average (seconds) = {:.4f}".format(np.mean(latency)))
+    print("Latency std (seconds) = {:.4f}".format(np.std(latency)))
+    print("Latency cl 50 (seconds) = {:.4f}".format(cf_50))
+    print("Latency cl 90 (seconds) = {:.4f}".format(cf_90))
+    print("Latency cl 95 (seconds) = {:.4f}".format(cf_95))
+    print("Latency cl 99 (seconds) = {:.4f}".format(cf_99))
+    print("Latency cl 100 (seconds) = {:.4f}".format(cf_100))
+
+
+def main():
+    """
+    Launches text to speech (inference).
+    Inference is executed on a single GPU.
+    """
+    parser = argparse.ArgumentParser(
+        description='PyTorch Tacotron 2 Inference')
+    parser = parse_args(parser)
+    args, unknown_args = parser.parse_known_args()
+
+    DLLogger.init(backends=[JSONStreamBackend(Verbosity.DEFAULT, args.log_file),
+                            StdOutBackend(Verbosity.VERBOSE)])
+    for k,v in vars(args).items():
+        DLLogger.log(step="PARAMETER", data={k:v})
+    DLLogger.log(step="PARAMETER", data={'model_name':'Tacotron2_PyT'})
+
+    measurements_all = {"pre_processing": [],
+                        "tacotron2_encoder_time": [],
+                        "tacotron2_decoder_time": [],
+                        "tacotron2_postnet_time": [],
+                        "tacotron2_latency": [],
+                        "waveglow_latency": [],
+                        "latency": [],
+                        "type_conversion": [],
+                        "data_transfer": [],
+                        "storage": [],
+                        "tacotron2_items_per_sec": [],
+                        "waveglow_items_per_sec": [],
+                        "num_mels_per_audio": [],
+                        "throughput": []}
+
+    print("args:", args, unknown_args)
+
+    torch.cuda.init()
+
+    TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
+    encoder = load_engine(args.encoder, TRT_LOGGER)
+    decoder_iter = load_engine(args.decoder, TRT_LOGGER)
+    postnet = load_engine(args.postnet, TRT_LOGGER)
+    waveglow = load_engine(args.waveglow, TRT_LOGGER)
+
+    if args.waveglow_ckpt != "":
+        # setup denoiser using WaveGlow PyTorch checkpoint
+        waveglow_ckpt = load_and_setup_model('WaveGlow', parser, args.waveglow_ckpt,
+                                             True, forward_is_infer=True)
+        denoiser = Denoiser(waveglow_ckpt).cuda()
+        # after initialization, we don't need WaveGlow PyTorch checkpoint
+        # anymore - deleting
+        del waveglow_ckpt
+        torch.cuda.empty_cache()
+
+    # create TRT contexts for each engine
+    encoder_context = encoder.create_execution_context()
+    decoder_context = decoder_iter.create_execution_context()
+    postnet_context = postnet.create_execution_context()
+    waveglow_context = waveglow.create_execution_context()
+
+
+    texts = ["The forms of printed letters should be beautiful, and that their arrangement on the page should be reasonable and a help to the shapeliness of the letters themselves. The forms of printed letters should be beautiful, and that their arrangement on the page should be reasonable and a help to the shapeliness of the letters themselves."]
+    texts = [texts[0][:args.input_length]]
+    texts = texts*args.batch_size
+
+    warmup_iters = 3
+
+    for iter in range(args.num_iters):
+
+        measurements = {}
+
+        with MeasureTime(measurements, "pre_processing"):
+            sequences_padded, input_lengths = prepare_input_sequence(texts)
+            sequences_padded = sequences_padded.to(torch.int32)
+            input_lengths = input_lengths.to(torch.int32)
+
+        with torch.no_grad():
+            with MeasureTime(measurements, "latency"):
+                with MeasureTime(measurements, "tacotron2_latency"):
+                    mel, mel_lengths = infer_tacotron2_trt(encoder, decoder_iter, postnet,
+                                                           encoder_context, decoder_context, postnet_context,
+                                                           sequences_padded, input_lengths, measurements, args.fp16)
+
+                with MeasureTime(measurements, "waveglow_latency"):
+                    audios = infer_waveglow_trt(waveglow, waveglow_context, mel, measurements, args.fp16)
+
+        num_mels = mel.size(0)*mel.size(2)
+        num_samples = audios.size(0)*audios.size(1)
+
+        with MeasureTime(measurements, "type_conversion"):
+            audios = audios.float()
+
+        with MeasureTime(measurements, "data_transfer"):
+            audios = audios.cpu()
+
+        with MeasureTime(measurements, "storage"):
+            audios = audios.numpy()
+            for i, audio in enumerate(audios):
+                audio_path = "audio_"+str(i)+".wav"
+                write(audio_path, args.sampling_rate,
+                      audio[:mel_lengths[i]*args.stft_hop_length])
+
+        measurements['tacotron2_items_per_sec'] = num_mels/measurements['tacotron2_latency']
+        measurements['waveglow_items_per_sec'] = num_samples/measurements['waveglow_latency']
+        measurements['num_mels_per_audio'] = mel.size(2)
+        measurements['throughput'] = num_samples/measurements['latency']
+
+        if iter >= warmup_iters:
+            for k,v in measurements.items():
+                if k in measurements_all.keys():
+                    measurements_all[k].append(v)
+                    DLLogger.log(step=(iter-warmup_iters), data={k: v})
+
+    DLLogger.flush()
+
+    print_stats(measurements_all)
+
+if __name__ == '__main__':
+    main()
--- a/README.md
+++ b/README.md
@ -25,13 +25,14 @@ The examples are organized first by framework, such as TensorFlow, PyTorch, etc.
 - __VNet__ [[TensorFlow](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow/Segmentation/VNet)]

 ### Natural Language Processing
+- __BERT__ [[PyTorch](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/LanguageModeling/BERT)] [[TensorFlow](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow/LanguageModeling/BERT)]
 - __GNMT__ [[PyTorch](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Translation/GNMT)] [[TensorFlow](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow/Translation/GNMT)]
 - __Transformer__ [[PyTorch](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Translation/Transformer)]
- __BERT__ [[PyTorch](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/LanguageModeling/BERT)] [[TensorFlow](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow/LanguageModeling/BERT)]
- __Transformer-XL__ [[PyTorch](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/LanguageModeling/Transformer-XL)]
+- __Transformer-XL__ [[PyTorch](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/LanguageModeling/Transformer-XL)] [[TensorFlow](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow/LanguageModeling/Transformer-XL)]


 ### Recommender Systems
+- __DLRM__ [[PyTorch](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Recommendation/DLRM)]
 - __NCF__ [[PyTorch](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Recommendation/NCF)] [[TensorFlow](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow/Recommendation/NCF)]
 - __VAE-CF__ [[TensorFlow](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow/Recommendation/VAE-CF)]
 - __WideAndDeep__ [[TensorFlow](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow/Recommendation/WideAndDeep)]
@ -67,18 +68,20 @@ The examples are organized first by framework, such as TensorFlow, PyTorch, etc.
 | [ResNeXt101-32x4d](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Classification/ConvNets/resnext101-32x4d)  |PyTorch  | Yes  | Yes  | Yes  | -  | -  |   -  | -  | -  |
 | [SE-ResNeXt101-32x4d](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Classification/ConvNets/se-resnext101-32x4d)  |PyTorch  | Yes  | Yes  | Yes  | -  | -  | -  | -  | -  |
 | [SSD300 v1.1](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Detection/SSD) |PyTorch  | Yes  | Yes  | Yes  | -  | -  |   -  | -  | -  |
-| [BERT](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/LanguageModeling/BERT) |PyTorch  | N/A  | Yes  | Yes  | Yes  | -  |   -  | Yes  | -  |
+| [BERT](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/LanguageModeling/BERT) |PyTorch  | N/A  | Yes  | Yes  | Yes  | -  |   -  | [Yes](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/LanguageModeling/BERT/triton)  | -  |
 | [Transformer-XL](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/LanguageModeling/Transformer-XL) |PyTorch  | N/A  | Yes  | Yes  | Yes  | -  |   -  | -  | -  |
 | [Neural Collaborative Filtering](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Recommendation/NCF) |PyTorch  | N/A  | Yes  | Yes  | -  |  -  |-  | -  | -  |
+| [DLRM](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Recommendation/NCF) |PyTorch  | N/A  | Yes  | -  | -  |  -  |-  | -  | -  |
 | [Mask R-CNN](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Segmentation/MaskRCNN) |PyTorch  | N/A  | Yes  | Yes  | -  | -  |   -  | -  | -  |
-| [Jasper](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/SpeechRecognition/Jasper) |PyTorch  | N/A  | Yes  | Yes  | -  | Yes  |   Yes  | Yes  | -  |
-| [Tacotron 2 And WaveGlow v1.10](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/SpeechSynthesis/Tacotron2) | PyTorch  | N/A  | Yes  | Yes  | -  | Yes  |   Yes  | Yes  | -  |
+| [Jasper](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/SpeechRecognition/Jasper) |PyTorch  | N/A  | Yes  | Yes  | -  | Yes  |   Yes  | [Yes](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/SpeechRecognition/Jasper/trtis)  | -  |
+| [Tacotron 2 And WaveGlow v1.10](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/SpeechSynthesis/Tacotron2) | PyTorch  | N/A  | Yes  | Yes  | -  | Yes  |   Yes  | [Yes](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/SpeechSynthesis/Tacotron2/notebooks/trtis)  | -  |
 | [GNMT v2](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Translation/GNMT) |PyTorch  | N/A  | Yes  | Yes  | -  | -  |   -  | -  | -  |
 | [Transformer](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Translation/Transformer) |PyTorch  | N/A  | Yes  | Yes  | -  | -  |   -  | -  | -  |
 | [ResNet-50 v1.5](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow/Classification/RN50v1.5) |TensorFlow  | Yes  | Yes  | Yes  | -  | -  | -  | -  | -  |
 | [SSD320 v1.2](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow/Detection/SSD) | TensorFlow  | N/A  | Yes  | Yes  | -  | -  | -  | -  | -  |
-| [BERT](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow/LanguageModeling/BERT) |TensorFlow  | N/A  | Yes  | Yes  | Yes  | Yes  | -  | Yes  | Yes  |
+| [BERT](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow/LanguageModeling/BERT) |TensorFlow  | N/A  | Yes  | Yes  | Yes  | Yes  | -  | [Yes](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow/LanguageModeling/BERT/trtis)  | Yes  |
 | [BioBert](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow/LanguageModeling/BERT/biobert) | TensorFlow  | N/A  | Yes  | Yes  | -  | -  | -  | -  | -  |
+| [Transformer-XL](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow/LanguageModeling/Transformer-XL) |TensorFlow  | N/A  | Yes  | Yes  | -  | -  |   -  | -  | -  |
 | [Neural Collaborative Filtering](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow/Recommendation/NCF) |TensorFlow  | N/A  | Yes  | Yes  | -  | -  | -  | -  | -  |
 | [Variational Autoencoder Collaborative Filtering](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow/Recommendation/VAE-CF) |TensorFlow  | N/A  | Yes  | Yes  | -  | -  |   -  | -  | -  |
 | [WideAndDeep](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow/Recommendation/WideAndDeep) | TensorFlow  | N/A  | Yes  | Yes  | -  | -  |   -  | -  | -  |
--- a/TensorFlow/LanguageModeling/Transformer-XL/Dockerfile
+++ b/TensorFlow/LanguageModeling/Transformer-XL/Dockerfile
@ -0,0 +1,7 @@
+ARG FROM_IMAGE_NAME=nvcr.io/nvidia/tensorflow:19.12-tf1-py3
+FROM ${FROM_IMAGE_NAME}
+
+WORKDIR /workspace/transformer-xl/tf
+RUN pip --no-cache-dir --no-cache install 'git+https://github.com/NVIDIA/dllogger'
+
+ADD tf/ /workspace/transformer-xl/tf
--- a/TensorFlow/LanguageModeling/Transformer-XL/LICENSE
+++ b/TensorFlow/LanguageModeling/Transformer-XL/LICENSE
@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
--- a/TensorFlow/LanguageModeling/Transformer-XL/NOTICE
+++ b/TensorFlow/LanguageModeling/Transformer-XL/NOTICE
@ -0,0 +1,9 @@
+Transformer-XL for Tensorflow
+
+This repository includes software from https://github.com/kimiyoung/transformer-xl licensed under the Apache License 2.0.
+
+This repository includes software from https://github.com/salesforce/awd-lstm-lm licensed under the BSD-3-Clause license.
+
+This repository includes software from https://github.com/cybertronai/transformer-xl licensed under the Apache License 2.0.
+
+This repository includes software from https://github.com/cybertronai/pytorch-lamb licensed under the MIT license.
--- a/TensorFlow/LanguageModeling/Transformer-XL/README.md
+++ b/TensorFlow/LanguageModeling/Transformer-XL/README.md
@ -0,0 +1,945 @@
+# Transformer-XL For TensorFlow
+
+This repository provides a script and recipe to train the Transformer-XL model
+to achieve state-of-the-art accuracy and is tested and maintained by NVIDIA.
+
+## Table Of Contents
+
+<!-- TOC GFM -->
+
+* [Model overview](#model-overview)
+  * [Model architecture](#model-architecture)
+  * [Default configuration](#default-configuration)
+  * [Feature support matrix](#feature-support-matrix)
+    * [Features](#features)
+  * [Mixed precision training](#mixed-precision-training)
+    * [Enabling mixed precision](#enabling-mixed-precision)
+* [Setup](#setup)
+  * [Requirements](#requirements)
+* [Quick Start Guide](#quick-start-guide)
+* [Advanced](#advanced)
+  * [Scripts and sample code](#scripts-and-sample-code)
+  * [Parameters](#parameters)
+  * [Command-line options](#command-line-options)
+  * [Getting the data](#getting-the-data)
+    * [Dataset guidelines](#dataset-guidelines)
+    * [Multi-dataset](#multi-dataset)
+  * [Training process](#training-process)
+  * [Inference process](#inference-process)
+* [Performance](#performance)
+  * [Benchmarking](#benchmarking)
+    * [Training performance benchmark](#training-performance-benchmark)
+    * [Inference performance benchmark](#inference-performance-benchmark)
+  * [Results](#results)
+    * [Training accuracy results](#training-accuracy-results)
+      * [Training accuracy: NVIDIA DGX-1 (8x V100 16G)](#training-accuracy-nvidia-dgx-1-8x-v100-16g)
+        * [Base model](#base-model)
+      * [Training accuracy: NVIDIA DGX-2 (16x V100 32G)](#training-accuracy-nvidia-dgx-2-16x-v100-32g)
+        * [Base model](#base-model-1)
+      * [Training loss plot](#training-loss-plot)
+        * [Base model](#base-model-2)
+      * [Training stability test](#training-stability-test)
+        * [Base model](#base-model-3)
+    * [Training performance results](#training-performance-results)
+      * [Training performance: NVIDIA DGX-1 (8x V100 16G)](#training-performance-nvidia-dgx-1-8x-v100-16g)
+        * [Base model](#base-model-4)
+      * [Training performance: NVIDIA DGX-2 (16x V100 32G)](#training-performance-nvidia-dgx-2-16x-v100-32g)
+        * [Base model](#base-model-5)
+    * [Inference performance results](#inference-performance-results)
+      * [Inference performance: NVIDIA DGX-1 (1x V100 16G)](#inference-performance-nvidia-dgx-1-1x-v100-16g)
+        * [Base model](#base-model-6)
+      * [Inference performance: NVIDIA T4](#inference-performance-nvidia-t4)
+        * [Base model](#base-model-7)
+* [Release notes](#release-notes)
+  * [Changelog](#changelog)
+  * [Known issues](#known-issues)
+
+<!-- /TOC -->
+
+## Model overview
+
+This repository provides an implementation of the Transformer-XL model in
+[TensorFlow](https://www.tensorflow.org) from the paper [Transformer-XL: Attentive
+Language Models Beyond a Fixed-Length
+Context](https://arxiv.org/abs/1901.02860). Transformer-XL is a
+transformer-based language model with a segment-level recurrence and a novel
+relative positional encoding. Enhancements introduced in Transformer-XL help
+capture better long-term dependencies by attending to tokens from multiple
+previous segments.
+
+Our implementation is based on the
+[codebase](https://github.com/kimiyoung/transformer-xl) published by the
+authors of the Transformer-XL paper.
+Our implementation uses a modified model architecture. Our
+modifications were made to achieve better hardware utilization and to take
+advantage of Tensor Cores. Similar modifications were also proposed in an
+implementation available from
+[github.com/cybertronai/transformer-xl](https://github.com/cybertronai/transformer-xl).
+Refer to the [Model architecture](#model-architecture) section for more
+details.
+
+This model is trained with mixed precision using Tensor Cores on NVIDIA Volta
+GPUs and evaluated on Volta and Turing GPUs. Therefore, researchers can get
+results up to 1.5x faster than training without Tensor Cores, while
+experiencing the benefits of mixed precision training. This model is tested
+against each NGC monthly container release to ensure consistent accuracy and
+performance over time.
+
+### Model architecture
+
+The Transformer-XL "base" model for WikiText-103 dataset available in this
+repository was modified to use the following hyperparameter values:
+
+
+|**Hyperparameter**|**Description**|**Original setting for the base model**|**Our modification to the base model**|
+|------------------|---------------|--------------------------------------:|--------------------------------------:|
+| `d_model` | hidden size                                                      | 410  | 512  |
+| `n_head`  | number of attention heads                                        | 10   | 8    |
+| `d_head`  | size of each attention head                                      | 41   | 64   |
+| `d_inner` | hidden size in fully-connected layers                            | 2100 | 2048 |
+| `tgt_len` | number of tokens to predict during training                      | 150  | 192  |
+| `mem_len` | number of tokens cached from previous iterations during training | 150  | 192  |
+
+Changes described above were made to align certain hyperparameters with powers
+of two, with this modification, the model is able to achieve better hardware
+utilization, and therefore higher training throughput.
+
+The following table lists the hyperparameters for the base
+Transformer-XL model for WikiText-103 dataset available in this repository.
+
+| **Hyperparameter** | **Description**                                                  | **Base model** |
+| ------------------ | ---------------------------------------------------------------- | -------------: |
+| `n_layer`          | number of layers                                                 | 16             |
+| `d_model`          | hidden size                                                      | 512            |
+| `n_head`           | number of attention heads                                        | 8              |
+| `d_head`           | size of each attention head                                      | 64             |
+| `d_inner`          | inner hidden size in fully-connected layers                      | 2048           |
+| `dropout`          | dropout                                                          | 0.1            |
+| `dropatt`          | dropout after softmax in the attention                           | 0.0            |
+| `lr`               | base learning rate                                               | 0.01           |
+| `min_lr_ratio`     | minimum ratio learning rate (for cosine decay)                   | 0.1            |
+| `max_step`         | number of training steps                                         | 40,000         |
+| `warmup_step`      | number of learning rate warmup steps                             | 1,000          |
+| `batch_size`       | training batch size                                              | 256            |
+| `tgt_len`          | number of tokens to predict during training                      | 192            |
+| `mem_len`          | number of tokens cached from previous iterations during training | 192            |
+
+
+The Transformer-XL model addresses the limitations of vanilla transformer-based
+language models, which are only able to use relatively short context, bounded
+by the segment length. The Transformer-XL introduces a recurrence mechanism,
+which is able to use a cached hidden state from previous segments. During
+training, the context consists of a concatenation of the current segment's hidden
+state and cached states from previous iterations. Gradients are backpropagated
+only through the current segment, although the model is able to take advantage
+of the extra information stored in the cache and therefore is able to model
+long-term dependencies.
+
+An illustration of the recurrence mechanism taken from the [Transformer-XL
+paper](https://arxiv.org/abs/1901.02860) is shown below.
+![model](tf/img/model.png)
+
+
+### Default configuration
+
+The following features were implemented in this model:
+
+* general
+  * single-node, Horovod multi-GPU training
+  * training and inference with mixed precision using Tensor Cores
+  * automatic mixed precision training (AMP)
+
+* model
+  * 16-layer base Transformer-XL model with hidden size 512, 8 attention heads,
+    each head with hidden size 64
+  * the model trained on
+    [WikiText-103](https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/)
+    dataset, using word-level vocabulary and
+    adaptive softmax
+  * embedding weights are tied with weights in the classifier
+
+* training
+  * training with [LAMB](https://arxiv.org/abs/1904.00962) optimizer, the
+    implementation of the optimizer uses [XLA](https://www.tensorflow.org/xla), which enables
+    the fusion of elementwise operations and accelerates the training
+  * support for training with a gradient accumulation
+  * base model:
+    * linear learning rate warmup for 1,000 iterations, followed by the cosine
+      learning rate schedule, the initial learning rate is set to 0.0, and the final
+      learning rate is set to 0.001 (min_lr_ratio * base_lr)
+    * training for 40,000 steps, using a batch size of 256
+
+* inference
+  * support for single-GPU inference
+  * each token is using the same size of the context from previous time steps.
+  * base model:
+    * target length is set to 64, length of memory is set to 640
+    * positional embeddings are clamped after 400 time steps
+
+### Feature support matrix
+
+The following features are supported by this model:
+
+| **Feature** | **Transformer-XL** |
+|:------------|-------------------:|
+|[Automatic mixed precision (AMP)](https://nvidia.github.io/apex/amp.html) | Yes |
+|[Horovod Multi-GPU (NCCL)](https://github.com/horovod/horovod) | Yes |
+|[LAMB](https://arxiv.org/abs/1904.00962v3) | Yes |
+
+
+#### Features
+
+[TF-AMP](https://docs.nvidia.com/deeplearning/dgx/tensorflow-user-guide/index.html#tfamp) - a 
+tool that enables Tensor Core-accelerated training. Refer to the [Enabling
+mixed precision](#enabling-mixed-precision) section for more details.
+
+[Horovod](https://github.com/horovod/horovod) - Horovod 
+is a distributed training framework for TensorFlow, Keras, PyTorch, and MXNet.
+The goal of Horovod is to make distributed deep learning fast and easy to use.
+For more information about how to get started with Horovod, see the [Horovod:
+Official repository](https://github.com/horovod/horovod).
+
+[Multi-GPU training with Horovod](https://github.com/horovod/horovod/#usage) - our model 
+uses Horovod to implement efficient multi-GPU training with NCCL. For details,
+see example sources in this repository or see the [TensorFlow
+tutorial](https://github.com/horovod/horovod/#usage).
+
+[LAMB](https://arxiv.org/abs/1904.00962v3) - stands 
+for Layerwise Adaptive Moments Based optimizer, is a large batch optimization
+technique that helps accelerate training of deep neural networks using large
+minibatches.
+
+### Mixed precision training
+
+Mixed precision is the combined use of different numerical precisions in a
+computational method.
+[Mixed precision](https://arxiv.org/abs/1710.03740) training offers significant
+computational speedup by performing operations in half-precision format while
+storing minimal information in single-precision to retain as much information
+as possible in critical parts of the network. Since the introduction of [Tensor
+Cores](https://developer.nvidia.com/tensor-cores) in the Volta and Turing
+architectures, significant training speedups are experienced by switching to
+mixed precision -- up to 3x overall speedup on the most arithmetically intense
+model architectures. Using mixed precision training previously required two
+steps:
+
+1. Porting the model to use the FP16 data type where appropriate.
+2. Manually adding loss scaling to preserve small gradient values.
+
+The ability to train deep learning networks with lower precision was introduced
+in the Pascal architecture and first supported in [CUDA
+8](https://devblogs.nvidia.com/parallelforall/tag/fp16/) in the NVIDIA Deep
+Learning SDK.
+
+For information about:
+
+* How to train using mixed precision, see the [Mixed Precision
+  Training](https://arxiv.org/abs/1710.03740) paper and [Training With Mixed
+  Precision](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html)
+  documentation.
+* Techniques used for mixed precision training, see the [Mixed-Precision
+  Training of Deep Neural
+  Networks](https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/)
+  blog.
+* How to access and enable AMP for TensorFlow, see [Using
+  TF-AMP](https://docs.nvidia.com/deeplearning/dgx/tensorflow-user-guide/index.html#tfamp)
+  from the TensorFlow User Guide. 
+
+#### Enabling mixed precision
+
+Automatic Mixed Precision (AMP) for TensorFlow enables the full [mixed precision
+methodology](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html#tensorflow) in your existing
+TensorFlow model code.  AMP enables mixed precision training on Volta and Turing GPUs automatically. The TensorFlow
+framework code makes all necessary model changes internally.
+
+In TF-AMP, the computational graph is optimized to use as few casts as necessary and maximizes the use of FP16, and the
+loss scaling is automatically applied inside of supported optimizers. AMP can be configured to work with the existing
+`tf.contrib` loss scaling manager by disabling the AMP scaling with a single environment variable to perform only the
+automatic mixed precision optimization. It accomplishes this by automatically rewriting all computation graphs with the
+necessary operations to enable mixed precision training and automatic loss scaling.
+
+## Setup
+
+The following section lists the requirements that you need to meet in order to
+start training the Transformer-XL model.
+
+### Requirements
+
+This repository contains `Dockerfile` which extends the TensorFlow NGC container
+and encapsulates some dependencies. Aside from these dependencies, ensure you
+have the following components:
+
+* [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker)
+* [TensorFlow 19.12-tf1-py3](https://ngc.nvidia.com/catalog/containers/nvidia:tensorflow) NGC container
+* [NVIDIA Volta](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/)
+  or [Turing](https://www.nvidia.com/pl-pl/geforce/turing/) based GPU
+
+For more information about how to get started with NGC containers, see the
+following sections from the NVIDIA GPU Cloud Documentation and the Deep
+Learning DGX Documentation:
+
+* [Getting Started Using NVIDIA GPU Cloud](https://docs.nvidia.com/ngc/ngc-getting-started-guide/index.html),
+* [Accessing And Pulling From The NGC Container Registry](https://docs.nvidia.com/deeplearning/dgx/user-guide/index.html#accessing_registry),
+* [Running TensorFlow](https://docs.nvidia.com/deeplearning/frameworks/tensorflow-release-notes/running.html#running)
+
+For those unable to use the TensorFlow NGC container, to set up the required environment or create your own container,
+see the versioned [NVIDIA Container Support
+Matrix](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html).
+
+## Quick Start Guide
+
+To train your model using mixed precision with Tensor Cores or using FP32,
+perform the following steps using the default parameters of the Transformer-XL
+base model on the
+[WikiText-103](https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/)
+dataset. 
+
+For the specifics concerning training
+and inference, see the [Advanced](#advanced) section.
+
+1. Clone the repository.
+
+```
+git clone https://github.com/NVIDIA/DeepLearningExamples
+cd DeepLearningExamples/TensorFlow/LanguageModeling/Transformer-XL
+```
+
+2. Download and preprocess the dataset.
+
+```
+bash getdata.sh
+```
+
+3. Build the Transformer-XL TensorFlow NGC container.
+
+```
+bash tf/scripts/docker/build.sh
+```
+
+4. Start an interactive session in the NGC container to run training/inference.
+
+```
+bash tf/scripts/docker/interactive.sh
+```
+
+5. Create tfrecords before your first training/evaluation for a given batch size per GPU.
+Use same --batch_chunk and --training_batch_size flags as in the training.
+
+For training on DGX-1 with gradient accumulation in 2 steps:
+```
+bash run_wt103_base.sh train_data --batch_chunk 2
+```
+
+For single GPU training with gradient accumulation in 16 steps:
+```
+bash run_wt103_base.sh train_data --batch_chunk 16
+```
+
+For evaluation:
+```
+bash run_wt103_base.sh test_data
+```
+
+6. Start training.
+
+To start mixed precision training on 8 GPUs on DGX-1, run:
+
+```
+bash run_wt103_base.sh train 8 --fp16 --batch_chunk 2
+```
+
+To start FP32 training on single GPU, run:
+
+```
+bash run_wt103_base.sh train 1 --batch_chunk 16
+```
+
+To start mixed precision training on 16 GPUs on DGX-2, run:
+
+```
+bash run_wt103_base.sh train 16 --fp16
+```
+
+To start FP32 training on 16 GPUs on DGX-2, run:
+
+```
+bash run_wt103_base.sh train 16
+```
+
+For more information on the available options, and for an explanation of what
+happens at the end of training, refer to the [Training
+process](#training-process) section.
+
+7. Start evaluation.
+
+To start mixed precision inference on the test set, run:
+
+```
+bash run_wt103_base.sh eval [--fp16]
+```
+
+The `--fp16` flag is optional, however, if it's set, then the script
+launches mixed precision inference with Tensor Cores. If the flag is not
+present, then the script launches FP32 inference.
+By default, the script is loading the checkpoint from
+`LM-TFM/model.ckpt`, which contains the model corresponding to the
+last checkpoint from the previous training run. The path to the
+checkpoint can be customized by setting the `--model_dir` flag.
+
+For more information on the available options, refer to the [Inference
+process](#inference-process) section.
+
+## Advanced
+
+The following sections provide greater details of the dataset, running training
+and inference, and the training results.
+
+### Scripts and sample code
+
+* `Dockerfile`: a container with the basic set of dependencies to run
+  Transformer-XL
+
+In the `tf` directory, the most important files are:
+
+* `data_utils.py`: data loading utilities
+* `exp_utils.py`: utility functions for running training and benchmarking
+* `lamb.py`: implementation of [LAMB](https://arxiv.org/abs/1904.00962)
+  optimizer
+* `main.py`: serves as the entry point to launch the training and inference
+* `model.py`: implementation of the Transformer-XL model
+* `vocabulary.py`: implementation of word-level vocabulary
+
+### Parameters
+
+The complete list of available parameters for the `tf/main.py` script contains:
+
+```
+  --batch_chunk: Number of accumulation steps.
+    (default: '1')
+    (an integer)
+  --clamp_len: Clamp length
+    (default: '-1')
+    (an integer)
+  --clip: Gradient clipping value.
+    (default: '0.25')
+    (a number)
+  --corpus_info_path: Path to corpus-info.json file.
+    (default: '')
+  --d_embed: Dimension of the embeddings.
+    (default: '512')
+    (an integer)
+  --d_head: Dimension of each attention head.
+    (default: '64')
+    (an integer)
+  --d_inner: Dimension of inner hidden size in positionwise feed-forward.
+    (default: '2048')
+    (an integer)
+  --d_model: Dimension of the model.
+    (default: '512')
+    (an integer)
+  --data_dir: Path to tf-records directory.
+    (default: '')
+  --div_val: Divide the embedding size by this val for each bin
+    (default: '1')
+    (an integer)
+  --[no]do_eval: Whether to run eval on the dev set.
+    (default: 'false')
+  --[no]do_train: Whether to run training.
+    (default: 'true')
+  --dropatt: Attention dropout rate.
+    (default: '0.0')
+    (a number)
+  --dropout: Dropout rate.
+    (default: '0.1')
+    (a number)
+  --eval_batch_size: Size of valid batch.
+    (default: '16')
+    (an integer)
+  --eval_ckpt_path: Checkpoint path for do_test evaluation.If set, model_dir will be ignored.If unset, will use the latest ckpt in model_dir.
+  --eval_split: Which data split to evaluate.
+    (default: 'valid')
+  --[no]fp16: Whether to enable AMP ops.
+    (default: 'false')
+  --init: <normal|uniform>: Initialization method.
+    (default: 'normal')
+  --init_range: Initialization std when init is uniform.
+    (default: '0.1')
+    (a number)
+  --init_std: Initialization std when init is normal.
+    (default: '0.02')
+    (a number)
+  --learning_rate: Maximum learning rate.
+    (default: '0.01')
+    (a number)
+  --log_interval: Number of iterations per repeat loop.
+    (default: '100')
+    (an integer)
+  --max_eval_batch: Set -1 to turn off. Only used in test mode.
+    (default: '-1')
+    (an integer)
+  --mem_len: Number of steps to cache
+    (default: '192')
+    (an integer)
+  --min_lr_ratio: Minimum ratio learning rate.
+    (default: '0.1')
+    (a number)
+  --model_dir: Estimator model_dir.
+    (default: 'LM-TFM')
+  --n_head: Number of attention heads.
+    (default: '8')
+    (an integer)
+  --n_layer: Number of layers.
+    (default: '16')
+    (an integer)
+  --num_core_per_host: Number of cores per host
+    (default: '8')
+    (an integer)
+  --percentiles: percentiles for latency confidence intervals
+    (default: '90,95,99')
+    (a comma separated list)
+  --proj_init_std: Initialization std for embedding projection.
+    (default: '0.01')
+    (a number)
+  --[no]proj_same_dim: Project the bin with the same dimension.
+    (default: 'true')
+  --[no]proj_share_all_but_first: True to share all but first projs, False not to share.
+    (default: 'false')
+  --record_info_dir: Path to local directory containing filenames.txt.
+    (default: '')
+  --[no]same_length: Same length attention
+    (default: 'false')
+  --save_steps: number of steps for model checkpointing.
+    (default: '5000')
+    (an integer)
+  --tgt_len: Number of steps to predict
+    (default: '192')
+    (an integer)
+  --[no]tie_weight: Tie embedding and softmax weight.
+    (default: 'true')
+  --train_batch_size: Size of train batch.
+    (default: '256')
+    (an integer)
+  --train_steps: Total number of training steps.
+    (default: '40000')
+    (an integer)
+  --[no]untie_r: untie r_w_bias and r_r_bias
+    (default: 'false')
+  --warmup_steps: Number of steps for linear lr warmup.
+    (default: '1000')
+    (an integer)
+```
+
+### Command-line options
+
+To see the full list of available options and their descriptions, use the `--help` command-line option.
+For example:
+
+```
+python3 main.py --help
+```
+
+### Getting the data
+
+The Transformer-XL model was trained on the
+[WikiText-103](https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/)
+dataset. The WikiText-103 dataset is a collection of over 100 million tokens
+extracted from the set of verified
+[Good](https://en.wikipedia.org/wiki/Wikipedia:Good_articles) and
+[Featured](https://en.wikipedia.org/wiki/Wikipedia:Featured_articles) articles
+on Wikipedia.
+
+This repository contains the `getdata.sh` download script which
+automatically downloads and extracts the training, validation and test
+datasets. By default, data is downloaded to the `data` directory.
+
+In order to test with other datasets, the script needs to be customized
+accordingly.
+
+#### Dataset guidelines
+
+The WikiText-103 dataset was already pre-tokenized with word-level tokens. The
+dataset features a large vocabulary of 267,735 tokens and retains the original
+case, punctuation and numbers.
+
+The `getdata.sh` script downloads the data, extracts the archive and renames
+the training, validation, and test set to `train.txt`, `valid.txt`, `test.txt`
+respectively.
+
+#### Multi-dataset
+
+Using other datasets requires changes in the `tf/data_utils.py` file:
+* the name of the new dataset should be added to the `dataset` flag
+* the support for the new dataset needs to be added to the `Corpus` class:
+    names of files containing training, validation and test data, options for
+    the tokenizer, dataset iterator and desired values of cutoffs for adaptive softmax
+
+The current codebase supports training with word-level vocabulary
+(automatically generated based on the provided dataset)
+
+Additionally, using other datasets may require changes in some hyperparameters
+(for example, batch size, learning rate, number of training steps,
+and the configuration of learning rate scheduler). 
+
+### Training process
+
+The default training configuration can be launched by running the
+`run_wt103_base.sh` script with the first argument
+set to `train`. By default, the training results are saved to `tf/LM-TFM` directory,
+and map to your container's `/workspace/transformer-x/tf/LM-TFM` directory;
+this can be customized by setting the `--model_dir` parameter.
+
+The training script launches a single-node data-parallel training with a fixed
+global batch size of 256, optionally with gradient accumulation to allow
+training on configurations with less than 16 GPUs.
+
+**Command-line**
+
+You can launch training of the Transformer-XL base model on the
+WikiText-103 dataset with the word-based vocabulary and adaptive softmax using
+`<#GPUs>` GPUs. For example:
+
+```
+bash run_wt103_base.sh train <#GPUs> [--fp16] [--batch_chunk CHUNK]
+```
+
+The `--fp16` flag is optional, however, if it's set, then the script
+launches mixed precision training with Tensor Cores; if the flag is not
+present, then the script launches FP32 training.
+
+The `--batch_chunk CHUNK` parameter controls gradient accumulation. With
+gradient accumulation, the batch size is split into `CHUNK` chunks of equal
+size, the training script executes the forward and backward pass using each
+chunk and then executes the optimizer using accumulated gradients.
+
+**Examples**
+
+You can launch mixed precision training of the Transformer-XL base model on the
+WikiText-103 dataset using 16 GPUs. For example:
+
+```
+bash run_wt103_base.sh train 16 --fp16 --batch_chunk 1
+```
+
+The batch size per GPU is equal to the default global batch size of 256
+divided by the product of the number of GPUs times the number of chunks. In this
+case, batch size per GPU is equal to `256 / (16 * 1) = 16`.
+
+You can launch FP32 training using 8 GPUs; the batch size per GPU is equal to 16
+(`--batch_chunk` was set to `2` because a local batch size of 32 runs out
+of memory on a DGX-1 with Tesla V100 16G in FP32 training). For example:
+
+```
+bash run_wt103_base.sh train 8 --batch_chunk 2
+```
+
+A summary of the training progress is printed after every 100 training
+iterations; this can be customized by setting the `--log_interval` parameter.
+The summary is printed in the following format:
+
+```
+step 1300 | lr 0.009998686 | loss 5.09 | pplx  162.70, bpc  7.3461, tok/s 138037
+```
+
+which contains information about a current training
+step, current learning rate, current training loss,
+training [perplexity](https://en.wikipedia.org/wiki/Perplexity#Perplexity_per_word),
+bits per character and throughput in tokens per second.
+
+
+The script saves one checkpoint: `model.ckpt` which contains the last saved model.
+By default, model saving is executed every
+5000 training steps, this can be customized by setting the `--save_steps`
+parameter.
+
+Evaluation (inference) benefits from longer attention sequences, therefore to
+reproduce perplexity values reported in the [Transformer-XL
+paper](https://arxiv.org/abs/1901.02860), it's necessary to run the final
+evaluation with a dedicated inference script. Refer to the [Inference
+process](#inference-process) section for more details.
+
+### Inference process
+
+Inference can be run by launching the `run_wt103_base.sh` script
+with the first argument set to `eval`. Running
+inference requires a pre-trained model checkpoint.
+
+The script supports only single-GPU inference.
+
+**Command-line**
+
+You can launch inference of the Transformer-XL base model on the
+WikiText-103 dataset with the word-based vocabulary and adaptive softmax.
+
+For example:
+
+```
+bash run_wt103_base.sh eval --model_dir <PATH TO THE CHECKPOINT> [--fp16]
+```
+
+The `--fp16` flag is optional, however, if it's specified, then the script
+launches inference with Tensor Cores; if the flag is not present, then the
+script launches FP32 inference.
+
+**Examples**
+
+To launch mixed precision inference on a single GPU using a checkpoint
+loaded from `LM-TFM/model.ckpt*`, run:
+
+```
+bash run_wt103_base.sh eval --model_dir LM-TFM --fp16
+```
+
+To launch FP32 inference on a single GPU using a checkpoint loaded
+from `LM-TFM/model.ckpt*`, run:
+
+```
+bash run_wt103_base.sh eval --model_dir LM-TFM
+```
+
+After the execution, the script prints a summary in the following format:
+
+```
+I0109 13:02:31.304439 139903273469760 main.py:440] Evaluating with: math fp16
+INFO:tensorflow:| loss 3.15 | pplx   23.32, bpc  4.5432, tok/s   9946, ms/batch 102.84
+```
+
+which contains information about loss, perplexity and execution performance on the test dataset.
+
+## Performance
+
+### Benchmarking
+
+The following section shows how to run benchmarks measuring the model
+performance in training and inference modes.
+
+#### Training performance benchmark
+
+To benchmark the training performance on a specific global batch size `<BS>`,
+with a specific number of GPUs `<#GPUs>` for a specific number of training
+iterations `<ITER>` run:
+
+For the base model:
+
+```
+bash run_wt103_base.sh train <#GPUs> --train_batch_size <BS> --train_steps <ITER> --log_interval 1 [--fp16] [--batch_chunk CHUNK]
+```
+
+It's recommended to launch at least 1500 training steps to get a reliable
+estimate of training performance. For more information about the available
+options, refer to the [Training process](#training-process) section.
+
+The training script prints information in the following format:
+
+```
+(...)
+[1,0]<stderr>:INFO:tensorflow:step 99 | lr 0.000990000 | loss 9.22 | pplx 10069.60, bpc 13.2977, tok/s 136092
+[1,0]<stderr>:I0109 12:18:41.333325 140403024426816 main.py:333] step 99 | lr 0.000990000 | loss 9.22 | pplx 10069.60,
+bpc 13.2977, tok/s 136092
+[1,0]<stderr>:INFO:tensorflow:step 100 | lr 0.001000000 | loss 9.21 | pplx 9981.87, bpc 13.2851, tok/s 135309
+[1,0]<stderr>:I0109 12:18:41.696926 140403024426816 main.py:333] step 100 | lr 0.001000000 | loss 9.21 | pplx 9981.87,
+bpc 13.2851, tok/s 135309
+(...)
+[1,0]<stderr>:INFO:tensorflow:Training throughput: 135959 tok/s
+```
+
+The last two lines contain information on the
+average training throughput measured in tokens per second.
+
+#### Inference performance benchmark
+
+The inference performance and accuracy benchmarks require a checkpoint from a
+trained model.
+
+To benchmark the inference performance on a specific global batch size `<BS>`, run:
+
+```
+bash run_wt103_base.sh eval --model_dir <CHECKPOINT_DIR> --eval_batch_size <BS> [--fp16]
+```
+
+The inference script prints information in the following format:
+
+```
+I0109 13:02:31.304439 139903273469760 main.py:440] Evaluating with: math fp16
+INFO:tensorflow:| loss 3.15 | pplx   23.32, bpc  4.5432, tok/s   9946, ms/batch 102.84
+```
+
+The output contains information on the achieved test loss and test perplexity,
+average inference throughput (measured in tokens per second), average inference
+latency (measured in milliseconds).
+
+### Results
+
+The following sections provide details on how we achieved our performance and
+accuracy in training and inference.
+
+#### Training accuracy results
+
+##### Training accuracy: NVIDIA DGX-1 (8x V100 16G)
+
+###### Base model
+Our results were obtained by running the `tf/run_wt103_base.sh`
+training script in the tensorflow:19.12-tf1-py3 NGC container on NVIDIA DGX-1
+with 8x V100 16G GPUs.
+
+|**GPUs**|**Batch Size / GPU**|**Accuracy - FP32 (perplexity)**|**Accuracy - Mixed precision (perplexity)**|**Time to Train - FP32 (minutes)**|**Time to Train - Mixed precision (minutes)**|**Time to Train Speedup (FP32 to Mixed precision)**|
+|-------:|-------------------:|-------------------------------:|------------------------------------------:|---------------------------------:|--------------------------------------------:|--------------------------------------------------:|
+| 1 | 16 | 23.64 | 23.58 | 2943 | 2011 | 1.46 |
+| 8 | 16 | 23.36 | 23.38 | 439  | 333 | 1.32 |
+
+##### Training accuracy: NVIDIA DGX-2 (16x V100 32G)
+
+###### Base model
+
+Our results were obtained by running the `tf/run_wt103_base.sh`
+training script in the tensorflow:19.12-tf1-py3 NGC container on NVIDIA DGX-2
+with 16x V100 32G GPUs.
+
+|**GPUs**|**Batch Size / GPU**|**Accuracy - FP32 (perplexity)**|**Accuracy - Mixed precision (perplexity)**|**Time to Train - FP32 (minutes)**|**Time to Train - Mixed precision (minutes)**|**Time to Train Speedup (FP32 to Mixed precision)**|
+|-------:|-------------------:|-------------------------------:|------------------------------------------:|---------------------------------:|--------------------------------------------:|--------------------------------------------------:|
+| 16 | 16 | 23.39 | 23.37 | 202 | 161 | 1.25 |
+| 8 | 32 | 23.33 | 23.40 | 330 | 227 | 1.46 |
+
+
+##### Training loss plot
+
+###### Base model
+
+![TrainingLossBase](tf/img/training_loss_base.png)
+
+##### Training stability test
+
+###### Base model
+The Transformer-XL base model was trained for 40,000 training steps, starting
+from 20 different initial random seeds. The training was performed in the tensorflow:19.12-tf1-py3 NGC container on
+NVIDIA DGX-1 with 8x V100 16G GPUs.
+After training, the models were evaluated on the test dataset. The following
+table summarizes the final perplexity on the test set.
+
+|**Average perplexity**|**Standard deviation**|**Minimum**|**Maximum**|**Median**|
+|---------------------:|---------------------:|----------:|----------:|---------:|
+| 23.39 | 0.0878 | 23.24 | 23.58 | 23.39 |
+
+#### Training performance results
+
+##### Training performance: NVIDIA DGX-1 (8x V100 16G)
+
+###### Base model
+
+Our results were obtained by running the `tf/run_wt103_base.sh`
+training script in the tensorflow:19.12-tf1-py3 NGC container on NVIDIA DGX-1 with 8x
+V100 16G GPUs. Performance numbers (in tokens per second) were averaged over 2000
+training iterations.
+
+|**GPUs**|**Batch Size / GPU**|**Throughput - FP32 (tok/s)**|**Throughput - Mixed precision (tok/s)**|**Throughput speedup (FP32 to Mixed precision)**|**Weak Scaling - FP32**|**Weak Scaling - Mixed precision**|
+|-------:|-------------------:|----------------------------:|---------------------------------------:|-----------------------------------------------:|----------------------:|---------------------------------:|
+| 1 | 16 |  9,104 | 13,004  | 1.428 | 1.000 | 1.000 |
+| 2 | 16 | 18,169 | 23,856  | 1.313 | 1.996 | 1.835 |
+| 4 | 16 | 38,876 | 50,310  | 1.294 | 4.270 | 3.869 |
+| 8 | 16 | 78,626 | 101,954 | 1.297 | 8.636 | 7.840 |
+
+To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
+
+##### Training performance: NVIDIA DGX-2 (16x V100 32G)
+
+###### Base model
+
+Our results were obtained by running the `tf/run_wt103_base.sh` training
+script in the tensorflow:19.12-tf1-py3 NGC container on NVIDIA DGX-2 with 16x V100 32G
+GPUs. Performance numbers (in tokens per second) were averaged over 2000
+training iterations.
+
+|**GPUs**|**Batch Size / GPU**|**Throughput - FP32 (tok/s)**|**Throughput - Mixed precision (tok/s)**|**Throughput speedup (FP32 to Mixed precision)**|**Weak Scaling - FP32**|**Weak Scaling - Mixed precision**|
+|-------:|-------------------:|----------------------------:|---------------------------------------:|-----------------------------------------------:|----------------------:|---------------------------------:|
+| 1  | 16 | 9,891   | 13,791  | 1.394 | 1.000  | 1.000  |
+| 2  | 16 | 21,550  | 28,306  | 1.314 | 2.179  | 2.052  |
+| 4  | 16 | 42,616  | 55,430  | 1.301 | 4.309  | 4.019  |
+| 8  | 16 | 83,932  | 107,999 | 1.287 | 8.486  | 7.831  |
+| 16 | 16 | 164,675 | 206,906 | 1.256 | 16.649 | 15.003 |
+
+To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
+
+#### Inference performance results
+
+##### Inference performance: NVIDIA DGX-1 (1x V100 16G)
+
+###### Base model
+
+Our results were obtained by running the
+`tf/scripts/inference_benchmark.sh` inferencing benchmarking script in the
+tensorflow:19.12-tf1-py3 NGC container on NVIDIA DGX-1 with 1x V100 16G GPU.
+
+The command to launch the inference performance benchmark is provided in the
+[Inference performance benchmark](#inference-performance-benchmark) section.
+
+**FP16**
+
+|**Batch size**|**Sequence length**|**Memory length**|**Throughput Avg (tok/s)**|**Latency Avg (ms)**|**Latency 90% (ms)**|**Latency 95% (ms)**|**Latency 99% (ms)**|
+|-------------:|------------------:|----------------:|-------------------------:|-------------------:|-------------------:|-------------------:|-------------------:|
+|  1  | 64 | 640 | 1394.7    | 45.91  | 47.18  | 47.98  | 49.47  |
+|  2  | 64 | 640 | 2560.9    | 50.00  | 51.30  | 52.08  | 54.94  |
+|  4  | 64 | 640 | 4326.6    | 59.14  | 60.47  | 61.21  | 63.00  |
+|  8  | 64 | 640 | 6621.9    | 77.29  | 78.50  | 79.01  | 81.36  |
+| 16  | 64 | 640 | 8872.3    | 115.34 | 116.93 | 117.98 | 121.15 |
+| 32  | 64 | 640 | 10441.9   | 196.00 | 197.94 | 199.43 | 203.96 |
+
+**FP32**
+
+|**Batch size**|**Sequence length**|**Memory length**|**Throughput Avg (tok/s)**|**Latency Avg (ms)**|**Latency 90% (ms)**|**Latency 95% (ms)**|**Latency 99% (ms)**|
+|-------------:|------------------:|----------------:|-------------------------:|-------------------:|-------------------:|-------------------:|-------------------:|
+|  1  | 64 | 640 | 1315.2  | 48.70  | 49.78  | 50.54  | 53.31  |
+|  2  | 64 | 640 | 2419.2  | 52.91  | 54.17  | 54.73  | 56.13  |
+|  4  | 64 | 640 | 4012.7  | 63.76  | 65.27  | 66.11  | 67.81  |
+|  8  | 64 | 640 | 5650.1  | 90.56  | 91.92  | 92.47  | 94.15  |
+| 16  | 64 | 640 | 7041.2  | 145.34 | 147.20 | 148.38 | 151.37 |
+| 32  | 64 | 640 | 8051.3  | 254.14 | 256.58 | 257.51 | 258.39 |
+
+To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
+
+##### Inference performance: NVIDIA T4
+
+###### Base model
+
+Our results were obtained by running the
+`tf/scripts/inference_benchmark.sh` inferencing benchmarking script in the
+tensorflow:19.12-tf1-py3 NGC container on NVIDIA T4.
+
+The command to launch the inference performance benchmark is provided in the
+[Inference performance benchmark](#inference-performance-benchmark) section.
+
+**FP16**
+
+|**Batch size**|**Sequence length**|**Memory length**|**Throughput Avg (tok/s)**|**Latency Avg (ms)**|**Latency 90% (ms)**|**Latency 95% (ms)**|**Latency 99% (ms)**|
+|-------------:|------------------:|----------------:|-------------------------:|-------------------:|-------------------:|-------------------:|-------------------:|
+|  1  | 64 | 640 | 1053.6    | 60.75  | 61.59  | 62.02  | 63.58  |
+|  2  | 64 | 640 | 2024.5    | 63.22  | 63.95  | 64.76  | 67.33  |
+|  4  | 64 | 640 | 3309.7    | 77.30  | 78.33  | 78.85  | 80.12  |
+|  8  | 64 | 640 | 4713.7    | 108.53 | 109.66 | 110.26 | 111.15 |
+| 16  | 64 | 640 | 6075.8    | 168.40 | 169.62 | 170.28 | 171.88 |
+| 32  | 64 | 640 | 6850.5    | 298.69 | 300.42 | 301.04 | 302.21 |
+
+**FP32**
+
+|**Batch size**|**Sequence length**|**Memory length**|**Throughput Avg (tok/s)**|**Latency Avg (ms)**|**Latency 90% (ms)**|**Latency 95% (ms)**|**Latency 99% (ms)**|
+|-------------:|------------------:|----------------:|-------------------------:|-------------------:|-------------------:|-------------------:|-------------------:|
+|  1  | 64 | 640 | 929.5  | 68.88  | 70.43  | 70.88  | 72.05  |
+|  2  | 64 | 640 | 1757.6  | 72.84  | 74.30  | 75.08  | 76.62  |
+|  4  | 64 | 640 | 2696.7  | 94.87  | 97.02  | 97.58  | 99.19  |
+|  8  | 64 | 640 | 3561.6  | 143.65 | 145.98 | 146.96 | 148.18 |
+| 16  | 64 | 640 | 4190.4  | 244.16 | 246.34 | 246.62 | 247.32 |
+| 32  | 64 | 640 | 4567.7  | 447.96 | 451.19 | 452.77 | 455.32 |
+
+To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
+
+## Release notes
+
+### Changelog
+
+* April 2020
+  * Initial release
+  * Support for FP32 and mixed precision training on NVIDIA
+    DGX-1, NVIDIA DGX-2, and inference on NVIDIA Tesla V100 16G
+    and NVIDIA T4
+
+### Known issues
+
+There are no known issues with this model.
--- a/TensorFlow/LanguageModeling/Transformer-XL/getdata.sh
+++ b/TensorFlow/LanguageModeling/Transformer-XL/getdata.sh
@ -0,0 +1,120 @@
+# BSD 3-Clause License
+# 
+# Copyright (c) 2017, 
+# All rights reserved.
+# 
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# 
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+# 
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# 
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+# 
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+echo "=== Acquiring datasets ==="
+echo "---"
+
+mkdir -p data
+cd data
+
+if [[ ! -d 'wikitext-2' ]]; then
+    echo "- Downloading WikiText-2 (WT2)"
+    wget --quiet --continue https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-v1.zip
+    unzip -q wikitext-2-v1.zip
+    cd wikitext-2
+    mv wiki.train.tokens train.txt
+    mv wiki.valid.tokens valid.txt
+    mv wiki.test.tokens test.txt
+    cd ..
+fi
+
+echo "- Downloading WikiText-103 (WT2)"
+if [[ ! -d 'wikitext-103' ]]; then
+    wget --continue https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip
+    unzip -q wikitext-103-v1.zip
+    cd wikitext-103
+    mv wiki.train.tokens train.txt
+    mv wiki.valid.tokens valid.txt
+    mv wiki.test.tokens test.txt
+    cd ..
+fi
+
+echo "- Downloading enwik8 (Character)"
+if [[ ! -d 'enwik8' ]]; then
+    mkdir -p enwik8
+    cd enwik8
+    wget --continue http://mattmahoney.net/dc/enwik8.zip
+    wget https://raw.githubusercontent.com/salesforce/awd-lstm-lm/master/data/enwik8/prep_enwik8.py
+    python3 prep_enwik8.py
+    cd ..
+fi
+
+echo "- Downloading text8 (Character)"
+if [[ ! -d 'text8' ]]; then
+    mkdir -p text8
+    cd text8
+    wget --continue http://mattmahoney.net/dc/text8.zip
+    python ../../prep_text8.py
+    cd ..
+fi
+
+echo "- Downloading Penn Treebank (PTB)"
+if [[ ! -d 'penn' ]]; then
+    wget --quiet --continue http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz
+    tar -xzf simple-examples.tgz
+
+    mkdir -p penn
+    cd penn
+    mv ../simple-examples/data/ptb.train.txt train.txt
+    mv ../simple-examples/data/ptb.test.txt test.txt
+    mv ../simple-examples/data/ptb.valid.txt valid.txt
+    cd ..
+
+    echo "- Downloading Penn Treebank (Character)"
+    mkdir -p pennchar
+    cd pennchar
+    mv ../simple-examples/data/ptb.char.train.txt train.txt
+    mv ../simple-examples/data/ptb.char.test.txt test.txt
+    mv ../simple-examples/data/ptb.char.valid.txt valid.txt
+    cd ..
+
+    rm -rf simple-examples/
+fi
+
+echo "- Downloading 1B words"
+
+if [[ ! -d 'one-billion-words' ]]; then
+    mkdir -p one-billion-words
+    cd one-billion-words
+
+    wget --no-proxy http://www.statmt.org/lm-benchmark/1-billion-word-language-modeling-benchmark-r13output.tar.gz
+    tar xzvf 1-billion-word-language-modeling-benchmark-r13output.tar.gz
+
+    path="1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/"
+    cat ${path}/news.en.heldout-00000-of-00050 > valid.txt
+    cat ${path}/news.en.heldout-00000-of-00050 > test.txt
+
+    wget https://github.com/rafaljozefowicz/lm/raw/master/1b_word_vocab.txt
+
+    cd ..
+fi
+
+echo "---"
+echo "Happy language modeling :)"
--- a/TensorFlow/LanguageModeling/Transformer-XL/prep_text8.py
+++ b/TensorFlow/LanguageModeling/Transformer-XL/prep_text8.py
@ -0,0 +1,62 @@
+#!/usr/bin/env python
+# coding=utf-8
+
+# BSD 3-Clause License
+#
+# Copyright (c) 2017,
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import os
+import sys
+import zipfile
+
+from io import open
+
+if os.path.exists('train.txt'):
+    print('Tokenized text8 already exists - skipping processing')
+    sys.exit()
+
+data = zipfile.ZipFile('text8.zip').extractall()
+data = open('text8', 'r', encoding='utf-8').read()
+
+print('Length of text8: {}'.format(len(data)))
+
+num_test_chars = 5000000
+
+train_data = data[: -2 * num_test_chars]
+valid_data = data[-2 * num_test_chars: -num_test_chars]
+test_data = data[-num_test_chars:]
+
+for fn, part in [('train.txt', train_data), ('valid.txt', valid_data), ('test.txt', test_data)]:
+    print('{} will have {} bytes'.format(fn, len(part)))
+    print('- Tokenizing...')
+    # Change space ' ' to underscore '_'
+    part_str = ' '.join(['_' if c == ' ' else c for c in part.strip()])
+    print('- Writing...')
+    f = open(fn, 'w').write(part_str)
+    f = open(fn + '.raw', 'w', encoding='utf-8').write(part)
--- a/TensorFlow/LanguageModeling/Transformer-XL/tf/data_utils.py
+++ b/TensorFlow/LanguageModeling/Transformer-XL/tf/data_utils.py
@ -0,0 +1,488 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+import os
+from functools import partial
+
+from collections import Counter, OrderedDict
+import pickle
+import json
+import multiprocessing as mp
+
+import numpy as np
+
+from absl import flags
+import tensorflow as tf
+from vocabulary import Vocab
+
+from tensorflow.gfile import Exists as exists
+from tensorflow.gfile import MakeDirs as makedirs
+from tensorflow.gfile import Glob as glob
+
+
+def _preprocess(shard, train, vocab, save_dir, cutoffs, bin_sizes, bsz, tgt_len,
+                num_core_per_host, num_shuffle):
+  file_names = []
+  num_batch = 0
+
+  path = train[shard]
+  data_shard = vocab.encode_file(path, ordered=False, add_double_eos=True)
+
+  for shuffle in range(num_shuffle):
+    basename = "train-{:03d}-{:02d}".format(shard, shuffle)
+    print("Processing shard {} shuffle {}".format(shard, shuffle))
+
+    np.random.shuffle(data_shard)
+    file_name, num_batch_shuffle = create_ordered_tfrecords(
+        save_dir, basename, np.concatenate(data_shard), bsz, tgt_len,
+        num_core_per_host, cutoffs, bin_sizes)
+    file_names.append(file_name)
+    num_batch += num_batch_shuffle
+
+  return file_names, num_batch
+
+
+class Corpus(object):
+  def __init__(self, path, dataset, *args, **kwargs):
+    self.dataset = dataset
+    self.vocab = Vocab(*args, **kwargs)
+
+    if self.dataset in ["ptb", "wt2", "enwik8", "text8"]:
+      self.vocab.count_file(os.path.join(path, "train.txt"))
+      self.vocab.count_file(os.path.join(path, "valid.txt"))
+      self.vocab.count_file(os.path.join(path, "test.txt"))
+    elif self.dataset == "wt103":
+      self.vocab.count_file(os.path.join(path, "train.txt"))
+    elif self.dataset == "lm1b":
+      train_path_pattern = os.path.join(
+          path, "1-billion-word-language-modeling-benchmark-r13output",
+          "training-monolingual.tokenized.shuffled", "news.en-*")
+      train_paths = glob(train_path_pattern)
+
+      # the vocab will load from file when build_vocab() is called
+      # for train_path in sorted(train_paths):
+      #   self.vocab.count_file(train_path, verbose=True)
+
+    self.vocab.build_vocab()
+
+    if self.dataset in ["ptb", "wt2", "wt103"]:
+      self.train = self.vocab.encode_file(
+          os.path.join(path, "train.txt"), ordered=True)
+      self.valid = self.vocab.encode_file(
+          os.path.join(path, "valid.txt"), ordered=True)
+      self.test  = self.vocab.encode_file(
+          os.path.join(path, "test.txt"), ordered=True)
+    elif self.dataset in ["enwik8", "text8"]:
+      self.train = self.vocab.encode_file(
+          os.path.join(path, "train.txt"), ordered=True, add_eos=False)
+      self.valid = self.vocab.encode_file(
+          os.path.join(path, "valid.txt"), ordered=True, add_eos=False)
+      self.test  = self.vocab.encode_file(
+          os.path.join(path, "test.txt"), ordered=True, add_eos=False)
+    elif self.dataset == "lm1b":
+      self.train = train_paths
+      valid_path = os.path.join(path, "valid.txt")
+      test_path = valid_path
+      self.valid = self.vocab.encode_file(
+          valid_path, ordered=True, add_double_eos=True)
+      self.test  = self.vocab.encode_file(
+          test_path, ordered=True, add_double_eos=True)
+
+    if self.dataset == "wt103":
+      self.cutoffs = [0, 19997, 39997, 199997] + [len(self.vocab)]
+    elif self.dataset == "lm1b":
+      self.cutoffs = [0, 59997, 99997, 639997] + [len(self.vocab)]
+    else:
+      self.cutoffs = []
+
+
+  def convert_to_tfrecords(self, split, save_dir, bsz, tgt_len,
+                           num_core_per_host, **kwargs):
+    FLAGS = kwargs.get('FLAGS')
+
+    file_names = []
+
+    record_name = "record_info-{}.bsz-{}.tlen-{}.json".format(
+        split, bsz, tgt_len)
+
+    record_info_path = os.path.join(save_dir, record_name)
+
+    if self.dataset in ["ptb", "wt2", "wt103", "enwik8", "text8"]:
+      data = getattr(self, split)
+      bin_sizes = get_bin_sizes(
+          data, bsz // num_core_per_host, tgt_len, self.cutoffs)
+      file_name, num_batch = create_ordered_tfrecords(
+          save_dir, split, data, bsz, tgt_len, num_core_per_host,
+          self.cutoffs, bin_sizes,
+          num_passes=FLAGS.num_passes if split == 'train' else 1)
+      file_names.append(file_name)
+    elif self.dataset == "lm1b":
+      bin_sizes = get_bin_sizes(
+          self.valid, bsz // num_core_per_host, tgt_len, self.cutoffs)
+      if split == "train":
+        np.random.seed(123456)
+        num_batch = 0
+
+        if FLAGS.num_procs > 1:
+          _preprocess_wrapper = partial(_preprocess,
+              train=self.train, vocab=self.vocab, save_dir=save_dir,
+              cutoffs=self.cutoffs, bin_sizes=bin_sizes, bsz=bsz,
+              tgt_len=tgt_len, num_core_per_host=num_core_per_host,
+              num_shuffle=FLAGS.num_shuffle)
+
+          pool = mp.Pool(processes=FLAGS.num_procs)
+          results = pool.map(_preprocess_wrapper, range(len(self.train)))
+          for res in results:
+            file_names.extend(res[0])
+            num_batch += res[1]
+        else:
+          for shard, path in enumerate(self.train):
+            data_shard = self.vocab.encode_file(path, ordered=False,
+                                                add_double_eos=True)
+
+            num_shuffle = FLAGS.num_shuffle
+
+            for shuffle in range(num_shuffle):
+              print("Processing shard {} shuffle {}".format(shard, shuffle))
+              basename = "train-{:03d}-{:02d}".format(shard, shuffle)
+              np.random.shuffle(data_shard)
+              file_name, num_batch_ = create_ordered_tfrecords(
+                  save_dir, basename, np.concatenate(data_shard), bsz, tgt_len,
+                  num_core_per_host,
+                  self.cutoffs, bin_sizes)
+              file_names.append(file_name)
+              num_batch += num_batch_
+
+      else:
+        file_name, num_batch = create_ordered_tfrecords(
+            save_dir, split, getattr(self, split), bsz, tgt_len,
+            num_core_per_host,
+            self.cutoffs, bin_sizes)
+        file_names.append(file_name)
+
+    with open(record_info_path, "w") as fp:
+      record_info = {
+        "filenames": file_names,
+        "bin_sizes": bin_sizes,
+        "num_batch": num_batch
+      }
+      json.dump(record_info, fp)
+
+
+def get_bin_sizes(data, batch_size, tgt_len, cutoffs, std_mult=[2.5, 2.5, 2.5]):
+  """
+    Note: the `batch_size` here should be per-core batch size
+  """
+  bin_sizes = []
+
+  def _nearest_to_eight(x):
+    y = x - x % 8
+    return y + 8 if x % 8 >= 4 else max(8, y)
+
+  if cutoffs:
+    num_batch = len(data) // batch_size // tgt_len
+
+    data = data[:batch_size * num_batch * tgt_len]
+    data = data.reshape(batch_size, num_batch, tgt_len)
+
+    tot = batch_size * tgt_len
+    for b, (left, right) in enumerate(zip(cutoffs[1:-1], cutoffs[2:])):
+      mask = (data >= left) * (data < right)
+      percents = mask.astype(np.float64).sum(2).sum(0) / tot
+      mean = np.mean(percents)
+      std = np.std(percents)
+
+      bin_size = int(math.ceil(tgt_len * batch_size * (mean + std_mult[b] * std)))
+      bin_size = _nearest_to_eight(bin_size)
+      bin_sizes.append(bin_size)
+
+  return bin_sizes
+
+
+def _int64_feature(values):
+  return tf.train.Feature(int64_list=tf.train.Int64List(value=values))
+
+def _float_feature(values):
+  return tf.train.Feature(float_list=tf.train.FloatList(value=values))
+
+def batchify(data, batch_size, num_passes):
+  """
+    if num_passes > 1
+
+    Here, we use multiple randomly shifted copies.
+  """
+  if num_passes > 1:
+    data_len = len(data)
+    double_data = np.concatenate([data, data])
+    data_list = []
+    for i in range(num_passes):
+      start = np.random.randint(0, data_len)
+      data_list.append(double_data[start:start+data_len])
+    data = np.concatenate(data_list)
+
+  num_step = len(data) // batch_size
+  data = data[:batch_size * num_step]
+  data = data.reshape(batch_size, num_step)
+
+  return data
+
+
+def create_ordered_tfrecords(save_dir, basename, data, batch_size, tgt_len,
+                             num_core_per_host, cutoffs=[], bin_sizes=[],
+                             num_passes=1):
+
+  file_name = "{}.bsz-{}.tlen-{}.tfrecords".format(
+      basename, batch_size, tgt_len)
+
+  save_path = os.path.join(save_dir, file_name)
+  record_writer = tf.python_io.TFRecordWriter(save_path)
+
+  batched_data = batchify(data, batch_size, num_passes)
+
+  num_batch = 0
+  for t in range(0, batched_data.shape[1] - 1, tgt_len):
+    cur_tgt_len = min(batched_data.shape[1] - 1 - t, tgt_len)
+    if num_batch % 500 == 0:
+      print("  processing batch {}".format(num_batch))
+    for idx in range(batch_size):
+      inputs = batched_data[idx, t:t + cur_tgt_len]
+      labels = batched_data[idx, t + 1:t + cur_tgt_len + 1]
+
+      # features dict
+      feature = {
+          "inputs": _int64_feature(inputs),
+          "labels": _int64_feature(labels),
+      }
+
+      example = tf.train.Example(features=tf.train.Features(feature=feature))
+      record_writer.write(example.SerializeToString())
+
+    num_batch += 1
+
+  record_writer.close()
+  print("Done writing {}. batches: {}".format(file_name, num_batch))
+
+  return file_name, num_batch
+
+
+def get_lm_corpus(data_dir, dataset):
+  fn = os.path.join(data_dir, "cache.pkl")
+
+  if exists(fn):
+    print("Loading cached dataset...")
+    with open(fn, "rb") as fp:
+      corpus = pickle.load(fp)
+  else:
+    print("Producing dataset...")
+    kwargs = {}
+    if dataset in ["wt103", "wt2"]:
+      kwargs["special"] = ["<eos>"]
+      kwargs["lower_case"] = False
+    elif dataset == "ptb":
+      kwargs["special"] = ["<eos>"]
+      kwargs["lower_case"] = True
+    elif dataset == "lm1b":
+      kwargs["special"] = []
+      kwargs["lower_case"] = False
+      kwargs["vocab_file"] = os.path.join(data_dir, "1b_word_vocab.txt")
+    elif dataset in ["enwik8", "text8"]:
+      pass
+
+    corpus = Corpus(data_dir, dataset, **kwargs)
+
+    print("Saving dataset...")
+    with open(fn, "wb") as fp:
+      pickle.dump(corpus, fp, protocol=2)
+
+    corpus_info = {
+      "vocab_size" : len(corpus.vocab),
+      "cutoffs" : corpus.cutoffs,
+      "dataset" : corpus.dataset
+    }
+    with open(os.path.join(data_dir, "corpus-info.json"), "w") as fp:
+      json.dump(corpus_info, fp)
+
+  return corpus
+
+
+def main(unused_argv):
+  del unused_argv  # Unused
+
+  corpus = get_lm_corpus(FLAGS.data_dir, FLAGS.dataset)
+
+  save_dir = os.path.join(FLAGS.data_dir, "tfrecords")
+  if not exists(save_dir):
+    makedirs(save_dir)
+
+  # test mode
+  if FLAGS.eval_batch_size > 0:
+    corpus.convert_to_tfrecords("test", save_dir, FLAGS.eval_batch_size,
+                                FLAGS.tgt_len, FLAGS.num_core_per_host,
+                                FLAGS=FLAGS)
+    return
+
+  for split, batch_size in zip(
+      ["train", "valid"],
+      [FLAGS.train_batch_size // FLAGS.batch_chunk, FLAGS.valid_batch_size]):
+
+    if batch_size <= 0: continue
+    print("Converting {} set...".format(split))
+    corpus.convert_to_tfrecords(split, save_dir, batch_size, FLAGS.tgt_len,
+                                FLAGS.num_core_per_host, FLAGS=FLAGS)
+
+
+def load_record_info(record_info_dir, split, per_host_bsz, tgt_len,
+                     num_core_per_host):
+  record_name = "record_info-{}.bsz-{}.tlen-{}.json".format(
+      split, per_host_bsz, tgt_len)
+
+  record_info_path = os.path.join(record_info_dir, record_name)
+  with open(record_info_path, "r") as fp:
+    record_info = json.load(fp)
+
+  return record_info
+
+def get_input_fn(record_info_dir, split, per_host_bsz, tgt_len,
+                 num_core_per_host, num_hosts=1):
+  """Creates input function."""
+  record_info = load_record_info(record_info_dir, split, per_host_bsz, tgt_len,
+                                 num_core_per_host)
+
+  file_names = record_info["filenames"]
+  bin_sizes = record_info["bin_sizes"]
+  num_batch = record_info["num_batch"]
+
+  tf.logging.info("[{}] File names {}".format(split, file_names))
+
+  def input_fn(params):
+    # per-core batch size
+    per_core_bsz = params["batch_size"] // num_core_per_host
+
+    # data_dir could be a remote path, e.g., a google storage url
+    data_dir = params["data_dir"]
+
+    def parser(record):
+      # preprocess "inp_perm" and "tgt_perm"
+      def _process_perm_feature(example, prefix):
+        for b in range(len(bin_sizes)):
+          cnt = example.pop("{}_cnt_{}".format(prefix, b))[0]
+          tup = example.pop("{}_tup_{}".format(prefix, b))
+
+          tup = tf.reshape(
+              tf.sparse_tensor_to_dense(tup),
+              shape=[cnt, 2])
+
+          # tf.float32
+          perm = tf.sparse_to_dense(
+              sparse_indices=tup,
+              output_shape=[tgt_len, bin_sizes[b]],
+              sparse_values=1.0,
+              default_value=0.0)
+
+          example["{}_perm_{}".format(prefix, b)] = perm
+
+      # whether allow the last batch with a potentially shorter length
+      record_spec = {
+          "inputs": tf.VarLenFeature(tf.int64),
+          "labels": tf.VarLenFeature(tf.int64),
+      }
+
+      # retrieve serialized example
+      example = tf.parse_single_example(
+          serialized=record,
+          features=record_spec)
+
+      # cast int64 into int32
+      # cast sparse to dense
+      for key in list(example.keys()):
+        val = example[key]
+        if tf.keras.backend.is_sparse(val):
+          val = tf.sparse.to_dense(val)
+        if val.dtype == tf.int64:
+          val = tf.to_int32(val)
+        example[key] = val
+
+      return example["inputs"], example["labels"]
+
+    file_paths = []
+    for file_name in file_names:
+      file_path = os.path.join(data_dir, file_name)
+      file_paths.append(file_path)
+
+    if split == "train":
+      dataset = tf.data.Dataset.from_tensor_slices(file_paths)
+      if len(file_paths) > 1:
+        dataset = dataset.shuffle(len(file_paths)).repeat()
+        dataset = tf.data.TFRecordDataset(dataset)
+      elif num_hosts > 1:
+        host_id = params["context"].current_host
+        # drop the remaining batches
+        num_batch_per_host = num_batch // num_hosts
+
+        my_start_sample_id = (host_id * num_batch_per_host * num_core_per_host *
+                              per_core_bsz)
+        my_sample_num = num_batch_per_host * num_core_per_host * per_core_bsz
+        dataset = tf.data.TFRecordDataset(dataset).skip(
+            my_start_sample_id).take(my_sample_num)
+      else:
+        dataset = tf.data.TFRecordDataset(dataset)
+
+      if num_core_per_host > 1:
+        import horovod.tensorflow as hvd
+        dataset = dataset.shard(hvd.size(), hvd.rank())
+      dataset = dataset.map(parser).cache().repeat()
+      dataset = dataset.batch(per_core_bsz, drop_remainder=True)
+      dataset = dataset.prefetch(num_core_per_host * per_core_bsz)
+    else:
+      # do not shuffle, repeat or cache in evaluation
+      dataset = tf.data.Dataset.from_tensor_slices(file_paths)
+      dataset = tf.data.TFRecordDataset(dataset)
+      dataset = dataset.map(parser)
+      dataset = dataset.batch(per_core_bsz, drop_remainder=True)
+
+    return dataset
+
+  if split == "train" and num_hosts > 1:
+    record_info["num_batch"] = num_batch // num_hosts
+
+  return input_fn, record_info
+
+def get_corpus_info(corpus_info_path):
+  with open(corpus_info_path, "r") as fp:
+    corpus_info = json.load(fp)
+  return corpus_info
+
+if __name__ == "__main__":
+  FLAGS = flags.FLAGS
+  flags.DEFINE_string("data_dir", None,
+        help="Location of the data corpus")
+  flags.DEFINE_enum("dataset", "wt103",
+        ["ptb", "wt2", "wt103", "lm1b", "enwik8", "text8"],
+        help="Dataset name.")
+  flags.DEFINE_integer("train_batch_size", 256,
+        help="train batch size each host")
+  flags.DEFINE_integer("valid_batch_size", 256,
+        help="valid batch size each host")
+  flags.DEFINE_integer("eval_batch_size", 16,
+        help="If > 0, enter test mode and process test set only."
+             "Otherwise, process train and dev sets only.")
+  flags.DEFINE_integer("tgt_len", 70,
+        help="number of tokens to predict")
+  flags.DEFINE_integer("max_batch", -1,
+        help="run in debug mode")
+  flags.DEFINE_integer("num_core_per_host", 8,
+        help="number of GPUs per host")
+  flags.DEFINE_bool("debug", default=False,
+        help="Process only the first batch without shuffle for lm1b.")
+  flags.DEFINE_integer("num_procs", 1,
+        help="number of processes")
+  flags.DEFINE_integer("num_passes", 10,
+        help="number of passes")
+  flags.DEFINE_integer("num_shuffle", 4,
+        help="number of shuffles for lm1b")
+  flags.DEFINE_integer("batch_chunk", 1,
+        help="number of accumulation steps")
+
+  tf.app.run(main)
--- a/TensorFlow/LanguageModeling/Transformer-XL/tf/exp_utils.py
+++ b/TensorFlow/LanguageModeling/Transformer-XL/tf/exp_utils.py
@ -0,0 +1,56 @@
+# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import dllogger
+import os
+
+class AverageMeter:
+    """
+    Computes and stores the average and current value
+    """
+    def __init__(self, warmup=0, keep=False):
+        self.reset()
+        self.warmup = warmup
+        self.keep = keep
+
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+        self.iters = 0
+        self.vals = []
+
+    def update(self, val, n=1):
+        self.iters += 1
+        self.val = val
+
+        if self.iters > self.warmup:
+            self.sum += val * n
+            self.count += n
+            self.avg = self.sum / self.count
+            if self.keep:
+                self.vals.append(val)
+
+def setup_dllogger(enabled=True, filename=os.devnull, rank=0):
+    if enabled and rank == 0:
+        backends = [
+            dllogger.JSONStreamBackend(
+                dllogger.Verbosity.VERBOSE,
+                filename,
+                ),
+            ]
+        dllogger.init(backends)
+    else:
+        dllogger.init([])
--- a/TensorFlow/LanguageModeling/Transformer-XL/tf/img/model.png
+++ b/TensorFlow/LanguageModeling/Transformer-XL/tf/img/model.png
--- a/TensorFlow/LanguageModeling/Transformer-XL/tf/img/training_loss_base.png
+++ b/TensorFlow/LanguageModeling/Transformer-XL/tf/img/training_loss_base.png
--- a/TensorFlow/LanguageModeling/Transformer-XL/tf/lamb.py
+++ b/TensorFlow/LanguageModeling/Transformer-XL/tf/lamb.py
@ -0,0 +1,179 @@
+# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# MIT License
+#
+# Copyright (c) 2019 cybertronai
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import tensorflow as tf
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import linalg_ops
+from tensorflow.python.eager import context
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.training import optimizer
+
+class LAMBOptimizer(optimizer.Optimizer):
+
+  def __init__(self, learning_rate=0.001, wd= 0.01, beta1=0.9, beta2=0.999, epsilon=1e-6,
+               use_locking=False, name="LAMB"):
+
+    super(LAMBOptimizer, self).__init__(use_locking, name)
+    self._lr = learning_rate
+    self._beta1 = beta1
+    self._beta2 = beta2
+    self._epsilon = epsilon
+    self._wd = wd
+
+    # Tensor versions of the constructor arguments, created in _prepare().
+    self._lr_t = None
+    self._beta1_t = None
+    self._beta2_t = None
+    self._epsilon_t = None
+    self._wd_t = None
+
+  def _get_beta_accumulators(self):
+    with ops.init_scope():
+      if context.executing_eagerly():
+        graph = None
+      else:
+        graph = ops.get_default_graph()
+      return (self._get_non_slot_variable("beta1_power", graph=graph),
+              self._get_non_slot_variable("beta2_power", graph=graph))
+
+  def _create_slots(self, var_list):
+    first_var = min(var_list, key=lambda x: x.name)
+    self._create_non_slot_variable(initial_value=self._beta1,
+                                   name="beta1_power",
+                                   colocate_with=first_var)
+    self._create_non_slot_variable(initial_value=self._beta2,
+                                   name="beta2_power",
+                                   colocate_with=first_var)
+
+    for v in var_list:
+      self._zeros_slot(v, "m", self._name)
+      self._zeros_slot(v, "v", self._name)
+
+  def _prepare(self):
+    lr = self._call_if_callable(self._lr)
+    beta1 = self._call_if_callable(self._beta1)
+    beta2 = self._call_if_callable(self._beta2)
+    epsilon = self._call_if_callable(self._epsilon)
+    wd = self._call_if_callable(self._wd)
+
+    self._lr_t = ops.convert_to_tensor(lr, name="learning_rate")
+    self._beta1_t = ops.convert_to_tensor(beta1, name="beta1")
+    self._beta2_t = ops.convert_to_tensor(beta2, name="beta2")
+    self._epsilon_t = ops.convert_to_tensor(epsilon, name="epsilon")
+    self._wd_t = ops.convert_to_tensor(wd, name="wd")
+
+  def _apply_dense(self, grad, var):
+    lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
+    beta1_power, beta2_power = self._get_beta_accumulators()
+    beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype)
+    beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype)
+    eps = math_ops.cast(self._epsilon_t, var.dtype.base_dtype)
+    wd_lambda = math_ops.cast(self._wd_t, var.dtype.base_dtype)
+
+    v = self.get_slot(var, "v")
+    v_t = v.assign(beta2_t * v + (1. - beta2_t) * grad**2)
+    m = self.get_slot(var, "m")
+    m_t = m.assign(beta1_t * m + (1. - beta1_t) * grad)
+
+    # add l2 normalizations and set ratio
+    r1 = tf.sqrt(tf.reduce_sum(tf.square(var)))
+    step = m_t / (tf.sqrt(v_t) + eps) + wd_lambda * var
+    r2 = tf.sqrt(tf.reduce_sum(tf.square(step)))
+
+    ratio = array_ops.where(math_ops.greater(r1, 0), array_ops.where(
+        math_ops.greater(r2, 0), tf.minimum(r1, 10) / r2, 1.0), 1.0)
+    var_update = state_ops.assign_sub(var, lr_t * ratio * step)
+    return control_flow_ops.group(*[var_update, v_t, m_t])
+
+  def _resource_apply_dense(self, grad, var):
+    return None
+
+  def _apply_sparse_shared(self, grad, var, indices, scatter_add):
+    beta1_power, beta2_power = self._get_beta_accumulators()
+    lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
+    beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype)
+    beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype)
+    epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype)
+    # m_t = beta1 * m + (1 - beta1) * g_t
+    m = self.get_slot(var, "m")
+    m_scaled_g_values = grad * (1 - beta1_t)
+    m_t = state_ops.assign(m, m * beta1_t, use_locking=self._use_locking)
+    with ops.control_dependencies([m_t]):
+      m_t = scatter_add(m, indices, m_scaled_g_values)
+    # v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
+    v = self.get_slot(var, "v")
+    v_scaled_g_values = (grad * grad) * (1 - beta2_t)
+    v_t = state_ops.assign(v, v * beta2_t, use_locking=self._use_locking)
+    with ops.control_dependencies([v_t]):
+      v_t = scatter_add(v, indices, v_scaled_g_values)
+    v_sqrt = math_ops.sqrt(v_t)
+    step = m_t / (v_sqrt + epsilon_t)
+    w_norm = linalg_ops.norm(var, ord=2)
+    g_norm = linalg_ops.norm(step, ord=2)
+    ratio = array_ops.where(math_ops.greater(w_norm, 0), array_ops.where(
+        math_ops.greater(g_norm, 0), tf.minimum(w_norm, 10) / g_norm, 1.0), 1.0)
+    var_update = state_ops.assign_sub(
+        var, ratio * lr_t * step, use_locking=self._use_locking)
+    return control_flow_ops.group(*[var_update, m_t, v_t])
+
+  def _apply_sparse(self, grad, var):
+    return self._apply_sparse_shared(
+        grad.values,
+        var,
+        grad.indices,
+        lambda x, i, v: state_ops.scatter_add(  # pylint: disable=g-long-lambda
+            x,
+            i,
+            v,
+            use_locking=self._use_locking))
--- a/TensorFlow/LanguageModeling/Transformer-XL/tf/main.py
+++ b/TensorFlow/LanguageModeling/Transformer-XL/tf/main.py
@ -0,0 +1,510 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import math
+import time
+
+from absl import flags
+import absl.logging as _logging  # pylint: disable=unused-import
+
+import tensorflow as tf
+import horovod.tensorflow as hvd
+import model
+import data_utils
+import lamb
+import dllogger
+from exp_utils import AverageMeter, setup_dllogger
+
+import numpy as np
+
+flags.DEFINE_integer("num_core_per_host", default=8,
+      help="Number of cores per host")
+flags.DEFINE_bool('horovod', True, 'Use Horovod ')
+# Experiment (data/checkpoint/directory) config
+flags.DEFINE_string("raport_file", default="summary.json",
+      help="Path to dlloger json")
+flags.DEFINE_string("data_dir", default="",
+      help="Path to tf-records directory.")
+flags.DEFINE_string("record_info_dir", default="",
+      help="Path to local directory containing filenames.txt.")
+flags.DEFINE_string("corpus_info_path", default="",
+      help="Path to corpus-info.json file.")
+flags.DEFINE_string("model_dir", default="LM-TFM",
+      help="Estimator model_dir.")
+flags.DEFINE_bool("do_train", default=True,
+      help="Whether to run training.")
+flags.DEFINE_bool("do_eval", default=False,
+      help="Whether to run eval on the dev set.")
+flags.DEFINE_string("eval_ckpt_path", None,
+      help="Checkpoint path for do_test evaluation."
+           "If set, model_dir will be ignored."
+           "If unset, will use the latest ckpt in model_dir.")
+flags.DEFINE_bool("fp16", default=False,
+      help="Whether to enable AMP ops.")
+flags.DEFINE_bool("jit_optimizer", default=True,
+      help="Whether to enable XLA on optimizer")
+
+# Optimization config
+flags.DEFINE_float("learning_rate", default=0.01,
+      help="Maximum learning rate.")
+flags.DEFINE_float("clip", default=0.25,
+      help="Gradient clipping value.")
+# for cosine decay
+flags.DEFINE_float("min_lr_ratio", default=0.1,
+      help="Minimum ratio learning rate.")
+flags.DEFINE_integer("warmup_steps", default=1000,
+      help="Number of steps for linear lr warmup.")
+
+# Training config
+flags.DEFINE_integer("train_batch_size", default=256,
+      help="Size of train batch.")
+flags.DEFINE_integer("eval_batch_size", default=16,
+      help="Size of valid batch.")
+flags.DEFINE_integer("train_steps", default=40000,
+      help="Total number of training steps.")
+flags.DEFINE_integer("log_interval", default=100,
+      help="Number of iterations per repeat loop.")
+flags.DEFINE_integer("save_steps", default=5000,
+      help="number of steps for model checkpointing.")
+flags.DEFINE_integer("batch_chunk", default=1,
+      help="Number of accumulation steps.")
+
+# Evaluation config
+flags.DEFINE_integer("max_eval_batch", default=-1,
+      help="Set -1 to turn off. Only used in test mode.")
+flags.DEFINE_string("eval_split", "valid",
+      help="Which data split to evaluate.")
+flags.DEFINE_list("percentiles", default=['90', '95', '99'],
+      help="percentiles for latency confidence intervals")
+
+# Model config
+flags.DEFINE_integer("tgt_len", default=192,
+      help="Number of steps to predict")
+flags.DEFINE_integer("mem_len", default=192,
+      help="Number of steps to cache")
+flags.DEFINE_bool("same_length", default=False,
+      help="Same length attention")
+flags.DEFINE_integer("clamp_len", default=-1,
+      help="Clamp length")
+
+flags.DEFINE_integer("n_layer", default=16,
+      help="Number of layers.")
+flags.DEFINE_integer("d_model", default=512,
+      help="Dimension of the model.")
+flags.DEFINE_integer("d_embed", default=512,
+      help="Dimension of the embeddings.")
+flags.DEFINE_integer("n_head", default=8,
+      help="Number of attention heads.")
+flags.DEFINE_integer("d_head", default=64,
+      help="Dimension of each attention head.")
+flags.DEFINE_integer("d_inner", default=2048,
+      help="Dimension of inner hidden size in positionwise feed-forward.")
+flags.DEFINE_float("dropout", default=0.1,
+      help="Dropout rate.")
+flags.DEFINE_float("dropatt", default=0.0,
+      help="Attention dropout rate.")
+flags.DEFINE_bool("untie_r", default=False,
+      help="untie r_w_bias and r_r_bias")
+
+# Adaptive Softmax / Embedding
+flags.DEFINE_bool("tie_weight", default=True,
+      help="Tie embedding and softmax weight.")
+flags.DEFINE_integer("div_val", default=1,
+      help="Divide the embedding size by this val for each bin")
+flags.DEFINE_bool("proj_share_all_but_first", default=False,
+      help="True to share all but first projs, False not to share.")
+flags.DEFINE_bool("proj_same_dim", default=True,
+      help="Project the bin with the same dimension.")
+
+# Parameter initialization
+flags.DEFINE_enum("init", default="normal",
+      enum_values=["normal", "uniform"],
+      help="Initialization method.")
+flags.DEFINE_float("init_std", default=0.02,
+      help="Initialization std when init is normal.")
+flags.DEFINE_float("proj_init_std", default=0.01,
+      help="Initialization std for embedding projection.")
+flags.DEFINE_float("init_range", default=0.1,
+      help="Initialization std when init is uniform.")
+
+
+FLAGS = flags.FLAGS
+
+def get_model_fn(n_token, cutoffs):
+  def model_fn(inp, tgt, mems, is_training):
+    inp = tf.transpose(inp, [1, 0])
+    tgt = tf.transpose(tgt, [1, 0])
+
+    if FLAGS.init == "uniform":
+      initializer = tf.initializers.random_uniform(
+          minval=-FLAGS.init_range,
+          maxval=FLAGS.init_range,
+          seed=None)
+    elif FLAGS.init == "normal":
+      initializer = tf.initializers.random_normal(
+          stddev=FLAGS.init_std,
+          seed=None)
+      proj_initializer = tf.initializers.random_normal(
+          stddev=FLAGS.proj_init_std,
+          seed=None)
+
+    tie_projs = [False for _ in range(len(cutoffs) + 1)]
+    if FLAGS.proj_share_all_but_first:
+      for i in range(1, len(tie_projs)):
+        tie_projs[i] = True
+
+    loss, new_mems = model.transformer(
+        dec_inp=inp,
+        target=tgt,
+        mems=mems,
+        n_token=n_token,
+        n_layer=FLAGS.n_layer,
+        d_model=FLAGS.d_model,
+        d_embed=FLAGS.d_embed,
+        n_head=FLAGS.n_head,
+        d_head=FLAGS.d_head,
+        d_inner=FLAGS.d_inner,
+        dropout=FLAGS.dropout,
+        dropatt=FLAGS.dropatt,
+        initializer=initializer,
+        proj_initializer=proj_initializer,
+        is_training=is_training,
+        mem_len=FLAGS.mem_len,
+        cutoffs=cutoffs,
+        div_val=FLAGS.div_val,
+        tie_projs=tie_projs,
+        input_perms=None,
+        target_perms=None,
+        head_target=None,
+        same_length=FLAGS.same_length,
+        clamp_len=FLAGS.clamp_len,
+        untie_r=FLAGS.untie_r,
+        proj_same_dim=FLAGS.proj_same_dim)
+
+    # number of parameters
+    num_params = sum([np.prod(v.shape) for v in tf.trainable_variables()])
+    tf.logging.info('#params: {}'.format(num_params))
+
+    if is_training:
+      all_vars = tf.trainable_variables()
+
+      return loss, new_mems, all_vars
+    else:
+      return loss, new_mems
+
+  return model_fn
+
+
+def single_core_graph(n_token, cutoffs, is_training, inp, tgt, mems):
+  model_fn = get_model_fn(
+      n_token=n_token,
+      cutoffs=cutoffs)
+
+  model_ret = model_fn(
+      inp=inp,
+      tgt=tgt,
+      mems=mems,
+      is_training=is_training)
+
+  return model_ret
+
+
+def train(n_token, cutoffs, rank, local_rank, size):
+
+  meters = {}
+  warmup = 2 + 12/size
+  meters['train_throughput'] = AverageMeter(warmup=warmup)
+  train_batch_size = FLAGS.train_batch_size // FLAGS.batch_chunk
+  ##### Get input function and model function
+  train_input_fn, train_record_info = data_utils.get_input_fn(
+      record_info_dir=FLAGS.record_info_dir,
+      split="train",
+      per_host_bsz=train_batch_size,
+      tgt_len=FLAGS.tgt_len,
+      num_core_per_host=FLAGS.num_core_per_host,
+      num_hosts=1)
+
+  tf.logging.info("num of batches {}".format(train_record_info["num_batch"]))
+
+  ##### Create computational graph
+  train_set = train_input_fn({
+      "batch_size": train_batch_size,
+      "data_dir": FLAGS.data_dir})
+
+  inputs, labels = train_set.make_one_shot_iterator().get_next()
+
+  per_core_bsz = train_batch_size // FLAGS.num_core_per_host
+
+  with tf.variable_scope(tf.get_variable_scope()):
+    mems = [tf.Variable(tf.zeros([FLAGS.mem_len, per_core_bsz, FLAGS.d_model], tf.float32), trainable=False)
+              for _ in range(FLAGS.n_layer)]
+
+    loss, new_mems, all_vars = single_core_graph(
+        n_token=n_token,
+        cutoffs=cutoffs,
+        is_training=True,
+        inp=inputs,
+        tgt=labels,
+        mems=mems)
+
+    assign_mems = [mems[i].assign(new_mems[i]) for i in range(FLAGS.n_layer)]
+
+  target_tokens = tf.size(labels)
+
+  ## configure the optimizer
+  global_step = tf.train.get_or_create_global_step()
+
+  # warmup stage: increase the learning rate linearly
+  if FLAGS.warmup_steps > 0:
+    warmup_lr = tf.to_float(global_step) / tf.to_float(FLAGS.warmup_steps) \
+                * FLAGS.learning_rate
+  else:
+    warmup_lr = 0.0
+
+  # decay stage: decay the learning rate using the cosine schedule
+  decay_lr = tf.train.cosine_decay(
+      FLAGS.learning_rate,
+      global_step=global_step-FLAGS.warmup_steps,
+      decay_steps=FLAGS.train_steps-FLAGS.warmup_steps,
+      alpha=FLAGS.min_lr_ratio)
+
+  # choose warmup or decay
+  learning_rate = tf.where(global_step < FLAGS.warmup_steps,
+                           warmup_lr, decay_lr)
+
+  # get the train op
+  optimizer = lamb.LAMBOptimizer(learning_rate=learning_rate)
+  if FLAGS.horovod:
+    optimizer = hvd.DistributedOptimizer(optimizer, sparse_as_dense=True)
+  grads_and_vars = optimizer.compute_gradients(loss/FLAGS.batch_chunk, all_vars)
+  grads, all_vars = zip(*grads_and_vars)
+
+  accum_vars = [tf.Variable(tf.zeros_like(tv.initialized_value()), trainable=False) for tv in all_vars]
+  in_progress = tf.get_variable(name="in_progress", shape=[], dtype=tf.bool, trainable=False,
+                               initializer=tf.zeros_initializer)
+  accum_ops = tf.cond(in_progress,
+                      lambda: [accum_vars[i].assign_add(grad) for i, grad in enumerate(grads)],
+                      lambda: [accum_vars[i].assign(grad) for i, grad in enumerate(grads)])
+  with tf.control_dependencies(accum_ops + assign_mems):
+    acc_op = in_progress.assign(tf.ones_like(in_progress))
+  final_accum_vars = [accum_vars[i] + gv for i,gv in enumerate(grads)]
+  acc_clipped, acc_gnorm = tf.clip_by_global_norm(final_accum_vars, FLAGS.clip)
+  clipped, gnorm = tf.clip_by_global_norm(grads, FLAGS.clip)
+  acc_train_op = optimizer.apply_gradients(list(zip(acc_clipped, all_vars)), global_step)
+  grads_and_vars = list(zip(clipped, all_vars))
+  if FLAGS.jit_optimizer:
+    jit_scope = tf.contrib.compiler.jit.experimental_jit_scope
+    with jit_scope():
+      train_op = optimizer.apply_gradients(grads_and_vars, global_step)
+  else:
+    train_op = optimizer.apply_gradients(grads_and_vars, global_step)
+  final_op = tf.group(train_op, assign_mems)
+  acc_final_op = tf.group(acc_train_op, assign_mems, in_progress.assign(tf.zeros_like(in_progress)))
+  ##### Training loop
+  saver = tf.train.Saver()
+
+  gpu_options = tf.GPUOptions(allow_growth = True, visible_device_list = str(local_rank))
+  with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, gpu_options = gpu_options)) as sess:
+    sess.run(tf.global_variables_initializer())
+    if FLAGS.horovod:
+      sess.run(hvd.broadcast_global_variables(0))
+
+    accum = [acc_op, target_tokens]
+    fetches = [loss, global_step, target_tokens, learning_rate, final_op if FLAGS.batch_chunk == 1 else acc_final_op]
+    total_loss, prev_step, target_tokens = 0., -1, 0
+    start_time = time.time()
+    while True:
+      for i in range(FLAGS.batch_chunk-1):
+        _,tt = sess.run(accum)
+        target_tokens += tt
+      fetched = sess.run(fetches)
+
+      loss_np, curr_step, tt = fetched[:3]
+      total_loss += loss_np
+      target_tokens += tt
+
+      if curr_step > 0 and curr_step % FLAGS.log_interval == 0:
+        curr_loss = total_loss / (curr_step - prev_step)
+        throughput = target_tokens * size / (time.time()-start_time)
+        meters['train_throughput'].update(throughput)
+        if rank == 0:
+          tf.logging.info("step {} | lr {:8.9f} "
+                        "| loss {:.2f} | pplx {:>7.2f}, bpc {:>7.4f}, tok/s {:>6.0f}".format(
+                            curr_step, fetched[-2],
+                            curr_loss, math.exp(curr_loss), curr_loss / math.log(2), throughput))
+          dllogger_data = {
+              'lr': fetched[-1],
+              'train_loss': curr_loss,
+              'train_perplexity': math.exp(curr_loss),
+              'train_throughput': throughput,
+          }
+          dllogger.log(step=int(curr_step), data=dllogger_data)
+        total_loss, prev_step, target_tokens = 0., curr_step, 0
+        start_time = time.time()
+
+      if curr_step > 0 and curr_step % FLAGS.save_steps == 0 and rank == 0:
+        save_path = os.path.join(FLAGS.model_dir, "model.ckpt")
+        saver.save(sess, save_path)
+        tf.logging.info("Model saved in path: {}".format(save_path))
+
+      if curr_step == FLAGS.train_steps:
+        break
+    if rank == 0:
+      tf.logging.info("Training throughput: {:>6.0f} tok/s".format(meters['train_throughput'].avg))
+      summary = {
+          'train_throughput': meters['train_throughput'].avg,
+      }
+      dllogger.log(step=tuple(), data=summary)
+
+
+
+def evaluate(n_token, cutoffs):
+  ##### Get input function and model function
+  eval_input_fn, eval_record_info = data_utils.get_input_fn(
+      record_info_dir=FLAGS.record_info_dir,
+      split=FLAGS.eval_split,
+      per_host_bsz=FLAGS.eval_batch_size,
+      tgt_len=FLAGS.tgt_len,
+      num_core_per_host=FLAGS.num_core_per_host,
+      num_hosts=1)
+
+  meters = {}
+  warmup = 2
+  meters['eval_throughput'] = AverageMeter(warmup=warmup)
+  meters['eval_latency'] = AverageMeter(warmup=warmup, keep=True)
+
+  num_batch = eval_record_info["num_batch"]
+  if FLAGS.max_eval_batch > 0:
+      num_batch = FLAGS.max_eval_batch
+  tf.logging.info("num of batches {}".format(num_batch))
+
+  ##### Create computational graph
+  eval_set = eval_input_fn({
+      "batch_size": FLAGS.eval_batch_size,
+      "data_dir": FLAGS.data_dir})
+
+  inputs, labels = eval_set.make_one_shot_iterator().get_next()
+
+  bsz = FLAGS.eval_batch_size
+
+  with tf.variable_scope(tf.get_variable_scope()):
+    mems = [tf.placeholder(tf.float32,
+                             [FLAGS.mem_len, bsz, FLAGS.d_model])
+              for _ in range(FLAGS.n_layer)]
+
+    loss, new_mems = single_core_graph(
+        n_token=n_token,
+        cutoffs=cutoffs,
+        is_training=False,
+        inp=inputs,
+        tgt=labels,
+        mems=mems)
+
+  target_tokens = tf.size(labels)
+  ##### Evaluation loop
+  mems_np = [np.zeros([FLAGS.mem_len, bsz, FLAGS.d_model], dtype=np.float32)
+          for layer in range(FLAGS.n_layer)]
+
+  saver = tf.train.Saver()
+
+  with tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) as sess:
+    sess.run(tf.global_variables_initializer())
+
+    if FLAGS.eval_ckpt_path is None:
+      eval_ckpt_path = tf.train.latest_checkpoint(FLAGS.model_dir)
+    else:
+      eval_ckpt_path = FLAGS.eval_ckpt_path
+    tf.logging.info("Evaluate {}".format(eval_ckpt_path))
+    saver.restore(sess, eval_ckpt_path)
+
+    fetches = [loss, new_mems, target_tokens]
+
+    format_str = "  >> processing batch {{:{0}d}}/{{:{0}d}}".format(
+        len(str(num_batch)))
+
+    total_loss, total_cnt, target_tokens = 0, 0, 0
+    start_time = time.time()
+    for step in range(num_batch):
+      feed_dict = {}
+      for m, m_np in zip(mems, mems_np):
+        feed_dict[m] = m_np
+
+      fetched = sess.run(fetches, feed_dict=feed_dict)
+
+      loss_np, mems_np, tt = fetched
+      target_tokens += tt
+      cnt_np = 1
+      total_loss += loss_np * cnt_np
+      total_cnt += cnt_np
+
+      elapsed = time.time()-start_time
+      throughput = target_tokens / elapsed
+      latency = elapsed*1000
+      meters['eval_throughput'].update(throughput)
+      meters['eval_latency'].update(latency)
+      target_tokens = 0
+      if (step+1) % (num_batch // 10) == 0:
+        tf.logging.info(format_str.format(step+1, num_batch))
+        dllogger_data = {
+            'eval_latency': latency,
+            'eval_throughput': throughput,
+        }
+        dllogger.log(step=step+1, data=dllogger_data)
+
+
+      start_time = time.time()
+    avg_loss = total_loss / total_cnt
+    latency_data = np.array(meters['eval_latency'].vals)
+    tf.logging.info("Evaluating with: bs {}, math {} ".format(FLAGS.eval_batch_size, "fp16" if FLAGS.fp16 else "fp32"))
+    tf.logging.info("| loss {:.2f} | pplx {:>7.2f}, bpc {:>7.4f}, tok/s {:>6.1f}, ms/batch {:>4.2f}".format(
+        avg_loss, math.exp(avg_loss), avg_loss / math.log(2), meters['eval_throughput'].avg, meters['eval_latency'].avg))
+    summary = {
+        'eval_loss': avg_loss,
+        'eval_ppl': math.exp(avg_loss),
+        'eval_avg_throughput': meters['eval_throughput'].avg,
+        'eval_avg_latency': meters['eval_latency'].avg,
+    }
+    for p in FLAGS.percentiles:
+      p = int(p)
+      tf.logging.info("Latency {}%: {:>4.2f} ms".format(
+        p, np.percentile(latency_data, p)))
+      summary[f'eval_{p}%_latency'] = np.percentile(latency_data, p)
+    dllogger.log(step=tuple(), data=summary)
+
+
+
+def main(unused_argv):
+  rank, local_rank, size = 0, 0, 1
+  if FLAGS.horovod:
+    hvd.init()
+    rank = hvd.rank()
+    local_rank = hvd.local_rank()
+    size = hvd.size()
+  del unused_argv  # Unused
+
+  tf.logging.set_verbosity(tf.logging.INFO)
+
+  if FLAGS.fp16:
+      os.environ["TF_ENABLE_AUTO_MIXED_PRECISION"] = "1"
+  else:
+      os.environ["TF_ENABLE_AUTO_MIXED_PRECISION"] = "0"
+
+  # Get corpus info
+  corpus_info = data_utils.get_corpus_info(FLAGS.corpus_info_path)
+  n_token = corpus_info["vocab_size"]
+  cutoffs = corpus_info["cutoffs"][1:-1]
+  tf.logging.info("n_token {}".format(n_token))
+
+  setup_dllogger(enabled=True, filename=FLAGS.raport_file, rank=rank)
+
+  if FLAGS.do_train:
+    train(n_token, cutoffs, rank, local_rank, size)
+  if FLAGS.do_eval:
+    evaluate(n_token, cutoffs)
+
+
+
+if __name__ == "__main__":
+  tf.app.run()
--- a/TensorFlow/LanguageModeling/Transformer-XL/tf/model.py
+++ b/TensorFlow/LanguageModeling/Transformer-XL/tf/model.py
@ -0,0 +1,539 @@
+import tensorflow as tf
+
+
+def positional_embedding(pos_seq, inv_freq, bsz=None):
+  sinusoid_inp = tf.einsum('i,j->ij', pos_seq, inv_freq)
+  pos_emb = tf.concat([tf.sin(sinusoid_inp), tf.cos(sinusoid_inp)], -1)
+  if bsz is not None:
+    return tf.tile(pos_emb[:, None, :], [1, bsz, 1])
+  else:
+    return pos_emb[:, None, :]
+
+
+def positionwise_FF(inp, d_model, d_inner, dropout, kernel_initializer,
+                    scope='ff', is_training=True):
+  output = inp
+  with tf.variable_scope(scope):
+    output = tf.layers.dense(inp, d_inner, activation=tf.nn.relu,
+                             kernel_initializer=kernel_initializer,
+                             name='layer_1')
+    output = tf.layers.dropout(output, dropout, training=is_training,
+                               name='drop_1')
+    output = tf.layers.dense(output, d_model,
+                             kernel_initializer=kernel_initializer,
+                             name='layer_2')
+    output = tf.layers.dropout(output, dropout, training=is_training,
+                               name='drop_2')
+    output = tf.contrib.layers.layer_norm(output + inp, begin_norm_axis=-1)
+  return output
+
+
+def rel_shift(x):
+  x_size = tf.shape(x)
+
+  x = tf.pad(x, [[0, 0], [0, 0], [0, 0], [1, 0]])
+  x = tf.reshape(x, [x_size[0], x_size[1], x_size[3] + 1, x_size[2]])
+  x = tf.slice(x, [0, 0, 1, 0], [-1, -1, -1, -1])
+  x = tf.reshape(x, x_size)
+
+  return x
+
+
+def rel_multihead_attn(w, r, r_w_bias, r_r_bias, attn_mask, mems, d_model,
+                       n_head, d_head, dropout, dropatt, is_training,
+                       kernel_initializer, scope='rel_attn'):
+  scale = 1 / (d_head ** 0.5)
+  with tf.variable_scope(scope):
+    qlen = tf.shape(w)[0]
+    rlen = tf.shape(r)[0]
+    bsz = tf.shape(w)[1]
+
+    cat = tf.concat([mems, w],
+                    0) if mems is not None and mems.shape.ndims > 1 else w
+    w_heads = tf.layers.dense(cat, 3 * n_head * d_head, use_bias=False,
+                              kernel_initializer=kernel_initializer, name='qkv')
+    r_head_k = tf.layers.dense(r, n_head * d_head, use_bias=False,
+                               kernel_initializer=kernel_initializer, name='r')
+
+    w_head_q, w_head_k, w_head_v = tf.split(w_heads, 3, -1)
+    w_head_q = w_head_q[-qlen:]
+
+    klen = tf.shape(w_head_k)[0]
+
+    w_head_q = tf.reshape(w_head_q, [qlen, bsz, n_head, d_head])
+    w_head_k = tf.reshape(w_head_k, [klen, bsz, n_head, d_head])
+    w_head_v = tf.reshape(w_head_v, [klen, bsz, n_head, d_head])
+
+    r_head_k = tf.reshape(r_head_k, [rlen, n_head, d_head])
+
+    rw_head_q = w_head_q + r_w_bias
+    rr_head_q = w_head_q + r_r_bias
+
+    AC = tf.einsum('ibnd,jbnd->bnij', rw_head_q, w_head_k)
+    BD = tf.einsum('ibnd,jnd->bnij', rr_head_q, r_head_k)
+    BD = rel_shift(BD)
+
+    attn_score = (AC + BD) * scale
+    attn_mask_t = attn_mask[None, None, :, :]
+    attn_score = attn_score * (1 - attn_mask_t) - 1e30 * attn_mask_t
+
+    attn_prob = tf.nn.softmax(attn_score, 3)
+    attn_prob = tf.layers.dropout(attn_prob, dropatt, training=is_training)
+
+    attn_vec = tf.einsum('bnij,jbnd->ibnd', attn_prob, w_head_v)
+    size_t = tf.shape(attn_vec)
+    attn_vec = tf.reshape(attn_vec, [size_t[0], size_t[1], n_head * d_head])
+
+    attn_out = tf.layers.dense(attn_vec, d_model, use_bias=False,
+                               kernel_initializer=kernel_initializer, name='o')
+    attn_out = tf.layers.dropout(attn_out, dropout, training=is_training)
+
+    output = tf.contrib.layers.layer_norm(attn_out + w, begin_norm_axis=-1)
+  return output
+
+
+def embedding_lookup(lookup_table, x, use_tpu=True):
+  if use_tpu:
+    n_token = tf.shape(lookup_table)[0]
+    one_hot_idx = tf.one_hot(x, n_token)
+    if one_hot_idx.shape.ndims == 2:
+      return tf.einsum('nd,in->id', lookup_table, one_hot_idx)
+    else:
+      return tf.einsum('nd,ibn->ibd', lookup_table, one_hot_idx)
+  else:
+    return tf.nn.embedding_lookup(lookup_table, x)
+
+
+def mask_adaptive_embedding_lookup(x, n_token, d_embed, d_proj, cutoffs, initializer,
+                                   proj_initializer, div_val=1,
+                                   proj_same_dim=True,
+                                   scope='adaptive_embed', **kwargs):
+  emb_scale = d_proj ** 0.5
+  with tf.variable_scope(scope):
+    if div_val == 1:
+      lookup_table = tf.get_variable('lookup_table', [n_token, d_embed],
+                                     initializer=initializer)
+      y = embedding_lookup(lookup_table, x, use_tpu=False)
+      if d_proj != d_embed:
+        proj_W = tf.get_variable('proj_W', [d_embed, d_proj],
+                                 initializer=proj_initializer)
+        y = tf.einsum('ibe,ed->ibd', y, proj_W)
+      else:
+        proj_W = None
+      ret_params = [lookup_table, proj_W]
+    else:
+      tables, projs = [], []
+      cutoff_ends = [0] + cutoffs + [n_token]
+      x_size = tf.shape(x)
+      y = tf.zeros([x_size[0], x_size[1], d_proj])
+      for i in range(len(cutoff_ends) - 1):
+        with tf.variable_scope('cutoff_{}'.format(i)):
+          l_idx, r_idx = cutoff_ends[i], cutoff_ends[i + 1]
+          mask = (x >= l_idx) & (x < r_idx)
+          cur_x = tf.boolean_mask(x, mask) - l_idx
+          cur_d_embed = d_embed // (div_val ** i)
+          lookup_table = tf.get_variable('lookup_table',
+                                         [r_idx - l_idx, cur_d_embed],
+                                         initializer=initializer)
+          cur_y = embedding_lookup(lookup_table, cur_x, use_tpu=False)
+          if d_proj == cur_d_embed and not proj_same_dim:
+            proj_W = None
+          else:
+            proj_W = tf.get_variable('proj_W', [cur_d_embed, d_proj],
+                                     initializer=proj_initializer)
+            cur_y = tf.einsum('id,de->ie', cur_y, proj_W)
+          mask_idx = tf.to_int64(tf.where(mask))
+          y += tf.scatter_nd(mask_idx, cur_y, tf.to_int64(tf.shape(y)))
+          tables.append(lookup_table)
+          projs.append(proj_W)
+      ret_params = [tables, projs]
+
+  y *= emb_scale
+  return y, ret_params
+
+
+def mul_adaptive_embedding_lookup(x, n_token, d_embed, d_proj, cutoffs, initializer,
+                                  proj_initializer, div_val=1, perms=None,
+                                  proj_same_dim=True,
+                                  scope='adaptive_embed'):
+  """
+  perms: If None, first compute W = W1 x W2 (projection for each bin),
+      and then compute X x W (embedding lookup). If not None,
+      use bin-based embedding lookup with max_bin_size defined by
+      the shape of perms.
+  """
+  emb_scale = d_proj ** 0.5
+  with tf.variable_scope(scope):
+    if div_val == 1:
+      lookup_table = tf.get_variable('lookup_table', [n_token, d_embed],
+                                     initializer=initializer)
+      y = embedding_lookup(lookup_table, x)
+      if d_proj != d_embed:
+        proj_W = tf.get_variable('proj_W', [d_embed, d_proj],
+                                 initializer=proj_initializer)
+        y = tf.einsum('ibe,ed->ibd', y, proj_W)
+      else:
+        proj_W = None
+      ret_params = [lookup_table, proj_W]
+    else:
+      tables, projs = [], []
+      cutoff_ends = [0] + cutoffs + [n_token]
+      x_size = tf.shape(x)
+      if perms is None:
+        cat_lookup = []
+      else:
+        cat_lookup = tf.zeros([x_size[0], x_size[1], d_proj])
+      for i in range(len(cutoff_ends) - 1):
+        with tf.variable_scope('cutoff_{}'.format(i)):
+          l_idx, r_idx = cutoff_ends[i], cutoff_ends[i + 1]
+          cur_d_embed = d_embed // (div_val ** i)
+          lookup_table = tf.get_variable('lookup_table',
+                                         [r_idx - l_idx, cur_d_embed],
+                                         initializer=initializer)
+          if cur_d_embed == d_proj and not proj_same_dim:
+            proj_W = None
+          else:
+            proj_W = tf.get_variable('proj_W', [cur_d_embed, d_proj],
+                                   initializer=proj_initializer)
+          if perms is None:
+            cat_lookup.append(tf.einsum('ie,ed->id', lookup_table, proj_W))
+          else:
+            # speed up the computation of the first bin
+            # also save some meory
+            if i == 0:
+              cur_y = embedding_lookup(lookup_table, tf.minimum(x, r_idx - 1))
+              if proj_W is not None:
+                cur_y = tf.einsum('ibe,ed->ibd', cur_y, proj_W)
+              cur_y *= perms[i][:, :, None]
+              cat_lookup += cur_y
+            else:
+              cur_x = tf.einsum('ib,ibk->k', tf.to_float(x - l_idx), perms[i])
+              cur_x = tf.to_int32(cur_x)
+              cur_y = embedding_lookup(lookup_table, cur_x)
+              if proj_W is not None:
+                cur_y = tf.einsum('ke,ed->kd', cur_y, proj_W)
+              cat_lookup += tf.einsum('kd,ibk->ibd', cur_y, perms[i])
+          tables.append(lookup_table)
+          projs.append(proj_W)
+      if perms is None:
+        cat_lookup = tf.concat(cat_lookup, 0)
+        y = embedding_lookup(cat_lookup, x)
+      else:
+        y = cat_lookup
+      ret_params = [tables, projs]
+
+  y *= emb_scale
+  return y, ret_params
+
+
+def mask_adaptive_logsoftmax(hidden, target, n_token, d_embed, d_proj, cutoffs,
+                             params, tie_projs,
+                             initializer=None, proj_initializer=None,
+                             div_val=1, scope='adaptive_softmax',
+                             proj_same_dim=True,
+                             return_mean=True, **kwargs):
+  def _logit(x, W, b, proj):
+    y = x
+    if proj is not None:
+      y = tf.einsum('ibd,ed->ibe', y, proj)
+    return tf.einsum('ibd,nd->ibn', y, W) + b
+
+  params_W, params_projs = params[0], params[1]
+
+  def _gather_logprob(logprob, target):
+    lp_size = tf.shape(logprob)
+    r = tf.range(lp_size[0])
+    idx = tf.stack([r, target], 1)
+    return tf.gather_nd(logprob, idx)
+
+  with tf.variable_scope(scope):
+    if len(cutoffs) == 0:
+      softmax_b = tf.get_variable('bias', [n_token],
+                                  initializer=tf.zeros_initializer())
+      output = _logit(hidden, params_W, softmax_b, params_projs)
+      nll = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=target,
+                                                           logits=output)
+    else:
+      cutoff_ends = [0] + cutoffs + [n_token]
+      nll = tf.zeros_like(target, dtype=tf.float32)
+      for i in range(len(cutoff_ends) - 1):
+        with tf.variable_scope('cutoff_{}'.format(i)):
+          l_idx, r_idx = cutoff_ends[i], cutoff_ends[i + 1]
+          mask = (target >= l_idx) & (target < r_idx)
+          mask_idx = tf.where(mask)
+          cur_target = tf.boolean_mask(target, mask) - l_idx
+          cur_d_embed = d_embed // (div_val ** i)
+
+          if div_val == 1:
+            cur_W = params_W[l_idx: r_idx]
+          else:
+            cur_W = params_W[i]
+          cur_b = tf.get_variable('b', [r_idx - l_idx],
+                                  initializer=tf.zeros_initializer())
+          if tie_projs[i]:
+            if div_val == 1:
+              cur_proj = params_projs
+            else:
+              cur_proj = params_projs[i]
+          else:
+            if (div_val == 1 or not proj_same_dim) and d_proj == cur_d_embed:
+              cur_proj = None
+            else:
+              cur_proj = tf.get_variable('proj', [cur_d_embed, d_proj],
+                                         initializer=proj_initializer)
+          if i == 0:
+            cluster_W = tf.get_variable('cluster_W', [len(cutoffs), d_embed],
+                                        initializer=tf.zeros_initializer())
+            cluster_b = tf.get_variable('cluster_b', [len(cutoffs)],
+                                        initializer=tf.zeros_initializer())
+            cur_W = tf.concat([cur_W, cluster_W], 0)
+            cur_b = tf.concat([cur_b, cluster_b], 0)
+
+            head_logit = _logit(hidden, cur_W, cur_b, cur_proj)
+            head_logprob = tf.nn.log_softmax(head_logit)
+            cur_head_logprob = tf.boolean_mask(head_logprob, mask)
+            cur_logprob = _gather_logprob(cur_head_logprob, cur_target)
+          else:
+            cur_head_logprob = tf.boolean_mask(head_logprob, mask)
+            cur_hidden = tf.boolean_mask(hidden, mask)
+            tail_logit = tf.squeeze(_logit(
+                cur_hidden[None], cur_W, cur_b, cur_proj), 0)
+            tail_logprob = tf.nn.log_softmax(tail_logit)
+            cur_logprob = (cur_head_logprob[:, cutoff_ends[1] + i - 1] +
+                           _gather_logprob(tail_logprob, cur_target))
+          nll += tf.scatter_nd(mask_idx, -cur_logprob,
+                                 tf.to_int64(tf.shape(nll)))
+  if return_mean:
+    nll = tf.reduce_mean(nll)
+  return nll
+
+
+def mul_adaptive_logsoftmax(hidden, target, n_token, d_embed, d_proj, cutoffs,
+                            params, tie_projs,
+                            initializer=None, proj_initializer=None,
+                            div_val=1, perms=None, proj_same_dim=True,
+                            scope='adaptive_softmax',
+                            **kwargs):
+  def _logit(x, W, b, proj):
+    y = x
+    if x.shape.ndims == 3:
+      if proj is not None:
+        y = tf.einsum('ibd,ed->ibe', y, proj)
+      return tf.einsum('ibd,nd->ibn', y, W) + b
+    else:
+      if proj is not None:
+        y = tf.einsum('id,ed->ie', y, proj)
+      return tf.einsum('id,nd->in', y, W) + b
+
+  params_W, params_projs = params[0], params[1]
+
+  with tf.variable_scope(scope):
+    if len(cutoffs) == 0:
+      softmax_b = tf.get_variable('bias', [n_token],
+                                  initializer=tf.zeros_initializer())
+      output = _logit(hidden, params_W, softmax_b, params_projs)
+      nll = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=target,
+                                                           logits=output)
+      nll = tf.reduce_mean(nll)
+    else:
+      total_loss, total_cnt = 0, 0
+      cutoff_ends = [0] + cutoffs + [n_token]
+      for i in range(len(cutoff_ends) - 1):
+        with tf.variable_scope('cutoff_{}'.format(i)):
+          l_idx, r_idx = cutoff_ends[i], cutoff_ends[i + 1]
+
+          cur_d_embed = d_embed // (div_val ** i)
+
+          if div_val == 1:
+            cur_W = params_W[l_idx: r_idx]
+          else:
+            cur_W = params_W[i]
+          cur_b = tf.get_variable('b', [r_idx - l_idx],
+                                  initializer=tf.zeros_initializer())
+          if tie_projs[i]:
+            if div_val == 1:
+              cur_proj = params_projs
+            else:
+              cur_proj = params_projs[i]
+          else:
+            if (div_val == 1 or not proj_same_dim) and d_proj == cur_d_embed:
+              cur_proj = None
+            else:
+              cur_proj = tf.get_variable('proj', [cur_d_embed, d_proj],
+                                         initializer=proj_initializer)
+
+          if i == 0:
+            cluster_W = tf.get_variable('cluster_W', [len(cutoffs), d_embed],
+                                        initializer=tf.zeros_initializer())
+            cluster_b = tf.get_variable('cluster_b', [len(cutoffs)],
+                                        initializer=tf.zeros_initializer())
+            cur_W = tf.concat([cur_W, cluster_W], 0)
+            cur_b = tf.concat([cur_b, cluster_b], 0)
+
+            head_logit = _logit(hidden, cur_W, cur_b, cur_proj)
+
+            head_target = kwargs.get("head_target")
+            head_nll = tf.nn.sparse_softmax_cross_entropy_with_logits(
+                labels=head_target,
+                logits=head_logit)
+
+            masked_loss = head_nll * perms[i]
+            total_loss += tf.reduce_sum(masked_loss)
+            total_cnt += tf.reduce_sum(perms[i])
+          else:
+            cur_head_nll = tf.einsum('ib,ibk->k', head_nll, perms[i])
+
+            cur_hidden = tf.einsum('ibd,ibk->kd', hidden, perms[i])
+            tail_logit = _logit(cur_hidden, cur_W, cur_b, cur_proj)
+
+            tail_target = tf.einsum('ib,ibk->k', tf.to_float(target - l_idx),
+                                    perms[i])
+            tail_nll = tf.nn.sparse_softmax_cross_entropy_with_logits(
+                labels=tf.to_int32(tail_target),
+                logits=tail_logit)
+
+            sum_nll = cur_head_nll + tail_nll
+            mask = tf.reduce_sum(perms[i], [0, 1])
+
+            masked_loss = sum_nll * mask
+            total_loss += tf.reduce_sum(masked_loss)
+            total_cnt += tf.reduce_sum(mask)
+
+      nll = total_loss / total_cnt
+
+  return nll
+
+
+def _create_mask(qlen, mlen, same_length=False):
+  attn_mask = tf.ones([qlen, qlen])
+  mask_u = tf.matrix_band_part(attn_mask, 0, -1)
+  mask_dia = tf.matrix_band_part(attn_mask, 0, 0)
+  attn_mask_pad = tf.zeros([qlen, mlen])
+  ret = tf.concat([attn_mask_pad, mask_u - mask_dia], 1)
+  if same_length:
+    mask_l = tf.matrix_band_part(attn_mask, -1, 0)
+    ret = tf.concat([ret[:, :qlen] + mask_l - mask_dia, ret[:, qlen:]], 1)
+  return ret
+
+def _cache_mem(curr_out, prev_mem, mem_len=None):
+  if mem_len is None or prev_mem is None:
+    new_mem = curr_out
+  elif mem_len == 0:
+    return prev_mem
+  else:
+    new_mem = tf.concat([prev_mem, curr_out], 0)[- mem_len:]
+
+  return tf.stop_gradient(new_mem)
+
+
+def transformer(dec_inp, target, mems, n_token, n_layer, d_model, d_embed,
+                n_head, d_head, d_inner, dropout, dropatt,
+                initializer, is_training, proj_initializer=None,
+                mem_len=None, cutoffs=[], div_val=1, tie_projs=[],
+                same_length=False, clamp_len=-1, use_tpu=False,
+                input_perms=None, target_perms=None, head_target=None,
+                untie_r=False, proj_same_dim=True,
+                scope='transformer'):
+  """
+  cutoffs: a list of python int. Cutoffs for adaptive softmax.
+  tie_projs: a list of python bools. Whether to tie the projections.
+  use_tpu: if True, use one_hot in embedding lookup and bin-based implementation
+        of adaptive softmax.
+  perms: a list of tensors. Each tensor should of size [len, bsz, bin_size].
+        Only used in the adaptive setting.
+  """
+  new_mems = []
+  with tf.variable_scope(scope):
+    if untie_r:
+      r_w_bias = tf.get_variable('r_w_bias', [n_layer, n_head, d_head],
+                               initializer=initializer)
+      r_r_bias = tf.get_variable('r_r_bias', [n_layer, n_head, d_head],
+                                 initializer=initializer)
+    else:
+      r_w_bias = tf.get_variable('r_w_bias', [n_head, d_head],
+                                 initializer=initializer)
+      r_r_bias = tf.get_variable('r_r_bias', [n_head, d_head],
+                                 initializer=initializer)
+
+    qlen = tf.shape(dec_inp)[0]
+    mlen = tf.shape(mems[0])[0] if mems is not None else 0
+    klen = mlen + qlen
+
+    if proj_initializer is None:
+      proj_initializer = initializer
+    lookup_fn = (mul_adaptive_embedding_lookup if use_tpu else
+                 mask_adaptive_embedding_lookup)
+    embeddings, shared_params = lookup_fn(
+        x=dec_inp,
+        n_token=n_token,
+        d_embed=d_embed,
+        d_proj=d_model,
+        cutoffs=cutoffs,
+        initializer=initializer,
+        proj_initializer=proj_initializer,
+        div_val= div_val,
+        perms=input_perms,
+        proj_same_dim=proj_same_dim)
+
+    attn_mask = _create_mask(qlen, mlen, same_length)
+
+    pos_seq = tf.range(klen - 1, -1, -1.0)
+    if clamp_len > 0:
+      pos_seq = tf.minimum(pos_seq, clamp_len)
+    inv_freq = 1 / (10000 ** (tf.range(0, d_model, 2.0) / d_model))
+    pos_emb = positional_embedding(pos_seq, inv_freq)
+
+    output = tf.layers.dropout(embeddings, dropout, training=is_training)
+    pos_emb = tf.layers.dropout(pos_emb, dropout, training=is_training)
+
+    if mems is None:
+      mems = [None] * n_layer
+
+    for i in range(n_layer):
+      # cache new mems
+      new_mems.append(_cache_mem(output, mems[i], mem_len))
+
+      with tf.variable_scope('layer_{}'.format(i)):
+        output = rel_multihead_attn(
+            w=output,
+            r=pos_emb,
+            r_w_bias=r_w_bias if not untie_r else r_w_bias[i],
+            r_r_bias=r_r_bias if not untie_r else r_r_bias[i],
+            attn_mask=attn_mask,
+            mems=mems[i],
+            d_model=d_model,
+            n_head=n_head,
+            d_head=d_head,
+            dropout=dropout,
+            dropatt=dropatt,
+            is_training=is_training,
+            kernel_initializer=initializer)
+        output = positionwise_FF(
+            inp=output,
+            d_model=d_model,
+            d_inner=d_inner,
+            dropout=dropout,
+            kernel_initializer=initializer,
+            is_training=is_training)
+
+    output = tf.layers.dropout(output, dropout, training=is_training)
+
+    logsoftmax_fn = (mul_adaptive_logsoftmax if use_tpu else
+                     mask_adaptive_logsoftmax)
+    loss = logsoftmax_fn(
+        hidden=output,
+        target=target,
+        n_token=n_token,
+        d_embed=d_embed,
+        d_proj=d_model,
+        cutoffs=cutoffs,
+        params=shared_params,
+        tie_projs=tie_projs,
+        initializer=initializer,
+        proj_initializer=proj_initializer,
+        div_val=div_val,
+        perms=target_perms,
+        head_target=head_target,
+        proj_same_dim=proj_same_dim)
+    return loss, new_mems
+
--- a/TensorFlow/LanguageModeling/Transformer-XL/tf/run_wt103_base.sh
+++ b/TensorFlow/LanguageModeling/Transformer-XL/tf/run_wt103_base.sh
@ -0,0 +1,98 @@
+#!/bin/bash
+
+# Data
+DATA_ROOT=../data/wikitext-103/
+
+# Model
+DIV_VAL=1
+N_LAYER=16
+D_MODEL=512
+D_EMBED=512
+N_HEAD=8
+D_HEAD=64
+D_INNER=2048
+
+# Training
+TGT_LEN=192
+MEM_LEN=192
+
+NUM_CORE=${2:-"8"}
+
+# Testing
+TEST_TGT_LEN=64
+TEST_MEM_LEN=640
+TEST_CLAMP_LEN=400
+
+TEST_NUM_CORE=1
+
+
+if [[ $1 == 'train_data' ]]; then
+    python data_utils.py \
+        --data_dir=${DATA_ROOT}/ \
+        --dataset=wt103 \
+        --tgt_len=${TGT_LEN} \
+        --num_passes=2 \
+        --use_tpu=False \
+        --eval_batch_size=0 \
+        ${@:2}
+elif [[ $1 == 'test_data' ]]; then
+    python data_utils.py \
+        --data_dir=${DATA_ROOT}/ \
+        --dataset=enwik8 \
+        --tgt_len=${TEST_TGT_LEN} \
+        --num_passes=1 \
+        --use_tpu=False \
+        ${@:2}
+elif [[ $1 == 'train' ]]; then
+    echo 'Run training...'
+    horovodrun -np ${NUM_CORE} -H localhost:${NUM_CORE} python main.py \
+        --data_dir=${DATA_ROOT}/tfrecords \
+        --record_info_dir=${DATA_ROOT}/tfrecords/ \
+        --corpus_info_path=${DATA_ROOT}/corpus-info.json \
+        --div_val=${DIV_VAL} \
+        --untie_r=True \
+        --proj_share_all_but_first=True \
+        --n_layer=${N_LAYER} \
+        --d_model=${D_MODEL} \
+        --d_embed=${D_EMBED} \
+        --n_head=${N_HEAD} \
+        --d_head=${D_HEAD} \
+        --d_inner=${D_INNER} \
+        --dropout=0.1 \
+        --dropatt=0.0 \
+        --learning_rate=0.01 \
+        --warmup_steps=1000 \
+        --tgt_len=${TGT_LEN} \
+        --mem_len=${MEM_LEN} \
+        --num_core_per_host=${NUM_CORE} \
+        ${@:3}
+elif [[ $1 == 'eval' ]]; then
+    echo 'Run evaluation...'
+    python main.py \
+        --data_dir=${DATA_ROOT}/tfrecords \
+        --record_info_dir=${DATA_ROOT}/tfrecords/ \
+        --corpus_info_path=${DATA_ROOT}/corpus-info.json \
+        --div_val=${DIV_VAL} \
+        --untie_r=True \
+        --proj_share_all_but_first=True \
+        --n_layer=${N_LAYER} \
+        --d_model=${D_MODEL} \
+        --d_embed=${D_EMBED} \
+        --n_head=${N_HEAD} \
+        --d_head=${D_HEAD} \
+        --d_inner=${D_INNER} \
+        --dropout=0.0 \
+        --dropatt=0.0 \
+        --tgt_len=${TEST_TGT_LEN} \
+        --mem_len=${TEST_MEM_LEN} \
+        --clamp_len=${TEST_CLAMP_LEN} \
+        --same_length=True \
+        --num_core_per_host=${TEST_NUM_CORE} \
+        --do_train=False \
+        --do_eval=True \
+        --horovod=False \
+        --eval_split=test \
+        ${@:2}
+else
+    echo 'unknown argment 1'
+fi
--- a/TensorFlow/LanguageModeling/Transformer-XL/tf/scripts/docker/build.sh
+++ b/TensorFlow/LanguageModeling/Transformer-XL/tf/scripts/docker/build.sh
@ -0,0 +1,17 @@
+#!/bin/bash
+
+# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#       http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+docker build . --network=host --rm -t transformer-xl:latest
--- a/TensorFlow/LanguageModeling/Transformer-XL/tf/scripts/docker/interactive.sh
+++ b/TensorFlow/LanguageModeling/Transformer-XL/tf/scripts/docker/interactive.sh
@ -0,0 +1,17 @@
+#!/bin/bash
+
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#       http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+nvidia-docker run --init -it --rm --network=host --ipc=host -v $PWD:/workspace/transformer-xl transformer-xl bash
--- a/TensorFlow/LanguageModeling/Transformer-XL/tf/scripts/inference_benchmark.sh
+++ b/TensorFlow/LanguageModeling/Transformer-XL/tf/scripts/inference_benchmark.sh
@ -0,0 +1,30 @@
+#!/bin/bash
+
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#       http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+BATCH_SIZES=(1 2 4 8 16 32)
+# "empty" MATH corresponds to fp32
+MATHS=("" "--fp16")
+
+
+for (( j = 0; j < ${#BATCH_SIZES[@]}; j++ )); do
+   for (( k = 0; k < ${#MATHS[@]}; k++ )); do
+      echo batch size: ${BATCH_SIZES[j]} math: ${MATHS[k]}
+      taskset -c 0 bash run_wt103_base.sh eval \
+         --eval_batch_size "${BATCH_SIZES[j]}" \
+         "${MATHS[k]}" \
+         "${@:1}"
+   done
+done
--- a/TensorFlow/LanguageModeling/Transformer-XL/tf/vocabulary.py
+++ b/TensorFlow/LanguageModeling/Transformer-XL/tf/vocabulary.py
@ -0,0 +1,170 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from collections import Counter, OrderedDict
+
+import numpy as np
+
+import tensorflow as tf
+
+from tensorflow.gfile import Open as open
+from tensorflow.gfile import Exists as exists
+
+class Vocab(object):
+  def __init__(self, special=[], min_freq=0, max_size=None, lower_case=True,
+         delimiter=None, vocab_file=None):
+    self.counter = Counter()
+    self.special = special
+    self.min_freq = min_freq
+    self.max_size = max_size
+    self.lower_case = lower_case
+    self.delimiter = delimiter
+    self.vocab_file = vocab_file
+
+  def tokenize(self, line, add_eos=False, add_double_eos=False):
+    line = line.strip()
+    # convert to lower case
+    if self.lower_case:
+      line = line.lower()
+
+    # empty delimiter '' will evaluate False
+    if self.delimiter == '':
+      symbols = line
+    else:
+      symbols = line.split(self.delimiter)
+
+    if add_double_eos: # lm1b
+      return ['<S>'] + symbols + ['<S>']
+    elif add_eos:
+      return symbols + ['<eos>']
+    else:
+      return symbols
+
+  def count_file(self, path, verbose=False, add_eos=False):
+    if verbose: print('counting file {} ...'.format(path))
+    assert exists(path)
+
+    sents = []
+    with open(path, 'r') as f:
+      for idx, line in enumerate(f):
+        if verbose and idx > 0 and idx % 500000 == 0:
+          print('  line {}'.format(idx))
+        symbols = self.tokenize(line, add_eos=add_eos)
+        self.counter.update(symbols)
+        sents.append(symbols)
+
+    return sents
+
+  def count_sents(self, sents, verbose=False):
+    """
+      sents : a list of sentences, each a list of tokenized symbols
+    """
+    if verbose: print('counting {} sents ...'.format(len(sents)))
+    for idx, symbols in enumerate(sents):
+      if verbose and idx > 0 and idx % 500000 == 0:
+        print('  line {}'.format(idx))
+      self.counter.update(symbols)
+
+  def _build_from_file(self, vocab_file):
+    self.idx2sym = []
+    self.sym2idx = OrderedDict()
+
+    with open(vocab_file, 'r') as f:
+      for line in f:
+        symb = line.strip().split()[0]
+        self.add_symbol(symb)
+    self.unk_idx = self.sym2idx['<UNK>']
+
+  def build_vocab(self):
+    if self.vocab_file:
+      print('building vocab from {}'.format(self.vocab_file))
+      self._build_from_file(self.vocab_file)
+      print('final vocab size {}'.format(len(self)))
+    else:
+      print('building vocab with min_freq={}, max_size={}'.format(
+        self.min_freq, self.max_size))
+      self.idx2sym = []
+      self.sym2idx = OrderedDict()
+
+      for sym in self.special:
+        self.add_special(sym)
+
+      for sym, cnt in self.counter.most_common(self.max_size):
+        if cnt < self.min_freq: break
+        self.add_symbol(sym)
+
+      print('final vocab size {} from {} unique tokens'.format(
+        len(self), len(self.counter)))
+
+  def encode_file(self, path, ordered=False, verbose=False, add_eos=True,
+          add_double_eos=False):
+    if verbose: print('encoding file {} ...'.format(path))
+    assert exists(path)
+    encoded = []
+    with open(path, 'r') as f:
+      for idx, line in enumerate(f):
+        if verbose and idx > 0 and idx % 500000 == 0:
+          print('  line {}'.format(idx))
+        symbols = self.tokenize(line, add_eos=add_eos,
+          add_double_eos=add_double_eos)
+        encoded.append(self.convert_to_nparray(symbols))
+
+    if ordered:
+      encoded = np.concatenate(encoded)
+
+    return encoded
+
+  def encode_sents(self, sents, ordered=False, verbose=False):
+    if verbose: print('encoding {} sents ...'.format(len(sents)))
+    encoded = []
+    for idx, symbols in enumerate(sents):
+      if verbose and idx > 0 and idx % 500000 == 0:
+        print('  line {}'.format(idx))
+      encoded.append(self.convert_to_nparray(symbols))
+
+    if ordered:
+      encoded = np.concatenate(encoded)
+
+    return encoded
+
+  def add_special(self, sym):
+    if sym not in self.sym2idx:
+      self.idx2sym.append(sym)
+      self.sym2idx[sym] = len(self.idx2sym) - 1
+      setattr(self, '{}_idx'.format(sym.strip('<>')), self.sym2idx[sym])
+
+  def add_symbol(self, sym):
+    if sym not in self.sym2idx:
+      self.idx2sym.append(sym)
+      self.sym2idx[sym] = len(self.idx2sym) - 1
+
+  def get_sym(self, idx):
+    assert 0 <= idx < len(self), 'Index {} out of range'.format(idx)
+    return self.idx2sym[idx]
+
+  def get_idx(self, sym):
+    if sym in self.sym2idx:
+      return self.sym2idx[sym]
+    else:
+      assert hasattr(self, 'unk_idx')
+      return self.sym2idx.get(sym, self.unk_idx)
+
+  def get_symbols(self, indices):
+    return [self.get_sym(idx) for idx in indices]
+
+  def get_indices(self, symbols):
+    return [self.get_idx(sym) for sym in symbols]
+
+  def convert_to_nparray(self, symbols):
+    nparray = np.array(self.get_indices(symbols), dtype=np.int64)
+    return nparray
+
+  def convert_to_sent(self, indices, exclude=None):
+    if exclude is None:
+      return ' '.join([self.get_sym(idx) for idx in indices])
+    else:
+      return ' '.join([self.get_sym(idx) for idx in indices if idx not in exclude])
+
+  def __len__(self):
+    return len(self.idx2sym)
--- a/TensorFlow/Recommendation/WideAndDeep/README.md
+++ b/TensorFlow/Recommendation/WideAndDeep/README.md
@ -48,7 +48,7 @@ The differences between this Wide & Deep Recommender Model and the model from th

 The model enables you to train a recommender model that combines the memorization of the Wide part and generalization of the Deep part of the network.

-This model is trained with mixed precision using Tensor Cores on NVIDIA Volta and Turing GPUs. Therefore, researchers can get results 1.32 times faster than training without Tensor Cores, while experiencing the benefits of mixed precision training. This model is tested against each NGC monthly container release to ensure consistent accuracy and performance over time.
+This model is trained with mixed precision using Tensor Cores on NVIDIA Volta and Turing GPUs. Therefore, researchers can get results 1.44 times faster than training without Tensor Cores, while experiencing the benefits of mixed precision training. This model is tested against each NGC monthly container release to ensure consistent accuracy and performance over time.

 ### Model architecture

@ -62,12 +62,6 @@ Figure 1. The architecture of the Wide & Deep model.</a>

 ### Applications and dataset

-The basis of our API lies in the observation that in recommendation problems there are hierarchies of features: those which describe the person or object _to which_ we wish to make recommendations (*request* level features), and those which describe those objects which we are considering recommending (*item* level features). Additionally, these features often need to undergo some transformation from their raw representation in data stores to a representation digestible by neural networks. These transformations, defined by [TensorFlow `tf.feature_column`](https://www.tensorflow.org/api_docs/python/tf/feature_column), include nontrivial operations such as hashing, binning, vocabulary lookups, and embedding (indicator columns can be thought of as embeddings with the identity matrix as the embedding table).
-
-In most APIs, including those implemented in standard TensorFlow, these transformations need to be computed for request level features repeatedly for _every_ item on which we want to compute a recommendation score. Moreover, if the model is being hosted on a dedicated remote inference server, this requires us to send copies of the request level data for every item as well.
-
-To address this, we built a custom GPU op which computes _all_ these transformations in parallel, and only reads and computes request level features once before fanning them out to the rest of the batch. Besides saving on redundant compute and network I/O, this implementation leverages the exceptional parallel computing power of NVIDIA GPUs to provide massive inference time accelerations compared to native CPU based implementations.
-
 As a reference dataset, we used a subset of [the features engineered](https://github.com/gabrielspmoreira/kaggle_outbrain_click_prediction_google_cloud_ml_engine) by the 19th place finisher in the [Kaggle Outbrain Click Prediction Challenge](https://www.kaggle.com/c/outbrain-click-prediction/). This competition challenged competitors to predict the likelihood with which a particular ad on a website's display would be clicked on. Competitors were given information about the user, display, document, and ad in order to train their models. More information can be found [here](https://www.kaggle.com/c/outbrain-click-prediction/data).


@ -215,11 +209,11 @@ size (4096 is the default).

 Single GPU:
 ```bash
-python -m trainer.task --gpu --amp --batch_size 131072 --num_epochs 100
+python -m trainer.task --gpu --amp --global_batch_size 131072 --num_epochs 120
 ```
 8 GPU:
 ```bash
-mpiexec --allow-run-as-root --bind-to socket -np 8 python -m trainer.task --gpu --amp --hvd --batch_size 16384 --num_epochs 20
+mpiexec --allow-run-as-root --bind-to socket -np 8 python -m trainer.task --gpu --amp --hvd --global_batch_size 131072 --num_epochs 120
 ```

 If you want to run validation or inference, you can either use the checkpoint obtained from the training 
@ -356,9 +350,9 @@ Our results were obtained by running the benchmark scripts from the `scripts` di

 |**GPUs**|**Batch Size / GPU**|**Accuracy - FP32 (MAP@12)**|**Accuracy - Mixed precision (MAP@12)**|**Time to Train - FP32 (minutes)**|**Time to Train - Mixed precision (minutes)**|**Time to Train Speedup (FP32 to Mixed precision)**|
 |-------:|-------------------:|----------------------------:|---------------------------------------:|-----------------------------------------------:|----------------------:|---------------------------------:|
-| 1 | 131,072 |  0.67689 | 0.67542  | 546 | 414 | 1.32 |
-| 4 | 32,768 | 0.67677 | 0.67647  | 78 | 66 | 1.18 |
-| 8 | 16,384 | 0.67669 | 0.67594  | 30 | 24 | 1.25 |
+| 1 | 131,072 |  0.67647 | 0.67634  | 654 | 454 | 1.44 |
+| 4 | 32,768 | 0.67599 | 0.67652  | 226 | 183 | 1.23 |
+| 8 | 16,384 | 0.67688 | 0.67690  | 167 | 153 | 1.09 |

 To achieve the same results, follow the steps in the [Quick Start Guide](#quick-start-guide).

@ -368,15 +362,15 @@ To achieve the same results, follow the steps in the [Quick Start Guide](#quick-

 ##### Training stability test

-The Wide and Deep model was trained for 72,951 training steps, starting
-from 20 different initial random seeds. The training was performed in the 20.02-tf1-py3-stage NGC container on
+The Wide and Deep model was trained for 54,713 training steps, starting
+from 50 different initial random seeds. The training was performed in the 20.02-tf1-py3-stage NGC container on
 NVIDIA DGX-1 with 8x V100 16G GPUs with mixed precision enabled.
 After training, the models were evaluated on the test dataset. The following
 table summarizes the final MAP@12 score on the test set.

 |**Average MAP@12**|**Standard deviation**|**Minimum**|**Maximum**|
 |---------------------:|---------------------:|----------:|----------:|
-| 0.67594 | 0.00204 | 0.66906 | 0.67785 | 
+| 0.67690 | 0.00081 | 0.67432 | 0.67821 | 


 #### Training performance results
@ -390,9 +384,9 @@ To achieve these same results, follow the steps in the [Quick Start Guide](#quic

 |**GPUs**|**Batch Size / GPU**|**Throughput - FP32 (samples/s)**|**Throughput - Mixed precision (samples/s)**|**Throughput speedup (FP32 to Mixed precision)**|**Weak Scaling - FP32**|**Weak Scaling - Mixed precision**|
 |-------:|-------------------:|----------------------------:|---------------------------------------:|-----------------------------------------------:|----------------------:|---------------------------------:|
-| 1 | 131,072 | 167,875 | 221,550 | 1.320 | 1.000 | 1.000 |
-| 4 | 131,072 | 485,242 | 547,683 | 1.129 | 2.472 | 2.890 |
-| 8 | 131,072 | 655,665 | 688,481 | 1.050 | 3.108 | 3.906 |
+| 1 | 131,072 | 168,181 | 242,332 | 1.44 | 1.00 | 1.00 |
+| 4 | 131,072 | 487,719 | 602,027 | 1.23 | 2.47 | 2.89 |
+| 8 | 131,072 | 659,533 | 718,820 | 1.09 | 3.11 | 3.91 |



--- a/TensorFlow/Recommendation/WideAndDeep/img/map12_WnD.png
+++ b/TensorFlow/Recommendation/WideAndDeep/img/map12_WnD.png
--- a/TensorFlow/Recommendation/WideAndDeep/scripts/benchmark_training_fp16_1gpu.sh
+++ b/TensorFlow/Recommendation/WideAndDeep/scripts/benchmark_training_fp16_1gpu.sh
@ -17,4 +17,4 @@
 set -x
 set -e

-python -m trainer.task --model_dir . --transformed_metadata_path "/outbrain/tfrecords" --eval_data_pattern "/outbrain/tfrecords/eval_*" --train_data_pattern "/outbrain/tfrecords/train_*" --save_checkpoints_secs 600 --linear_l1_regularization 0.0 --linear_l2_regularization 0.0 --linear_learning_rate 0.2 --deep_l1_regularization 0.0 --deep_l2_regularization 0.0 --deep_learning_rate 1.0 --deep_dropout 0.0 --deep_hidden_units 1024 1024 1024 1024 1024 --prebatch_size 4096 --batch_size 131072 --eval_batch_size 32768 --eval_steps 8 --num_epochs 15 --model_type wide_n_deep --gpu --benchmark --amp
+python -m trainer.task --model_dir . --transformed_metadata_path "/outbrain/tfrecords" --eval_data_pattern "/outbrain/tfrecords/eval_*" --train_data_pattern "/outbrain/tfrecords/train_*" --save_checkpoints_secs 600 --linear_l1_regularization 0.0 --linear_l2_regularization 0.0 --linear_learning_rate 0.2 --deep_l1_regularization 0.0 --deep_l2_regularization 0.0 --deep_learning_rate 1.0 --deep_dropout 0.0 --deep_hidden_units 1024 1024 1024 1024 1024 --prebatch_size 4096 --global_batch_size 131072 --eval_batch_size 32768 --eval_steps 8 --model_type wide_n_deep --gpu --benchmark --amp
--- a/TensorFlow/Recommendation/WideAndDeep/scripts/benchmark_training_fp16_4gpu.sh
+++ b/TensorFlow/Recommendation/WideAndDeep/scripts/benchmark_training_fp16_4gpu.sh
@ -17,4 +17,4 @@
 set -x
 set -e

-mpiexec --allow-run-as-root --bind-to socket -np 4 python -m trainer.task --hvd --model_dir . --transformed_metadata_path "/outbrain/tfrecords" --eval_data_pattern "/outbrain/tfrecords/eval_*" --train_data_pattern "/outbrain/tfrecords/train_*" --save_checkpoints_secs 600 --linear_l1_regularization 0.0 --linear_l2_regularization 0.0 --linear_learning_rate 0.2 --deep_l1_regularization 0.0 --deep_l2_regularization 0.0 --deep_learning_rate 1.0 --deep_dropout 0.0 --deep_hidden_units 1024 1024 1024 1024 1024 --prebatch_size 4096 --batch_size 131072 --eval_batch_size 32768 --eval_steps 8 --num_epochs 15 --model_type wide_n_deep --gpu --benchmark --amp
+mpiexec --allow-run-as-root --bind-to socket -np 4 python -m trainer.task --hvd --model_dir . --transformed_metadata_path "/outbrain/tfrecords" --eval_data_pattern "/outbrain/tfrecords/eval_*" --train_data_pattern "/outbrain/tfrecords/train_*" --save_checkpoints_secs 600 --linear_l1_regularization 0.0 --linear_l2_regularization 0.0 --linear_learning_rate 0.2 --deep_l1_regularization 0.0 --deep_l2_regularization 0.0 --deep_learning_rate 1.0 --deep_dropout 0.0 --deep_hidden_units 1024 1024 1024 1024 1024 --prebatch_size 4096 --global_batch_size 131072 --eval_batch_size 32768 --eval_steps 8 --model_type wide_n_deep --gpu --benchmark --amp
--- a/TensorFlow/Recommendation/WideAndDeep/scripts/benchmark_training_fp16_8gpu.sh
+++ b/TensorFlow/Recommendation/WideAndDeep/scripts/benchmark_training_fp16_8gpu.sh
@ -17,4 +17,4 @@
 set -x
 set -e

-mpiexec --allow-run-as-root --bind-to socket -np 8 python -m trainer.task --hvd --model_dir . --transformed_metadata_path "/outbrain/tfrecords" --eval_data_pattern "/outbrain/tfrecords/eval_*" --train_data_pattern "/outbrain/tfrecords/train_*" --save_checkpoints_secs 600 --linear_l1_regularization 0.0 --linear_l2_regularization 0.0 --linear_learning_rate 0.2 --deep_l1_regularization 0.0 --deep_l2_regularization 0.0 --deep_learning_rate 1.0 --deep_dropout 0.0 --deep_hidden_units 1024 1024 1024 1024 1024 --prebatch_size 4096 --batch_size 131072 --eval_batch_size 32768 --eval_steps 8 --num_epochs 15 --model_type wide_n_deep --gpu --benchmark --amp
+mpiexec --allow-run-as-root --bind-to socket -np 8 python -m trainer.task --hvd --model_dir . --transformed_metadata_path "/outbrain/tfrecords" --eval_data_pattern "/outbrain/tfrecords/eval_*" --train_data_pattern "/outbrain/tfrecords/train_*" --save_checkpoints_secs 600 --linear_l1_regularization 0.0 --linear_l2_regularization 0.0 --linear_learning_rate 0.2 --deep_l1_regularization 0.0 --deep_l2_regularization 0.0 --deep_learning_rate 1.0 --deep_dropout 0.0 --deep_hidden_units 1024 1024 1024 1024 1024 --prebatch_size 4096 --global_batch_size 131072 --eval_batch_size 32768 --eval_steps 8 --model_type wide_n_deep --gpu --benchmark --amp
--- a/TensorFlow/Recommendation/WideAndDeep/scripts/benchmark_training_fp32_1gpu.sh
+++ b/TensorFlow/Recommendation/WideAndDeep/scripts/benchmark_training_fp32_1gpu.sh
@ -17,4 +17,4 @@
 set -x
 set -e

-python -m trainer.task --model_dir . --transformed_metadata_path "/outbrain/tfrecords" --eval_data_pattern "/outbrain/tfrecords/eval_*" --train_data_pattern "/outbrain/tfrecords/train_*" --save_checkpoints_secs 600 --linear_l1_regularization 0.0 --linear_l2_regularization 0.0 --linear_learning_rate 0.2 --deep_l1_regularization 0.0 --deep_l2_regularization 0.0 --deep_learning_rate 1.0 --deep_dropout 0.0 --deep_hidden_units 1024 1024 1024 1024 1024 --prebatch_size 4096 --batch_size 131072 --eval_batch_size 32768 --eval_steps 8 --num_epochs 15 --model_type wide_n_deep --gpu --benchmark
+python -m trainer.task --model_dir . --transformed_metadata_path "/outbrain/tfrecords" --eval_data_pattern "/outbrain/tfrecords/eval_*" --train_data_pattern "/outbrain/tfrecords/train_*" --save_checkpoints_secs 600 --linear_l1_regularization 0.0 --linear_l2_regularization 0.0 --linear_learning_rate 0.2 --deep_l1_regularization 0.0 --deep_l2_regularization 0.0 --deep_learning_rate 1.0 --deep_dropout 0.0 --deep_hidden_units 1024 1024 1024 1024 1024 --prebatch_size 4096 --global_batch_size 131072 --eval_batch_size 32768 --eval_steps 8 --model_type wide_n_deep --gpu --benchmark
--- a/TensorFlow/Recommendation/WideAndDeep/scripts/benchmark_training_fp32_4gpu.sh
+++ b/TensorFlow/Recommendation/WideAndDeep/scripts/benchmark_training_fp32_4gpu.sh
@ -17,4 +17,4 @@
 set -x
 set -e

-mpiexec --allow-run-as-root --bind-to socket -np 4 python -m trainer.task --hvd --model_dir . --transformed_metadata_path "/outbrain/tfrecords" --eval_data_pattern "/outbrain/tfrecords/eval_*" --train_data_pattern "/outbrain/tfrecords/train_*" --save_checkpoints_secs 600 --linear_l1_regularization 0.0 --linear_l2_regularization 0.0 --linear_learning_rate 0.2 --deep_l1_regularization 0.0 --deep_l2_regularization 0.0 --deep_learning_rate 1.0 --deep_dropout 0.0 --deep_hidden_units 1024 1024 1024 1024 1024 --prebatch_size 4096 --batch_size 131072 --eval_batch_size 32768 --eval_steps 8 --num_epochs 15 --model_type wide_n_deep --gpu --benchmark
+mpiexec --allow-run-as-root --bind-to socket -np 4 python -m trainer.task --hvd --model_dir . --transformed_metadata_path "/outbrain/tfrecords" --eval_data_pattern "/outbrain/tfrecords/eval_*" --train_data_pattern "/outbrain/tfrecords/train_*" --save_checkpoints_secs 600 --linear_l1_regularization 0.0 --linear_l2_regularization 0.0 --linear_learning_rate 0.2 --deep_l1_regularization 0.0 --deep_l2_regularization 0.0 --deep_learning_rate 1.0 --deep_dropout 0.0 --deep_hidden_units 1024 1024 1024 1024 1024 --prebatch_size 4096 --global_batch_size 131072 --eval_batch_size 32768 --eval_steps 8 --model_type wide_n_deep --gpu --benchmark
--- a/TensorFlow/Recommendation/WideAndDeep/scripts/benchmark_training_fp32_8gpu.sh
+++ b/TensorFlow/Recommendation/WideAndDeep/scripts/benchmark_training_fp32_8gpu.sh
@ -17,4 +17,4 @@
 set -x
 set -e

-mpiexec --allow-run-as-root --bind-to socket -np 8 python -m trainer.task --hvd --model_dir . --transformed_metadata_path "/outbrain/tfrecords" --eval_data_pattern "/outbrain/tfrecords/eval_*" --train_data_pattern "/outbrain/tfrecords/train_*" --save_checkpoints_secs 600 --linear_l1_regularization 0.0 --linear_l2_regularization 0.0 --linear_learning_rate 0.2 --deep_l1_regularization 0.0 --deep_l2_regularization 0.0 --deep_learning_rate 1.0 --deep_dropout 0.0 --deep_hidden_units 1024 1024 1024 1024 1024 --prebatch_size 4096 --batch_size 131072 --eval_batch_size 32768 --eval_steps 8 --num_epochs 15 --model_type wide_n_deep --gpu --benchmark
+mpiexec --allow-run-as-root --bind-to socket -np 8 python -m trainer.task --hvd --model_dir . --transformed_metadata_path "/outbrain/tfrecords" --eval_data_pattern "/outbrain/tfrecords/eval_*" --train_data_pattern "/outbrain/tfrecords/train_*" --save_checkpoints_secs 600 --linear_l1_regularization 0.0 --linear_l2_regularization 0.0 --linear_learning_rate 0.2 --deep_l1_regularization 0.0 --deep_l2_regularization 0.0 --deep_learning_rate 1.0 --deep_dropout 0.0 --deep_hidden_units 1024 1024 1024 1024 1024 --prebatch_size 4096 --global_batch_size 131072 --eval_batch_size 32768 --eval_steps 8 --model_type wide_n_deep --gpu --benchmark
--- a/TensorFlow/Recommendation/WideAndDeep/trainer/task.py
+++ b/TensorFlow/Recommendation/WideAndDeep/trainer/task.py
@ -94,8 +94,8 @@ def create_parser():
    default=4096,
    type=int)
  parser.add_argument(
-    '--batch_size',
-    help='Training batch size',
+    '--global_batch_size',
+    help='Total training batch size',
    default=131072,
    type=int)
  parser.add_argument(
@ -116,7 +116,7 @@ def create_parser():
  parser.add_argument(
    '--num_epochs',
     help='Number of epochs',
-     default=100,
+     default=120,
     type=int)
  parser.add_argument(
    '--save_checkpoints_secs',
@ -334,7 +334,7 @@ def get_feature_columns(use_all_columns=False, force_subset=None):
 def separate_input_fn(
    tf_transform_output,
    transformed_examples,
-    batch_size,
+    create_batches,
    mode,
    reader_num_threads=1,
    parser_num_threads=2,
@ -357,7 +357,7 @@ def separate_input_fn(
                  if (mode==tf.estimator.ModeKeys.TRAIN and shuffle_buffer_size > 1) \
                  else raw_dataset
  raw_dataset = raw_dataset.repeat()
-  raw_dataset = raw_dataset.batch(batch_size)
+  raw_dataset = raw_dataset.batch(create_batches)
  
  # this function appears to require each element to be a vector
  # batching should mean that this is always true
@ -509,8 +509,7 @@ def custom_estimator_model_fn(features, labels, mode, params, config):
  with tf.compat.v1.variable_scope('deep', values=features) as scope:
    deep_absolute_scope = scope.name
    if params['model_type'] in [DEEP, WIDE_N_DEEP]:
-      deep_features = features.copy()
-      deep_current = tf.compat.v1.feature_column.input_layer(deep_features, params['deep_columns'])
+      deep_current = tf.compat.v1.feature_column.input_layer(features, params['deep_columns'])

    if params['model_type'] in [DEEP, WIDE_N_DEEP]:
      for layer_ind in range(len(params['layers'])):
@ -640,7 +639,8 @@ def main(FLAGS):

  dllogger.log(data=vars(FLAGS), step='PARAMETER')
  
-  create_batches = FLAGS.batch_size // FLAGS.prebatch_size
+  local_batch_size = FLAGS.global_batch_size // num_gpus
+  create_batches = local_batch_size // FLAGS.prebatch_size

  wide_columns, deep_columns = get_feature_columns(use_all_columns=FLAGS.use_all_columns)
  tf_transform_output = tft.TFTransformOutput(FLAGS.transformed_metadata_path)
@ -727,7 +727,7 @@ def main(FLAGS):
  estimator = tf.estimator.add_metrics(estimator, map_custom_metric)
  estimator = tf.estimator.add_metrics(estimator, map_custom_metric_with_leak)

-  steps_per_epoch = FLAGS.training_set_size / FLAGS.batch_size
+  steps_per_epoch = FLAGS.training_set_size / FLAGS.global_batch_size

  print('Steps per epoch: {}'.format(steps_per_epoch))
  max_steps = int(FLAGS.num_epochs * steps_per_epoch)
@ -738,7 +738,7 @@ def main(FLAGS):

  if FLAGS.predict or FLAGS.evaluate: # inference
    if FLAGS.benchmark:
-      benchmark_hook = BenchmarkLoggingHook(global_batch_size=num_gpus * FLAGS.eval_batch_size, warmup_steps=FLAGS.benchmark_warmup_steps)
+      benchmark_hook = BenchmarkLoggingHook(global_batch_size=FLAGS.eval_batch_size, warmup_steps=FLAGS.benchmark_warmup_steps)
      hooks.append(benchmark_hook)
      eval_steps = FLAGS.benchmark_steps
    else:
@ -775,7 +775,7 @@ def main(FLAGS):
  else: # training

    if FLAGS.benchmark:
-      benchmark_hook = BenchmarkLoggingHook(global_batch_size=num_gpus * FLAGS.batch_size, 
+      benchmark_hook = BenchmarkLoggingHook(global_batch_size=FLAGS.global_batch_size, 
        warmup_steps=FLAGS.benchmark_warmup_steps)
      hooks.append(benchmark_hook)
      estimator.train(train_input_fn, hooks=hooks, steps=FLAGS.benchmark_steps)
@ -787,7 +787,7 @@ def main(FLAGS):
                                      throttle_secs=FLAGS.eval_throttle_secs, steps=FLAGS.eval_steps)  
      result = tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)

-      if result:
+      if result != (None, None):
        dllogger.log(step=(), data={'map': float(result[0]['map']), 
        'map_with_leak': float(result[0]['map_with_leak'])})
    
--- a/TensorFlow2/Segmentation/UNet_Medical/Dockerfile
+++ b/TensorFlow2/Segmentation/UNet_Medical/Dockerfile
@ -1,8 +1,7 @@
-ARG FROM_IMAGE_NAME=gitlab-master.nvidia.com:5005/dl/dgx/tensorflow:20.02-tf2-py3-devel
-FROM ${FROM_IMAGE_NAME}
+FROM nvcr.io/nvidia/tensorflow:20.02-tf2-py3

 ADD . /workspace/unet
 WORKDIR /workspace/unet

 RUN pip install --upgrade pip
-RUN pip install -r requirements.txt
+RUN pip install -r requirements.txt