[DLRM/PyT] Update

2021-03-24 19:01:16 +01:00 · 2021-03-24 19:01:16 +01:00 · 4e764dcd78
parent 19fac37435
commit 4e764dcd78
63 changed files with 14725 additions and 849 deletions
--- a/PyTorch/Recommendation/DLRM/Dockerfile
+++ b/PyTorch/Recommendation/DLRM/Dockerfile
@ -1,4 +1,4 @@
-# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2021 NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:20.07-py3
+ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:21.02-py3
 FROM ${FROM_IMAGE_NAME}

 ADD requirements.txt .
@ -22,4 +22,5 @@ WORKDIR /workspace/dlrm

 COPY . .

+RUN chmod +x bind.sh
 RUN pip install --no-cache-dir -e .
--- a/PyTorch/Recommendation/DLRM/Dockerfile_preprocessing
+++ b/PyTorch/Recommendation/DLRM/Dockerfile_preprocessing
@ -0,0 +1,75 @@
+# Copyright (c) 2021 NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+ARG CUDF_VERSION=0.18
+ARG RAPIDS_VERSION=0.4.0
+ARG SPARK_VERSION=3.0.1
+ARG DGX_VERSION=DGX-2
+ARG NUMBER_OF_GPUS
+ARG FROM_IMAGE_NAME=nvcr.io/nvidia/nvtabular:0.3
+FROM ${FROM_IMAGE_NAME} AS base
+ARG CUDF_VERSION
+ARG RAPIDS_VERSION
+ARG SPARK_VERSION
+ARG DGX_VERSION
+ARG NUMBER_OF_GPUS
+
+RUN apt update &&                  \
+    apt install -y openjdk-8-jdk && \
+    apt install -y curl
+
+RUN curl https://archive.apache.org/dist/spark/spark-$SPARK_VERSION/spark-$SPARK_VERSION-bin-hadoop2.7.tgz -o /opt/spark.tgz && \
+    tar zxf /opt/spark.tgz -C /opt/ && \
+    mv /opt/spark-$SPARK_VERSION-bin-hadoop2.7 /opt/spark && \
+    rm /opt/spark.tgz && \
+    curl https://repo1.maven.org/maven2/ai/rapids/cudf/0.18.1/cudf-0.18.1-cuda11.jar -o /opt/cudf.jar && \
+    curl https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/0.4.0/rapids-4-spark_2.12-0.4.0.jar  -o /opt/rapids-4-spark.jar && \
+    apt install -y git
+
+WORKDIR /workspace/dlrm
+
+COPY . .
+
+RUN mv /opt/cudf.jar  /opt/spark/jars &&                                                   \
+    mv /opt/rapids-4-spark.jar /opt/spark/jars/ &&                                          \
+    mv /workspace/dlrm/preproc/gpu/get_gpu_resources.sh /opt/spark/conf/ &&                  \
+    mv /workspace/dlrm/preproc/gpu/spark-defaults.conf /opt/spark/conf/spark-defaults.conf && \
+    rm -rf /workspace/dlrm/preproc/gpu
+
+RUN chmod +x /workspace/dlrm/preproc/run_spark_gpu_$DGX_VERSION.sh
+RUN chmod +x /opt/spark/conf/get_gpu_resources.sh
+RUN chmod +x /workspace/dlrm/preproc/run_NVTabular.sh
+
+ENV SPARK_HOME /opt/spark
+ENV PYTHONPATH $SPARK_HOME/python
+ENV PYSPARK_PYTHON /conda/envs/rapids/bin/python
+ENV PYSPARK_DRIVER_PYTHON /conda/envs/rapids/bin/python
+ENV DGX_VERSION $DGX_VERSION
+ENV SPARK_VERSION $SPARK_VERSION
+
+SHELL ["/bin/bash", "-c"]
+
+RUN source activate rapids &&                                  \
+    pip install --upgrade pip &&                                \
+    pip install --no-cache-dir -r requirements_preprocessing.txt
+
+FROM base AS image-machine-DGX-2
+ARG NUMBER_OF_GPUS
+ENV NUMBER_OF_GPUS ${NUMBER_OF_GPUS:-16}
+
+FROM base AS image-machine-DGX-A100
+ENV NUMBER_OF_GPUS 8
+
+FROM image-machine-${DGX_VERSION} AS final
+RUN echo "spark.worker.resource.gpu.amount    ${NUMBER_OF_GPUS}" >> /opt/spark/conf/spark-defaults.conf
--- a/PyTorch/Recommendation/DLRM/Dockerfile_spark
+++ b/PyTorch/Recommendation/DLRM/Dockerfile_spark
@ -1,39 +0,0 @@
-# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#       http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:20.03-py3
-FROM ${FROM_IMAGE_NAME}
-
-RUN apt update && \
-    apt install -y openjdk-8-jdk && \
-    curl https://archive.apache.org/dist/spark/spark-3.0.0/spark-3.0.0-bin-hadoop3.2.tgz -o /opt/spark-3.0.0-bin-hadoop3.2.tgz && \
-    tar zxf /opt/spark-3.0.0-bin-hadoop3.2.tgz -C /opt/ && \
-    rm /opt/spark-3.0.0-bin-hadoop3.2.tgz && \
-    curl https://repo1.maven.org/maven2/ai/rapids/cudf/0.14/cudf-0.14-cuda10-2.jar -o /opt/cudf-0.14-cuda10-2.jar && \
-    curl https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/0.1.0/rapids-4-spark_2.12-0.1.0.jar -o /opt/rapids-4-spark_2.12-0.1.0.jar
-
-ADD requirements.txt .
-RUN pip install -r requirements.txt
-
-WORKDIR /workspace/dlrm
-
-COPY . .
-
-RUN mv /opt/cudf-0.14-cuda10-2.jar  /opt/spark-3.0.0-bin-hadoop3.2/jars && \
-    mv /opt/rapids-4-spark_2.12-0.1.0.jar /opt/spark-3.0.0-bin-hadoop3.2/jars/ && \
-    mv /workspace/dlrm/preproc/gpu/get_gpu_resources.sh /opt/spark-3.0.0-bin-hadoop3.2/conf/ && \
-    mv /workspace/dlrm/preproc/gpu/spark-defaults.conf /opt/spark-3.0.0-bin-hadoop3.2/conf/ && \
-    rm -fr /workspace/dlrm/preproc/gpu
-
-RUN chmod +x /opt/spark-3.0.0-bin-hadoop3.2/conf/get_gpu_resources.sh
--- a/PyTorch/Recommendation/DLRM/README.md
+++ b/PyTorch/Recommendation/DLRM/README.md
@ -1,5 +1,3 @@
-
-
 # DLRM For PyTorch

 This repository provides a script and recipe to train the Deep Learning Recommendation Model (DLRM) to achieve state-of-the-art accuracy and is tested and maintained by NVIDIA.
@ -7,46 +5,51 @@ This repository provides a script and recipe to train the Deep Learning Recommen
 ## Table Of Contents	

  * [Model overview](#model-overview)
-     * [Model architecture](#model-architecture)
-     * [Default configuration](#default-configuration)
-     * [Feature support matrix](#feature-support-matrix)
-        * [Features](#features)
-     * [Mixed precision training](#mixed-precision-training)
-        * [Enabling mixed precision](#enabling-mixed-precision)
-        * [Enabling TF32](#enabling-tf32)
-     * [Hybrid-parallel multiGPU with all-2-all communication](#hybrid-parallel-multigpu-with-all-2-all-communication)
-        * [Embedding table placement and load balancing](#embedding-table-placement-and-load-balancing)
-     * [Preprocessing on GPU with Spark 3](#preprocessing-on-gpu-with-spark-3)
+    + [Model architecture](#model-architecture)
+    + [Default configuration](#default-configuration)
+    + [Feature support matrix](#feature-support-matrix)
+      - [Features](#features)
+    + [Mixed precision training](#mixed-precision-training)
+      - [Enabling mixed precision](#enabling-mixed-precision)
+      - [Enabling TF32](#enabling-tf32)
+    + [Hybrid-parallel multi-GPU with all-2-all communication](#hybrid-parallel-multi-gpu-with-all-2-all-communication)
+      - [Embedding table placement and load balancing](#embedding-table-placement-and-load-balancing)
+    + [Preprocessing on GPU](#preprocessing-on-gpu)
  * [Setup](#setup)
-     * [Requirements](#requirements)
+    + [Requirements](#requirements)
  * [Quick Start Guide](#quick-start-guide)
  * [Advanced](#advanced)
-     * [Scripts and sample code](#scripts-and-sample-code)
-     * [Parameters](#parameters)
-     * [Command-line options](#command-line-options)
-     * [Getting the data](#getting-the-data)
-        * [Dataset guidelines](#dataset-guidelines)
-        * [Multi-dataset](#multi-dataset)
-        * [Preprocess with Spark](#preprocess-with-spark)
-     * [Training process](#training-process)
-     * [Inference process](#inference-process)
-     * [Deploying DLRM Using NVIDIA Triton Inference Server](#deploying-dlrm-using-nvidia-triton-inference-server)
+    + [Scripts and sample code](#scripts-and-sample-code)
+    + [Parameters](#parameters)
+    + [Command-line options](#command-line-options)
+    + [Getting the data](#getting-the-data)
+      - [Dataset guidelines](#dataset-guidelines)
+      - [Multi-dataset](#multi-dataset)
+      - [Preprocessing](#preprocessing)
+        * [NVTabular](#nvtabular)
+        * [Spark](#spark)
+    + [Training process](#training-process)
+    + [Inference process](#inference-process)
+    + [Deploying DLRM Using NVIDIA Triton Inference Server](#deploying-dlrm-using-nvidia-triton-inference-server)
  * [Performance](#performance)
-     * [Benchmarking](#benchmarking)
-        * [Training performance benchmark](#training-performance-benchmark)
-        * [Inference performance benchmark](#inference-performance-benchmark)
-     * [Results](#results)
-        * [Training accuracy results](#training-accuracy-results)
-           * [Training accuracy: NVIDIA DGX A100 (8x A100 40GB)](#training-accuracy-nvidia-dgx-a100-8x-a100-40gb)
-           * [Training accuracy: NVIDIA DGX-1 (8x V100 32GB)](#training-accuracy-nvidia-dgx-1-8x-v100-32gb)
-           * [Training stability test](#training-stability-test)
-        * [Training performance results](#training-performance-results)
-           * [Training performance: NVIDIA DGX A100 (8x A100 40GB)](#training-performance-nvidia-dgx-a100-8x-a100-40gb)
-           * [Training performance: NVIDIA DGX-1 (8x V100 32GB)](#training-performance-nvidia-dgx-1-8x-v100-32gb)
-           * [Training performance: NVIDIA DGX-2 (16x V100 32GB)](#training-performance-nvidia-dgx-2-16x-v100-32gb)
+    + [Benchmarking](#benchmarking)
+      - [Training performance benchmark](#training-performance-benchmark)
+      - [Inference performance benchmark](#inference-performance-benchmark)
+    + [Results](#results)
+      - [Training accuracy results](#training-accuracy-results)
+        * [Training accuracy: NVIDIA DGX A100 (8x A100 80GB)](#training-accuracy-nvidia-dgx-a100-8x-a100-80gb)
+        * [Training accuracy: NVIDIA DGX-1 (8x V100 32GB)](#training-accuracy-nvidia-dgx-1-8x-v100-32gb)
+        * [Training accuracy plots](#training-accuracy-plots)
+        * [Training stability test](#training-stability-test)
+        * [Impact of mixed precision on training accuracy](#impact-of-mixed-precision-on-training-accuracy)
+      - [Training performance results](#training-performance-results)
+        * [Training performance: NVIDIA DGX A100 (8x A100 80GB)](#training-performance-nvidia-dgx-a100-8x-a100-80gb)
+        * [Training performance: NVIDIA DGX A100 (8x A100 40GB)](#training-performance-nvidia-dgx-a100-8x-a100-40gb)
+        * [Training performance: NVIDIA DGX-1 (8x V100 32GB)](#training-performance-nvidia-dgx-1-8x-v100-32gb)
+        * [Training performance: NVIDIA DGX-2 (16x V100 32GB)](#training-performance-nvidia-dgx-2-16x-v100-32gb)
  * [Release notes](#release-notes)
-     * [Changelog](#changelog)
-     * [Known issues](#known-issues)
+    + [Changelog](#changelog)
+    + [Known issues](#known-issues)

 ## Model overview

@ -56,13 +59,13 @@ make use of both categorical and numerical inputs. It was first described in
 This repository provides a reimplementation of the codebase provided originally [here](https://github.com/facebookresearch/dlrm).
 The scripts provided enable you to train DLRM on the [Criteo Terabyte Dataset](https://labs.criteo.com/2013/12/download-terabyte-click-logs/). 

-Using the scripts provided here, you can efficiently train models that are too large to fit into a single GPU. This is because we use a hybrid-parallel approach, which combines model parallelism for the embedding tables with data parallelism for the Top MLP. This is explained in details in [next sections](#hybrid-parallel-multigpu-with-all-2-all-communication)
+Using the scripts provided here, you can efficiently train models that are too large to fit into a single GPU. This is because we use a hybrid-parallel approach, which combines model parallelism for the embedding tables with data parallelism for the Top MLP. This is explained in details in [next sections](#hybrid-parallel-multigpu-with-all-2-all-communication).

 This model uses a slightly different preprocessing procedure than the one found in the original implementation. You can find a detailed description of the preprocessing steps in the [Dataset guidelines](#dataset-guidelines) section.

 Using DLRM you can train a high-quality general model for providing recommendations.

-This model is trained with mixed precision using Tensor Cores on Volta, Turing and NVIDIA Ampere GPU architectures. Therefore, researchers can get results 3.4x faster than training without Tensor Cores while experiencing the benefits of mixed precision training. It is tested against each NGC monthly container release to ensure consistent accuracy and performance over time.
+This model is trained with mixed precision using Tensor Cores on Volta, Turing, and NVIDIA Ampere GPU architectures. Therefore, researchers can get results up to 3.3x faster than training without Tensor Cores while experiencing the benefits of mixed precision training. It is tested against each NGC monthly container release to ensure consistent accuracy and performance over time.



@ -72,7 +75,7 @@ DLRM accepts two types of features: categorical and numerical. For each categori
 feature, an embedding table is used to provide dense representation to each unique value. The dense features enter the model and are transformed by a 
 simple neural network referred to as "bottom MLP". This part of the network consists of a series
 of linear layers with ReLU activations. The output of the bottom MLP and the embedding vectors
-are then fed into the "dot interaction" operation. The output of "dot interaction" is then concatenated with the features resulting from bottom MLP and fed into the "top MLP" which is also a series of dense layers with activations.
+are then fed into the "dot interaction" operation. The output of "dot interaction" is then concatenated with the features resulting from the bottom MLP and fed into the "top MLP" which is also a series of dense layers with activations.
 The model outputs a single number which can be interpreted as a likelihood of a certain user clicking an ad. 


@ -88,19 +91,22 @@ Figure 1. The architecture of DLRM.
 The following features were implemented in this model:
 - general
 	- static loss scaling for Tensor Cores (mixed precision) training
-	- hybrid-parallel multiGPU training
+	- hybrid-parallel multi-GPU training
 - preprocessing
    - dataset preprocessing using Spark 3 on GPUs 
+    - dataset preprocessing using NVTabular on GPUs 
    
 ### Feature support matrix

 The following features are supported by this model: 

-| Feature               | DLRM                
-|----------------------|--------------------------
-|Automatic mixed precision (AMP)   | yes
-|Hybrid-parallel multiGPU with all-2-all| yes
-|Preprocessing on GPU with Spark 3| yes
+| Feature                                 | DLRM                
+|-----------------------------------------|-----
+|Automatic mixed precision (AMP)          | yes
+|Hybrid-parallel multi-GPU with all-2-all | yes
+|Preprocessing on GPU with NVTabular      | yes
+|Preprocessing on GPU with Spark 3        | yes
+
         
 #### Features

@ -108,10 +114,14 @@ Automatic Mixed Precision (AMP) - enables mixed precision training without any c

 Multi-GPU training with PyTorch distributed - our model uses `torch.distributed` to implement efficient multi-GPU training with NCCL. For details, see example sources in this repository or see the [PyTorch Tutorial](https://pytorch.org/tutorials/intermediate/dist_tuto.html).

+Preprocessing on GPU with NVTabular - Criteo dataset preprocessing can be conducted using [NVTabular](https://github.com/NVIDIA/NVTabular). For more information on the framework, see the [Announcing the NVIDIA NVTabular Open Beta with Multi-GPU Support and New Data Loaders](https://developer.nvidia.com/blog/announcing-the-nvtabular-open-beta-with-multi-gpu-support-and-new-data-loaders/).
+
+Preprocessing on GPU with Spark 3 - Criteo dataset preprocessing can be conducted using [Apache Spark 3.0](https://spark.apache.org/). For more information on the framework and how to leverage GPU to preprocessing, see the [Accelerating Apache Spark 3.0 with GPUs and RAPIDS](https://developer.nvidia.com/blog/accelerating-apache-spark-3-0-with-gpus-and-rapids/).
+

 ### Mixed precision training

-Mixed precision is the combined use of different numerical precisions in a computational method. [Mixed precision](https://arxiv.org/abs/1710.03740) training offers significant computational speedup by performing operations in half-precision format while storing minimal information in single-precision to retain as much information as possible in critical parts of the network. Since the introduction of [Tensor Cores](https://developer.nvidia.com/tensor-cores) in Volta, and following with both the Turing and Ampere architectures, significant training speedups are experienced by switching to mixed precision -- up to 3.4x overall speedup on the most arithmetically intense model architectures. Using mixed precision training requires two steps:
+Mixed precision is the combined use of different numerical precisions in a computational method. [Mixed precision](https://arxiv.org/abs/1710.03740) training offers significant computational speedup by performing operations in the half-precision floating-point format while storing minimal information in single-precision to retain as much information as possible in critical parts of the network. Since the introduction of [Tensor Cores](https://developer.nvidia.com/tensor-cores) in Volta, and following with both the Turing and Ampere architectures, significant training speedups are experienced by switching to mixed precision &ndash; up to 3.3x overall speedup on the most arithmetically intense model architectures. Using mixed precision training requires two steps:
 1.  Porting the model to use the FP16 data type where appropriate.    
 2.  Adding loss scaling to preserve small gradient values.

@ -131,17 +141,17 @@ Mixed precision training is turned off by default. To turn it on issue the `--am

 TensorFloat-32 (TF32) is the new math mode in [NVIDIA A100](https://www.nvidia.com/en-us/data-center/a100/) GPUs for handling the matrix math also called tensor operations. TF32 running on Tensor Cores in A100 GPUs can provide up to 10x speedups compared to single-precision floating-point math (FP32) on Volta GPUs. 

-TF32 Tensor Cores can speed up networks using FP32, typically with no loss of accuracy. It is more robust than FP16 for models which require high dynamic range for weights or activations.
+TF32 Tensor Cores can speed up networks using FP32, typically with no loss of accuracy. It is more robust than FP16 for models that require a high dynamic range for weights or activations.

 For more information, refer to the [TensorFloat-32 in the A100 GPU Accelerates AI Training, HPC up to 20x](https://blogs.nvidia.com/blog/2020/05/14/tensorfloat-32-precision-format/) blog post.

 TF32 is supported in the NVIDIA Ampere GPU architecture and is enabled by default.

-### Hybrid-parallel multiGPU with all-2-all communication
+### Hybrid-parallel multi-GPU with all-2-all communication

-Many recommendation models contain very large embedding tables. As a result the model is often too large to fit onto a single device. This could be easily solved by training in a model-parallel way, using either the CPU or other GPUs as "memory donors". However, this approach is suboptimal as the "memory donor" devices' compute is not utilized. In this repository we use the model-parallel approach for the bottom part of the model (Embedding Tables + Bottom MLP) while using a usual data parallel approach for the top part of the model (Dot Interaction + Top MLP). This way we can train models much larger than what would normally fit into a single GPU while at the same time making the training faster by using multiple GPUs. We call this approach hybrid-parallel.
+Many recommendation models contain very large embedding tables. As a result, the model is often too large to fit onto a single device. This could be easily solved by training in a model-parallel way, using either the CPU or other GPUs as "memory donors". However, this approach is suboptimal as the "memory donor" devices' compute is not utilized. In this repository, we use the model-parallel approach for the bottom part of the model (Embedding Tables + Bottom MLP) while using a usual data parallel approach for the top part of the model (Dot Interaction + Top MLP). This way we can train models much larger than what would normally fit into a single GPU while at the same time making the training faster by using multiple GPUs. We call this approach hybrid-parallel.

-The transition from model-parallel to data-parallel in the middle of the neural net needs a specific multiGPU communication pattern called [all-2-all](https://en.wikipedia.org/wiki/All-to-all_\(parallel_pattern\)) which is available in our [PyTorch 20.06-py3] NGC docker container. In the [original DLRM whitepaper](https://arxiv.org/abs/1906.00091) this has been also referred to as "butterlfy shuffle". 
+The transition from model-parallel to data-parallel in the middle of the neural net needs a specific multi-GPU communication pattern called [all-2-all](https://en.wikipedia.org/wiki/All-to-all_\(parallel_pattern\)) which is available in our [PyTorch 21.02-py3](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch/tags) NGC docker container. In the [original DLRM whitepaper](https://arxiv.org/abs/1906.00091) this has been also referred to as "butterfly shuffle". 

 <p align="center">
  <img width="100%" src="./img/hybrid_parallel.png" />
@ -149,7 +159,7 @@ The transition from model-parallel to data-parallel in the middle of the neural
 </p>


-In the example shown in this repository we train models of two sizes: "small" (~15 GB) and "large" (~82 GB). We use the hybrid-parallel approach only for the "large" model while the "small" one supports only singleGPU training for now.
+In the example shown in this repository we train models of three sizes: "small" (~15 GB), "large" (~82 GB), and "xlarge" (~142 GB). We use the hybrid-parallel approach for the "large" and "xlarge" models, as they do not fit in a single GPU.

 #### Embedding table placement and load balancing

@ -158,15 +168,14 @@ We use the following heuristic for dividing the work between the GPUs:
 - The tables are sorted from the largest to the smallest
 - Set `max_tables_per_gpu := ceil(number_of_embedding_tables / number_of_available_gpus)`
 - Repeat until all embedding tables have an assigned device:
-    - Out of all the available GPUs find the one with largest amount of unallocated memory
+    - Out of all the available GPUs find the one with the largest amount of unallocated memory
    - Place the largest unassigned embedding table on this GPU. Raise an exception if it does not fit.
-    - If the number of embedding tables on this GPU is now equal to `max_tables_per_gpu` remove this GPU from the list of available GPUs, so that no more embedding tables will placed on this GPU. This ensures the all2all communication is well balanced between all devices.
+    - If the number of embedding tables on this GPU is now equal to `max_tables_per_gpu` remove this GPU from the list of available GPUs so that no more embedding tables will be placed on this GPU. This ensures the all2all communication is well balanced between all devices.

+### Preprocessing on GPU

-### Preprocessing on GPU with Spark 3
+Please refer to [the "Preprocessing" section](#preprocessing) for a detailed description of the Apache Spark 3.0 and NVTabular GPU functionality 

-Please refer to [the "Preprocessing with Spark" section](#preprocess-with-spark) for detailed description of the Spark 3 GPU functionality 
- 
 ## Setup

 The following section lists the requirements for training DLRM.
@ -174,17 +183,17 @@ The following section lists the requirements for training DLRM.
 ### Requirements

 This repository contains Dockerfile which extends the PyTorch NGC container and encapsulates some dependencies. Aside from these dependencies, ensure you have the following components:
-   [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker)
-   [PyTorch 20.06-py3] NGC container
-   Supported GPUs:
+- [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker)
+- [PyTorch 21.02-py3](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch/tags) NGC container
+- Supported GPUs:
    - [NVIDIA Volta architecture](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/)
-    - [NVIDIA Turing architecture](https://www.nvidia.com/en-us/geforce/turing/)
+    - [NVIDIA Turing architecture](https://www.nvidia.com/en-us/design-visualization/technologies/turing-architecture/)
    - [NVIDIA Ampere architecture](https://www.nvidia.com/en-us/data-center/nvidia-ampere-gpu-architecture/)


 For more information about how to get started with NGC containers, see the following sections from the NVIDIA GPU Cloud Documentation and the Deep Learning Documentation:
-   [Getting Started Using NVIDIA GPU Cloud](https://docs.nvidia.com/ngc/ngc-getting-started-guide/index.html)
-   [Accessing And Pulling From The NGC Container Registry](https://docs.nvidia.com/deeplearning/frameworks/user-guide/index.html#accessing_registry)
+- [Getting Started Using NVIDIA GPU Cloud](https://docs.nvidia.com/ngc/ngc-getting-started-guide/index.html)
+- [Accessing And Pulling From The NGC Container Registry](https://docs.nvidia.com/deeplearning/frameworks/user-guide/index.html#accessing_registry)
 - [Running PyTorch](https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/running.html#running)
  
 For those unable to use the PyTorch NGC container, to set up the required environment or create your own container, see the versioned [NVIDIA Container Support Matrix](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html).
@ -201,44 +210,57 @@ git clone https://github.com/NVIDIA/DeepLearningExamples
 cd DeepLearningExamples/PyTorch/Recommendation/DLRM
 ```

-2. Build a DLRM Docker container
+2. Download the dataset.
+
+You can download the data by following the instructions at: http://labs.criteo.com/2013/12/download-terabyte-click-logs/.
+When you have successfully downloaded it and unpacked it, set the `CRITEO_DATASET_PARENT_DIRECTORY` to its parent directory:
+```
+CRITEO_DATASET_PARENT_DIRECTORY=/raid/dlrm
+``` 
+We recommend to choose the fastest possible file system, otherwise it may lead to an IO bottleneck.
+
+3. Build DLRM Docker containers
 ```bash
 docker build -t nvidia_dlrm_pyt .
-docker build -t nvidia_dlrm_spark -f Dockerfile_spark . 
+docker build -t nvidia_dlrm_preprocessing -f Dockerfile_preprocessing . --build-arg DGX_VERSION=[DGX-2|DGX-A100]
 ```

 3. Start an interactive session in the NGC container to run preprocessing.
-The NCF PyTorch container can be launched with:
+The DLRM PyTorch container can be launched with:
 ```bash
-mkdir -p data
-docker run --runtime=nvidia -it --rm --ipc=host  -v ${PWD}/data:/data nvidia_dlrm_spark bash
+docker run --runtime=nvidia -it --rm --ipc=host  -v ${CRITEO_DATASET_PARENT_DIRECTORY}:/data/dlrm nvidia_dlrm_preprocessing bash
 ```

-4.  Download and preprocess the dataset.
+4.  Preprocess the dataset.

-You can download the data by following the instructions at: http://labs.criteo.com/2013/12/download-terabyte-click-logs/.
-When you have successfully downloaded it, put it in the `/data/dlrm/criteo/` directory in the container (`$PWD/data/dlrm/criteo` in the host system).
+Here are a few examples of different preprocessing commands. For the details on how those scripts work and detailed description of dataset types (small FL=15, large FL=3, xlarge FL=2), training possibilities and all the parameters consult the [preprocessing section](#preprocessing). 

-Here are a few examples of different preprocessing commands. For the details on how those scripts work and detailed description of all the parameters please consult the [preprocess with spark section](#preprocess-with-spark).
- 
+Depending on datastet type (small FL=15, large FL=3, xlarge FL=2) run one of following command:
+
+4.1. Preprocess to small dataset (FL=15) with Spark GPU:
 ```bash
-cd preproc
-
-# to run on a DGX2 with a frequency limit of 3 (will need 8xV100-32GB to fit the model in GPU memory)
-./prepare_dataset.sh DGX2 3
-
-# to run on a DGX2 with a frequency limit of 15 (should fit on a single V100-32GB):
-./prepare_dataset.sh DGX2 15
-#
-# to run on CPU with a frequency limit of 15:
-./prepare_dataset.sh CPU 15
+cd /workspace/dlrm/preproc
+./prepare_dataset.sh 15 GPU Spark
 ```

+4.2. Preprocess to large dataset (FL=3) with Spark GPU:
+```bash
+cd /workspace/dlrm/preproc
+./prepare_dataset.sh 3 GPU Spark
+```
+
+4.3. Preprocess to xlarge dataset (FL=2) with Spark GPU:
+```bash
+cd /workspace/dlrm/preproc
+./prepare_dataset.sh 2 GPU Spark
+```
+
+
 5. Start training.

- First start the docker container:
+- First start the docker container (adding `--security-opt seccomp=unconfined` option is needed to take the full advantage of processor affinity in multi-GPU training):
 ```bash
-docker run --runtime=nvidia -it --rm --ipc=host  -v ${PWD}/data:/data nvidia_dlrm_pyt bash
+docker run --security-opt seccomp=unconfined --runtime=nvidia -it --rm --ipc=host  -v ${PWD}/data:/data nvidia_dlrm_pyt bash
 ```

 - single-GPU:
@ -250,26 +272,40 @@ python -m dlrm.scripts.main --mode train --dataset /data/dlrm/binary_dataset/
 ```bash
 python -m torch.distributed.launch --no_python --use_env --nproc_per_node 8 \
          bash  -c './bind.sh --cpu=dgxa100_ccx.sh --mem=dgxa100_ccx.sh python -m dlrm.scripts.dist_main \
-          --dataset /data/dlrm/binary_dataset/--seed 0 --epochs 1 --amp'
+          --dataset /data/dlrm/binary_dataset/ --seed 0 --epochs 1 --amp'
 ```

- multi-GPU for DGX1 and DGX2:
+- multi-GPU for DGX-1 and DGX-2:
 ```bash
 python -m torch.distributed.launch --no_python --use_env --nproc_per_node 8 \
          bash  -c './bind.sh  --cpu=exclusive -- python -m dlrm.scripts.dist_main \
-          --dataset /data/dlrm/binary_dataset/--seed 0 --epochs 1 --amp'
+          --dataset /data/dlrm/binary_dataset/ --seed 0 --epochs 1 --amp'
 ```

-6. Start validation/evaluation.
+6. Start validation/evaluation. If you want to run validation or evaluation, you can either:
+ - use the checkpoint obtained from the training commands above, or
+ - download the pretrained checkpoint from NGC.

+In order to download the checkpoint from NGC, visit ngc.nvidia.com website and browse the available models. Download the checkpoint files and unzip them to some path, for example, to `$CRITEO_DATASET_PARENT_DIRECTORY/checkpoints/`. The checkpoint requires around 15GB of disk space.
+
+Commands:
 - single-GPU:
-```
-python -m dlrm.scripts.main --mode test --dataset /data/dlrm/binary_dataset/
+```bash
+python -m dlrm.scripts.main --mode test --dataset /data/dlrm/binary_dataset/ --load_checkpoint_path `$CRITEO_DATASET_PARENT_DIRECTORY/checkpoints/checkpoint`
 ```

- multi-GPU:
+- multi-GPU for DGX A100:
+```bash
+python -m torch.distributed.launch --no_python --use_env --nproc_per_node 8 \
+          bash  -c './bind.sh --cpu=dgxa100_ccx.sh --mem=dgxa100_ccx.sh python -m dlrm.scripts.dist_main \
+          --dataset /data/dlrm/binary_dataset/ --seed 0 --epochs 1 --amp --load_checkpoint_path `$CRITEO_DATASET_PARENT_DIRECTORY/checkpoints/checkpoint`'
 ```
-python -u -m torch.distributed.launch --use_env --nproc_per_node 8 -m dlrm.scripts.dist_main --mode test --dataset /data/dlrm/binary_dataset
+
+- multi-GPU for DGX-1 and DGX-2:
+```bash
+python -m torch.distributed.launch --no_python --use_env --nproc_per_node 8 \
+          bash  -c './bind.sh  --cpu=exclusive -- python -m dlrm.scripts.dist_main \
+          --dataset /data/dlrm/binary_dataset/ --seed 0 --epochs 1 --amp --load_checkpoint_path `$CRITEO_DATASET_PARENT_DIRECTORY/checkpoints/checkpoint`'
 ```

 ## Advanced
@ -278,9 +314,9 @@ The following sections provide greater details of the dataset, running training

 ### Scripts and sample code

-The `dlrm/scripts/main.py` script provides an entry point to most of the functionality in single-GPU setting. Using different command-line flags allows you to run training, validation and benchmark both training and inference on real or synthetic data.
+The `dlrm/scripts/main.py` script provides an entry point to most of the functionality in a single-GPU setting. Using different command-line flags allows you to run training, validation, and benchmark both training and inference on real or synthetic data.

-Analogously, the `dlrm/scripts/dist_main.py` script provides an entry point for the functionality in multi-GPU setting. It uses the same flags as in single-GPU case with the defaults tuned to large model training.
+Analogously, the `dlrm/scripts/dist_main.py` script provides an entry point for the functionality in a multi-GPU setting. It uses the same flags as in the single-GPU case with the defaults tuned to large model training.

 The `dlrm/model/single.py` file provides the definition of the DLRM neural network for single-GPU, whereas `dlrm/model/distributed.py` contains DLRM definition for multi-GPU case.

@ -293,10 +329,10 @@ Utilities connected to loading the data reside in the `data` directory.
 The `dlrm/scripts/main.py` script supports a number of command-line flags. You can get the descriptions of those by running `python -m dlrm.scripts.main --help`. Running this command will output:

 ```        
-       USAGE: /workspace/dlrm/scripts/main.py [flags]
+USAGE: /workspace/dlrm/scripts/main.py [flags]
 flags:

-/workspace/dlrm/scripts/main.py:
+/workspace/dlrm/dlrm/scripts/main.py:
  --[no]amp: If True the script will use Automatic Mixed Precision
    (default: 'false')
  --auc_threshold: Stop the training after achieving this AUC
@ -312,11 +348,8 @@ flags:
  --bottom_mlp_sizes: Linear layer sizes for the bottom MLP
    (default: '512,256,128')
    (a comma separated list)
-  --dataset: Full path to binary dataset. Must include files such as: train_data.bin, test_data.bin
-  --dataset_subset: Use only a subset of the training data. If None (default) will use all of it. Must be either None, or a float in
-    range [0,1]
-    (a number)
-  --dataset_type: <binary|memmap|split|synthetic_gpu|synthetic_disk>: The type of the dataset to use
+  --dataset: Path to dataset
+  --dataset_type: <binary|split|synthetic_gpu>: The type of the dataset to use
    (default: 'split')
  --decay_end_lr: LR after the decay ends
    (default: '0.0')
@ -324,8 +357,7 @@ flags:
  --decay_power: Polynomial learning rate decay power
    (default: '2')
    (an integer)
-  --decay_start_step: Optimization step after which to start decaying the learning rate, if None will start decaying right after the
-    warmup phase is completed
+  --decay_start_step: Optimization step after which to start decaying the learning rate, if None will start decaying right after the warmup phase is completed
    (default: '64000')
    (an integer)
  --decay_steps: Polynomial learning rate decay steps. If equal to 0 will not do any decaying
@ -360,10 +392,9 @@ flags:
    (a number)
  --max_steps: Stop training after doing this many optimization steps
    (an integer)
-  --max_table_size: Maximum number of rows per embedding table, by default equal to the number of unique values for each categorical
-    variable
+  --max_table_size: Maximum number of rows per embedding table, by default equal to the number of unique values for each categorical variable
    (an integer)
-  --mode: <train|test|inference_benchmark>: Select task to be performed
+  --mode: <train|test|inference_benchmark|prof-train>: Select task to be performed
    (default: 'train')
  --num_numerical_features: Number of numerical features in the dataset. Defaults to 13 for the Criteo Terabyte Dataset
    (default: '13')
@ -381,14 +412,12 @@ flags:
    (an integer)
  -shuffle,--[no]shuffle_batch_order: Read batch in train dataset by random order
    (default: 'false')
-  --synthetic_dataset_dir: Default synthetic disk dataset directory
-    (default: '/tmp/dlrm_sythetic_dataset')
  --synthetic_dataset_num_entries: Number of samples per epoch for the synthetic dataset
    (default: '33554432')
    (an integer)
  --synthetic_dataset_table_sizes: Embedding table sizes to use with the synthetic dataset
-    (default: '100000,100000,100000,100000,100000,100000,100000,100000,100000,100000,100000,100000,100000,100000,100000,100000,100000,10
-    0000,100000,100000,100000,100000,100000,100000,100000,100000')
+    (default: '100000,100000,100000,100000,100000,100000,100000,100000,100000,100000,100000,100000,100000,100000,100000,100000,100000,100000,100000,100000,100000,100000,10
+    0000,100000,100000,100000')
    (a comma separated list)
  --test_after: Don't test the model unless this many epochs has been completed
    (default: '0.0')
@ -408,6 +437,22 @@ flags:
    (default: '6400')
    (an integer)
 ``` 
+The multi-GPU training script, `dlrm/scripts/dist_main.py` has also a few, specific for itself, option, that you can get by running `python -m dlrm.scripts.dist_main --help`:
+```
+USAGE: /workspace/dlrm/dlrm/scripts/dist_main.py [flags]
+flags:
+
+/workspace/dlrm/dlrm/scripts/dist_main.py:
+  --[no]Adam_MLP_optimizer: Swaps MLP optimizer to Adam
+    (default: 'false')
+  --[no]Adam_embedding_optimizer: Swaps embedding optimizer to Adam
+    (default: 'false')
+  --backend: Backend to use for distributed training. Default nccl
+    (default: 'nccl')
+  --[no]bottom_features_ordered: Sort features from the bottom model, useful when using saved checkpoint in different
+    device configurations
+    (default: 'false')
+```


 The following example output is printed when running the model:
@ -435,9 +480,9 @@ The first 23 days are used as the training set. The last day is split in half. T

 The preprocessing steps applied to the raw data include:
 - Replacing the missing values with `0`
- Replacing the categorical values that exist fewer than 15 times with a special value
+- Replacing the categorical values that exist fewer than `T` times with a special value (T value is called a frequency threshold or a frequency limit)
 - Converting the hash values to consecutive integers
- Adding 2 to all the numerical features so that all of them are greater or equal to 1
+- Adding 3 to all the numerical features so that all of them are greater or equal to 1
 - Taking a natural logarithm of all numerical features

 #### Multi-dataset
@ -448,75 +493,64 @@ Our preprocessing scripts are designed for the Criteo Terabyte Dataset and shoul
 - The next `M` tokens should contain the hashed categorical features separated by tabs.


-#### Preprocess with Spark
+#### Preprocessing 

-The preprocessing scripts provided in this repository support running both on CPU and on DGX-2 using [Apache Spark 3.0](https://www.nvidia.com/en-us/deep-learning-ai/solutions/data-science/apache-spark-3/).
-It should be possible to change the values in `preproc/dgx2_config.sh`
-so that they'll work on also on other hardware platforms such as DGX-1.
+The preprocessing scripts provided in this repository support running both on CPU and GPU using [NVtabular](https://developer.nvidia.com/blog/announcing-the-nvtabular-open-beta-with-multi-gpu-support-and-new-data-loaders/) (GPU only) and [Apache Spark 3.0](https://www.nvidia.com/en-us/deep-learning-ai/solutions/data-science/apache-spark-3/).
+
+Please note that the preprocessing will require about 4TB of disk storage. 

-Please note that the preprocessing will require about 4TB of disk storage.

 The syntax for the preprocessing script is as follows:
 ```bash
-cd preproc
-./prepare_dataset.sh <DGX2|CPU> <frequency_threshold>
+cd /workspace/dlrm/preproc
+./prepare_dataset.sh <frequency_threshold> <GPU|CPU> <NVTabular|Spark>
 ```

-The first argument is the hardware platform to use (either DGX2 or pure-CPU). The second argument means the frequency 
-threshold to apply to the categorical variables. For a frequency threshold `T`, the categorical values that occur less 
-often than `T` will be replaced with a special embedding. Thus, a larger value of `T` will require smaller embedding tables 
-and will substantially reduce the overall size of the model.
-
-For the Criteo Terabyte dataset we recommend a frequency threshold of `T=3` if you intend to run the hybrid-parallel mode
+For the Criteo Terabyte dataset, we recommend a frequency threshold of `T=3`(when using A100 40GB or V100 32 GB) or `T=2`(when using A100 80GB) if you intend to run the hybrid-parallel mode
 on multiple GPUs. If you want to make the model fit into a single NVIDIA Tesla V100-32GB, you can set `T=15`. 

-The preprocessing scripts makes use of the following environment variables to configure the data directory paths:
+The first argument means the frequency threshold to apply to the categorical variables. For a frequency threshold `T`, the categorical values that occur less 
+often than `T` will be replaced with one special value for each category. Thus, a larger value of `T` will require smaller embedding tables 
+and will substantially reduce the overall size of the model.
+
+The second argument is the hardware to use (either GPU or CPU).  
+
+The third arguments is a framework to use (either NVTabular or Spark). In case of choosing a CPU preprocessing this argument is omitted as it only Apache Spark is supported on CPU.
+
+The preprocessing scripts make use of the following environment variables to configure the data directory paths:
 - `download_dir` – this directory should contain the original Criteo Terabyte CSV files
 - `spark_output_path` – directory to which the parquet data will be written
 - `conversion_intermediate_dir` – directory used for storing intermediate data used to convert from parquet to train-ready format
 - `final_output_dir` – directory to store the final results of the preprocessing which can then be used to train DLRM 

-
-The script `spark_data_utils.py` is a PySpark application, which is used to preprocess the Criteo Terabyte Dataset. In the Docker image, we have installed Spark 3.0.0, which will start a standalone cluster of Spark. The scripts `run_spark_cpu.sh` and `run_spark_gpu.sh` start Spark, then run several PySpark jobs with `spark_data_utils.py`. 
-Generate the dictionary
-Transform train dataset
-Transform test dataset
-Transform validation dataset
-
-    Change the variables in the `run-spark.sh` script according to your environment.
-    Configure the paths.
-```
-export SPARK_LOCAL_DIRS=/data/spark-tmp
-export INPUT_PATH=/data/criteo
-export OUTPUT_PATH=/data/output
-```
-Note that the Spark job requires about 3TB disk space used for data shuffle.
-
-`SPARK_LOCAL_DIRS` is the path where Spark uses to write shuffle data.
-
-`INPUT_PATH` is the path of the Criteo Terabyte Dataset, including uncompressed files like day_0, day_1…
-
-`OUTPUT_PATH` is where the script writes the output data. It will generate below subdirectories of `models`, `train`, `test`, and `validation`. 
-The `model` is the dictionary folder. 
+In the `final_output_dir` will be three subdirectories created: `train`, `test`, `validation`, and one json file &ndash; `model_size.json` &ndash; containing a maximal index of each category. 
 The `train` is the train dataset transformed from day_0 to day_22. 
 The `test` is the test dataset transformed from the prior half of day_23. 
 The `validation` is the dataset transformed from the latter half of day_23.

-Configure the resources which Spark will use.
-```
-export TOTAL_CORES=80
-export TOTAL_MEMORY=800
-```
-`TOTAL_CORES` is the total CPU cores you want Spark to use.
+The model is tested on 3 datasets resulting from Criteo dataset preprocessing: small (Freqency threshold = 15), large (Freqency threshold = 3) and xlarge (Freqency threshold = 2). Each dataset occupies approx 370GB of disk space. Table below presents information on the supercomputer and GPU count that are needed to train model on particular dataset.

-`TOTAL_MEMORY` is the total memory Spark will use.
+| Dataset | GPU VRAM consumption\* | Model checkpoint size\* | FL setting | DGX A100 40GB, 1GPU | DGX A100 40GB, 8GPU | DGX A100 80GB, 1GPU | DGX A100 80GB, 8GPU | DGX-1** or DGX-2, 1 GPU | DGX-1** or DGX-2, 8GPU | DGX-2, 16GPU |
+| ------- | ---------------------- | ----------------------- | ---------- | -------------------- | -------------------- | -------------------- | -------------------- | ---------------------- | --------------------- | ------------ |
+| small (FL=15) | 20.5 | 15.0 | 15 | Yes | Yes | Yes | Yes | Yes | Yes | Yes |
+| large (FL=3) | 132.3 | 81.9 | 3 | NA | Yes | NA | Yes | NA | Yes | Yes |
+| xlarge (FL=2) | 198.8 | 141.3 | 2 | NA | NA | NA | Yes | NA | NA | NA |

-Configure frequency limit.
-```
-USE_FREQUENCY_LIMIT=15
-```
-The frequency limit is used to filter out the categorical values which appear less than n times in the whole dataset, and make them be 0. Change this variable to 1 to enable it. The default frequency limit is 15 in the script. You also can change the number as you want by changing  the line of `OPTS="--frequency_limit 8"`.
+\*with default embedding dimension setting
+\**DGX-1 V100 32GB

+##### NVTabular
+
+NVTabular preprocessing is calibrated to run on [DGX A100](https://www.nvidia.com/en-us/data-center/dgx-a100/) and [DGX-2](https://www.nvidia.com/en-us/data-center/dgx-2/) AI systems. However, it should be possible to change the values of `ALL_DS_MEM_FRAC`, `TRAIN_DS_MEM_FRAC`, `TEST_DS_MEM_FRAC`, `VALID_DS_MEM_FRAC` in `preproc/preproc_NVTabular.py`, so that they'll work on also on other hardware platforms such as DGX-1 or a custom one. 
+
+##### Spark
+
+The script `spark_data_utils.py` is a PySpark application, which is used to preprocess the Criteo Terabyte Dataset. In the Docker image, we have installed Spark 3.0.1, which will start a standalone cluster of Spark. The scripts `run_spark_cpu.sh` and `run_spark_gpu.sh` start Spark, then run several PySpark jobs with `spark_data_utils.py`. 
+
+Note that the Spark job requires about 3TB disk space used for data shuffle.
+
+Spark preprocessing is calibrated to run on [DGX A100](https://www.nvidia.com/en-us/data-center/dgx-a100/) and [DGX-2](https://www.nvidia.com/en-us/data-center/dgx-2/) AI systems. However, it should be possible to change the values in `preproc/DGX-2_config.sh` or `preproc/DGX-A100_config.sh`
+so that they'll work on also on other hardware platforms such as DGX-1 or a custom one. 

 ### Training process

@ -526,18 +560,17 @@ generated by the model is measured by the [ROC AUC metric](https://scikit-learn.
 The speed of training and inference is measured by throughput i.e., the number 
 of samples processed per second. We use mixed precision training with static loss scaling for the bottom and top MLPs while embedding tables are stored in FP32 format.

-
 ### Inference process

 This section describes inference with PyTorch in Python. If you're interested in inference using the Triton Inference Server, refer to [triton/README.md](triton/README.md) file.

 Two modes for inference are currently supported by the `dlrm/scripts/main.py` script:

-1. Inference benchmark – this mode will measure and print out throughput and latency numbers for multiple batch sizes. You can activate it by setting the batch sizes to be tested with the `inference_benchmark_batch_sizes` command-line argument. It will use the default test dataset unless the `--dataset_type synthetic_disk` flag is passed.
-2. Test-only – this mode can be used to run a full validation on a checkpoint to measure ROC AUC . You can enable it by passing the `--mode test` flag.
+1. Inference benchmark – this mode will measure and print out throughput and latency numbers for multiple batch sizes. You can activate it by setting the batch sizes to be tested with the `inference_benchmark_batch_sizes` command-line argument.
+2. Test-only – this mode can be used to run a full validation on a checkpoint to measure ROC AUC. You can enable it by passing the `--mode test` flag.

 ### Deploying DLRM Using NVIDIA Triton Inference Server
-The NVIDIA Triton Inference Server provides a cloud inferencing solution optimized for NVIDIA GPUs. The server provides an inference service via an HTTP or GRPC endpoint, allowing remote clients to request inferencing for any model being managed by the server. More information on how to perform inference using NVIDIA Triton Inference Server can be found in [triton/README.md](triton/README.md).
+The NVIDIA Triton Inference Server provides a cloud inferencing solution optimized for NVIDIA GPUs. The server provides an inference service via an HTTP or gRPC endpoint, allowing remote clients to request inferencing for any model being managed by the server. More information on how to perform inference using NVIDIA Triton Inference Server can be found in [triton/README.md](triton/README.md).

 ## Performance

@ -551,7 +584,7 @@ To benchmark the training performance on a specific batch size, please follow th
 in the [Quick Start Guide](#quick-start-guide). You can also add the `--max_steps 1000 --benchmark_warmup_steps 500`
 if you want to get a reliable throughput measurement without running the entire training. 

-You can also pass the `--dataset_type synthetic_disk` flag if you haven't yet downloaded the dataset.
+You can also create a synthetic dataset by running `python -m dlrm.scripts.prepare_synthetic_dataset --synthetic_dataset_dir /tmp/dlrm_synthetic_data` if you haven't yet downloaded the dataset.

 #### Inference performance benchmark

@ -561,29 +594,31 @@ To benchmark the inference performance on a specific batch size, run:
 python -m dlrm.scripts.main --mode inference_benchmark --dataset /data
 ```

-You can also pass the `--dataset_type synthetic_disk` flag if you haven't yet downloaded the dataset.
+You can also create a synthetic dataset by running `python -m dlrm.scripts.prepare_synthetic_dataset --synthetic_dataset_dir /tmp/dlrm_synthetic_data` if you haven't yet downloaded the dataset.

 ### Results 

 The following sections provide details on how we achieved our performance and accuracy in training and inference. 

-We used two model size variants to show memory scalability in multi-GPU setup:
- small - refers to model trained on Criteo dataset with frequency thresholding set to 15 resulting in smaller embedding tables - total model size: ~15 GB
- large - refers to model trained on Criteo dataset wtih frequency thresholding set to 3 resulting in larger embedding tables - total model size: ~82 GB
+We used three model size variants to show memory scalability in a multi-GPU setup:
+- small - refers to a model trained on Criteo dataset with frequency thresholding set to 15 resulting in smaller embedding tables - total model size: ~15 GB
+- large - refers to a model trained on Criteo dataset with frequency thresholding set to 3 resulting in larger embedding tables - total model size: ~82 GB
+- xlarge -  refers to a model trained on Criteo dataset with frequency thresholding set to 2 resulting in larger embedding tables - total model size: ~142 GB

 #### Training accuracy results


-##### Training accuracy: NVIDIA DGX A100 (8x A100 40GB)
+##### Training accuracy: NVIDIA DGX A100 (8x A100 80GB)

 Our results were obtained by running training scripts as described in the Quick Start Guide in the DLRM Docker container in two configurations:
- on a single NVIDIA A100 40GB GPU (`dlrm/scripts/main.py`)
- in multi-GPU setup on DGX A100 with 8x Ampere A100 40GB (`dlrm/scripts/dist_main.py`)
+- on a single NVIDIA A100 80GB GPU (`dlrm/scripts/main.py`)
+- in multi-GPU setup on DGX A100 with 8x Ampere A100 80GB (`dlrm/scripts/dist_main.py`)

 | GPUs    | Model size    | Batch size / GPU    | Accuracy (AUC) - TF32  | Accuracy (AUC) - mixed precision  |   Time to train - TF32 [minutes]  |  Time to train - mixed precision [minutes] | Time to train speedup (TF32 to mixed precision)        
 |----:|----|----|----:|----:|---:|---:|---:|
-| 8 | large | 64k | 0.8027 | 0.8027 | 7.72 | 4.9 | 1.58 |
-| 1 | small | 32k | 0.8036 | 0.8036 | 28.20 | 17.45 | 1.62 |
+| 8 | xlarge | 64k | 0.8026 | 0.8026 |  6.63 |  4.78 | 1.39 |
+| 8 |  large | 64k | 0.8027 | 0.8027 |  6.62 |  4.85 | 1.36 |
+| 1 |  small | 32k | 0.8036 | 0.8036 | 26.05 | 17.45 | 1.49 |


 ##### Training accuracy: NVIDIA DGX-1 (8x V100 32GB)
@ -598,31 +633,106 @@ Our results were obtained by running training scripts as described in the Quick
 | 1 | small | 32k | 0.8035 | 0.8035 | 105.98 | 31.12 | 3.40 |


+##### Training accuracy plots
+
+Models trained with FP32, TF32, and Automatic Mixed Precision (AMP) achieve similar accuracy.
+
+The plot represents ROC AUC metric as a function of steps (step is single batch) during training for default precision (FP32 for Volta architecture (DGX-1) and TF32 for Ampere GPU architecture (DGX-A100)), and AMP for all three datasets. 
+All other parameters of training are default.
+
+<p align="center">
+  <img width="100%" src="./img/learning_curve_FL2.svg" />
+  <br>
+  Figure 1. Training stability for a FL2 dataset: distribution of ROC AUC across different configurations. 'All configurations' refer to the distribution of ROC AUC for cartesian product of architecture, training precision. </a>
+</p>
+
+<p align="center">
+  <img width="100%" src="./img/learning_curve_FL3.svg" />
+  <br>
+  Figure 2. Training stability for a FL3 dataset: distribution of ROC AUC across different configurations. 'All configurations' refer to the distribution of ROC AUC for cartesian product of architecture, training precision. </a>
+</p>
+
+<p align="center">
+  <img width="100%" src="./img/learning_curve_FL15.svg" />
+  <br>
+  Figure 3. Training stability for a FL15 dataset: distribution of ROC AUC across different configurations. 'All configurations' refer to the distribution of ROC AUC for cartesian product of architecture, training precision. </a>
+</p>
+
+
 ##### Training stability test

-The table below shows the complete convergence data for 16 different random seeds. 
+Training of the model is stable for multiple configurations achieving the standard deviation of 10e-4. 
+The model achieves similar ROC AUC scores for A100 and V100, training precisions. 
+The DLRM model was trained for one epoch (roughly 4 billion samples, 128028 batches for single-GPU and 64014 for multi-GPU training), starting from 20 different initial random seeds for each setup.
+The training was performed in the pytorch:21.02-py3 NGC container with and without mixed precision enabled.
+The provided charts and numbers consider single and 8 GPU training. After training, the models were evaluated on the test set. 
+The following plots compare distributions of ROC AUC on the test set. 
+In columns there is single vs 8 GPU training, in rows type of hardware: A100 and V100.

-|   Random seed |  Mixed precision AUC | Single precision AUC |
-|-------:|---------:|---------:|
-|      8 | 0.803696 | 0.803669 |
-|      9 | 0.803617 | 0.803574 |
-|     10 | 0.803672 | 0.80367  |
-|     11 | 0.803699 | 0.803683 |
-|     12 | 0.803659 | 0.803724 |
-|     13 | 0.803578 | 0.803565 |
-|     14 | 0.803609 | 0.803613 |
-|     15 | 0.803585 | 0.803615 |
-|     16 | 0.803553 | 0.803583 |
-|     17 | 0.803644 | 0.803688 |
-|     18 | 0.803656 | 0.803609 |
-|     19 | 0.803589 | 0.803635 |
-|     20 | 0.803567 | 0.803611 |
-|     21 | 0.803548 | 0.803487 |
-|     22 | 0.803532 | 0.803591 |
-|     23 | 0.803625 | 0.803601 |
-| **mean** | **0.803614** | **0.803620** |
+<p align="center">
+  <img width="100%" src="./img/training_stability_FL2.svg" />
+  <br>
+  Figure 4. Training stability for a FL2 dataset: distribution of ROC AUC across different configurations. 'All configurations' refer to the distribution of ROC AUC for cartesian product of architecture, training precision. Single distribution is presented since only DGX A100 80GB is large enought to support dataset with FL=2. See [Preprocessing section](#preprocessing) for more details</a>
+</p>
+
+<p align="center">
+  <img width="100%" src="./img/training_stability_FL3.svg" />
+  <br>
+  Figure 5. Training stability for a FL3 dataset: distribution of ROC AUC across different configurations. 'All configurations' refer to the distribution of ROC AUC for cartesian product of architecture, training precision. </a>
+</p>
+
+<p align="center">
+  <img width="100%" src="./img/training_stability_FL15.svg" />
+  <br>
+  Figure 6. Training stability for a FL15 dataset: distribution of ROC AUC across different configurations. 'All configurations' refer to the distribution of ROC AUC for cartesian product of architecture, training precision. </a>
+</p>
+
+Training stability was also compared in terms of point statistics for ROC AUC distribution for multiple configurations. Refer to the expandable table below.
+
+##### Impact of mixed precision on training accuracy
+
+The accuracy of training, measured with ROC AUC on the test set after the final epoch metric was not impacted by enabling mixed precision. The obtained results were statistically similar. The similarity was measured according to the following procedure:
+
+The model was trained 20 times for default settings (FP32 or TF32 for Volta and Ampere architecture respectively) and 20 times for AMP. After the last epoch, the accuracy score ROC AUC was calculated on the test set.
+
+Distributions for two hardware configurations (A100, V100) for 3 datasets are presented below.
+
+<p align="center">
+  <img width="100%" src="./img/amp_impact_FL2.svg" />
+  <br>
+  Figure 7. Influence of AMP on ROC AUC distribution for A100 and V100 GPUs for single- and multi-gpu training on a dataset with a frequency threshold of 2. </a>
+</p>
+
+<p align="center">
+  <img width="100%" src="./img/amp_impact_FL3.svg" />
+  <br>
+  Figure 8. Influence of AMP on ROC AUC distribution for A100 and V100 GPUs for single- and multi-gpu training on a dataset with a frequency threshold of 3. </a>
+</p>
+
+<p align="center">
+  <img width="100%" src="./img/amp_impact_FL15.svg" />
+  <br>
+  Figure 9. Influence of AMP on ROC AUC distribution for A100 and V100 GPUs for single- and multi-gpu training on a dataset with a frequency threshold of 15. </a>
+</p>


+Distribution of AUC ROC for single precision training (TF32 for A100, FP32 for Volta) and AMP training were compared in terms of mean, variance and [Kolmogorov–Smirnov test](https://en.wikipedia.org/wiki/Kolmogorov%E2%80%93Smirnov_test) to state statistical difference between single precision and AMP results. Refer to the expandable table below.
+
+<details>
+<summary>Full tabular data for AMP influence on AUC ROC</summary>
+
+| Supercomputer | Dataset | GPUs | mean AUC ROC for TF32 (DGX A100)/ FP32 (DGX-1,DGX-2) | std AUC ROC for TF32 (DGX A100)/ FP32 (DGX-1,DGX-2) |mean AUC ROC for AMP | std AUC ROC for AMP | KS test value: statistics, p-value |
+| ------------- | -----| ------- | ---------------------------------------------------- | ----------------------------------------------------|---------------------|-------------------- | -----------------------------------|
+DGX A100|FL2|8|0.80261|0.00008|0.80265|0.00008|0.30000 (0.33559)|
+DGX A100|FL3|8|0.80266|0.00007|0.80266|0.00006|0.10000 (0.99999)|
+DGX A100|FL15|1|0.80361|0.00004|0.80363|0.00005|0.20000 (0.83197)|
+DGX-2 / DGX-1|FL3|8|0.80266|0.00008|0.80266|0.00006|0.20000 (0.83197)|
+DGX-2 |FL3|16|0.80265|0.00006|0.80266|0.00005|0.15000 (0.98314)|
+DGX-2 / DGX-1|FL15|1|0.80360|0.00006|0.80362|0.00006|0.20000 (0.83197)|
+
+Sample size was set to 20 experiments for each training setup.
+
+</details>

 #### Training performance results

@ -630,70 +740,96 @@ The table below shows the complete convergence data for 16 different random seed
 We used throughput in items processed per second as the performance metric.


-##### Training performance: NVIDIA DGX A100 (8x A100 40GB)
+##### Training performance: NVIDIA DGX A100 (8x A100 80GB)

 Our results were obtained by running the following commands:
- for single GPU setup:
+- for single-GPU setup:
 ```
-python -m dlrm.scripts.main --mode train --dataset /data [--amp]
+python -m dlrm.scripts.main --dataset /data [--amp]
 ```
- for multi GPU setup:
+- for multi-GPU setup:
 ```
-python -u -m torch.distributed.launch --use_env --nproc_per_node 8 -m dlrm.scripts.dist_main --mode train --dataset /data/ [--amp]
+python -m torch.distributed.launch --no_python --use_env --nproc_per_node 8 \
+          bash  -c './bind.sh --cpu=dgxa100_ccx.sh --mem=dgxa100_ccx.sh python -m dlrm.scripts.dist_main \
+          --dataset /data [--amp]'
 ```

-in the DLRM Docker container on NVIDIA DGX A100 (8x A100 40GB) GPUs. Performance numbers (in items per second) were averaged over an entire training epoch.
+in the DLRM Docker container on NVIDIA DGX A100 (8x A100 80GB) GPUs. Performance numbers (in records of data per second) were averaged over an entire training epoch.

 | GPUs   | Model size    | Batch size / GPU   | Throughput - TF32    | Throughput - mixed precision    | Throughput speedup (TF32 - mixed precision)      
 |----:|----|----|---:|---:|---:|
-| 8 | large | 64k | 9,056,775 | 14,230,793 | 1.57 |
-| 1 | small | 32k | 2,498,002 | 4,081,969 | 1.63 |
-
+| 8 | xlarge | 64k | 10,538,937 | 14,608,934 | 1.39 |
+| 8 |  large | 64k | 10,556,858 | 14,369,146 | 1.36 |
+| 1 |  small | 32k |  2,684,082 |  4,006,897 | 1.49 |

 To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).

+##### Training performance: NVIDIA DGX A100 (8x A100 40GB)
+
+Our results were obtained by running the following commands:
+- for single-GPU setup:
+```
+python -m dlrm.scripts.main --dataset /data [--amp]
+```
+- for multi-GPU setup:
+```
+python -m torch.distributed.launch --no_python --use_env --nproc_per_node 8 \
+          bash  -c './bind.sh --cpu=dgxa100_ccx.sh --mem=dgxa100_ccx.sh python -m dlrm.scripts.dist_main \
+          --dataset /data/ [--amp]'
+```
+
+in the DLRM Docker container on NVIDIA DGX A100 (8x A100 40GB) GPUs. Performance numbers (in records of data per second) were averaged over an entire training epoch.
+
+| GPUs   | Model size    | Batch size / GPU   | Throughput - TF32    | Throughput - mixed precision    | Throughput speedup (TF32 - mixed precision)      
+|----:|----|----|---:|---:|---:|
+| 8 | large | 64k | 9,729,442 | 13,860,895 | 1.42 |
+| 1 | small | 32k | 2,489,746 |  3,859,449 | 1.55 |
+

 ##### Training performance: NVIDIA DGX-1 (8x V100 32GB)

 Our results were obtained by running the following commands:
- for single GPU setup:
+- for single-GPU setup:
 ```
 python -m dlrm.scripts.main --mode train --dataset /data [--amp]
 ```
- for multi GPU setup:
+- for multi-GPU setup:
 ```
-python -u -m torch.distributed.launch --use_env --nproc_per_node 8 -m dlrm.scripts.dist_main --mode train --dataset /data/ [--amp]
+python -m torch.distributed.launch --no_python --use_env --nproc_per_node 8 \
+          bash  -c './bind.sh  --cpu=exclusive -- python -m dlrm.scripts.dist_main \
+          --dataset /data [--amp]'
 ```

- in the DLRM Docker container on NVIDIA DGX-1 with (8x V100 32GB) GPUs. Performance numbers (in items/images per second) were averaged over an entire training epoch.
+ in the DLRM Docker container on NVIDIA DGX-1 with (8x V100 32GB) GPUs. Performance numbers (in records of data per second) were averaged over an entire training epoch.

 | GPUs   | Model size    | Batch size / GPU   | Throughput - FP32    | Throughput - mixed precision    | Throughput speedup (FP32 - mixed precision)   |     
 |----:|----|----|---:|---:|---:|
-| 8 | large | 64k | 1,620,421 | 3,305,045 | 2.04 |
-| 1 | small | 32k | 670,239 | 2,281,278 | 3.40 |
+| 8 | large | 64k | 2,761,951 | 6,489,102 | 2.34 |
+| 1 | small | 32k |   639,906 | 2,125,239 | 3.32 |

-
-We used throughput in items processed per second as the performance metric.
+To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).


 ##### Training performance: NVIDIA DGX-2 (16x V100 32GB)

 Our results were obtained by running the following commands:
- for single GPU setup:
+- for single-GPU setup:
 ```
-python -m dlrm.scripts.main --mode train --dataset /data [--amp] 
+python -m dlrm.scripts.main --dataset /data [--amp] 
 ```
- for multi GPU setup:
+- for multi-GPU setup:
 ```
-python -u -m torch.distributed.launch --use_env --nproc_per_node 16 -m dlrm.scripts.dist_main --mode train --dataset /data/ [--amp]
+python -m torch.distributed.launch --no_python --use_env --nproc_per_node [8/16] \
+          bash  -c './bind.sh  --cpu=exclusive -- python -m dlrm.scripts.dist_main \
+          --dataset /data [--amp]'
 ```
- in the DLRM Docker container on NVIDIA DGX-2 with (16x V100 32GB) GPUs. Performance numbers (in items/images per second) were averaged over an entire training epoch.
+ in the DLRM Docker container on NVIDIA DGX-2 with (16x V100 32GB) GPUs. Performance numbers (in records of data per second) were averaged over an entire training epoch.

 | GPUs   | Model size   | Batch size / GPU   | Throughput - FP32    | Throughput - mixed precision    | Throughput speedup (FP32 - mixed precision)     
 |----:|----|---|---:|---:|---:|
-| 16 | large | 64k | 4,567,478 | 11,208,483 | 2.45 |
-| 8 | large | 64k | 3,169,146 | 8,315,534 | 2.62 |
-| 1 | small | 32k | 706,933 | 2,417,585 | 3.42 |
+| 16 | large | 64k | 4,494,685 | 10,360,385 | 2.30 |
+| 8  | large | 64k | 3,202,701 |  8,394,967 | 2.62 |
+| 1  | small | 32k |   692,052 |  2,235,067 | 3.23 |


 To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
@ -719,8 +855,15 @@ August 2020
 - Automatic placement and load balancing of embedding tables
 - Improved README

+March 2021
+- Added NVTabular as a new preprocessing option
+- Added a new dataset - xlarge, which uses a frequency threshold of 2
+- Introduced a new GPU - A100 80GB, and its performance results
+- Updated Spark preprocessing
+- Added Adam as an optional optimizer for embedding and MLPs, for multi-GPU training
+- Improved README

 ### Known issues

-There are no known issues with this model
+- Adam performance is not optimized.  

--- a/PyTorch/Recommendation/DLRM/bind.sh
+++ b/PyTorch/Recommendation/DLRM/bind.sh
@ -1,4 +1,4 @@
-# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2021 NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
--- a/PyTorch/Recommendation/DLRM/dgxa100_ccx.sh
+++ b/PyTorch/Recommendation/DLRM/dgxa100_ccx.sh
@ -1,4 +1,4 @@
-# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2021 NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
--- a/PyTorch/Recommendation/DLRM/dlrm/cuda_ext/dot_based_interact.py
+++ b/PyTorch/Recommendation/DLRM/dlrm/cuda_ext/dot_based_interact.py
@ -1,3 +1,17 @@
+# Copyright (c) 2021 NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import torch
 from torch.autograd import Function
 from apex import amp
--- a/PyTorch/Recommendation/DLRM/dlrm/cuda_ext/fused_gather_embedding.py
+++ b/PyTorch/Recommendation/DLRM/dlrm/cuda_ext/fused_gather_embedding.py
@ -1,3 +1,17 @@
+# Copyright (c) 2021 NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 """
 Fused Buckle Embedding
 """
--- a/PyTorch/Recommendation/DLRM/dlrm/cuda_ext/sparse_embedding.py
+++ b/PyTorch/Recommendation/DLRM/dlrm/cuda_ext/sparse_embedding.py
@ -1,4 +1,4 @@
-# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2021 NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
--- a/PyTorch/Recommendation/DLRM/dlrm/data/data_loader.py
+++ b/PyTorch/Recommendation/DLRM/dlrm/data/data_loader.py
@ -1,4 +1,4 @@
-# Copyright (c) 2020 NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2021 NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@ -44,7 +44,7 @@ if __name__ == '__main__':
    parser.add_argument('--steps', type=int, default=1000)
    args = parser.parse_args()

-    dataset = CriteoBinDataset(data_file=args.file, batch_size=args.batch_size)
+    dataset = CriteoBinDataset(data_path=args.file, batch_size=args.batch_size)

    begin = time.time()
    for i in range(args.steps):
--- a/PyTorch/Recommendation/DLRM/dlrm/data/datasets.py
+++ b/PyTorch/Recommendation/DLRM/dlrm/data/datasets.py
@ -1,4 +1,4 @@
-# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2021 NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@ -35,7 +35,7 @@ class SyntheticDataset(Dataset):
        self,
        num_entries: int,
        device: str = 'cuda',
-        batch_size: int = 1,
+        batch_size: int = 32768,
        numerical_features: Optional[int] = None,
        categorical_feature_sizes: Optional[Sequence[int]] = None,
        device_mapping: Optional[Dict[str, Any]] = None
@ -79,12 +79,12 @@ class CriteoBinDataset(Dataset):

    def __init__(
        self,
-        data_file: str,
+        data_path: str,
        batch_size: int = 1,
-        subset: float = None,
        numerical_features: int = 13,
        categorical_features: int = 26,
-        data_type: str = 'int32'
+        data_type: str = 'int32',
+        **kwargs
    ):
        self.data_type = np.__dict__[data_type]
        bytes_per_feature = self.data_type().nbytes
@ -94,14 +94,9 @@ class CriteoBinDataset(Dataset):

        self.batch_size = batch_size
        self.bytes_per_entry = (bytes_per_feature * self.tot_fea * batch_size)
-        self.num_entries = math.ceil(os.path.getsize(data_file) / self.bytes_per_entry)
+        self.num_entries = math.ceil(os.path.getsize(data_path) / self.bytes_per_entry)

-        if subset is not None:
-            if subset <= 0 or subset > 1:
-                raise ValueError('Subset parameter must be in (0,1) range')
-            self.num_entries = math.ceil(self.num_entries * subset)
-
-        self.file = open(data_file, 'rb')
+        self.file = open(data_path, 'rb')
        self._last_read_idx = -1

    def __len__(self):
@ -142,13 +137,16 @@ class SplitCriteoDataset(Dataset):
        data_path: str,
        batch_size: int = 1,
        numerical_features: bool = False,
+        number_of_numerical_features: int = 13,
        categorical_features: Optional[Sequence[int]] = None,
        categorical_feature_sizes: Optional[Sequence[int]] = None,
        prefetch_depth: int = 10,
        drop_last_batch: bool = False,
+        **kwargs
    ):
        self._label_bytes_per_batch = np.dtype(np.bool).itemsize * batch_size
-        self._numerical_bytes_per_batch = 13 * np.dtype(np.float16).itemsize * batch_size if numerical_features else 0
+        self._number_of_numerical_features = number_of_numerical_features
+        self._numerical_bytes_per_batch = self._number_of_numerical_features * np.dtype(np.float16).itemsize * batch_size if numerical_features else 0
        self._categorical_feature_types = [
            get_categorical_feature_type(size) for size in categorical_feature_sizes
        ] if categorical_feature_sizes else []
@ -226,7 +224,7 @@ class SplitCriteoDataset(Dataset):
        raw_numerical_data = os.pread(self._numerical_features_file, self._numerical_bytes_per_batch,
                                      idx * self._numerical_bytes_per_batch)
        array = np.frombuffer(raw_numerical_data, dtype=np.float16)
-        return torch.from_numpy(array).view(-1, 13)
+        return torch.from_numpy(array).view(-1, self._number_of_numerical_features)

    def _get_categorical_features(self, idx: int) -> Optional[torch.Tensor]:
        if self._categorical_features_files is None:
--- a/PyTorch/Recommendation/DLRM/dlrm/data/factories.py
+++ b/PyTorch/Recommendation/DLRM/dlrm/data/factories.py
@ -1,4 +1,4 @@
-# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2021 NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@ -21,7 +21,8 @@ from torch.utils.data import Dataset, Sampler, RandomSampler

 from dlrm.data.datasets import CriteoBinDataset, SyntheticDataset, SplitCriteoDataset
 from dlrm.data.samplers import RandomDistributedSampler
-from dlrm.data.utils import collate_array, write_dataset_to_disk, get_categorical_feature_sizes, collate_split_tensors
+from dlrm.data.utils import collate_array, write_dataset_to_disk, get_categorical_feature_sizes, \
+    collate_split_tensors
 from dlrm.utils.distributed import is_distributed, is_main_process, get_rank


@ -40,24 +41,34 @@ def create_synthetic_datasets(flags, device_mapping: Optional[Dict] = None):
    return dataset_train, dataset_test


-def create_real_datasets(flags, path, dataset_class: type = CriteoBinDataset):
-    train_dataset = os.path.join(path, "train_data.bin")
-    test_dataset = os.path.join(path, "test_data.bin")
+def create_real_datasets(
+    flags,
+    path,
+    dataset_class: type = SplitCriteoDataset,
+    train_dataset_path="train",
+    test_dataset_path="test",
+    **kwargs
+):
+    train_dataset = os.path.join(path, train_dataset_path)
+    test_dataset = os.path.join(path, test_dataset_path)
    categorical_sizes = get_categorical_feature_sizes(flags)

    dataset_train = dataset_class(
-        data_file=train_dataset,
+        data_path=train_dataset,
        batch_size=flags.batch_size,
-        subset=flags.dataset_subset,
        numerical_features=flags.num_numerical_features,
-        categorical_features=len(categorical_sizes),
+        categorical_features=range(len(categorical_sizes)),
+        categorical_feature_sizes=categorical_sizes,
+        **kwargs
    )

    dataset_test = dataset_class(
-        data_file=test_dataset,
+        data_path=test_dataset,
        batch_size=flags.test_batch_size,
        numerical_features=flags.num_numerical_features,
-        categorical_features=len(categorical_sizes),
+        categorical_features=range(len(categorical_sizes)),
+        categorical_feature_sizes=categorical_sizes,
+        **kwargs
    )

    return dataset_train, dataset_test
@ -73,7 +84,8 @@ class DatasetFactory:
        if self._device_mapping is not None:
            # selection of categorical features assigned to this device
            device_cat_features = torch.tensor(
-                self._device_mapping["embedding"][get_rank()], device=self._flags.base_device, dtype=torch.long)
+                self._device_mapping["embedding"][get_rank()], device=self._flags.base_device,
+                dtype=torch.long)
        else:
            device_cat_features = None

@ -92,7 +104,12 @@ class DatasetFactory:
    def create_datasets(self) -> Tuple[Dataset, Dataset]:
        raise NotImplementedError()

-    def create_data_loader(self, dataset, collate_fn: Optional[Callable] = None, sampler: Optional[Sampler] = None):
+    def create_data_loader(
+        self,
+        dataset,
+        collate_fn: Optional[Callable] = None,
+        sampler: Optional[Sampler] = None
+    ):
        return torch.utils.data.DataLoader(
            dataset, collate_fn=collate_fn, sampler=sampler, batch_size=None,
            num_workers=0, pin_memory=False
@ -112,7 +129,11 @@ class SyntheticDiskDatasetFactory(DatasetFactory):
        else:
            self._write(synthetic_train, synthetic_test)

-        return create_real_datasets(self._flags, self._flags.synthetic_dataset_dir)
+        return create_real_datasets(
+            self._flags, self._flags.synthetic_dataset_dir,
+            SplitCriteoDataset, "train", "test",
+            prefetch_depth=10
+        )

    def _synchronized_write(self, train_dataset: Dataset, test_dataset: Dataset):
        if is_main_process():
@ -139,12 +160,18 @@ class SyntheticGpuDatasetFactory(DatasetFactory):
 class BinaryDatasetFactory(DatasetFactory):

    def create_datasets(self) -> Tuple[Dataset, Dataset]:
-        return create_real_datasets(self._flags, self._flags.dataset)
-
+        return create_real_datasets(
+            self._flags,
+            self._flags.dataset,
+            dataset_class=CriteoBinDataset,
+            train_dataset_path="train_data.bin",
+            test_dataset_path="test_data.bin"
+        )

 class SplitBinaryDatasetFactory(DatasetFactory):

-    def __init__(self, flags, numerical_features: bool, categorical_features: Sequence[int]):
+    def __init__(self, flags, numerical_features: bool,
+                 categorical_features: Sequence[int]):
        super().__init__(flags)
        self._numerical_features = numerical_features
        self._categorical_features = categorical_features
@ -174,6 +201,7 @@ class SplitBinaryDatasetFactory(DatasetFactory):
            categorical_feature_sizes=categorical_sizes,
            prefetch_depth=prefetch_depth
        )
+
        dataset_test = SplitCriteoDataset(
            data_path=test_dataset_path,
            batch_size=self._flags.test_batch_size,
@ -182,8 +210,8 @@ class SplitBinaryDatasetFactory(DatasetFactory):
            categorical_feature_sizes=categorical_sizes,
            prefetch_depth=prefetch_depth
        )
-        return dataset_train, dataset_test

+        return dataset_train, dataset_test

 def create_dataset_factory(flags, device_mapping: Optional[dict] = None) -> DatasetFactory:
    """
@ -201,7 +229,7 @@ def create_dataset_factory(flags, device_mapping: Optional[dict] = None) -> Data
        return BinaryDatasetFactory(flags, device_mapping)

    if dataset_type == "split":
-        if is_distributed():
+        if is_distributed() or device_mapping:
            assert device_mapping is not None, "Distributed dataset requires information about model device mapping."
            rank = get_rank()
            return SplitBinaryDatasetFactory(
--- a/PyTorch/Recommendation/DLRM/dlrm/data/samplers.py
+++ b/PyTorch/Recommendation/DLRM/dlrm/data/samplers.py
@ -1,4 +1,4 @@
-# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2021 NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
--- a/PyTorch/Recommendation/DLRM/dlrm/data/utils.py
+++ b/PyTorch/Recommendation/DLRM/dlrm/data/utils.py
@ -1,4 +1,4 @@
-# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2021 NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@ -14,23 +14,27 @@

 import json
 import os
+from typing import Tuple, Optional

 import numpy as np
 import pandas as pd
 import torch
-import tqdm
 from torch import Tensor
 from torch.cuda import Stream
-from typing import Tuple, Optional
+from torch.utils.data import Dataset, DataLoader
+import tqdm
+
+DATASET_SAVING_BATCH_SIZE = 512


 def collate_split_tensors(
-        tensors: Tuple[Tensor, Tensor, Tensor],
-        device: str,
-        orig_stream: Stream,
-        numerical_type: torch.dtype = torch.float32
+    tensors: Tuple[Tensor, Tensor, Tensor],
+    device: str,
+    orig_stream: Stream,
+    numerical_type: torch.dtype = torch.float32
 ):
-    tensors = [tensor.to(device, non_blocking=True) if tensor is not None else None for tensor in tensors]
+    tensors = [tensor.to(device, non_blocking=True) if tensor is not None else None for tensor in
+               tensors]
    if device == 'cuda':
        for tensor in tensors:
            if tensor is not None:
@ -45,11 +49,11 @@ def collate_split_tensors(


 def collate_array(
-        array: np.array,
-        device: str,
-        orig_stream: Stream,
-        num_numerical_features: int,
-        selected_categorical_features: Optional[Tensor] = None
+    array: np.array,
+    device: str,
+    orig_stream: Stream,
+    num_numerical_features: int,
+    selected_categorical_features: Optional[Tensor] = None
 ):
    # numerical features are encoded as float32
    numerical_features = array[:, 1:1 + num_numerical_features].view(dtype=np.float32)
@ -73,40 +77,69 @@ def collate_array(
    return numerical_features, categorical_features, click


-def write_dataset_to_disk(destination, dataset_train, dataset_test, table_sizes):
-    for filename, dataset in zip(('train_data.bin', 'test_data.bin'),
+def get_categorical_feature_type(size: int):
+    types = (np.int8, np.int16, np.int32)
+
+    for numpy_type in types:
+        if size < np.iinfo(numpy_type).max:
+            return numpy_type
+
+    raise RuntimeError(f"Categorical feature of size {size} is too big for defined types")
+
+
+def write_dataset_to_disk(destination, dataset_train: Dataset, dataset_test, table_sizes):
+    for filename, dataset in zip(('train', 'test'),
                                 (dataset_train, dataset_test)):

-        os.makedirs(destination, exist_ok=True)
        dst_file = os.path.join(destination, filename)
-        if os.path.exists(dst_file):
-            print(f'File {dst_file} already exists, skipping')
-            continue
+        os.makedirs(dst_file, exist_ok=True)

-        with open(dst_file, 'wb') as dst_fd:
-            for numeric, categorical, label in tqdm.tqdm(dataset):
-                # numeric, categorical, label = collate(batch, device='cpu',
-                #                                       orig_stream=None,
-                #                                       num_numerical_features=13)
+        cat_feature_types = [get_categorical_feature_type(int(cat_size)) for cat_size in
+                             table_sizes]

-                categorical = categorical.to(torch.int32)
-                label = label.to(torch.int32)
+        file_streams = []

-                l = pd.DataFrame(label.cpu().numpy())
-                l.columns = ['label']
-                n = pd.DataFrame(numeric.cpu().numpy())
-                n.columns = ['n' + str(i) for i in range(len(n.columns))]
+        try:
+            numerical_f = open(os.path.join(dst_file, "numerical.bin"), "wb+")
+            file_streams.append(numerical_f)

-                c = pd.DataFrame(categorical.cpu().numpy())
-                c.columns = ['c' + str(i) for i in range(len(c.columns))]
-                df = pd.concat([l, n, c], axis=1)
+            label_f = open(os.path.join(dst_file, 'label.bin'), 'wb+')
+            file_streams.append(label_f)

-                records = df.to_records(index=False)
-                raw_data = records.tobytes()
+            categorical_fs = []
+            for i in range(len(table_sizes)):
+                fs = open(os.path.join(dst_file, f'cat_{i}.bin'), 'wb+')
+                categorical_fs.append(fs)
+                file_streams.append(fs)

-                dst_fd.write(raw_data)
+            for numerical, categorical, label in tqdm.tqdm(
+                DataLoader(dataset, DATASET_SAVING_BATCH_SIZE),
+                desc=filename + " dataset saving",
+                unit_scale=DATASET_SAVING_BATCH_SIZE
+            ):
+                number_of_numerical_variables = numerical.shape[-1]
+                number_of_categorical_variables = categorical.shape[-1]
+                numerical_f.write(numerical.to(torch.float16).cpu().numpy().tobytes())
+                label_f.write(label.to(torch.bool).cpu().numpy().tobytes())

-    model_size_dict = {'_c' + str(i): size for i, size in zip(range(14, 40), table_sizes)}
+                for cat_idx, cat_feature_type in enumerate(cat_feature_types):
+                    categorical_fs[cat_idx].write(
+                        categorical[:, :, cat_idx].cpu().numpy().astype(cat_feature_type).tobytes())
+
+        finally:
+            for stream in file_streams:
+                stream.close()
+
+    model_size_dict = {
+        '_c' + str(i): size
+        for i, size in zip(
+            range(
+                1 + number_of_numerical_variables,
+                1 + number_of_numerical_variables + number_of_categorical_variables
+            ),
+            table_sizes
+        )
+    }
    with open(os.path.join(destination, 'model_size.json'), 'w') as f:
        json.dump(model_size_dict, f, indent=4, sort_keys=True)

@ -131,14 +164,14 @@ def prefetcher(load_iterator, prefetch_stream):


 def get_categorical_feature_sizes(FLAGS):
-    if FLAGS.dataset_type in ['synthetic_disk', 'synthetic_gpu']:
+    if FLAGS.dataset_type in ['synthetic_gpu']:
        feature_sizes = [int(s) for s in FLAGS.synthetic_dataset_table_sizes]
        print('feature sizes: ', feature_sizes)
        return feature_sizes

    categorical_sizes_file = os.path.join(FLAGS.dataset, "model_size.json")
    with open(categorical_sizes_file) as f:
-        categorical_sizes = json.load(f).values()
+        categorical_sizes = [int(v) for v in json.load(f).values()]

    categorical_sizes = list(categorical_sizes)

@ -152,13 +185,3 @@ def get_categorical_feature_sizes(FLAGS):

    clipped_sizes = [min(s, FLAGS.max_table_size) for s in categorical_sizes]
    return clipped_sizes
-
-
-def get_categorical_feature_type(size: int):
-    types = (np.int8, np.int16, np.int32)
-
-    for numpy_type in types:
-        if size < np.iinfo(numpy_type).max:
-            return numpy_type
-
-    raise RuntimeError(f"Categorical feature of size {size} is too big for defined types")
--- a/PyTorch/Recommendation/DLRM/dlrm/model/distributed.py
+++ b/PyTorch/Recommendation/DLRM/dlrm/model/distributed.py
@ -1,3 +1,17 @@
+# Copyright (c) 2021 NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import Sequence, Optional

 import torch
@ -124,10 +138,13 @@ class DistributedDlrm(nn.Module):

        self.bottom_model = DlrmBottom(
            num_numerical_features, categorical_feature_sizes, bottom_mlp_sizes,
-            embedding_type, embedding_dim, hash_indices=hash_indices, use_cpp_mlp=use_cpp_mlp, fp16=fp16, device=device
+            embedding_type, embedding_dim, hash_indices=hash_indices, use_cpp_mlp=use_cpp_mlp,
+            fp16=fp16, device=device
        )
        self.top_model = DlrmTop(top_mlp_sizes, interaction, use_cpp_mlp=use_cpp_mlp).to(device)

+        self.distributed = dist.get_world_size() > 1
+
    def extra_repr(self):
        return f"interaction_op={self._interaction_op}, hash_indices={self._hash_indices}"

@ -146,11 +163,14 @@ class DistributedDlrm(nn.Module):
            batch_sizes_per_gpu (Sequence[int]):
        """
        # bottom mlp output may be not present before all to all communication
-        bottom_output, _ = self.bottom_model(numerical_input, categorical_inputs)
+        from_bottom, bottom_mlp_output = self.bottom_model(numerical_input, categorical_inputs)

-        from_bottom = bottom_to_top(bottom_output, batch_sizes_per_gpu, self._embedding_dim, self._vectors_per_gpu,
-                                    self._feature_order, self._device_feature_order)
+        # only perform all_to_all in multiGPU mode
+        if self.distributed:
+            from_bottom = bottom_to_top(from_bottom, batch_sizes_per_gpu, self._embedding_dim, self._vectors_per_gpu,
+                                        self._feature_order, self._device_feature_order)
+
+            # TODO: take bottom_mlp GPU from device mapping, do not assume it's always first
+            bottom_mlp_output = from_bottom[:, 0, :]

-        # TODO: take bottom_mlp GPU from device mapping, do not assume it's always first
-        bottom_mlp_output = from_bottom[:, 0, :]
        return self.top_model(from_bottom, bottom_mlp_output)
--- a/PyTorch/Recommendation/DLRM/dlrm/model/single.py
+++ b/PyTorch/Recommendation/DLRM/dlrm/model/single.py
@ -1,4 +1,4 @@
-# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2021 NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
--- a/PyTorch/Recommendation/DLRM/dlrm/nn/embeddings.py
+++ b/PyTorch/Recommendation/DLRM/dlrm/nn/embeddings.py
@ -1,4 +1,4 @@
-# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2021 NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@ -153,6 +153,10 @@ class JointEmbedding(Embeddings):
            data[offsets[cat]:offsets[cat + 1]] = weight


+# If you want ot use a fused joint embedding for a different number of variables, firstly change
+# the custom cuda kernel code to accommodate the new number, then change this value accordingly
+FUSED_JOINT_EMBEDDING_NUMBER_OF_CATEGORICAL_VARIABLES = 26
+
 class FusedJointEmbedding(Embeddings):
    """
    Buckle multiple one hot embedding together
@ -184,6 +188,10 @@ class FusedJointEmbedding(Embeddings):
        self.register_parameter("weight", torch.nn.Parameter(
            torch.empty((self.offsets[-1].item(), embedding_dim), device=device), requires_grad=True))

+        if len(categorical_feature_sizes) != FUSED_JOINT_EMBEDDING_NUMBER_OF_CATEGORICAL_VARIABLES:
+            raise ValueError(  f"Number of categorical features must be equal to {FUSED_JOINT_EMBEDDING_NUMBER_OF_CATEGORICAL_VARIABLES}, got {len(categorical_feature_sizes)}\n"
+                             + f"If you want to train on a different number, you need to recompile cuda kernels to support it or use different embedding type.")
+
    def forward(self, categorical_inputs) -> List[torch.Tensor]:
        # Check input has the right shape
        if self.hash_indices:
--- a/PyTorch/Recommendation/DLRM/dlrm/nn/factories.py
+++ b/PyTorch/Recommendation/DLRM/dlrm/nn/factories.py
@ -1,4 +1,4 @@
-# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2021 NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
--- a/PyTorch/Recommendation/DLRM/dlrm/nn/interactions.py
+++ b/PyTorch/Recommendation/DLRM/dlrm/nn/interactions.py
@ -1,4 +1,4 @@
-# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2021 NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
--- a/PyTorch/Recommendation/DLRM/dlrm/nn/mlps.py
+++ b/PyTorch/Recommendation/DLRM/dlrm/nn/mlps.py
@ -1,4 +1,4 @@
-# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2021 NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
--- a/PyTorch/Recommendation/DLRM/dlrm/nn/parts.py
+++ b/PyTorch/Recommendation/DLRM/dlrm/nn/parts.py
@ -1,4 +1,4 @@
-# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2021 NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@ -85,7 +85,6 @@ class DlrmBottom(nn.Module):
        Returns:
            Tensor: Concatenated bottom mlp and embedding output in shape [batch, 1 + #embedding, embedding_dim]
        """
-        batch_size = len(numerical_input) if numerical_input is not None else len(categorical_inputs)
        bottom_output = []
        bottom_mlp_output = None

@ -95,7 +94,7 @@ class DlrmBottom(nn.Module):
                bottom_mlp_output = bottom_mlp_output.half()

            # reshape bottom mlp to concatenate with embeddings
-            bottom_output.append(bottom_mlp_output.view(batch_size, 1, -1))
+            bottom_output.append(bottom_mlp_output.view(-1, 1, self._embedding_dim))

        if self.num_categorical_features > 0:
            bottom_output += self.embeddings(categorical_inputs)
--- a/PyTorch/Recommendation/DLRM/dlrm/scripts/dist_main.py
+++ b/PyTorch/Recommendation/DLRM/dlrm/scripts/dist_main.py
@ -1,4 +1,4 @@
-# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2021 NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@ -32,11 +32,11 @@ from dlrm.model.distributed import DistributedDlrm
 from dlrm.scripts.main import FLAGS, get_categorical_feature_sizes
 from dlrm.utils import distributed as dist
 from dlrm.utils.checkpointing.distributed import make_distributed_checkpoint_writer, make_distributed_checkpoint_loader
-from dlrm.utils.distributed import get_gpu_batch_sizes, get_device_mapping, is_main_process, is_distributed
+from dlrm.utils.distributed import get_gpu_batch_sizes, get_device_mapping, is_main_process

 # Training schedule flags
 FLAGS.set_default("batch_size", 65536)
-FLAGS.set_default("test_batch_size", 131072)
+FLAGS.set_default("test_batch_size", 65536)
 FLAGS.set_default("lr", 24.0)
 FLAGS.set_default("warmup_factor", 0)
 FLAGS.set_default("warmup_steps", 8000)
@ -47,8 +47,11 @@ FLAGS.set_default("decay_end_lr", 0)
 FLAGS.set_default("embedding_type", "joint_sparse")

 flags.DEFINE_string("backend", "nccl", "Backend to use for distributed training. Default nccl")
-flags.DEFINE_boolean("bottom_features_ordered", False, "Sort features from the bottom model, useful when using saved "
-                                                       "checkpoint in different device configurations")
+flags.DEFINE_boolean("bottom_features_ordered", False,
+                     "Sort features from the bottom model, useful when using saved "
+                     "checkpoint in different device configurations")
+flags.DEFINE_boolean("Adam_embedding_optimizer", False, "Swaps embedding optimizer to Adam")
+flags.DEFINE_boolean("Adam_MLP_optimizer", False, "Swaps MLP optimizer to Adam")


 def main(argv):
@ -60,9 +63,6 @@ def main(argv):
    rank, world_size, gpu = dist.init_distributed_mode(backend=FLAGS.backend, use_gpu=use_gpu)
    device = FLAGS.base_device

-    if not is_distributed():
-        raise NotImplementedError("This file is only for distributed training.")
-
    if is_main_process():
        dllogger.log(data=FLAGS.flag_values_dict(), step='PARAMETER')

@ -113,15 +113,46 @@ def main(argv):
    # DDP introduces a gradient average through allreduce(mean), which doesn't apply to bottom model.
    # Compensate it with further scaling lr
    scaled_lr = FLAGS.lr / FLAGS.loss_scale if FLAGS.amp else FLAGS.lr
-    scaled_lrs = [scaled_lr / world_size, scaled_lr]

-    embedding_optimizer = torch.optim.SGD([
-        {'params': model.bottom_model.embeddings.parameters(), 'lr': scaled_lrs[0]},
-    ])
-    mlp_optimizer = apex_optim.FusedSGD([
-        {'params': model.bottom_model.mlp.parameters(), 'lr': scaled_lrs[0]},
-        {'params': model.top_model.parameters(), 'lr': scaled_lrs[1]}
-    ])
+    if FLAGS.Adam_embedding_optimizer:
+        embedding_model_parallel_lr = scaled_lr
+    else:
+        embedding_model_parallel_lr = scaled_lr / world_size
+    if FLAGS.Adam_MLP_optimizer:
+        MLP_model_parallel_lr = scaled_lr
+    else:
+        MLP_model_parallel_lr = scaled_lr / world_size
+    data_parallel_lr = scaled_lr
+
+
+    if is_main_process():
+        mlp_params = [
+            {'params': list(model.top_model.parameters()), 'lr': data_parallel_lr},
+            {'params': list(model.bottom_model.mlp.parameters()), 'lr': MLP_model_parallel_lr}
+        ]
+        mlp_lrs = [data_parallel_lr, MLP_model_parallel_lr]
+    else:
+        mlp_params = [
+            {'params': list(model.top_model.parameters()), 'lr': data_parallel_lr}
+        ]
+        mlp_lrs = [data_parallel_lr]
+
+    if FLAGS.Adam_MLP_optimizer:
+        mlp_optimizer = apex_optim.FusedAdam(mlp_params)
+    else:
+        mlp_optimizer = apex_optim.FusedSGD(mlp_params)
+
+    embedding_params = [{
+        'params': list(model.bottom_model.embeddings.parameters()),
+        'lr': embedding_model_parallel_lr
+    }]
+    embedding_lrs = [embedding_model_parallel_lr]
+    
+    if FLAGS.Adam_embedding_optimizer:
+        embedding_optimizer = torch.optim.SparseAdam(embedding_params)
+    else:
+        embedding_optimizer = torch.optim.SGD(embedding_params)
+

    checkpoint_writer = make_distributed_checkpoint_writer(
        device_mapping=device_mapping,
@ -179,7 +210,7 @@ def main(argv):
    moving_loss_stream = torch.cuda.Stream()

    lr_scheduler = utils.LearningRateScheduler(optimizers=[mlp_optimizer, embedding_optimizer],
-                                               base_lrs=[scaled_lrs, [scaled_lrs[0]]],
+                                               base_lrs=[mlp_lrs, embedding_lrs],
                                               warmup_steps=FLAGS.warmup_steps,
                                               warmup_factor=FLAGS.warmup_factor,
                                               decay_start_step=FLAGS.decay_start_step,
@ -221,10 +252,13 @@ def main(argv):

                loss = loss_fn(output, click[batch_indices[rank]: batch_indices[rank + 1]])

-                # We don't need to accumulate gradient. Set grad to None is faster than optimizer.zero_grad()
-                for param_group in itertools.chain(embedding_optimizer.param_groups, mlp_optimizer.param_groups):
-                    for param in param_group['params']:
-                        param.grad = None
+                if FLAGS.Adam_embedding_optimizer or FLAGS.Adam_MLP_optimizer:
+                    model.zero_grad()
+                else:
+                    # We don't need to accumulate gradient. Set grad to None is faster than optimizer.zero_grad()
+                    for param_group in itertools.chain(embedding_optimizer.param_groups, mlp_optimizer.param_groups):
+                        for param in param_group['params']:
+                            param.grad = None

                if FLAGS.amp:
                    loss *= FLAGS.loss_scale
@ -233,7 +267,12 @@ def main(argv):
                else:
                    loss.backward()

+                if FLAGS.Adam_MLP_optimizer:
+                    scale_MLP_gradients(mlp_optimizer, world_size)
                mlp_optimizer.step()
+
+                if FLAGS.Adam_embedding_optimizer:
+                    scale_embeddings_gradients(embedding_optimizer, world_size)
                embedding_optimizer.step()

                moving_loss_stream.wait_stream(torch.cuda.current_stream())
@ -248,23 +287,22 @@ def main(argv):
                print(f"Started epoch {epoch}...")
            elif step % print_freq == 0:
                torch.cuda.current_stream().wait_stream(moving_loss_stream)
-                # Averaging cross a print_freq period to reduce the error.
+                # Averaging across a print_freq period to reduce the error.
                # An accurate timing needs synchronize which would slow things down.

                if global_step < FLAGS.benchmark_warmup_steps:
                    metric_logger.update(
                        loss=moving_loss.item() / print_freq / (FLAGS.loss_scale if FLAGS.amp else 1),
-                        lr=mlp_optimizer.param_groups[1]["lr"] * (FLAGS.loss_scale if FLAGS.amp else 1))
+                        lr=mlp_optimizer.param_groups[0]["lr"] * (FLAGS.loss_scale if FLAGS.amp else 1))
                else:
                    metric_logger.update(
                        step_time=timer.measured,
                        loss=moving_loss.item() / print_freq / (FLAGS.loss_scale if FLAGS.amp else 1),
-                        lr=mlp_optimizer.param_groups[1]["lr"] * (FLAGS.loss_scale if FLAGS.amp else 1))
+                        lr=mlp_optimizer.param_groups[0]["lr"] * (FLAGS.loss_scale if FLAGS.amp else 1))
                stop_time = time()

                eta_str = datetime.timedelta(seconds=int(metric_logger.step_time.global_avg * (steps_per_epoch - step)))
-                metric_logger.print(
-                    header=f"Epoch:[{epoch}/{FLAGS.epochs}] [{step}/{steps_per_epoch}]  eta: {eta_str}")
+                metric_logger.print(header=f"Epoch:[{epoch}/{FLAGS.epochs}] [{step}/{steps_per_epoch}]  eta: {eta_str}")

                with torch.cuda.stream(moving_loss_stream):
                    moving_loss = 0.
@ -285,7 +323,7 @@ def main(argv):
                if FLAGS.auc_threshold and auc >= FLAGS.auc_threshold:
                    run_time_s = int(stop_time - start_time)
                    print(f"Hit target accuracy AUC {FLAGS.auc_threshold} at epoch "
-                          f"{global_step/steps_per_epoch:.2f} in {run_time_s}s. "
+                          f"{global_step / steps_per_epoch:.2f} in {run_time_s}s. "
                          f"Average speed {global_step * FLAGS.batch_size / run_time_s:.1f} records/s.")
                    sys.exit()

@ -305,6 +343,16 @@ def main(argv):

    dllogger.log(data=results, step=tuple())

+def scale_MLP_gradients(mlp_optimizer: torch.optim.Optimizer, world_size: int):
+    for param_group in mlp_optimizer.param_groups[1:]:  # Omitting top MLP
+        for param in param_group['params']:
+            param.grad.div_(world_size)
+
+def scale_embeddings_gradients(embedding_optimizer: torch.optim.Optimizer, world_size: int):
+    for param_group in embedding_optimizer.param_groups:
+        for param in param_group['params']:
+            if param.grad != None:
+                param.grad.div_(world_size)

 def dist_evaluate(model, data_loader):
    """Test distributed DLRM model
@ -375,6 +423,10 @@ def dist_evaluate(model, data_loader):
            if last_batch_size is not None:
                output_receive_buffer = output_receive_buffer[:last_batch_size]

+            if FLAGS.auc_device == "CPU":
+                click = click.cpu()
+                output_receive_buffer = output_receive_buffer.cpu()
+
            y_true.append(click)
            y_score.append(output_receive_buffer)

--- a/PyTorch/Recommendation/DLRM/dlrm/scripts/main.py
+++ b/PyTorch/Recommendation/DLRM/dlrm/scripts/main.py
@ -1,4 +1,4 @@
-# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2021 NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@ -19,6 +19,8 @@ import numpy as np
 import torch
 from absl import app, flags
 from apex import amp
+import pyprof
+import torch.cuda.profiler as profiler

 import dlrm.scripts.utils as utils
 from dlrm.data.data_loader import get_data_loaders
@ -30,18 +32,18 @@ from dlrm.utils.checkpointing.serial import SerialCheckpointWriter, make_serial_
 FLAGS = flags.FLAGS

 # Basic run settings
-flags.DEFINE_enum("mode", default='train', enum_values=['train', 'test', 'inference_benchmark'],
+flags.DEFINE_enum("mode", default='train', enum_values=['train', 'test', 'inference_benchmark', 'prof-train'],
                  help="Select task to be performed")
-
 flags.DEFINE_integer("seed", 12345, "Random seed")

-# Training schedule flags
+# Training flags
 flags.DEFINE_integer("batch_size", 32768, "Batch size used for training")
 flags.DEFINE_integer("test_batch_size", 32768, "Batch size used for testing/validation")
 flags.DEFINE_float("lr", 28, "Base learning rate")
 flags.DEFINE_integer("epochs", 1, "Number of epochs to train for")
 flags.DEFINE_integer("max_steps", None, "Stop training after doing this many optimization steps")

+# Learning rate schedule flags
 flags.DEFINE_integer("warmup_factor", 0, "Learning rate warmup factor. Must be a non-negative integer")
 flags.DEFINE_integer("warmup_steps", 6400, "Number of warmup optimization steps")
 flags.DEFINE_integer("decay_steps", 80000, "Polynomial learning rate decay steps. If equal to 0 will not do any decaying")
@ -56,35 +58,26 @@ flags.DEFINE_enum("embedding_type", "joint_fused", ["joint", "joint_fused", "joi
 flags.DEFINE_integer("embedding_dim", 128, "Dimensionality of embedding space for categorical features")
 flags.DEFINE_list("top_mlp_sizes", [1024, 1024, 512, 256, 1], "Linear layer sizes for the top MLP")
 flags.DEFINE_list("bottom_mlp_sizes", [512, 256, 128], "Linear layer sizes for the bottom MLP")
-
 flags.DEFINE_enum("interaction_op", default="cuda_dot", enum_values=["cuda_dot", "dot", "cat"],
                  help="Type of interaction operation to perform.")

-flags.DEFINE_string(
-    "dataset", None,
-    "Full path to binary dataset. Must include files such as: train_data.bin, test_data.bin")
-flags.DEFINE_enum("dataset_type", default="split", enum_values=['binary', 'split', 'synthetic_gpu', 'synthetic_disk'],
+# Data configuration
+flags.DEFINE_string("dataset", None, "Path to dataset")
+flags.DEFINE_enum("dataset_type", default="split", enum_values=['binary', 'split', 'synthetic_gpu'],
                  help='The type of the dataset to use')
-
-flags.DEFINE_string("synthetic_dataset_dir", "/tmp/dlrm_sythetic_dataset", "Default synthetic disk dataset directory")
-flags.DEFINE_list("synthetic_dataset_table_sizes", default=','.join(26 * [str(10**5)]),
-                  help="Embedding table sizes to use with the synthetic dataset")
-
-flags.DEFINE_integer("synthetic_dataset_num_entries", default=int(2**15 * 1024), # 1024 batches by default
-                     help="Number of samples per epoch for the synthetic dataset")
-
 flags.DEFINE_boolean("shuffle_batch_order", False, "Read batch in train dataset by random order", short_name="shuffle")
-
 flags.DEFINE_integer("num_numerical_features", 13,
                     "Number of numerical features in the dataset. Defaults to 13 for the Criteo Terabyte Dataset")
-
 flags.DEFINE_integer("max_table_size", None,
                     "Maximum number of rows per embedding table, by default equal to the number of unique values for each categorical variable")
 flags.DEFINE_boolean("hash_indices", False,
                     "If True the model will compute `index := index % table size` to ensure that the indices match table sizes")

-flags.DEFINE_float("dataset_subset", None,
-     "Use only a subset of the training data. If None (default) will use all of it. Must be either None, or a float in range [0,1]")
+# Synthetic data configuration
+flags.DEFINE_list("synthetic_dataset_table_sizes", default=','.join(26 * [str(10**5)]),
+                  help="Embedding table sizes to use with the synthetic dataset")
+flags.DEFINE_integer("synthetic_dataset_num_entries", default=int(2**15 * 1024), # 1024 batches by default
+                     help="Number of samples per epoch for the synthetic dataset")

 # Checkpointing
 flags.DEFINE_string("load_checkpoint_path", None, "Path from which to load a checkpoint")
@ -96,7 +89,6 @@ flags.DEFINE_string("log_path", "./log.json", "Destination for the log file with
 flags.DEFINE_integer("test_freq", None, "Number of optimization steps between validations. If None will test after each epoch")
 flags.DEFINE_float("test_after", 0, "Don't test the model unless this many epochs has been completed")
 flags.DEFINE_integer("print_freq", 200, "Number of optimizations steps between printing training status to stdout")
-
 flags.DEFINE_integer("benchmark_warmup_steps", 0, "Number of initial iterations to exclude from throughput measurements")

 # Machine setting flags
@ -110,9 +102,11 @@ flags.DEFINE_list("inference_benchmark_batch_sizes", default=[1, 64, 4096],
 flags.DEFINE_integer("inference_benchmark_steps", 200,
                     "Number of steps for measuring inference latency and throughput")

+# Miscellaneous
 flags.DEFINE_float("auc_threshold", None, "Stop the training after achieving this AUC")
 flags.DEFINE_boolean("optimized_mlp", True, "Use an optimized implementation of MLP from apex")
-
+flags.DEFINE_enum("auc_device", default="GPU", enum_values=['GPU', 'CPU'],
+                  help="Specifies where ROC AUC metric is calculated")

 def validate_flags():
    if FLAGS.max_table_size is not None and not FLAGS.hash_indices:
@ -131,6 +125,12 @@ def validate_flags():
            print('WARNING: Optimized MLP is not supported on CPU')
            FLAGS.optimized_mlp = False

+    if FLAGS.embedding_type == 'joint_fused' and FLAGS.embedding_dim != 128:
+        print('WARNING: Joint fused can be used only with embedding_dim=128. Changed embedding type to joint.')
+        FLAGS.embedding_type = 'joint'
+
+    if FLAGS.dataset == None and FLAGS.dataset_type != 'synthetic_gpu':
+        raise ValueError('Dataset argument has to specify a path to the dataset')

 def is_data_prefetching_enabled() -> bool:
    return FLAGS.base_device == 'cuda'
@ -187,7 +187,10 @@ def main(argv):

    optimizer = torch.optim.SGD(model.parameters(), lr=scaled_lr)

-    if FLAGS.amp and FLAGS.mode == 'train':
+    if FLAGS.mode == 'prof-train':
+        pyprof.init(enable_function_stack=True)
+
+    if FLAGS.amp and (FLAGS.mode == 'train' or FLAGS.mode == 'prof-train'):
        (model.top_model, model.bottom_model.mlp), optimizer = amp.initialize([model.top_model, model.bottom_model.mlp],
                                                                              optimizer, opt_level="O2", loss_scale=1)
    elif FLAGS.amp:
@ -237,6 +240,9 @@ def main(argv):

    if FLAGS.mode == 'train':
        train(model, loss_fn, optimizer, data_loader_train, data_loader_test, scaled_lr)
+    if FLAGS.mode == 'prof-train':
+        with torch.autograd.profiler.emit_nvtx():
+            train(model, loss_fn, optimizer, data_loader_train, data_loader_test, scaled_lr)


 def maybe_save_checkpoint(checkpoint_writer: SerialCheckpointWriter, model, path):
@ -304,7 +310,12 @@ def train(model, loss_fn, optimizer, data_loader_train, data_loader_test, scaled
                          base_lr=scaled_lr, warmup_factor=FLAGS.warmup_factor,
                          decay_steps=FLAGS.decay_steps, decay_start_step=FLAGS.decay_start_step)

+            if FLAGS.mode == 'prof-train' and global_step == FLAGS.benchmark_warmup_steps:
+                profiler.start()
+
            if FLAGS.max_steps and global_step > FLAGS.max_steps:
+                if FLAGS.mode == 'prof-train':
+                    profiler.stop()
                print(f"Reached max global steps of {FLAGS.max_steps}. Stopping.")
                break

@ -374,8 +385,7 @@ def train(model, loss_fn, optimizer, data_loader_train, data_loader_test, scaled
    stop_time = time()
    run_time_s = int(stop_time - start_time)

-    print(f"Finished training in {run_time_s}s. "
-          f"Average speed {global_step * FLAGS.batch_size / run_time_s:.1f} records/s.")
+    print(f"Finished training in {run_time_s}s.")

    avg_throughput = FLAGS.batch_size / metric_logger.step_time.avg

@ -432,6 +442,11 @@ def evaluate(model, loss_fn, data_loader):
            output = model(numerical_features, categorical_features).squeeze()

            loss = loss_fn(output, click)
+
+            if FLAGS.auc_device == "CPU":
+                click = click.cpu()
+                output = output.cpu()
+
            y_true.append(click)
            y_score.append(output)

--- a/PyTorch/Recommendation/DLRM/dlrm/scripts/prepare_synthetic_dataset.py
+++ b/PyTorch/Recommendation/DLRM/dlrm/scripts/prepare_synthetic_dataset.py
@ -1,4 +1,4 @@
-# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2021 NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@ -11,15 +11,43 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from dlrm.data.datasets import SyntheticDataset
+from dlrm.data.factories import create_synthetic_datasets
+from dlrm.data.utils import write_dataset_to_disk, get_categorical_feature_sizes
+from absl import app, flags

-from dlrm.data.factories import SyntheticDiskDatasetFactory
-from dlrm.scripts.main import FLAGS
-from absl import app
+FLAGS = flags.FLAGS
+
+flags.DEFINE_integer("num_numerical_features", 13,
+                     "Number of numerical features in the dataset. Defaults to 13 for the Criteo Terabyte Dataset")
+flags.DEFINE_integer("synthetic_dataset_num_entries",
+                     default=int(32768 * 1024),  # 1024 batches for single-GPU training by default
+                     help="Number of samples per epoch for the synthetic dataset")
+flags.DEFINE_list("synthetic_dataset_table_sizes", default=','.join(26 * [str(10 ** 5)]),
+                  help="Embedding table sizes to use with the synthetic dataset")
+flags.DEFINE_string("synthetic_dataset_dir", default="/tmp/dlrm_synthetic_data",
+                    help="Destination of the saved synthetic dataset")


 def main(argv):
-    dataset_factory = SyntheticDiskDatasetFactory(FLAGS)
-    dataset_factory.create_datasets()
+    table_sizes = [int(s) for s in FLAGS.synthetic_dataset_table_sizes]
+    train_dataset = SyntheticDataset(
+        num_entries=FLAGS.synthetic_dataset_num_entries,
+        numerical_features=FLAGS.num_numerical_features,
+        categorical_feature_sizes=table_sizes
+    )
+    test_dataset = SyntheticDataset(
+        num_entries=FLAGS.synthetic_dataset_num_entries,
+        numerical_features=FLAGS.num_numerical_features,
+        categorical_feature_sizes=table_sizes
+    )
+
+    write_dataset_to_disk(
+        FLAGS.synthetic_dataset_dir,
+        train_dataset,
+        test_dataset,
+        FLAGS.synthetic_dataset_table_sizes
+    )


 if __name__ == '__main__':
--- a/PyTorch/Recommendation/DLRM/dlrm/scripts/utils.py
+++ b/PyTorch/Recommendation/DLRM/dlrm/scripts/utils.py
@ -1,4 +1,4 @@
-# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2021 NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@ -59,7 +59,7 @@ class SmoothedValue(object):
    @property
    def median(self):
        d = torch.tensor(list(self.deque))
-        return d.median().item()
+        return d.median().item() if len(self.deque) else 0

    @property
    def avg(self):
@ -68,15 +68,15 @@ class SmoothedValue(object):

    @property
    def global_avg(self):
-        return self.total / self.count
+        return self.total / self.count if self.count else 0

    @property
    def max(self):
-        return max(self.deque)
+        return max(self.deque) if len(self.deque) else 0

    @property
    def value(self):
-        return self.deque[-1]
+        return self.deque[-1] if len(self.deque) else None

    def __str__(self):
        return self.fmt.format(
--- a/PyTorch/Recommendation/DLRM/dlrm/utils/checkpointing.py
+++ b/PyTorch/Recommendation/DLRM/dlrm/utils/checkpointing.py
@ -1,139 +0,0 @@
-# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#       http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import torch
-from os.path import join
-from typing import Dict, Any, Optional, Sequence
-
-
-class DlrmCheckpointNavigator:
-
-    @property
-    def bottom_mlp_path(self) -> str:
-        return "bottom_model.mlp.pt"
-
-    @property
-    def top_model_path(self) -> str:
-        return "top_model.pt"
-
-    @property
-    def metadata_path(self) -> str:
-        return "metadata.pt"
-
-    def embedding_path(self, embedding_index: int) -> str:
-        return f"bottom_model.embeddings.{embedding_index}.pt"
-
-
-class DistributedCheckpointWriter:
-
-    def __init__(
-        self,
-        device_mapping: Dict[str, Any],
-        config: Dict[str, Any],
-        rank: int,
-        main_process: bool
-    ):
-        self._device_mapping = device_mapping
-        self._config = config
-        self._main_process = main_process
-        self._has_bottom_mlp = rank == device_mapping["bottom_mlp"]
-        self._embedding_indices = device_mapping["embedding"][rank]
-        self._navigator = DlrmCheckpointNavigator()
-
-    def save_checkpoint(
-        self,
-        model,
-        checkpoint_path: str,
-        epoch: Optional[int] = None,
-        step: Optional[int] = None
-    ):
-        os.makedirs(checkpoint_path, exist_ok=True)
-
-        self._save_embeddings_weights(checkpoint_path, model)
-
-        if self._has_bottom_mlp:
-            torch.save(model.bottom_model.mlp.state_dict(), join(checkpoint_path, self._navigator.bottom_mlp_path))
-
-        if self._main_process:
-            torch.save(model.top_model.state_dict(), join(checkpoint_path, self._navigator.top_model_path))
-            self._save_metadata(checkpoint_path, epoch, step)
-
-        torch.distributed.barrier()
-
-    def _save_embeddings_weights(self, checkpoint_path: str, model):
-        for embedding_index, weight in zip(self._embedding_indices, model.bottom_model.embeddings.weights):
-            torch.save({"weight": weight}, join(checkpoint_path, self._navigator.embedding_path(embedding_index)))
-
-    def _save_metadata(self, checkpoint_path, epoch, step):
-        torch.save({
-            "config": self._config,
-            "device_mapping": self._device_mapping,
-            "epoch": epoch,
-            "step": step
-        }, join(checkpoint_path, self._navigator.metadata_path))
-
-
-class DistributedCheckpointLoader:
-
-    def __init__(self, device_mapping: Dict[str, Any], rank: int):
-        self._device_mapping = device_mapping
-        self._has_bottom_mlp = rank == device_mapping["bottom_mlp"]
-        self._embedding_indices = device_mapping["embedding"][rank]
-        self._navigator = DlrmCheckpointNavigator()
-
-    def load_checkpoint(self, model, checkpoint_path: str):
-        top_model_state = self._load(checkpoint_path, self._navigator.top_model_path)
-        model.top_model.load_state_dict(top_model_state)
-
-        if self._has_bottom_mlp:
-            bottom_mlp_state = self._load(checkpoint_path, self._navigator.bottom_mlp_path)
-            model.bottom_model.mlp.load_state_dict(bottom_mlp_state)
-
-        embedding_weights = (self._load(checkpoint_path, self._navigator.embedding_path(index))["weight"]
-                             for index in self._embedding_indices)
-        model.bottom_model.embeddings.load_weights(embedding_weights)
-
-        torch.distributed.barrier()
-
-    def _load(self, checkpoint_path: str, state_path: str):
-        return torch.load(join(checkpoint_path, state_path), map_location="cpu")  # loading to CUDA causes OOM errors
-
-
-class CpuCheckpointLoader:
-
-    def __init__(self, embedding_indices: Sequence[int]):
-        self._embedding_indices = embedding_indices
-        self._navigator = DlrmCheckpointNavigator()
-
-    def load_checkpoint(self, model, checkpoint_path: str):
-        top_model_state = self._load(checkpoint_path, self._navigator.top_model_path)
-        model.top_model.load_state_dict(top_model_state)
-
-        bottom_mlp_state = self._load(checkpoint_path, self._navigator.bottom_mlp_path)
-        model.bottom_model.mlp.load_state_dict(bottom_mlp_state)
-
-        embedding_weights = (self._load(checkpoint_path, self._navigator.embedding_path(index))["weight"]
-                             for index in self._embedding_indices)
-        model.bottom_model.embeddings.load_weights(embedding_weights)
-
-    def _load(self, checkpoint_path: str, state_path: str):
-        data = torch.load(join(checkpoint_path, state_path), map_location="cpu")
-        return {self._strip_key(key): value for key, value in data.items()}
-
-    def _strip_key(self, key: str):
-        prefix = "module."
-        if key.startswith(prefix):
-            return key[len(prefix):]
-        return key
--- a/PyTorch/Recommendation/DLRM/dlrm/utils/checkpointing/distributed.py
+++ b/PyTorch/Recommendation/DLRM/dlrm/utils/checkpointing/distributed.py
@ -1,4 +1,4 @@
-# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2021 NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
--- a/PyTorch/Recommendation/DLRM/dlrm/utils/checkpointing/model.py
+++ b/PyTorch/Recommendation/DLRM/dlrm/utils/checkpointing/model.py
@ -1,4 +1,4 @@
-# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2021 NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
--- a/PyTorch/Recommendation/DLRM/dlrm/utils/checkpointing/serial.py
+++ b/PyTorch/Recommendation/DLRM/dlrm/utils/checkpointing/serial.py
@ -1,4 +1,4 @@
-# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2021 NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
--- a/PyTorch/Recommendation/DLRM/dlrm/utils/distributed.py
+++ b/PyTorch/Recommendation/DLRM/dlrm/utils/distributed.py
@ -1,4 +1,4 @@
-# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2021 NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
--- a/PyTorch/Recommendation/DLRM/img/amp_impact_FL15.svg
+++ b/PyTorch/Recommendation/DLRM/img/amp_impact_FL15.svg
--- a/PyTorch/Recommendation/DLRM/img/amp_impact_FL2.svg
+++ b/PyTorch/Recommendation/DLRM/img/amp_impact_FL2.svg
--- a/PyTorch/Recommendation/DLRM/img/amp_impact_FL3.svg
+++ b/PyTorch/Recommendation/DLRM/img/amp_impact_FL3.svg
--- a/PyTorch/Recommendation/DLRM/img/learning_curve_FL15.svg
+++ b/PyTorch/Recommendation/DLRM/img/learning_curve_FL15.svg
--- a/PyTorch/Recommendation/DLRM/img/learning_curve_FL2.svg
+++ b/PyTorch/Recommendation/DLRM/img/learning_curve_FL2.svg
--- a/PyTorch/Recommendation/DLRM/img/learning_curve_FL3.svg
+++ b/PyTorch/Recommendation/DLRM/img/learning_curve_FL3.svg
--- a/PyTorch/Recommendation/DLRM/img/training_stability_FL15.svg
+++ b/PyTorch/Recommendation/DLRM/img/training_stability_FL15.svg
--- a/PyTorch/Recommendation/DLRM/img/training_stability_FL2.svg
+++ b/PyTorch/Recommendation/DLRM/img/training_stability_FL2.svg
--- a/PyTorch/Recommendation/DLRM/img/training_stability_FL3.svg
+++ b/PyTorch/Recommendation/DLRM/img/training_stability_FL3.svg
--- a/PyTorch/Recommendation/DLRM/preproc/DGX-2_config.sh
+++ b/PyTorch/Recommendation/DLRM/preproc/DGX-2_config.sh
@ -1,6 +1,6 @@
 #!/bin/bash

-# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2021 NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
--- a/PyTorch/Recommendation/DLRM/preproc/DGX-A100_config.sh
+++ b/PyTorch/Recommendation/DLRM/preproc/DGX-A100_config.sh
@ -0,0 +1,38 @@
+#!/bin/bash
+
+# Copyright (c) 2021 NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# the environment variables to run spark job
+# should modify below environment variables
+
+# below numbers should be adjusted according to the resource of your running environment
+# set the total number of CPU cores, spark can use
+export TOTAL_CORES=256
+
+# set the number of executors
+export NUM_EXECUTORS=8
+
+# the cores for each executor, it'll be calculated
+export NUM_EXECUTOR_CORES=$((${TOTAL_CORES}/${NUM_EXECUTORS}))
+
+# unit: GB,  set the max memory you want to use
+export TOTAL_MEMORY=2000
+
+# unit: GB, set the memory for driver
+export DRIVER_MEMORY=32
+
+# the memory per executor
+export EXECUTOR_MEMORY=$(((${TOTAL_MEMORY}-${DRIVER_MEMORY})/${NUM_EXECUTORS}-16))
--- a/PyTorch/Recommendation/DLRM/preproc/NVT_shuffle_spark.py
+++ b/PyTorch/Recommendation/DLRM/preproc/NVT_shuffle_spark.py
@ -0,0 +1,57 @@
+# Copyright (c) 2021 NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from argparse import ArgumentParser
+
+from pyspark.sql import Row, SparkSession, Window
+from pyspark.sql.functions import *
+from pyspark.sql.types import *
+
+
+LABEL_COL = 0
+INT_COLS = list(range(1, 14))
+CAT_COLS = list(range(14, 40))
+
+
+def col_of_rand_long():
+    return (rand() * (1 << 52)).cast(LongType())
+
+def rand_ordinal(df):
+    return df.withColumn('ordinal', col_of_rand_long())
+
+def _parse_args():
+    parser = ArgumentParser()
+    parser.add_argument('--input_path', required=True)
+    parser.add_argument('--output_path')
+    args = parser.parse_args()
+    return args
+
+
+def _main():
+    args = _parse_args()
+    spark = SparkSession.builder.getOrCreate()
+
+    df = rand_ordinal(spark.read.load(args.input_path + "/*"))
+    df = df.repartition('ordinal').sortWithinPartitions('ordinal')
+    df = df.drop('ordinal')
+
+    df.write.parquet(
+        args.output_path,
+        mode='overwrite'
+    ) 
+    
+
+if __name__ == '__main__':
+    _main()
--- a/PyTorch/Recommendation/DLRM/preproc/gpu/spark-defaults.conf
+++ b/PyTorch/Recommendation/DLRM/preproc/gpu/spark-defaults.conf
@ -26,5 +26,4 @@
 # spark.driver.memory              5g
 # spark.executor.extraJavaOptions  -XX:+PrintGCDetails -Dkey=value -Dnumbers="one two three"

-spark.worker.resource.gpu.amount        	16
-spark.worker.resource.gpu.discoveryScript   /opt/spark-3.0.0-bin-hadoop3.2/conf/get_gpu_resources.sh
+spark.worker.resource.gpu.discoveryScript   /opt/spark/conf/get_gpu_resources.sh
--- a/PyTorch/Recommendation/DLRM/preproc/parquet_to_binary.py
+++ b/PyTorch/Recommendation/DLRM/preproc/parquet_to_binary.py
@ -1,4 +1,4 @@
-# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2021 NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@ -23,14 +23,16 @@ import tqdm
 import subprocess

 def process_file(f, dst):
-
+    label = '_c0'
+    dense_columns = [f'_c{i}' for i in range(1, 14)]
+    categorical_columns = [f'_c{i}' for i in range(14, 40)]
    all_columns_sorted = [f'_c{i}' for i in range(0, 40)]
-
    data = pd.read_parquet(f)
    data = data[all_columns_sorted]

-    dense_columns = [f'_c{i}' for i in range(1, 14)]
+    data[label] = data[label].astype(np.int32)
    data[dense_columns] = data[dense_columns].astype(np.float32)
+    data[categorical_columns] = data[categorical_columns].astype(np.int32)

    data = data.to_records(index=False)
    data = data.tobytes()
@ -50,7 +52,7 @@ def main():

    print('Processing train files...')
    train_src_files = glob.glob(args.src_dir + '/train/*.parquet')
-    train_intermediate_dir = args.intermediate_dir + '/train'
+    train_intermediate_dir = os.path.join(args.intermediate_dir, 'train')
    os.makedirs(train_intermediate_dir, exist_ok=True)

    Parallel(n_jobs=args.parallel_jobs)(delayed(process_file)(f, train_intermediate_dir) for f in tqdm.tqdm(train_src_files))
@ -59,7 +61,7 @@ def main():

    print('Processing test files...')
    test_src_files = glob.glob(args.src_dir + '/test/*.parquet')
-    test_intermediate_dir = args.intermediate_dir + '/test'
+    test_intermediate_dir = os.path.join(args.intermediate_dir, 'test')
    os.makedirs(test_intermediate_dir, exist_ok=True)

    Parallel(n_jobs=args.parallel_jobs)(delayed(process_file)(f, test_intermediate_dir) for f in tqdm.tqdm(test_src_files))
@ -67,7 +69,7 @@ def main():

    print('Processing validation files...')
    valid_src_files = glob.glob(args.src_dir + '/validation/*.parquet')
-    valid_intermediate_dir = args.intermediate_dir + '/valid'
+    valid_intermediate_dir = os.path.join(args.intermediate_dir, 'validation')
    os.makedirs(valid_intermediate_dir, exist_ok=True)

    Parallel(n_jobs=args.parallel_jobs)(delayed(process_file)(f, valid_intermediate_dir) for f in tqdm.tqdm(valid_src_files))
@ -82,7 +84,7 @@ def main():
    os.system(f'cat {test_intermediate_dir}/*.bin > {args.dst_dir}/test_data.bin')

    print('Concatenating validation files')
-    os.system(f'cat {valid_intermediate_dir}/*.bin > {args.dst_dir}/val_data.bin')
+    os.system(f'cat {valid_intermediate_dir}/*.bin > {args.dst_dir}/validation_data.bin')
    print('Done')


--- a/PyTorch/Recommendation/DLRM/preproc/prepare_dataset.sh
+++ b/PyTorch/Recommendation/DLRM/preproc/prepare_dataset.sh
@ -1,6 +1,6 @@
 #! /bin/bash

-# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2021 NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@ -15,59 +15,73 @@
 # limitations under the License.

 # Examples:
-# to run on a DGX2 with a frequency limit of 3 (will need 8xV100-32GB to fit the model in GPU memory)
-# ./prepare_dataset.sh DGX2 3
+# to run on GPU with a frequency limit of 3 using NVTabular:
+#   ./prepare_dataset.sh 3 GPU NVTabular
 #
-# to run on a DGX2 with a frequency limit of 15 (should fit on a single V100-32GB):
-# ./prepare_dataset.sh DGX2 15
+# to run on GPU with a frequency limit of 15 using Spark GPU:
+#   ./prepare_dataset.sh 15 GPU Spark
 #
-# to run on CPU with a frequency limit of 15:
-# ./prepare_dataset.sh CPU 15
-
-
+# to run on CPU with a frequency limit of 15 using Spark CPU:
+#   ./prepare_dataset.sh 15 CPU

 set -e
 set -x

 ls -ltrash

+
+rm -rf /data/dlrm/spark
+rm -rf /data/dlrm/intermediate_binary
+rm -rf /data/dlrm/output
+rm -rf /data/dlrm/criteo_parquet
+rm -rf /data/dlrm/binary_dataset
+
+
 download_dir=${download_dir:-'/data/dlrm/criteo'}
 ./verify_criteo_downloaded.sh ${download_dir}

-spark_output_path=${spark_output_path:-'/data/dlrm/spark/output'}
+output_path=${output_path:-'/data/dlrm/output'}


-if [ -f ${spark_output_path}/train/_SUCCESS ] \
-   && [ -f ${spark_output_path}/validation/_SUCCESS ] \
-   && [ -f ${spark_output_path}/test/_SUCCESS ]; then
-
-   echo "Spark preprocessing already carried out"
+if [ "$3" = "NVTabular" ]; then
+    echo "Performing NVTabular preprocessing"
+    ./run_NVTabular.sh ${download_dir} ${output_path} $1
+    preprocessing_version=NVTabular
 else
-   echo "Performing spark preprocessing"
-   ./run_spark.sh $1 ${download_dir} ${spark_output_path} $2
+    if [ -f ${output_path}/train/_SUCCESS ] \
+        && [ -f ${output_path}/validation/_SUCCESS ] \
+        && [ -f ${output_path}/test/_SUCCESS ]; then
+
+        echo "Spark preprocessing already carried out"
+    else
+        echo "Performing spark preprocessing"
+        ./run_spark.sh $2 ${download_dir} ${output_path} $1
+    fi
+    preprocessing_version=Spark
 fi

 conversion_intermediate_dir=${conversion_intermediate_dir:-'/data/dlrm/intermediate_binary'}
 final_output_dir=${final_output_dir:-'/data/dlrm/binary_dataset'}

+source ${DGX_VERSION}_config.sh

 if [ -d ${final_output_dir}/train ] \
-   && [ -d ${final_output_dir}/val ] \
+   && [ -d ${final_output_dir}/validation ] \
   && [ -d ${final_output_dir}/test ] \
   && [ -f ${final_output_dir}/model_sizes.json ]; then

    echo "Final conversion already done"
 else
    echo "Performing final conversion to a custom data format"
-    python parquet_to_binary.py --parallel_jobs 40 --src_dir ${spark_output_path} \
+    python parquet_to_binary.py --parallel_jobs ${TOTAL_CORES} --src_dir ${output_path} \
                                --intermediate_dir  ${conversion_intermediate_dir} \
                                --dst_dir ${final_output_dir}

-    cp "${spark_output_path}/model_size.json" "${final_output_dir}/model_size.json"
+    cp "${output_path}/model_size.json" "${final_output_dir}/model_size.json"

    python split_dataset.py --dataset "${final_output_dir}" --output "${final_output_dir}/split"
    rm ${final_output_dir}/train_data.bin
-    rm ${final_output_dir}/val_data.bin
+    rm ${final_output_dir}/validation_data.bin
    rm ${final_output_dir}/test_data.bin

    mv ${final_output_dir}/split/* ${final_output_dir}
@ -75,5 +89,3 @@ else
 fi

 echo "Done preprocessing the Criteo Kaggle Dataset"
-echo "You can now start the training with: "
-echo "python -m dlrm.scripts.main --mode train --dataset  ${final_output_dir}"
--- a/PyTorch/Recommendation/DLRM/preproc/preproc_NVTabular.py
+++ b/PyTorch/Recommendation/DLRM/preproc/preproc_NVTabular.py
@ -0,0 +1,348 @@
+# Copyright (c) 2021 NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Preprocess Criteo 1TB Click Logs dataset with frequency thresholding and filling missing values.
+
+This script accepts input in either tsv or parquet format.
+"""
+
+import argparse
+from collections import OrderedDict
+import json
+import os
+import subprocess
+from time import time
+from typing import List, Optional
+
+import numpy as np
+import nvtabular as nvt
+import rmm
+import cudf
+from dask.base import tokenize
+from dask.dataframe.io.parquet.utils import _analyze_paths
+from dask.delayed import Delayed
+from dask.distributed import Client
+from dask.highlevelgraph import HighLevelGraph
+from dask.utils import natural_sort_key
+from dask_cuda import LocalCUDACluster
+from fsspec.core import get_fs_token_paths
+from nvtabular import Workflow
+from nvtabular.io import Dataset, Shuffle
+from nvtabular.utils import device_mem_size
+from nvtabular.ops import Normalize, Categorify, LogOp, FillMissing, Clip, get_embedding_sizes, \
+    LambdaOp
+from cudf.io.parquet import ParquetWriter
+
+CRITEO_CONTINUOUS_COLUMNS = [f'_c{x}' for x in range(1, 14)]
+CRITEO_CATEGORICAL_COLUMNS = [f'_c{x}' for x in range(14, 40)]
+CRITEO_CLICK_COLUMNS = ['_c0']
+COLUMNS = CRITEO_CONTINUOUS_COLUMNS + CRITEO_CATEGORICAL_COLUMNS + CRITEO_CLICK_COLUMNS
+CRITEO_TRAIN_DAYS = list(range(0, 23))
+
+ALL_DS_MEM_FRAC = 0.04
+TRAIN_DS_MEM_FRAC = 0.045
+TEST_DS_MEM_FRAC = 0.3
+VALID_DS_MEM_FRAC = 0.3
+
+def _pool(frac=0.8):
+    initial_pool_size = frac * device_mem_size()
+    if initial_pool_size % 256 != 0:
+        new_initial_pool_size = initial_pool_size // 256 * 256
+        print(
+            f"Initial pool size for rmm has to be a multiply of 256. Got {initial_pool_size}, reducing to {new_initial_pool_size}")
+        initial_pool_size = new_initial_pool_size
+
+    rmm.reinitialize(
+        pool_allocator=True,
+        initial_pool_size=initial_pool_size,
+    )
+
+
+def _convert_file(path, name, out_dir, gpu_mem_frac, fs, cols, dtypes):
+    fn = f"{name}.parquet"
+    out_path = fs.sep.join([out_dir, f"{name}.parquet"])
+    writer = ParquetWriter(out_path, compression=None)
+    for gdf in nvt.Dataset(
+        path,
+        engine="csv",
+        names=cols,
+        part_memory_fraction=gpu_mem_frac,
+        sep='\t',
+        dtypes=dtypes,
+    ).to_iter():
+        writer.write_table(gdf)
+        del gdf
+    md = writer.close(metadata_file_path=fn)
+    return md
+
+
+def _write_metadata(md_list, fs, path):
+    if md_list:
+        metadata_path = fs.sep.join([path, "_metadata"])
+        _meta = (
+            cudf.io.merge_parquet_filemetadata(md_list)
+            if len(md_list) > 1
+            else md_list[0]
+        )
+        with fs.open(metadata_path, "wb") as f:
+            _meta.tofile(f)
+    return True
+
+
+def convert_criteo_to_parquet(
+    input_path: str,
+    output_path: str,
+    client,
+    gpu_mem_frac: float = 0.05,
+):
+    print("Converting tsv to parquet files")
+    if not output_path:
+        raise RuntimeError("Intermediate directory must be defined, if the dataset is tsv.")
+    os.makedirs(output_path, exist_ok=True)
+
+    # split last day into two parts
+    number_of_lines = int(
+        subprocess.check_output((f'wc -l {os.path.join(input_path, "day_23")}').split()).split()[0])
+    valid_set_size = number_of_lines // 2
+    test_set_size = number_of_lines - valid_set_size
+
+    with open(os.path.join(input_path, "day_23.part1"), "w") as f:
+        subprocess.run(['head', '-n', str(test_set_size), str(os.path.join(input_path, "day_23"))], stdout=f)
+
+    with open(os.path.join(input_path, "day_23.part2"), "w") as f:
+        subprocess.run(['tail', '-n', str(valid_set_size), str(os.path.join(input_path, "day_23"))], stdout=f)
+
+    fs = get_fs_token_paths(input_path, mode="rb")[0]
+    file_list = [
+        x for x in fs.glob(fs.sep.join([input_path, "day_*"]))
+        if not x.endswith("parquet")
+    ]
+    file_list = sorted(file_list, key=natural_sort_key)
+    name_list = _analyze_paths(file_list, fs)[1]
+
+    cols = CRITEO_CLICK_COLUMNS + CRITEO_CONTINUOUS_COLUMNS + CRITEO_CATEGORICAL_COLUMNS
+
+    dtypes = {}
+    dtypes[CRITEO_CLICK_COLUMNS[0]] = np.int64
+    for x in CRITEO_CONTINUOUS_COLUMNS:
+        dtypes[x] = np.int64
+    for x in CRITEO_CATEGORICAL_COLUMNS:
+        dtypes[x] = "hex"
+
+    dsk = {}
+    token = tokenize(file_list, name_list, output_path, gpu_mem_frac, fs, cols, dtypes)
+    convert_file_name = "convert_file-" + token
+    for i, (path, name) in enumerate(zip(file_list, name_list)):
+        key = (convert_file_name, i)
+        dsk[key] = (_convert_file, path, name, output_path, gpu_mem_frac, fs, cols, dtypes)
+
+    write_meta_name = "write-metadata-" + token
+    dsk[write_meta_name] = (
+        _write_metadata,
+        [(convert_file_name, i) for i in range(len(file_list))],
+        fs,
+        output_path,
+    )
+    graph = HighLevelGraph.from_collections(write_meta_name, dsk, dependencies=[])
+    conversion_delayed = Delayed(write_meta_name, graph)
+
+    if client:
+        conversion_delayed.compute()
+    else:
+        conversion_delayed.compute(scheduler="synchronous")
+
+    print("Converted")
+
+
+def save_model_size_config(workflow: Workflow, output_path: str):
+    embeddings = {}
+    for k, v in get_embedding_sizes(workflow).items():
+        embeddings[k] = v[0] - 1  # we have to subtract one, as the model expects to get a maximal id for each category
+
+    ordered_dict = OrderedDict()
+    for k, v in sorted(list(embeddings.items()), key=lambda x: x[0]):
+        ordered_dict[k] = v
+    with open(os.path.join(output_path, "model_size.json"), 'w') as file:
+        file.write(json.dumps(ordered_dict))
+
+
+def preprocess_criteo_parquet(
+    input_path: str,
+    output_path: str,
+    client,
+    frequency_threshold: int,
+):
+    train_days = [str(x) for x in CRITEO_TRAIN_DAYS]
+    train_files = [
+        os.path.join(input_path, x)
+        for x in os.listdir(input_path)
+        if x.startswith("day") and x.split(".")[0].split("_")[-1] in train_days
+    ]
+    valid_file = os.path.join(input_path, "day_23.part2.parquet")
+    test_file = os.path.join(input_path, "day_23.part1.parquet")
+
+    all_set = train_files + [valid_file] + [test_file]
+
+    print(all_set, train_files, valid_file, test_file)
+    print("Creating Workflow Object")
+
+    workflow = Workflow(
+        cat_names=CRITEO_CATEGORICAL_COLUMNS,
+        cont_names=CRITEO_CONTINUOUS_COLUMNS,
+        label_name=CRITEO_CLICK_COLUMNS
+    )
+
+    # We want to assign 0 to all missing values, and calculate log(x+3) for present values
+    # so if we set missing values to -2, then the result of log(1+2+(-2)) would be 0
+    workflow.add_cont_feature([
+        FillMissing(fill_val=-2.0),
+        LambdaOp(op_name='Add3ButMinusOneCauseLogAddsOne', f=lambda col, _: col.add(2.0)),
+        LogOp(),  # Log(1+x)
+    ])
+
+    workflow.add_cat_preprocess(
+        Categorify(freq_threshold=frequency_threshold, out_path=output_path)
+    )
+
+    workflow.finalize()
+
+    print("Creating Dataset Iterator")
+    all_ds = Dataset(all_set, engine="parquet", part_mem_fraction=ALL_DS_MEM_FRAC)
+    trains_ds = Dataset(train_files, engine="parquet", part_mem_fraction=TRAIN_DS_MEM_FRAC)
+    valid_ds = Dataset(valid_file, engine="parquet", part_mem_fraction=TEST_DS_MEM_FRAC)
+    test_ds = Dataset(test_file, engine="parquet", part_mem_fraction=VALID_DS_MEM_FRAC)
+
+    print("Running apply")
+    out_train = os.path.join(output_path, "train")
+    out_valid = os.path.join(output_path, "validation")
+    out_test = os.path.join(output_path, "test")
+
+    start = time()
+    workflow.update_stats(all_ds)
+    print(f"Gathering statistics time: {time() - start}")
+
+    start = time()
+    workflow.apply(
+        trains_ds,
+        record_stats=False,
+        output_path=out_train
+    )
+    print(f"train preprocess time: {time() - start}")
+
+    start = time()
+    workflow.apply(
+        valid_ds,
+        record_stats=False,
+        output_path=out_valid
+    )
+    print(f"valid preprocess time: {time() - start}")
+
+    start = time()
+    workflow.apply(
+        test_ds,
+        record_stats=False,
+        output_path=out_test
+    )
+    print(f"test preprocess time: {time() - start}")
+
+    save_model_size_config(workflow, output_path)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Process some integers.")
+    parser.add_argument(
+        "input_dir",
+        help="directory with either csv or parquet dataset files inside"
+    )
+    parser.add_argument(
+        "output_dir",
+        help="directory to save preprocessed dataset files"
+    )
+    parser.add_argument(
+        "--intermediate_dir",
+        required=False,
+        default=None,
+        help="directory for converted to parquet dataset files inside"
+    )
+    parser.add_argument(
+        "--devices",
+        required=True,
+        help="available gpus, separated with commas; e.g 0,1,2,3"
+    )
+    parser.add_argument(
+        "--freq_threshold",
+        required=False,
+        default=15,
+        help="frequency threshold for categorical can be int or dict {column_name: threshold}"
+    )
+    parser.add_argument(
+        "--pool",
+        required=False,
+        default=False,
+        help="bool value to use a RMM pooled allocator"
+    )
+
+    args = parser.parse_args()
+
+    args.devices = args.devices.split(",")
+
+    return args
+
+
+def is_input_parquet(input_dir: str):
+    for f in os.listdir(input_dir):
+        if 'parquet' in f:
+            return True
+    return False
+
+
+def start_local_CUDA_cluster(devices, pool):
+    if len(devices) > 1:
+        cluster = LocalCUDACluster(
+            n_workers=len(devices),
+            CUDA_VISIBLE_DEVICES=",".join(str(x) for x in devices),
+        )
+        client = Client(cluster)
+        if pool:
+            client.run(_pool)
+    elif pool:
+        _pool()
+    return client
+
+
+def main():
+    args = parse_args()
+
+    client = start_local_CUDA_cluster(args.devices, args.pool)
+
+    if not is_input_parquet(args.input_dir):
+        convert_criteo_to_parquet(
+            input_path=args.input_dir,
+            output_path=args.intermediate_dir,
+            client=client,
+        )
+        args.input_dir = args.intermediate_dir
+
+    print("Preprocessing data")
+    preprocess_criteo_parquet(
+        input_path=args.input_dir,
+        output_path=args.output_dir,
+        client=client,
+        frequency_threshold=int(args.freq_threshold),
+    )
+    print("Done")
+
+
+if __name__ == '__main__':
+    main()
--- a/PyTorch/Recommendation/DLRM/preproc/run_NVTabular.sh
+++ b/PyTorch/Recommendation/DLRM/preproc/run_NVTabular.sh
@ -0,0 +1,88 @@
+#!/bin/bash
+
+# Copyright (c) 2021 NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#########################################################################
+# File Name: run_NVTabular.sh
+
+set -e
+
+# the data path including 1TB criteo data, day_0, day_1, ...
+export INPUT_PATH=${1:-'/data/dlrm/criteo'}
+
+# the output path, use for generating the dictionary and the final dataset
+# the output folder should have more than 300GB
+export OUTPUT_PATH=${2:-'/data/dlrm/output'}
+
+export FREQUENCY_LIMIT=${3:-'15'}
+
+export CRITEO_PARQUET=${4:-'/data/dlrm/criteo_parquet'}
+
+if [ "$DGX_VERSION" = "DGX-2" ]; then
+    export DEVICES=0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
+else
+    export DEVICES=0,1,2,3,4,5,6,7
+fi
+
+echo "Preprocessing data"
+python preproc_NVTabular.py $INPUT_PATH $OUTPUT_PATH --devices $DEVICES --intermediate_dir $CRITEO_PARQUET --freq_threshold $FREQUENCY_LIMIT
+
+echo "Shuffling"
+
+source ${DGX_VERSION}_config.sh
+
+export SPARK_HOME=/opt/spark
+export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64
+export PATH=$SPARK_HOME/bin:$SPARK_HOME/sbin:$PATH
+export MASTER=spark://$HOSTNAME:7077
+export SPARK_LOCAL_DIRS='/data/dlrm/spark/tmp'
+mkdir -p $SPARK_LOCAL_DIRS
+
+echo "Starting spark standalone"
+start-master.sh
+start-slave.sh $MASTER
+
+spark-submit --master $MASTER \
+    --driver-memory "${DRIVER_MEMORY}G" \
+    --executor-cores $NUM_EXECUTOR_CORES \
+    --executor-memory "${EXECUTOR_MEMORY}G" \
+    --conf spark.cores.max=$TOTAL_CORES \
+    --conf spark.task.cpus=1 \
+    --conf spark.sql.files.maxPartitionBytes=1073741824 \
+    --conf spark.sql.shuffle.partitions=1200 \
+    --conf spark.driver.maxResultSize=2G \
+    --conf spark.locality.wait=0s \
+    --conf spark.network.timeout=1800s \
+    --conf spark.task.resource.gpu.amount=0.01 \
+    --conf spark.executor.resource.gpu.amount=1 \
+    --conf spark.plugins=com.nvidia.spark.SQLPlugin \
+    --conf spark.rapids.sql.concurrentGpuTasks=2 \
+    --conf spark.rapids.sql.reader.batchSizeRows=4000000 \
+    --conf spark.rapids.memory.pinnedPool.size=16g \
+    --conf spark.rapids.sql.explain=ALL \
+    --conf spark.sql.autoBroadcastJoinThreshold=1GB \
+    --conf spark.rapids.sql.incompatibleOps.enabled=true \
+    --conf spark.driver.maxResultSize=2G \
+    --conf spark.executor.extraJavaOptions="-Dai.rapids.cudf.prefer-pinned=true\ -Djava.io.tmpdir=$SPARK_LOCAL_DIRS" \
+    NVT_shuffle_spark.py --input_path $OUTPUT_PATH/train --output_path $OUTPUT_PATH/shuffled_train
+
+stop-master.sh
+stop-slave.sh
+
+rm -rf $OUTPUT_PATH/train
+mv $OUTPUT_PATH/shuffled_train $OUTPUT_PATH/train
+
+
+
--- a/PyTorch/Recommendation/DLRM/preproc/run_spark.sh
+++ b/PyTorch/Recommendation/DLRM/preproc/run_spark.sh
@ -1,6 +1,6 @@
 #!/bin/bash

-# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2021 NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@ -21,14 +21,19 @@
 echo "Input mode option: $1"
 if [ "$1" = "CPU" ]
 then
-   echo "Run with CPU.";
-   shift
-   ./run_spark_cpu.sh ${@}
-elif [ "$1" = "DGX2" ]
+    echo "Run with CPU.";
+    shift
+    ./run_spark_cpu.sh ${@}
+elif [ "$1" = "GPU" ]
 then
-   echo "Run with GPU.";
-   shift
-   ./run_spark_gpu.sh ${@} DGX2
+    echo "Run with GPU.";
+    shift
+    if [ "$DGX_VERSION" = "DGX-2" ]
+    then
+        ./run_spark_gpu_DGX-2.sh ${@}
+    else
+        ./run_spark_gpu_DGX-A100.sh ${@}
+    fi
 else
-   echo "Please choose mode (CPU/DGX2).";
+   echo "Please choose mode (CPU/GPU).";
 fi
--- a/PyTorch/Recommendation/DLRM/preproc/run_spark_cpu.sh
+++ b/PyTorch/Recommendation/DLRM/preproc/run_spark_cpu.sh
@ -1,4 +1,4 @@
-# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2021 NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@ -54,18 +54,9 @@ export DRIVER_MEMORY=32
 # the memory per executor
 export EXECUTOR_MEMORY=$(((${TOTAL_MEMORY}-${DRIVER_MEMORY})/${NUM_EXECUTORS}))

-
 OPTS="--frequency_limit $FREQUENCY_LIMIT"

-# use frequency_limit=15 or not
-# by default use a frequency limit of 15
-USE_FREQUENCY_LIMIT=1
-OPTS=""
-if [[ $USE_FREQUENCY_LIMIT == 1 ]]; then
-    OPTS="--frequency_limit 15"
-fi
-
-export SPARK_HOME=/opt/spark-3.0.0-bin-hadoop3.2
+export SPARK_HOME=/opt/spark
 export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64
 export PATH=$SPARK_HOME/bin:$SPARK_HOME/sbin:$PATH

--- a/PyTorch/Recommendation/DLRM/preproc/run_spark_gpu.sh
+++ b/PyTorch/Recommendation/DLRM/preproc/run_spark_gpu.sh
@ -1,195 +0,0 @@
-#!/bin/bash
-
-# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#       http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-#########################################################################
-# File Name: run_spark_gpu.sh
-
-set -e
-
-# the data path including 1TB criteo data, day_0, day_1, ...
-export INPUT_PATH=${1:-'/data/dlrm/criteo'}
-
-# the output path, use for generating the dictionary and the final dataset
-# the output folder should have more than 300GB
-export OUTPUT_PATH=${2:-'/data/dlrm/spark/output'}
-
-export FREQUENCY_LIMIT=${3:-'15'}
-
-export HARDWARE_PLATFORM=${4:-'DGX2'}
-
-# spark local dir should have about 3TB
-# the temporary path used for spark shuffle write
-export SPARK_LOCAL_DIRS='/data/dlrm/spark/tmp'
-
-if [[ $HARDWARE_PLATFORM == DGX2 ]]; then
-    source dgx2_config.sh
-else
-    echo "Unknown hardware platform ${HARDWARE_PLATFORM}"
-    exit 1
-fi
-
-OPTS="--frequency_limit $FREQUENCY_LIMIT"
-
-export SPARK_HOME=/opt/spark-3.0.0-bin-hadoop3.2
-export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64
-export PATH=$SPARK_HOME/bin:$SPARK_HOME/sbin:$PATH
-
-# we use spark standalone to run the job
-export MASTER=spark://$HOSTNAME:7077
-
-echo "Starting spark standalone"
-start-master.sh
-start-slave.sh $MASTER
-
-echo "Generating the dictionary..."
-spark-submit --master $MASTER \
-    	--driver-memory "${DRIVER_MEMORY}G" \
-    	--executor-cores $NUM_EXECUTOR_CORES \
-    	--executor-memory "${EXECUTOR_MEMORY}G" \
-    	--conf spark.cores.max=$TOTAL_CORES \
-    	--conf spark.task.cpus=1 \
-        --conf spark.sql.files.maxPartitionBytes=1073741824 \
-    	--conf spark.sql.shuffle.partitions=600 \
-    	--conf spark.driver.maxResultSize=2G \
-    	--conf spark.locality.wait=0s \
-    	--conf spark.network.timeout=1800s \
-        --conf spark.task.resource.gpu.amount=0.01 \
-        --conf spark.executor.resource.gpu.amount=1 \
-        --conf spark.plugins=com.nvidia.spark.SQLPlugin \
-        --conf spark.rapids.sql.concurrentGpuTasks=2 \
-        --conf spark.rapids.sql.reader.batchSizeRows=4000000 \
-        --conf spark.rapids.memory.pinnedPool.size=16g \
-        --conf spark.rapids.sql.explain=ALL \
-        --conf spark.sql.autoBroadcastJoinThreshold=1GB \
-        --conf spark.rapids.sql.incompatibleOps.enabled=true \
-        --conf spark.driver.maxResultSize=2G \
-        --conf spark.executor.extraJavaOptions="-Dcom.nvidia.cudf.prefer-pinned=true\ -Djava.io.tmpdir=$SPARK_LOCAL_DIRS" \
-    	spark_data_utils.py --mode generate_models \
-    	$OPTS \
-    	--input_folder $INPUT_PATH \
-    	--days 0-23 \
-    	--model_folder $OUTPUT_PATH/models \
-    	--write_mode overwrite --low_mem 2>&1 | tee submit_dict_log.txt
-
-echo "Transforming the train data from day_0 to day_22..."
-spark-submit --master $MASTER \
-    	--driver-memory "${DRIVER_MEMORY}G" \
-    	--executor-cores $NUM_EXECUTOR_CORES \
-    	--executor-memory "${EXECUTOR_MEMORY}G" \
-    	--conf spark.cores.max=$TOTAL_CORES \
-    	--conf spark.task.cpus=1 \
-        --conf spark.sql.files.maxPartitionBytes=1073741824 \
-    	--conf spark.sql.shuffle.partitions=600 \
-    	--conf spark.driver.maxResultSize=2G \
-    	--conf spark.locality.wait=0s \
-    	--conf spark.network.timeout=1800s \
-        --conf spark.task.resource.gpu.amount=0.01 \
-        --conf spark.executor.resource.gpu.amount=1 \
-        --conf spark.plugins=com.nvidia.spark.SQLPlugin \
-        --conf spark.rapids.sql.concurrentGpuTasks=2 \
-        --conf spark.rapids.sql.reader.batchSizeRows=4000000 \
-        --conf spark.rapids.memory.pinnedPool.size=16g \
-        --conf spark.rapids.sql.explain=ALL \
-        --conf spark.sql.autoBroadcastJoinThreshold=1GB \
-        --conf spark.rapids.sql.incompatibleOps.enabled=true \
-        --conf spark.driver.maxResultSize=2G \
-        --conf spark.executor.extraJavaOptions="-Dcom.nvidia.cudf.prefer-pinned=true\ -Djava.io.tmpdir=$SPARK_LOCAL_DIRS" \
-    	spark_data_utils.py --mode transform \
-    	--input_folder $INPUT_PATH \
-    	--days 0-22 \
-    	--output_folder $OUTPUT_PATH/train \
-        --model_size_file $OUTPUT_PATH/model_size.json \
-    	--model_folder $OUTPUT_PATH/models \
-    	--write_mode overwrite --low_mem 2>&1 | tee submit_train_log.txt
-
-echo "Splitting the last day into 2 parts of test and validation..."
-last_day=$INPUT_PATH/day_23
-temp_test=$OUTPUT_PATH/temp/test
-temp_validation=$OUTPUT_PATH/temp/validation
-mkdir -p $temp_test $temp_validation
-
-lines=`wc -l $last_day | awk '{print $1}'`
-former=$((lines / 2))
-latter=$((lines - former))
-
-head -n $former $last_day > $temp_test/day_23
-tail -n $latter $last_day > $temp_validation/day_23
-
-echo "Transforming the test data in day_23..."
-spark-submit --master $MASTER \
-    	--driver-memory "${DRIVER_MEMORY}G" \
-    	--executor-cores $NUM_EXECUTOR_CORES \
-    	--executor-memory "${EXECUTOR_MEMORY}G" \
-    	--conf spark.cores.max=$TOTAL_CORES \
-    	--conf spark.task.cpus=1 \
-        --conf spark.sql.files.maxPartitionBytes=1073741824 \
-    	--conf spark.sql.shuffle.partitions=30 \
-    	--conf spark.driver.maxResultSize=2G \
-    	--conf spark.locality.wait=0s \
-    	--conf spark.network.timeout=1800s \
-        --conf spark.task.resource.gpu.amount=0.01 \
-        --conf spark.executor.resource.gpu.amount=1 \
-        --conf spark.plugins=com.nvidia.spark.SQLPlugin \
-        --conf spark.rapids.sql.concurrentGpuTasks=2 \
-        --conf spark.rapids.sql.reader.batchSizeRows=4000000 \
-        --conf spark.rapids.memory.pinnedPool.size=16g \
-        --conf spark.rapids.sql.explain=ALL \
-        --conf spark.sql.autoBroadcastJoinThreshold=1GB \
-        --conf spark.rapids.sql.incompatibleOps.enabled=true \
-        --conf spark.driver.maxResultSize=2G \
-        --conf spark.executor.extraJavaOptions="-Dcom.nvidia.cudf.prefer-pinned=true\ -Djava.io.tmpdir=$SPARK_LOCAL_DIRS" \
-    	spark_data_utils.py --mode transform \
-    	--input_folder $temp_test \
-    	--days 23-23 \
-    	--output_folder $OUTPUT_PATH/test \
-    	--output_ordering input \
-    	--model_folder $OUTPUT_PATH/models \
-    	--write_mode overwrite --low_mem 2>&1 | tee submit_test_log.txt
-
-echo "Transforming the validation data in day_23..."
-spark-submit --master $MASTER \
-    	--driver-memory "${DRIVER_MEMORY}G" \
-    	--executor-cores $NUM_EXECUTOR_CORES \
-    	--executor-memory "${EXECUTOR_MEMORY}G" \
-    	--conf spark.cores.max=$TOTAL_CORES \
-    	--conf spark.task.cpus=1 \
-        --conf spark.sql.files.maxPartitionBytes=1073741824 \
-    	--conf spark.sql.shuffle.partitions=30 \
-    	--conf spark.driver.maxResultSize=2G \
-    	--conf spark.locality.wait=0s \
-    	--conf spark.network.timeout=1800s \
-        --conf spark.task.resource.gpu.amount=0.01 \
-        --conf spark.executor.resource.gpu.amount=1 \
-        --conf spark.plugins=com.nvidia.spark.SQLPlugin \
-        --conf spark.rapids.sql.concurrentGpuTasks=2 \
-        --conf spark.rapids.sql.reader.batchSizeRows=4000000 \
-        --conf spark.rapids.memory.pinnedPool.size=16g \
-        --conf spark.rapids.sql.explain=ALL \
-        --conf spark.sql.autoBroadcastJoinThreshold=1GB \
-        --conf spark.rapids.sql.incompatibleOps.enabled=true \
-        --conf spark.driver.maxResultSize=2G \
-        --conf spark.executor.extraJavaOptions="-Dcom.nvidia.cudf.prefer-pinned=true\ -Djava.io.tmpdir=$SPARK_LOCAL_DIRS" \
-    	spark_data_utils.py --mode transform \
-    	--input_folder $temp_validation \
-    	--days 23-23 \
-    	--output_folder $OUTPUT_PATH/validation \
-    	--output_ordering input \
-    	--model_folder $OUTPUT_PATH/models \
-    	--write_mode overwrite --low_mem 2>&1 | tee submit_validation_log.txt
-
-rm -r $temp_test $temp_validation
-stop-master.sh
-stop-slave.sh
--- a/PyTorch/Recommendation/DLRM/preproc/run_spark_gpu_DGX-2.sh
+++ b/PyTorch/Recommendation/DLRM/preproc/run_spark_gpu_DGX-2.sh
@ -0,0 +1,190 @@
+#!/bin/bash
+
+# Copyright (c) 2021 NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#########################################################################
+# File Name: run_spark_gpu_DGX-2.sh
+
+set -e
+
+# the data path including 1TB criteo data, day_0, day_1, ...
+export INPUT_PATH=${1:-'/data/dlrm/criteo'}
+
+# the output path, use for generating the dictionary and the final dataset
+# the output folder should have more than 300GB
+export OUTPUT_PATH=${2:-'/data/dlrm/output'}
+
+export FREQUENCY_LIMIT=${3:-'15'}
+
+export HARDWARE_PLATFORM='DGX-2'
+
+# spark local dir should have about 3TB
+# the temporary path used for spark shuffle write
+export SPARK_LOCAL_DIRS='/data/dlrm/spark/tmp'
+
+source DGX-2_config.sh
+
+OPTS="--frequency_limit $FREQUENCY_LIMIT"
+
+export SPARK_HOME=/opt/spark
+export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64
+export PATH=$SPARK_HOME/bin:$SPARK_HOME/sbin:$PATH
+
+# we use spark standalone to run the job
+export MASTER=spark://$HOSTNAME:7077
+
+echo "Starting spark standalone"
+start-master.sh
+start-slave.sh $MASTER
+
+echo "Generating the dictionary..."
+spark-submit --master $MASTER \
+    --driver-memory "${DRIVER_MEMORY}G" \
+    --executor-cores $NUM_EXECUTOR_CORES \
+    --executor-memory "${EXECUTOR_MEMORY}G" \
+    --conf spark.cores.max=$TOTAL_CORES \
+    --conf spark.task.cpus=1 \
+    --conf spark.sql.files.maxPartitionBytes=1073741824 \
+    --conf spark.sql.shuffle.partitions=600 \
+    --conf spark.driver.maxResultSize=2G \
+    --conf spark.locality.wait=0s \
+    --conf spark.network.timeout=1800s \
+    --conf spark.task.resource.gpu.amount=0.01 \
+    --conf spark.executor.resource.gpu.amount=1 \
+    --conf spark.plugins=com.nvidia.spark.SQLPlugin \
+    --conf spark.rapids.sql.concurrentGpuTasks=2 \
+    --conf spark.rapids.sql.reader.batchSizeRows=4000000 \
+    --conf spark.rapids.memory.pinnedPool.size=16g \
+    --conf spark.rapids.sql.explain=ALL \
+    --conf spark.sql.autoBroadcastJoinThreshold=1GB \
+    --conf spark.rapids.sql.incompatibleOps.enabled=true \
+    --conf spark.driver.maxResultSize=2G \
+    --conf spark.executor.extraJavaOptions="-Dcom.nvidia.cudf.prefer-pinned=true\ -Djava.io.tmpdir=$SPARK_LOCAL_DIRS" \
+    spark_data_utils.py --mode generate_models \
+    $OPTS \
+    --input_folder $INPUT_PATH \
+    --days 0-23 \
+    --model_folder $OUTPUT_PATH/models \
+    --write_mode overwrite --low_mem 2>&1 | tee submit_dict_log.txt
+
+echo "Transforming the train data from day_0 to day_22..."
+spark-submit --master $MASTER \
+    --driver-memory "${DRIVER_MEMORY}G" \
+    --executor-cores $NUM_EXECUTOR_CORES \
+    --executor-memory "${EXECUTOR_MEMORY}G" \
+    --conf spark.cores.max=$TOTAL_CORES \
+    --conf spark.task.cpus=3 \
+    --conf spark.sql.files.maxPartitionBytes=1073741824 \
+    --conf spark.sql.shuffle.partitions=600 \
+    --conf spark.driver.maxResultSize=2G \
+    --conf spark.locality.wait=0s \
+    --conf spark.network.timeout=1800s \
+    --conf spark.task.resource.gpu.amount=0.01 \
+    --conf spark.executor.resource.gpu.amount=1 \
+    --conf spark.plugins=com.nvidia.spark.SQLPlugin \
+    --conf spark.rapids.sql.concurrentGpuTasks=2 \
+    --conf spark.rapids.sql.reader.batchSizeRows=4000000 \
+    --conf spark.rapids.memory.pinnedPool.size=16g \
+    --conf spark.rapids.sql.explain=ALL \
+    --conf spark.sql.autoBroadcastJoinThreshold=1GB \
+    --conf spark.rapids.sql.incompatibleOps.enabled=true \
+    --conf spark.driver.maxResultSize=2G \
+    --conf spark.executor.extraJavaOptions="-Dcom.nvidia.cudf.prefer-pinned=true\ -Djava.io.tmpdir=$SPARK_LOCAL_DIRS" \
+    spark_data_utils.py --mode transform \
+    --input_folder $INPUT_PATH \
+    --days 0-22 \
+    --output_folder $OUTPUT_PATH/train \
+    --model_size_file $OUTPUT_PATH/model_size.json \
+    --model_folder $OUTPUT_PATH/models \
+    --write_mode overwrite --low_mem 2>&1 | tee submit_train_log.txt
+
+echo "Splitting the last day into 2 parts of test and validation..."
+last_day=$INPUT_PATH/day_23
+temp_test=$OUTPUT_PATH/temp/test
+temp_validation=$OUTPUT_PATH/temp/validation
+mkdir -p $temp_test $temp_validation
+
+lines=`wc -l $last_day | awk '{print $1}'`
+former=$((lines / 2))
+latter=$((lines - former))
+
+head -n $former $last_day > $temp_test/day_23
+tail -n $latter $last_day > $temp_validation/day_23
+
+echo "Transforming the test data in day_23..."
+spark-submit --master $MASTER \
+    --driver-memory "${DRIVER_MEMORY}G" \
+    --executor-cores $NUM_EXECUTOR_CORES \
+    --executor-memory "${EXECUTOR_MEMORY}G" \
+    --conf spark.cores.max=$TOTAL_CORES \
+    --conf spark.task.cpus=1 \
+    --conf spark.sql.files.maxPartitionBytes=1073741824 \
+    --conf spark.sql.shuffle.partitions=30 \
+    --conf spark.driver.maxResultSize=2G \
+    --conf spark.locality.wait=0s \
+    --conf spark.network.timeout=1800s \
+    --conf spark.task.resource.gpu.amount=0.01 \
+    --conf spark.executor.resource.gpu.amount=1 \
+    --conf spark.plugins=com.nvidia.spark.SQLPlugin \
+    --conf spark.rapids.sql.concurrentGpuTasks=2 \
+    --conf spark.rapids.sql.reader.batchSizeRows=4000000 \
+    --conf spark.rapids.memory.pinnedPool.size=16g \
+    --conf spark.rapids.sql.explain=ALL \
+    --conf spark.sql.autoBroadcastJoinThreshold=1GB \
+    --conf spark.rapids.sql.incompatibleOps.enabled=true \
+    --conf spark.driver.maxResultSize=2G \
+    --conf spark.executor.extraJavaOptions="-Dcom.nvidia.cudf.prefer-pinned=true\ -Djava.io.tmpdir=$SPARK_LOCAL_DIRS" \
+    spark_data_utils.py --mode transform \
+    --input_folder $temp_test \
+    --days 23-23 \
+    --output_folder $OUTPUT_PATH/test \
+    --output_ordering input \
+    --model_folder $OUTPUT_PATH/models \
+    --write_mode overwrite --low_mem 2>&1 | tee submit_test_log.txt
+
+echo "Transforming the validation data in day_23..."
+spark-submit --master $MASTER \
+    --driver-memory "${DRIVER_MEMORY}G" \
+    --executor-cores $NUM_EXECUTOR_CORES \
+    --executor-memory "${EXECUTOR_MEMORY}G" \
+    --conf spark.cores.max=$TOTAL_CORES \
+    --conf spark.task.cpus=1 \
+    --conf spark.sql.files.maxPartitionBytes=1073741824 \
+    --conf spark.sql.shuffle.partitions=30 \
+    --conf spark.driver.maxResultSize=2G \
+    --conf spark.locality.wait=0s \
+    --conf spark.network.timeout=1800s \
+    --conf spark.task.resource.gpu.amount=0.01 \
+    --conf spark.executor.resource.gpu.amount=1 \
+    --conf spark.plugins=com.nvidia.spark.SQLPlugin \
+    --conf spark.rapids.sql.concurrentGpuTasks=2 \
+    --conf spark.rapids.sql.reader.batchSizeRows=4000000 \
+    --conf spark.rapids.memory.pinnedPool.size=16g \
+    --conf spark.rapids.sql.explain=ALL \
+    --conf spark.sql.autoBroadcastJoinThreshold=1GB \
+    --conf spark.rapids.sql.incompatibleOps.enabled=true \
+    --conf spark.driver.maxResultSize=2G \
+    --conf spark.executor.extraJavaOptions="-Dcom.nvidia.cudf.prefer-pinned=true\ -Djava.io.tmpdir=$SPARK_LOCAL_DIRS" \
+    spark_data_utils.py --mode transform \
+    --input_folder $temp_validation \
+    --days 23-23 \
+    --output_folder $OUTPUT_PATH/validation \
+    --output_ordering input \
+    --model_folder $OUTPUT_PATH/models \
+    --write_mode overwrite --low_mem 2>&1 | tee submit_validation_log.txt
+
+rm -r $temp_test $temp_validation
+stop-master.sh
+stop-slave.sh
--- a/PyTorch/Recommendation/DLRM/preproc/run_spark_gpu_DGX-A100.sh
+++ b/PyTorch/Recommendation/DLRM/preproc/run_spark_gpu_DGX-A100.sh
@ -0,0 +1,190 @@
+#!/bin/bash
+
+# Copyright (c) 2021 NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#########################################################################
+# File Name: run_spark_gpu_DGX-A100.sh
+
+set -e
+
+# the data path including 1TB criteo data, day_0, day_1, ...
+export INPUT_PATH=${1:-'/data/dlrm/criteo'}
+
+# the output path, use for generating the dictionary and the final dataset
+# the output folder should have more than 300GB
+export OUTPUT_PATH=${2:-'/data/dlrm/output'}
+
+export FREQUENCY_LIMIT=${3:-'15'}
+
+HARDWARE_PLATFORM='DGX-A100'
+
+# spark local dir should have about 3TB
+# the temporary path used for spark shuffle write
+export SPARK_LOCAL_DIRS='/data/dlrm/spark/tmp'
+
+source DGX-A100_config.sh
+
+OPTS="--frequency_limit $FREQUENCY_LIMIT"
+
+export SPARK_HOME=/opt/spark
+export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64
+export PATH=$SPARK_HOME/bin:$SPARK_HOME/sbin:$PATH
+
+# we use spark standalone to run the job
+export MASTER=spark://$HOSTNAME:7077
+
+echo "Starting spark standalone"
+start-master.sh
+start-slave.sh $MASTER
+
+echo "Generating the dictionary..."
+spark-submit --master $MASTER \
+    --driver-memory "${DRIVER_MEMORY}G" \
+    --executor-cores $NUM_EXECUTOR_CORES \
+    --executor-memory "${EXECUTOR_MEMORY}G" \
+    --conf spark.cores.max=$TOTAL_CORES \
+    --conf spark.task.cpus=1 \
+    --conf spark.sql.files.maxPartitionBytes=1073741824 \
+    --conf spark.sql.shuffle.partitions=600 \
+    --conf spark.driver.maxResultSize=2G \
+    --conf spark.locality.wait=0s \
+    --conf spark.network.timeout=1800s \
+    --conf spark.task.resource.gpu.amount=0.01 \
+    --conf spark.executor.resource.gpu.amount=1 \
+    --conf spark.plugins=com.nvidia.spark.SQLPlugin \
+    --conf spark.rapids.sql.concurrentGpuTasks=2 \
+    --conf spark.rapids.sql.reader.batchSizeRows=4000000 \
+    --conf spark.rapids.memory.pinnedPool.size=16g \
+    --conf spark.rapids.sql.explain=ALL \
+    --conf spark.sql.autoBroadcastJoinThreshold=1GB \
+    --conf spark.rapids.sql.incompatibleOps.enabled=true \
+    --conf spark.driver.maxResultSize=2G \
+    --conf spark.executor.extraJavaOptions="-Dcom.nvidia.cudf.prefer-pinned=true\ -Djava.io.tmpdir=$SPARK_LOCAL_DIRS" \
+    spark_data_utils.py --mode generate_models \
+    $OPTS \
+    --input_folder $INPUT_PATH \
+    --days 0-23 \
+    --model_folder $OUTPUT_PATH/models \
+    --write_mode overwrite --low_mem 2>&1 | tee submit_dict_log.txt
+
+echo "Transforming the train data from day_0 to day_22..."
+spark-submit --master $MASTER \
+    --driver-memory "${DRIVER_MEMORY}G" \
+    --executor-cores $NUM_EXECUTOR_CORES \
+    --executor-memory "${EXECUTOR_MEMORY}G" \
+    --conf spark.cores.max=$TOTAL_CORES \
+    --conf spark.task.cpus=16 \
+    --conf spark.sql.files.maxPartitionBytes=1073741824 \
+    --conf spark.sql.shuffle.partitions=600 \
+    --conf spark.driver.maxResultSize=2G \
+    --conf spark.locality.wait=0s \
+    --conf spark.network.timeout=1800s \
+    --conf spark.task.resource.gpu.amount=0.5 \
+    --conf spark.executor.resource.gpu.amount=1 \
+    --conf spark.plugins=com.nvidia.spark.SQLPlugin \
+    --conf spark.rapids.sql.concurrentGpuTasks=2 \
+    --conf spark.rapids.sql.reader.batchSizeRows=4000000 \
+    --conf spark.rapids.memory.pinnedPool.size=16g \
+    --conf spark.rapids.sql.explain=ALL \
+    --conf spark.sql.autoBroadcastJoinThreshold=1GB \
+    --conf spark.rapids.sql.incompatibleOps.enabled=true \
+    --conf spark.driver.maxResultSize=2G \
+    --conf spark.executor.extraJavaOptions="-Dcom.nvidia.cudf.prefer-pinned=true\ -Djava.io.tmpdir=$SPARK_LOCAL_DIRS" \
+    spark_data_utils.py --mode transform \
+    --input_folder $INPUT_PATH \
+    --days 0-22 \
+    --output_folder $OUTPUT_PATH/train \
+    --model_size_file $OUTPUT_PATH/model_size.json \
+    --model_folder $OUTPUT_PATH/models \
+    --write_mode overwrite --low_mem 2>&1 | tee submit_train_log.txt
+
+echo "Splitting the last day into 2 parts of test and validation..."
+last_day=$INPUT_PATH/day_23
+temp_test=$OUTPUT_PATH/temp/test
+temp_validation=$OUTPUT_PATH/temp/validation
+mkdir -p $temp_test $temp_validation
+
+lines=`wc -l $last_day | awk '{print $1}'`
+former=$((lines / 2))
+latter=$((lines - former))
+
+head -n $former $last_day > $temp_test/day_23
+tail -n $latter $last_day > $temp_validation/day_23
+
+echo "Transforming the test data in day_23..."
+spark-submit --master $MASTER \
+    --driver-memory "${DRIVER_MEMORY}G" \
+    --executor-cores $NUM_EXECUTOR_CORES \
+    --executor-memory "${EXECUTOR_MEMORY}G" \
+    --conf spark.cores.max=$TOTAL_CORES \
+    --conf spark.task.cpus=32 \
+    --conf spark.sql.files.maxPartitionBytes=1073741824 \
+    --conf spark.sql.shuffle.partitions=600 \
+    --conf spark.driver.maxResultSize=2G \
+    --conf spark.locality.wait=0s \
+    --conf spark.network.timeout=1800s \
+    --conf spark.task.resource.gpu.amount=1 \
+    --conf spark.executor.resource.gpu.amount=1 \
+    --conf spark.plugins=com.nvidia.spark.SQLPlugin \
+    --conf spark.rapids.sql.concurrentGpuTasks=1 \
+    --conf spark.rapids.sql.reader.batchSizeRows=4000000 \
+    --conf spark.rapids.memory.pinnedPool.size=16g \
+    --conf spark.rapids.sql.explain=ALL \
+    --conf spark.sql.autoBroadcastJoinThreshold=1GB \
+    --conf spark.rapids.sql.incompatibleOps.enabled=true \
+    --conf spark.driver.maxResultSize=2G \
+    --conf spark.executor.extraJavaOptions="-Dcom.nvidia.cudf.prefer-pinned=true\ -Djava.io.tmpdir=$SPARK_LOCAL_DIRS" \
+    spark_data_utils.py --mode transform \
+    --input_folder $temp_test \
+    --days 23-23 \
+    --output_folder $OUTPUT_PATH/test \
+    --output_ordering input \
+    --model_folder $OUTPUT_PATH/models \
+    --write_mode overwrite --low_mem 2>&1 | tee submit_test_log.txt
+
+echo "Transforming the validation data in day_23..."
+spark-submit --master $MASTER \
+    --driver-memory "${DRIVER_MEMORY}G" \
+    --executor-cores $NUM_EXECUTOR_CORES \
+    --executor-memory "${EXECUTOR_MEMORY}G" \
+    --conf spark.cores.max=$TOTAL_CORES \
+    --conf spark.task.cpus=32 \
+    --conf spark.sql.files.maxPartitionBytes=1073741824 \
+    --conf spark.sql.shuffle.partitions=600 \
+    --conf spark.driver.maxResultSize=2G \
+    --conf spark.locality.wait=0s \
+    --conf spark.network.timeout=1800s \
+    --conf spark.task.resource.gpu.amount=1 \
+    --conf spark.executor.resource.gpu.amount=1 \
+    --conf spark.plugins=com.nvidia.spark.SQLPlugin \
+    --conf spark.rapids.sql.concurrentGpuTasks=1 \
+    --conf spark.rapids.sql.reader.batchSizeRows=4000000 \
+    --conf spark.rapids.memory.pinnedPool.size=16g \
+    --conf spark.rapids.sql.explain=ALL \
+    --conf spark.sql.autoBroadcastJoinThreshold=1GB \
+    --conf spark.rapids.sql.incompatibleOps.enabled=true \
+    --conf spark.driver.maxResultSize=2G \
+    --conf spark.executor.extraJavaOptions="-Dcom.nvidia.cudf.prefer-pinned=true\ -Djava.io.tmpdir=$SPARK_LOCAL_DIRS" \
+    spark_data_utils.py --mode transform \
+    --input_folder $temp_validation \
+    --days 23-23 \
+    --output_folder $OUTPUT_PATH/validation \
+    --output_ordering input \
+    --model_folder $OUTPUT_PATH/models \
+    --write_mode overwrite --low_mem 2>&1 | tee submit_validation_log.txt
+
+rm -r $temp_test $temp_validation
+stop-master.sh
+stop-slave.sh
--- a/PyTorch/Recommendation/DLRM/preproc/spark_data_utils.py
+++ b/PyTorch/Recommendation/DLRM/preproc/spark_data_utils.py
@ -1,4 +1,4 @@
-# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2021 NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@ -428,7 +428,6 @@ def _main():
                delete_combined_model(spark, args.model_folder)

    if args.mode == 'transform':
-        spark.conf.set('spark.sql.shuffle.partitions', args.days * args.apply_shuffle_parallel_per_day)
        with _timed('transform'):
            if args.output_ordering == 'total_random':
                df = rand_ordinal(df)
--- a/PyTorch/Recommendation/DLRM/preproc/split_dataset.py
+++ b/PyTorch/Recommendation/DLRM/preproc/split_dataset.py
@ -1,4 +1,4 @@
-# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2021 NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@ -89,15 +89,15 @@ def split_binary_file(
 def split_dataset(dataset_dir: str, output_dir: str, batch_size: int, numerical_features: int):
    categorical_sizes_file = os.path.join(dataset_dir, "model_size.json")
    with open(categorical_sizes_file) as f:
-        categorical_sizes = list(json.load(f).values())
+        categorical_sizes = [int(v) for v in json.load(f).values()]

    train_file = os.path.join(dataset_dir, "train_data.bin")
    test_file = os.path.join(dataset_dir, "test_data.bin")
-    val_file = os.path.join(dataset_dir, "val_data.bin")
+    val_file = os.path.join(dataset_dir, "validation_data.bin")

    target_train = os.path.join(output_dir, "train")
    target_test = os.path.join(output_dir, "test")
-    target_val = os.path.join(output_dir, "val")
+    target_val = os.path.join(output_dir, "validation")

    os.makedirs(output_dir, exist_ok=True)
    os.makedirs(target_train, exist_ok=True)
--- a/PyTorch/Recommendation/DLRM/preproc/verify_criteo_downloaded.sh
+++ b/PyTorch/Recommendation/DLRM/preproc/verify_criteo_downloaded.sh
@ -1,4 +1,4 @@
-# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2021 NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
--- a/PyTorch/Recommendation/DLRM/requirements.txt
+++ b/PyTorch/Recommendation/DLRM/requirements.txt
@ -1,4 +1,3 @@
 -e git://github.com/NVIDIA/dllogger#egg=dllogger
 absl-py>=0.7.0
-numpy
-pyarrow
+
--- a/PyTorch/Recommendation/DLRM/requirements_preprocessing.txt
+++ b/PyTorch/Recommendation/DLRM/requirements_preprocessing.txt
@ -0,0 +1,4 @@
+numpy
+pandas
+joblib==0.16
+tqdm
--- a/PyTorch/Recommendation/DLRM/setup.py
+++ b/PyTorch/Recommendation/DLRM/setup.py
@ -1,4 +1,4 @@
-# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2021 NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
--- a/PyTorch/Recommendation/DLRM/triton/Dockerfile
+++ b/PyTorch/Recommendation/DLRM/triton/Dockerfile
@ -1,4 +1,4 @@
-# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2021 NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@ -12,8 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:20.06-py3
-ARG TRITON_BASE_IMAGE=nvcr.io/nvidia/tritonserver:20.06-py3-clientsdk
+ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:20.09-py3
+ARG TRITON_BASE_IMAGE=nvcr.io/nvidia/tritonserver:20.09-py3-clientsdk
 FROM ${TRITON_BASE_IMAGE} as trt
 FROM ${FROM_IMAGE_NAME}

@ -21,11 +21,8 @@ ADD requirements.txt .
 RUN pip install -r requirements.txt
 RUN pip install onnxruntime

-COPY --from=trt /workspace/v2.0.0.clients.tar.gz ./v2.0.0.clients.tar.gz
-RUN tar -xzf v2.0.0.clients.tar.gz \
-    && pip install ./python/tritonclientutils-2.0.0-py3-none-any.whl \
-    && pip install ./python/tritonhttpclient-2.0.0-py3-none-any.whl \
-    && pip install ./python/tritongrpcclient-2.0.0-py3-none-any.whl
+COPY --from=trt /workspace/install/python/tritonclient-2.3.0-py3-none-any.whl ./tritonclient-2.3.0-py3-none-any.whl
+RUN pip install ./tritonclient-2.3.0-py3-none-any.whl[all]

 WORKDIR /workspace/dlrm
 COPY . .
--- a/PyTorch/Recommendation/DLRM/triton/README.md
+++ b/PyTorch/Recommendation/DLRM/triton/README.md
@ -25,7 +25,7 @@ The very first step of deployment is to acquire trained checkpoint and model con
 checkpoint. Default model configuration are stored inside `dlrm/config` directory.

 **Currently, our implementation only supports TorchScript deployment for models that fit into the memory of a single GPU.**
-You can read more about training DLRM models on different dataset configurations based on frequency threshold in the preprocessing step in [README](https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/Recommendation/DLRM/README.md#preprocess-with-spark).
+You can read more about training DLRM models on different dataset configurations based on frequency threshold in the preprocessing step in [README](https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/Recommendation/DLRM/README.md#preprocessing-on-gpu).

 #### Inference container

@ -126,11 +126,11 @@ is mounted under `/data`
 #### Running the Triton server
 **NOTE: This step is executed outside inference container**

-1. `docker pull nvcr.io/nvidia/tritonserver:20.06-py3`
-2. `docker run -d --rm --gpus device=0 --ipc=host --network=host [--cpuset-cpus=0-15] -p 8000:8000 -p 8001:8001 -p 8002:8002 -v <PATH_TO_MODEL_REPOSITORY>:/models nvcr.io/nvidia/tritonserver:20.06-py3 tritonserver --model-repository=/models --log-verbose=1 --model-control-mode=explicit`
+1. `docker pull nvcr.io/nvidia/tritonserver:20.09-py3`
+2. `docker run -d --rm --gpus device=0 --ipc=host --network=host [--cpuset-cpus=0-15] -p 8000:8000 -p 8001:8001 -p 8002:8002 -v <PATH_TO_MODEL_REPOSITORY>:/models nvcr.io/nvidia/tritonserver:20.09-py3 tritonserver --model-repository=/models --log-verbose=1 --model-control-mode=explicit`

 Here `--gpus '"device=0,1,2,3"'` selects GPUs indexed by ordinals `0,1,2` and `3`, respectively. The server will see only these GPUs. If you write `device=all`, then the server will see all the available GPUs. `PATH_TO_MODEL_REPOSITORY` indicates location where
-deployed models were stored. Additional `--model-controle-mode` option allows to manually load and
+deployed models were stored. Additional `--model-control-mode` option allows to manually load and
 unload models. This is especially useful when dealing with numerous large models like DLRM.

 For models exported to onnx format and hosted inside onnx runtime it might be required to limit visible cpu to fully utlize gpu acceleration. Use `--cpuset-cpus` docker option for that.
--- a/PyTorch/Recommendation/DLRM/triton/client.py
+++ b/PyTorch/Recommendation/DLRM/triton/client.py
@ -1,5 +1,5 @@
 #!/usr/bin/env python
-# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@ -31,7 +31,7 @@ import sys

 import numpy as np
 import torch
-import tritonhttpclient
+import tritonclient.http as http_client
 from sklearn.metrics import roc_auc_score
 from tqdm import tqdm

@ -73,14 +73,14 @@ def run_infer(model_name, model_version, numerical_features, categorical_feature
    inputs = []
    outputs = []
    num_type = "FP16" if numerical_features.dtype == np.float16 else "FP32"
-    inputs.append(tritonhttpclient.InferInput('input__0', numerical_features.shape, num_type))
-    inputs.append(tritonhttpclient.InferInput('input__1', categorical_features.shape, "INT64"))
+    inputs.append(http_client.InferInput('input__0', numerical_features.shape, num_type))
+    inputs.append(http_client.InferInput('input__1', categorical_features.shape, "INT64"))

    # Initialize the data
    inputs[0].set_data_from_numpy(numerical_features, binary_data=True)
    inputs[1].set_data_from_numpy(categorical_features, binary_data=False)

-    outputs.append(tritonhttpclient.InferRequestedOutput('output__0', binary_data=True))
+    outputs.append(http_client.InferRequestedOutput('output__0', binary_data=True))
    results = triton_client.infer(model_name,
                                  inputs,
                                  model_version=str(model_version) if model_version != -1 else '',
@ -124,7 +124,7 @@ if __name__ == '__main__':

    FLAGS = parser.parse_args()
    try:
-        triton_client = tritonhttpclient.InferenceServerClient(url=FLAGS.triton_server_url, verbose=FLAGS.verbose)
+        triton_client = http_client.InferenceServerClient(url=FLAGS.triton_server_url, verbose=FLAGS.verbose)
    except Exception as e:
        print("channel creation failed: " + str(e))
        sys.exit(1)
--- a/PyTorch/Recommendation/DLRM/triton/deployer.py
+++ b/PyTorch/Recommendation/DLRM/triton/deployer.py
@ -1,6 +1,6 @@
 #!/usr/bin/python

-# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
--- a/PyTorch/Recommendation/DLRM/triton/deployer_lib.py
+++ b/PyTorch/Recommendation/DLRM/triton/deployer_lib.py
@ -1,6 +1,6 @@
 #!/usr/bin/python

-# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.