[WIDENDEEP/TF2] Update NVTabular version and add various model optimizations

2021-11-08 14:10:12 -08:00 · 2021-11-08 14:10:12 -08:00 · 01a9f5b48c
parent 2592d5a02c
commit 01a9f5b48c
49 changed files with 7022 additions and 19855 deletions
--- a/TensorFlow2/Recommendation/WideAndDeep/Dockerfile
+++ b/TensorFlow2/Recommendation/WideAndDeep/Dockerfile
@ -1 +0,0 @@
-Dockerfile-train
--- a/TensorFlow2/Recommendation/WideAndDeep/Dockerfile
+++ b/TensorFlow2/Recommendation/WideAndDeep/Dockerfile
@ -0,0 +1,32 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+ARG FROM_IMAGE_NAME=nvcr.io/nvidia/merlin/merlin-tensorflow-training:21.09
+
+FROM ${FROM_IMAGE_NAME}
+
+ENV HOROVOD_CYCLE_TIME=0.1
+ENV HOROVOD_FUSION_THRESHOLD=67108864
+ENV HOROVOD_NUM_STREAMS=2
+
+
+USER root
+
+RUN pip install --no-cache-dir -e git+https://github.com/NVIDIA/dllogger#egg=dllogger
+
+WORKDIR  /wd
+
+COPY . .
+
+RUN cd /nvtabular && git checkout v0.6.1
--- a/TensorFlow2/Recommendation/WideAndDeep/Dockerfile-preproc
+++ b/TensorFlow2/Recommendation/WideAndDeep/Dockerfile-preproc
@ -1,54 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-ARG FROM_IMAGE_NAME=nvcr.io/nvidia/nvtabular:0.3
-
-FROM ${FROM_IMAGE_NAME}
-
-USER root
-
-# Spark dependencies
-ENV APACHE_SPARK_VERSION 2.3.1
-ENV HADOOP_VERSION 2.7
-
-RUN apt-get -y update && \
-    apt-get install --no-install-recommends -y openjdk-8-jre-headless ca-certificates-java time && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/*
-
-RUN cd /tmp && \
-        wget -q http://archive.apache.org/dist/spark/spark-${APACHE_SPARK_VERSION}/spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz && \
-        echo "DC3A97F3D99791D363E4F70A622B84D6E313BD852F6FDBC777D31EAB44CBC112CEEAA20F7BF835492FB654F48AE57E9969F93D3B0E6EC92076D1C5E1B40B4696 *spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz" | sha512sum -c - && \
-        tar xzf spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz -C /usr/local --owner root --group root --no-same-owner && \
-        rm spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz
-RUN cd /usr/local && ln -s spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION} spark
-
-# Spark config
-ENV SPARK_HOME /usr/local/spark
-ENV PYTHONPATH $SPARK_HOME/python:$SPARK_HOME/python/lib/py4j-0.10.7-src.zip:/wd
-ENV SPARK_OPTS --driver-java-options=-Xms1024M --driver-java-options=-Xmx4096M --driver-java-options=-Dlog4j.logLevel=info
-ENV PYSPARK_PYTHON /conda/envs/rapids/bin/python
-ENV PYSPARK_DRIVER_PYTHON /conda/envs/rapids/bin/python
-
-SHELL ["/bin/bash", "-c"]
-
-RUN source activate rapids && \
-    pip install --upgrade pip && \
-    pip install --no-cache-dir pyspark==2.3.1 && \
-    pip install --no-cache-dir --no-deps tensorflow-transform==0.24.1 apache-beam==2.14 tensorflow-metadata==0.14.0 pydot dill \
-    pip install --no-cache-dir -e git+https://github.com/NVIDIA/dllogger#egg=dllogger
-
-WORKDIR  /wd
-
-COPY . .
--- a/TensorFlow2/Recommendation/WideAndDeep/LICENSE
+++ b/TensorFlow2/Recommendation/WideAndDeep/LICENSE
@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright 2021 NVIDIA Corporation
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
--- a/TensorFlow2/Recommendation/WideAndDeep/README.md
+++ b/TensorFlow2/Recommendation/WideAndDeep/README.md
@ -25,27 +25,24 @@ The content of the repository is tested and maintained by NVIDIA.
  * [Getting the data](#getting-the-data)
    + [Dataset guidelines](#dataset-guidelines)
    + [Dataset preprocessing](#dataset-preprocessing)
-      - [Spark CPU Dataset preprocessing](#spark-cpu-dataset-preprocessing)
      - [NVTabular GPU preprocessing](#nvtabular-gpu-preprocessing)
  * [Training process](#training-process)
  * [Evaluation process](#evaluation-process)
 - [Performance](#performance)
  * [Benchmarking](#benchmarking)
-    + [NVTabular and Spark CPU Preprocessing comparison](#nvtabular-and-spark-cpu-preprocessing-comparison)
-    + [Training and inference performance benchmark](#training-and-inference-performance-benchmark)
  * [Results](#results)
    + [Training accuracy results](#training-accuracy-results)
      - [Training accuracy: NVIDIA DGX A100 (8x A100 80GB)](#training-accuracy-nvidia-dgx-a100-8x-a100-80gb)
-      - [Training accuracy: NVIDIA DGX-1 (8x V100 16GB)](#training-accuracy-nvidia-dgx-1-8x-v100-16gb)
+      - [Training accuracy: NVIDIA DGX-1 (8x V100 32GB)](#training-accuracy-nvidia-dgx-1-8x-v100-32gb)
      - [Training accuracy plots](#training-accuracy-plots)
      - [Training stability test](#training-stability-test)
      - [Impact of mixed precision on training accuracy](#impact-of-mixed-precision-on-training-accuracy)
    + [Training performance results](#training-performance-results)
      - [Training performance: NVIDIA DGX A100 (8x A100 80GB)](#training-performance-nvidia-dgx-a100-8x-a100-80gb)
-      - [Training performance: NVIDIA DGX-1 (8x V100 16GB)](#training-performance-nvidia-dgx-1-8x-v100-16gb)
-    + [Inference performance results](#inference-performance-results)
-      - [Inference performance: NVIDIA DGX A100 (8x A100 80GB)](#inference-performance-nvidia-dgx-a100-8x-a100-80gb)
-      - [Inference performance: NVIDIA DGX-1 (8x V100 16GB)](#inference-performance-nvidia-dgx-1-8x-v100-16gb)
+      - [Training performance: NVIDIA DGX-1 (8x V100 32GB)](#training-performance-nvidia-dgx-1-8x-v100-32gb)
+    + [Evaluation performance results](#evaluation-performance-results)
+      - [Evaluation performance: NVIDIA DGX A100 (8x A100 80GB)](#evaluation-performance-nvidia-dgx-a100-8x-a100-80gb)
+      - [Evaluation performance: NVIDIA DGX-1 (8x V100 32GB)](#evaluation-performance-nvidia-dgx-1-8x-v100-32gb)
 - [Release notes](#release-notes)
  * [Changelog](#changelog)
  * [Known issues](#known-issues)
@ -86,21 +83,21 @@ The Outbrain Dataset is preprocessed in order to get features input to the model
 Features:
 - Request Level:
    * 5 scalar numeric features `dtype=float32`
-    * 8 categorical features (all INT32 `dtype`)
-    * 8 trainable embeddings of (dimension, cardinality of categorical variable): (128,300000), (16,4), (128,100000), (64 ,4000), (64,1000), (64,2500), (64,300), (64,2000)
+    * 8 categorical features `dtype=int32`
+    * 8 trainable embeddings of (dimension, cardinality of categorical variable): (128,300000), (19,4), (128,100000), (64,4000), (64,1000), (64,2500), (64,300), (64,2000)
    * 8  trainable embeddings for wide part of size 1 (serving as an embedding from the categorical to scalar space for input to the wide portion of the model)

 - Item Level:
    * 8 scalar numeric features `dtype=float32`
-    * 5 categorical features (all INT32 `dtype`)
-    * 5 trainable embeddings of dimensions (cardinality of categorical variable): 128 (250000), 64 (2500), 64 (4000), 64 (1000),64 (5000)
+    * 5 categorical features `dtype=int32`
+    * 5 trainable embeddings of  (dimension, cardinality of categorical variable): (128,250000), (64,2500), (64,4000), (64,1000), (128,5000)
    * 5 trainable embeddings for wide part of size 1 (working as trainable one-hot embeddings)

 Features describe both the user (Request Level features) and Item (Item Level Features).

 - Model:
    * Input dimension is 26 (13 categorical and 13 numerical features)
-    * Total embedding dimension is 976
+    * Total embedding dimension is 1043
    * 5 hidden layers each with size 1024
    * Total number of model parameter is ~90M
    * Output dimension is 1 (`y` is the probability of click given Request-level and Item-level features)
@ -112,7 +109,7 @@ For more information about feature preprocessing, go to [Dataset preprocessing](

 Model accuracy is defined with the [MAP@12](https://en.wikipedia.org/wiki/Evaluation_measures_(information_retrieval)#Mean_average_precision) metric. This metric follows the way of assessing model accuracy in the original [Kaggle Outbrain Click Prediction Challenge](https://www.kaggle.com/c/outbrain-click-prediction/). In this repository, the leaked clicked ads are not taken into account since in industrial setup Data Scientists do not have access to leaked information when training the model. For more information about data leak in Kaggle Outbrain Click Prediction challenge, visit this  [blogpost](https://medium.com/unstructured/how-feature-engineering-can-help-you-do-well-in-a-kaggle-competition-part-ii-3645d92282b8) by the 19th place finisher in that competition.

-Training and inference script also reports AUC ROC, binary accuracy, and Loss (BCE) values.
+Training and evaluation script also reports Loss (BCE) values.

 ### Feature support matrix

@ -177,7 +174,7 @@ The following section lists the requirements that you need to meet in order to s

 This repository contains Dockerfile which extends the TensorFlow2 NGC container and encapsulates some dependencies. Aside from these dependencies, ensure you have the following components:
 - [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker)
- [20.12-tf2-py3](https://ngc.nvidia.com/catalog/containers/nvidia:tensorflow) NGC container
+- [21.09 Merlin Tensorflow Training](https://ngc.nvidia.com/catalog/containers/nvidia:merlin:merlin-tensorflow-training) NGC container

 Supported GPUs:
 - [NVIDIA Volta architecture](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/)
@ -213,57 +210,44 @@ The Outbrain dataset can be downloaded from Kaggle (requires Kaggle account). Un
 HOST_OUTBRAIN_PATH=/raid/outbrain
 ```

-4. Preprocess the Outbrain dataset.
-
-4.1. Build the Wide & Deep Preprocessing Container.
+4. Build the Wide & Deep Container.
 ```
 cd DeepLearningExamples/TensorFlow2/Recommendation/WideAndDeep
-docker build -f Dockerfile-preproc . -t wd2-prep
+docker build . -t wd2
 ```

-4.2. Start an interactive session in the Wide&Deep Preprocessing Container. Run preprocessing against the original Outbrain dataset to `tf_records`. You can run preprocessing using Spark (CPU) or NVTabular preprocessing (GPU).
+5. Preprocess the Outbrain dataset.
+
+5.1. Start an interactive session in the Wide&Deep Container. Run preprocessing against the original Outbrain dataset to `parquets`. You can run preprocessing using NVTabular preprocessing (GPU).
 ```
-nvidia-docker run --rm -it --ipc=host -v ${HOST_OUTBRAIN_PATH}:/outbrain wd2-prep bash
+docker run --runtime=nvidia --gpus=all --rm -it --ipc=host -v ${HOST_OUTBRAIN_PATH}:/outbrain wd2 bash
 ```

-4.3. Start preprocessing.
-You can preprocess the data using either Spark on CPU or NVTabular on GPU. For more information, go to the [Dataset preprocessing](#dataset-preprocessing) section.
-
-4.3.1. CPU Preprocessing (Spark).
-```
-cd /wd && bash scripts/preproc.sh spark 40
-```
-
-4.3.2. GPU Preprocessing (NVTabular).
-```
-cd /wd && bash scripts/preproc.sh nvtabular 40
-```
-
-The result of preprocessing scripts are prebatched TFRecords. The argument to the script is the number of TFRecords files that will be generated by the script (here 40). TFRecord files are generated in `${HOST_OUTBRAIN_PATH}/tfrecords`.
-
-4.4. Training of the model
-4.4.1. Build the Wide&Deep Training Container
-```
-cd DeepLearningExamples/TensorFlow2/Recommendation/WideAndDeep
-docker build -f Dockerfile-train . -t wd2-train
-```
-
-4.4.2. Start an interactive session in the Wide&Deep Training Container
-```
-nvidia-docker run --rm -it --privileged --ipc=host -v ${HOST_OUTBRAIN_PATH}:/outbrain wd2-train bash
-```
-
-4.4.3. Run training
-For 1 GPU:
+5.2. Start NVTabular GPU preprocessing.  For more information, go to the [Dataset preprocessing](#dataset-preprocessing) section.

 ```
-python main.py
+bash scripts/preproc.sh
 ```

-For 1 GPU with Mixed Precision training with XLA:
+The result of preprocessing script is NVTabular dataset stored in parquets. Files are generated in `${HOST_OUTBRAIN_PATH}/data`.
+
+6. Train the model
+
+6.1. Start an interactive session in the Wide & Deep Container
+```
+docker run --runtime=nvidia --gpus=all --rm -it --ipc=host -v ${HOST_OUTBRAIN_PATH}:/outbrain wd2 bash
+```
+
+6.2. Run training (`${GPU}` is a arbitrary number of gpu to be used)

 ```
-python main.py --xla --amp
+horovodrun -np ${GPU} sh hvd_wrapper.sh python main.py
+```
+
+Training with Mixed Precision training with XLA:
+
+```
+horovodrun -np ${GPU} sh hvd_wrapper.sh python main.py --xla --amp
 ```


@ -272,39 +256,22 @@ For complete usage, run:
 python main.py -h
 ```

-For 8 GPUs:
-```
-mpiexec --allow-run-as-root --bind-to socket -np 8 python main.py
-```

-For 8 GPU with Mixed Precision training with XLA:
-```
-mpiexec --allow-run-as-root --bind-to socket -np 8 python main.py --xla --amp
-```
-
-
-
-5. Run validation or evaluation.
+7. Run validation or evaluation.
 If you want to run validation or evaluation, you can either:
 * use the checkpoint obtained from the training commands above, or
 * download the pretrained checkpoint from NGC.

 In order to download the checkpoint from NGC, visit [ngc.nvidia.com](https://ngc.nvidia.com) website and browse the available models. Download the checkpoint files and unzip them to some path, for example, to `$HOST_OUTBRAIN_PATH/checkpoints/` (which is the default path for storing the checkpoints during training). The checkpoint requires around 700MB disk space.

-6. Start validation/evaluation.
+8. Start validation/evaluation.
 In order to validate the checkpoint on the evaluation set, run the `main.py` script with the `--evaluate` and `--use_checkpoint` flags.

-For 1 GPU:
 ```
-python main.py --evaluate --use_checkpoint
+horovodrun -np ${GPU} sh hvd_wrapper.sh python main.py --evaluate --use_checkpoint
 ```

-For 8 GPUs:
-```
-mpiexec --allow-run-as-root --bind-to socket -np 8 python main.py --evaluate --use_checkpoint
-```
-
-Now that you have your model trained and evaluated, you can choose to compare your training results with our [Training accuracy results](#training-accuracy-results). You can also choose to benchmark yours performance to [Training and inference performance benchmark](#training-and-inference-performance-benchmark). Following the steps in these sections will ensure that you achieve the same accuracy and performance results as stated in the [Results](#results) section.
+Now that you have your model trained and evaluated, you can choose to compare your training results with our [Training accuracy results](#training-accuracy-results). You can also choose to benchmark yours performance to [Training and evaluation performance benchmark](#training-and-evaluation-performance-benchmark). Following the steps in these sections will ensure that you achieve the same accuracy and performance results as stated in the [Results](#results) section.

 ## Advanced

@ -313,29 +280,29 @@ The following sections provide greater details of the dataset, running training,
 ### Scripts and sample code

 These are the important scripts in this repository:
-* `main.py` - Python script for training the Wide & Deep recommender model. This script is run inside the training container (named `wd-train` in the [Quick Start Guide](#quick-start-guide)).
-* `scripts/preproc.sh` - Bash script for Outbrain dataset preparation for training, preprocessing and saving into TFRecords format. This script is run inside a preprocessing container (named `wd-prep` in the [Quick Start Guide](#quick-start-guide)).
-* `data/outbrain/dataloader.py` - Python file containing data loaders for training and evaluation set.
+* `main.py` - Python script for training the Wide & Deep recommender model.
+* `scripts/preproc.sh` - Bash script for Outbrain dataset preparation for training, preprocessing and saving into NVTabular format.
+* `data/outbrain/dataloader.py` - Python file containing NVTabular data loaders for train and evaluation set.
 * `data/outbrain/features.py` - Python file describing the request and item level features as well as embedding dimensions and hash buckets’ sizes.
 * `trainer/model/widedeep.py` - Python file with model definition.
-* `trainer/utils/run.py` - Python file with training loop.
+* `trainer/run.py` - Python file with training and evaluation setup.

 ### Parameters

-These are the important parameters in the `main.py` script:
+These are model parameters in the `main.py` script:

 | Scope| parameter| Comment| Default Value |
 | -------------------- | ----------------------------------------------------- | ------------------------------------------------------------ | ------------- |
-| location of datasets | --transformed_metadata_path TRANSFORMED_METADATA_PATH | Path to transformed_metadata for feature specification reconstruction |               |
-|location of datasets| --use_checkpoint|Use checkpoint stored in model_dir path |False
+|location of datasets |--train_data_pattern TRAIN_DATA_PATTERN |Pattern of training file names |/outbrain/data/train/*.parquet |
+|location of datasets |--eval_data_pattern EVAL_DATA_PATTERN |Pattern of eval file names |/outbrain/data/valid/*.parquet |
+|location of datasets|--use_checkpoint|Use checkpoint stored in model_dir path |False
 |location of datasets|--model_dir MODEL_DIR|Destination where model checkpoint will be saved |/outbrain/checkpoints
 |location of datasets|--results_dir RESULTS_DIR|Directory to store training results | /results
 |location of datasets|--log_filename LOG_FILENAME|Name of the file to store dlloger output |log.json|
-|training parameters|--training_set_size TRAINING_SET_SIZE|Number of samples in the training set | 59761827
 |training parameters|--global_batch_size GLOBAL_BATCH_SIZE|Total size of training batch | 131072
 |training parameters|--eval_batch_size EVAL_BATCH_SIZE|Total size of evaluation batch | 131072
 |training parameters|--num_epochs NUM_EPOCHS|Number of training epochs | 20
-|training parameters|--cpu|Run computations on the CPU | False
+|training parameters|--cpu|Run computations on the CPU | Currently not supported
 |training parameters|--amp|Enable automatic mixed precision conversion | False
 |training parameters|--xla|Enable XLA conversion | False
 |training parameters|--linear_learning_rate LINEAR_LEARNING_RATE|Learning rate for linear model | 0.02
@ -350,8 +317,6 @@ These are the important parameters in the `main.py` script:
 |run mode parameters|--affinity{socket,single,single_unique,<br>socket_unique_interleaved,<br>socket_unique_continuous,disabled}|Type of CPU affinity | socket_unique_interleaved


-
-
 ### Command-line options
 To see the full list of available options and their descriptions, use the `-h` or `--help` command-line option:
 ```
@ -374,38 +339,15 @@ The original data is stored in several separate files:
 * `promoted_content.csv` - metadata about the ads
 * `document_meta.csv`, `document_topics.csv`, `document_entities.csv`, `document_categories.csv` - metadata about the documents

-During the preprocessing stage, the data is transformed into 87M rows tabular data of 26 features. The dataset is split into training and evaluation parts that have approx 60M and approx 27M rows, respectively. Splitting into train and eval is done in this way so that random 80% of daily events for the first 10 days of the dataset form a training set and remaining part (20% of events daily for the first 10 days and all events in the last two days) form an evaluation set. Eventually the dataset is saved in pre-batched TFRecord format.
+During the preprocessing stage, the data is transformed into 87M rows tabular data of 26 features. The dataset is split into training and evaluation parts that have approx 60M and approx 27M rows, respectively. Splitting into train and eval is done in this way so that random 80% of daily events for the first 10 days of the dataset form a training set and remaining part (20% of events daily for the first 10 days and all events in the last two days) form an evaluation set. Eventually the dataset is saved in NVTabular parquet format.

 #### Dataset preprocessing

-Dataset preprocessing aims in creating in total 26 features: 13 categorical and 13 numerical. These features are obtained from the original Outbrain dataset in preprocessing. There are 2 types of preprocessing available for the model:
-Spark CPU preprocessing
-[NVTabular](https://nvidia.github.io/NVTabular/v0.3.0/index.html) GPU preprocessing
-
-Both split the dataset into train and evaluation sets and produce the same feature set, therefore, the training is agnostic to the preprocessing step.
-
-For comparison of Spark CPU and NVTabular preprocessing go to [NVTabular and Spark CPU Preprocessing comparison](#nvtabular-and-spark-cpu-preprocessing-comparison)
-
-##### Spark CPU Dataset preprocessing
-
-The original dataset is preprocessed using the scripts provided in `data/outbrain/spark`. Preprocessing is split into 3 preprocessing steps: `preproc1.py`, `preproc2.py`, and `preproc3.py` that form a complete workflow. The workflow consists of the following operations:
-* separating out the validation set for cross-validation
-* filling missing data with mode, median, or imputed values
-* joining click data, ad metadata,  and document category, topic and entity tables to create an enriched table
-* computing  7 click-through rates (CTR) for ads grouped by 7 features
-* computing attribute cosine similarity between the landing page and ad to be featured on the page
-* math transformations of the numeric features (logarithmic, scaling, binning)
-* categorifying data using hash-bucketing
-* storing the resulting set of features in pre-batched TFRecord format
-
-The `preproc1-3.py` preprocessing scripts use PySpark. In the Docker image, we have installed Spark 2.3.1 as a standalone cluster of Spark. The `preproc1.py` script splits the data into a training set and a validation set. The `preproc2.py` script computes the click-through rates (CTR) and cosine similarities between the features. The `preproc3.py` script performs the math transformations and generates the final TFRecord files. The data in the output files is pre-batched (with the default batch size of 4096) to avoid the overhead of the TFRecord format, which otherwise is not suitable for the tabular data.
-The preprocessing includes some very resource-exhaustive operations including joining tables having over 2 billions of rows. Such operations may not fit into the RAM memory, and therefore we use Spark which is well suited for handling tabular operations on large data with limited RAM. Note that the Spark job requires about 500 GB disk space and 300 GB RAM to perform the preprocessing.
-
-For more information about Spark, refer to the [Spark documentation](https://spark.apache.org/docs/2.3.1/).
+Dataset preprocessing aims in creating in total 26 features: 13 categorical and 13 numerical. These features are obtained from the original Outbrain dataset in [NVTabular](https://nvidia.github.io/NVTabular/v0.6.1/index.html) preprocessing.

 ##### NVTabular GPU preprocessing

-The NVTabular dataset is preprocessed using the script provided in `data/outbrain/nvtabular`. The workflow consists of most of the same operations in the Spark pipeline:
+The NVTabular dataset is preprocessed using the script provided in `data/outbrain/nvtabular`. The workflow consists of:
 * separating out the validation set for cross-validation
 * filling missing data with themode, median, or imputed values most frequent value
 * joining click data, ad metadata, and document category, topic and entity tables to create an enriched table.joining the  tables for the ad clicks data
@ -415,16 +357,14 @@ The NVTabular dataset is preprocessed using the script provided in `data/outbrai
 * categorifying data using hash-bucketing
 * storing the result in a Parquet format

-**Transforming the result into the pre-batched TFRecord format**
-
-Most of the code describing operations in this workflow are in `data/outbrain/nvtabular/utils/workflow.py` and leverage NVTabular v0.3. As stated in its repository, [NVTabular](https://github.com/NVIDIA/NVTabular), a component of [NVIDIA Merlin Open Beta](https://developer.nvidia.com/nvidia-merlin), is a feature engineering and preprocessing library for tabular data that is designed to quickly and easily manipulate terabyte scale datasets and train deep learning based recommender systems. It provides a high-level abstraction to simplify code and accelerates computation on the GPU using the [RAPIDS Dask-cuDF](https://github.com/rapidsai/cudf/tree/main/python/dask_cudf) library. The code to transform the NVTabular Parquet output into TFRecords is in `data/outbrain/nvtabular/utils/converter.py`.
-The NVTabular version of preprocessing is not subject to the same memory and storage constraints as its Spark counterpart, since NVTabular is able to manipulate tables on GPU and work with tables much larger than even physical RAM memory. The NVTabular Outbrain workflow has been successfully tested on DGX-1 V100 and DGX A100 for single and multigpu preprocessing.
+Most of the code describing operations in this workflow are in `data/outbrain/nvtabular/utils/workflow.py` and leverage NVTabular v0.6.1. As stated in its repository, [NVTabular](https://github.com/NVIDIA/NVTabular), a component of [NVIDIA Merlin Open Beta](https://developer.nvidia.com/nvidia-merlin), is a feature engineering and preprocessing library for tabular data that is designed to quickly and easily manipulate terabyte scale datasets and train deep learning based recommender systems. It provides a high-level abstraction to simplify code and accelerates computation on the GPU using the [RAPIDS Dask-cuDF](https://github.com/rapidsai/cudf/tree/main/python/dask_cudf) library.
+The NVTabular Outbrain workflow has been successfully tested on DGX-1 V100 and DGX A100 for single and multigpu preprocessing.

 For more information about NVTabular, refer to the [NVTabular documentation](https://github.com/NVIDIA/NVTabular).

 ### Training process

-The training can be started by running the `main.py` script. By default, the script is in train mode. Other training related configs are also present in the `trainer/utils/arguments.py` and can be seen using the command `python main.py -h`. Training happens on a TFRecords training dataset files that match `--train_data_pattern`.  Training is run for `--num_epochs` epochs with a global batch size of `--global_batch_size` in strong scaling mode (i.e. the effective batch size per GPU equals `global_batch_size/gpu_count`).
+The training can be started by running the `main.py` script. By default, the script is in train mode. Other training related configs are also present in the `trainer/utils/arguments.py` and can be seen using the command `python main.py -h`. Training happens with NVTabular data loader on a NVTabular training dataset files that match `--train_data_pattern`.  Training is run for `--num_epochs` epochs with a global batch size of `--global_batch_size` in strong scaling mode (i.e. the effective batch size per GPU equals `global_batch_size/gpu_count`).

 The model:
 `tf.keras.experimental.WideDeepModel` consists of a wide part and deep part with a sigmoid activation in the output layer (see [Figure 1](#model-architecture)) for reference and `trainer/model/widedeep.py` for model definition).
@ -435,89 +375,46 @@ Two separate optimizers are used to optimize the wide and the deep part of the n
 * RMSProp optimizer is used to optimize the deep part of the network.

 Checkpoint of the model:
-* Can be loaded at the beginning of training when `--use_checkpoint` is set
+* Can be loaded at the beginning of training when `--use_checkpoint` is set.
 * is saved into `--model_dir` after each training epoch. Only the last checkpoint is kept.
-* Contains information about number of training epochs
+* Contains information about number of training epochs.

 The model is evaluated on an evaluation dataset after every training epoch training log is displayed in the console and stored in  `--log_filename`.

-Every 100 batches with training metrics:
-loss, binary accuracy, AUC ROC, MAP@12 value
+Every 100 batches with training metrics: bce loss

-Every epoch after training, evaluation metrics are logged:
-loss, binary accuracy, AUC ROC, MAP@12 value
+Every epoch after training, evaluation metrics are logged: bce loss and MAP@12 value

 ### Evaluation process

-The evaluation can be started by running the `main.py --evaluation` script. Evaluation is done for TFRecords dataset stored in `--eval_data_pattern`. Other evaluation related configs are also present in the `trainer/utils/arguments.py` and can be seen using the command `python main.py -h`.
+The evaluation can be started by running the `main.py --evaluation` script. Evaluation is done on NVTabular parquet dataset stored in `--eval_data_pattern`. Other evaluation related configs are also present in the `trainer/utils/arguments.py` and can be seen using the command `python main.py -h`.

 During evaluation (`--evaluation flag`):
-* Model is restored from checkpoint in `--model_dir` if `--use_checkpoint` is set
-* Evaluation log is displayed in console and stored in  `--log_filename`
-* Every 100 batches evaluation metrics are logged - loss, binary accuracy, AUC ROC, MAP@12 value
+* Model is restored from checkpoint in `--model_dir` if `--use_checkpoint` is set.
+* Evaluation log is displayed in console and stored in  `--log_filename`.
+* Every 100 batches evaluation metrics are logged: bce loss.

-After the whole evaluation, the total evaluation metrics are logged,  loss, binary accuracy, AUC ROC, MAP@12 value.
+After the whole evaluation, the total evaluation metrics are logged: bce loss and MAP@12 value.

 ## Performance

-The performance measurements in this document were conducted at the time of publication and may not reflect the performance achieved from NVIDIA’s latest software release. For the most up-to-date performance measurements, go to [NVIDIA Data Center Deep Learning Product Performance](https://developer.nvidia.com/deep-learning-performance-training-inference).
-
 ### Benchmarking

 The following section shows how to run benchmarks measuring the model performance in training mode.

-#### NVTabular and Spark CPU Preprocessing comparison
-
-Two types of dataset preprocessing are presented in Spark-CPU and NVTabular on GPU repositories. Both of these preprocess return prebatched TFRecords files with the same structure. The following table shows the comparison of both preprocessing in terms of code complication (Lines of code), top RAM consumption, and preprocessing time.
-
-| |CPU preprocessing     | CPU Preprocessing        | GPU preprocessing        | GPU Preprocessing        | GPU preprocessing        | GPU Preprocessing        |
-| -------------------------- | ----- | --------------------- | ------------------------ | ------------------------ | ------------------------ | ------------------------|
-|| Spark on NVIDIA DGX-1 | Spark on NVIDIA DGX A100 | NVTabular on DGX-1 1 GPU | NVTabular on DGX-1 8 GPU | NVTabular DGX A100 1 GPU | NVTabular DGX A100 8 GPU |      |
-| Lines of code*             | ~1500 | ~1500| ~500| ~500| ~500| ~500|
-| Top RAM consumption \[GB\] | 167.0 | 223.4| 34.3| 48.7| 37.7 | 50.6|
-| Top VRAM consumption per GPU \[GB\] | 0 | 0 | 16 | 13 | 45 | 67|
-| Preprocessing time \[min\] |45.6|38.5|4.4|3.9|2.6| 2.3|
-
-
-To achieve the same results for Top RAM consumption and preprocessing time, run a preprocessing container (`${HOST_OUTBRAIN_PATH}` is the path with Outbrain dataset).
-```
-nvidia-docker run --rm -it --ipc=host -v ${HOST_OUTBRAIN_PATH}:/outbrain wd2-prep bash
-```
-
-In the preprocessing container, run the preprocessing benchmark.
-
-For Spark CPU preprocessing:
-```
-cd /wd && bash scripts/preproc_benchmark.sh -m spark
-```
-
-For GPU NVTabular preprocessing:
-```
-cd /wd && bash scripts/preproc_benchmark.sh -m nvtabular
-```
-
-
-#### Training and inference performance benchmark
+#### Training and evaluation performance benchmark

 Benchmark script is prepared to measure performance of the model during training (default configuration) and evaluation (`--evaluation`). Benchmark runs training or evaluation for `--benchmark_steps` batches, however measurement of performance starts after `--benchmark_warmup_steps`. Benchmark can be run for single and 8 GPUs and with a combination of XLA (`--xla`), AMP (`--amp`), batch sizes (`--global_batch_size` , `--eval_batch_size`) and affinity (`--affinity`).

 In order to run benchmark follow these steps:
-Run training container (`${HOST_OUTBRAIN_PATH}` is the path with Outbrain dataset):
-```
-nvidia-docker run --rm -it --ipc=host --privileged -v ${HOST_OUTBRAIN_PATH}:/outbrain wd2-train bash
-```

+Run Wide & Deep Container (`${HOST_OUTBRAIN_PATH}` is the path with Outbrain dataset):
+```
+docker run --runtime=nvidia --gpus=all --rm -it --ipc=host -v ${HOST_OUTBRAIN_PATH}:/outbrain wd2 bash
+```
 Run the benchmark script:
-For 1 GPU:
 ```
-python main.py --benchmark
-```
-
-The benchmark will be run for training with default training parameters.
-
-For 8GPUs:
-```
-mpiexec --allow-run-as-root --bind-to socket -np 8 python main.py --benchmark
+horovodrun -np ${GPU} sh hvd_wrapper.sh python main.py --benchmark
 ```

 ### Results
@ -530,27 +427,27 @@ The following sections provide details on how we achieved our performance and ac

 Our results were obtained by running the `main.py` training script in the TensorFlow2 NGC container on NVIDIA DGX A100 with (8x A100 80GB) GPUs.

-| GPUs | Batch size / GPU | XLA | Accuracy - TF32 (MAP@12), Spark dataset | Accuracy - mixed precision (MAP@12),Spark Dataset | Accuracy - TF32 (MAP@12), NVTabular dataset | Accuracy - mixed precision (MAP@12), NVTabular Dataset | Time to train - TF32 (minutes) | Time to train - mixed precision (minutes) | Time to train speedup (TF32 to mixed precision) |
-| ---- | ---------------- | --- | --------------|---|------- | ----------------------------------- | ------------------------------ | ----------------------------------------- | ----------------------------------------------- |
-1|131072|Yes|0.65536|0.65537|0.65537|0.65646|16.40|13.71|1.20
-1|131072|No|0.65538|0.65533|0.65533|0.65643|19.58|18.49|1.06
-8|16384|Yes|0.65527|0.65525|0.65525|0.65638|7.77|9.71|0.80
-8|16384|No|0.65517|0.65525|0.65525|0.65638|7.84|9.48|0.83
+| GPUs | Batch size / GPU | XLA | Accuracy - TF32 (MAP@12) | Accuracy - mixed precision (MAP@12) |  Time to train - TF32 (minutes) | Time to train - mixed precision (minutes) | Time to train speedup (TF32 to mixed precision) |
+| ---- | ---------------- | --- | --------------|---|------- | ----------------------------------- |  ----------------------------------------------- |
+1|131072|Yes|0.65656|0.65654|13.40|9.48|1.41
+1|131072|No |0.65662|0.65656|17.75|13.38|1.33
+8|16384|Yes |0.65672|0.65665|4.82|4.50|1.07
+8|16384|No  |0.65671|0.65655|5.71|5.72|1.00


 To achieve the same results, follow the steps in the [Quick Start Guide](#quick-start-guide).

-##### Training accuracy: NVIDIA DGX-1 (8x V100 16GB)
+##### Training accuracy: NVIDIA DGX-1 (8x V100 32GB)

-Our results were obtained by running the main.py training script in the TensorFlow2 NGC container on NVIDIA DGX-1 with (8x V100 16GB) GPUs.
+Our results were obtained by running the main.py training script in the TensorFlow2 NGC container on NVIDIA DGX-1 with (8x V100 32GB) GPUs.


-| GPUs | Batch size / GPU | XLA | Accuracy - TF32 (MAP@12), Spark dataset | Accuracy - mixed precision (MAP@12),Spark Dataset | Accuracy - TF32 (MAP@12), NVTabular dataset | Accuracy - mixed precision (MAP@12), NVTabular Dataset | Time to train - TF32 (minutes) | Time to train - mixed precision (minutes) | Time to train speedup (TF32 to mixed precision) |
-| ---- | ---------------- | --- | --------------|---|------- | ----------------------------------- | ------------------------------ | ----------------------------------------- | ----------------------------------------------- |
-1|131072|Yes|0.65531|0.65529|0.65529|0.65651|66.01|23.66|2.79
-1|131072|No|0.65542|0.65534|0.65534|0.65641|72.68|29.18|2.49|
-8|16384|Yes|0.65544|0.65547|0.65547|0.65642|16.28|13.90|1.17|
-8|16384|No|0.65548|0.65540|0.65540|0.65633|16.34|12.65|1.29|                                  |
+| GPUs | Batch size / GPU | XLA | Accuracy - FP32 (MAP@12) | Accuracy - mixed precision (MAP@12) |  Time to train - FP32 (minutes) | Time to train - mixed precision (minutes) | Time to train speedup (FP32 to mixed precision) |
+| ---- | ---------------- | --- | --------------|---|------- |  ----------------------------------------- | ----------------------------------------------- |
+1|131072|Yes |0.65658|0.65664|62.89|18.65|3.37
+1|131072|No  |0.65662|0.65658|71.53|25.18|2.84
+8|16384|Yes  |0.65668|0.65655|12.21|8.89|1.37
+8|16384|No   |0.65665|0.65654|14.38|7.17|2.01


 To achieve the same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
@ -559,132 +456,80 @@ To achieve the same results, follow the steps in the [Quick Start Guide](#quick-

 Models trained with FP32, TF32 and Automatic Mixed Precision (AMP), with and without XLA enabled achieve similar accuracy.

-The plot represents MAP@12 in a function of steps (step is single batch) during training for default precision (FP32 for Volta architecture (DGX-1) and TF32 for Ampere GPU architecture (DGX-A100)) and AMP  for XLA and without it for both datasets. All other parameters of training are default.
+The plot represents MAP@12 in a function of steps (step is single batch) during training for default precision (FP32 for Volta architecture (DGX-1) and TF32 for Ampere GPU architecture (DGX-A100)) and AMP  for XLA and without it for NVTabular dataset. All other parameters of training are default.

 <p align="center">
-  <img width="100%" src="./img/leraning_curve_spark.svg" />
+  <img width="100%" src="./img/learning_curve.svg" />
  <br>
-  Figure 2. Learning curves for Spark dataset for different configurations.</a>
+  Figure 2. Learning curves for different configurations on single gpu.</a>
 </p>

-<p align="center">
-  <img width="100%" src="./img/learning_curve_nvt.svg" />
-  <br>
-  Figure 3. Learning curves for NVTabular dataset for different configurations.</a>
-</p>
-
-
-
-
-
-
 ##### Training stability test

-Training of the model is stable for multiple configurations achieving the standard deviation of 10e-4. The model achieves similar MAP@12 scores for A100 and V100, training precisions, XLA usage and single/multi GPU. The Wide and Deep model was trained for 9100 training steps (20 epochs, 455 batches in each epoch, every batch containing 131072), starting from 20 different initial random seeds for each setup. The training was performed in the 20.12-tf1-py3 NGC container on NVIDIA DGX A100 80GB and DGX-1 16GB machines with and without mixed precision enabled, with and without XLA enabled for Spark- and NVTabular generated datasets. The provided charts and numbers consider single and 8 GPU training. After training, the models were evaluated on the validation set. The following plots compare distributions of MAP@12 on the evaluation set. In columns there is single vs 8 GPU training, in rows DGX A100 and DGX-1 V100.
+Training of the model is stable for multiple configurations achieving the standard deviation of 10e-4. The model achieves similar MAP@12 scores for A100 and V100, training precisions, XLA usage and single/multi GPU. The Wide and Deep model was trained for 9140 training steps (20 epochs, 457 batches in each epoch, every batch containing 131072), starting from 20 different initial random seeds for each setup. The training was performed in the 21.09 Merlin Tensorflow Training NGC container on NVIDIA DGX A100 80GB and DGX-1 32GB machines with and without mixed precision enabled, with and without XLA enabled for NVTabular generated dataset. The provided charts and numbers consider single and 8 GPU training. After training, the models were evaluated on the validation set. The following plots compare distributions of MAP@12 on the evaluation set. In columns there is single vs 8 GPU training, in rows DGX A100 and DGX-1 V100.

 <p align="center">
-  <img width="100%" src="./img/training_stability_spark.svg" />
+  <img width="100%" src="./img/training_stability.svg" />
  <br>
-  Figure 4. Training stability for Spark dataset: distribution of MAP@12 across different configurations. 'All configurations' refer to the distribution of MAP@12 for cartesian product of architecture, training precision, XLA usage, single/multi GPU. </a>
+  Figure 3. Training stability plot, distribution of MAP@12 across different configurations. 'All configurations' refer to the distribution of MAP@12 for cartesian product of architecture, training precision, XLA usage, single/multi GPU. </a>
 </p>

-<p align="center">
-  <img width="100%" src="./img/training_stability_nvtabular.svg" />
-  <br>
-  Figure 5. Training stability for NVtabular dataset: distribution of MAP@12 across different configurations. 'All configurations' refer to the distribution of MAP@12 for cartesian product of architecture, training precision, XLA usage, single/multi GPU.</a>
-</p>
-
-

 Training stability was also compared in terms of point statistics for MAP@12 distribution for multiple configurations. Refer to the expandable table below.

 <details>
 <summary>Full tabular data for training stability tests</summary>

-||GPUs|Precicision|Dataset|XLA|mean|std|Min|Max
-|--------|-|---------|-----------|---|----|---|---|---
-DGX A100|1|TF32|Spark preprocessed|Yes|0.65536|0.00016|0.65510|0.65560|
-DGX A100|1|TF32|Spark preprocessed|No|0.65538|0.00013|0.65510|0.65570|
-DGX A100|1|TF32|NVTabular preprocessed|Yes|0.65641|0.00038|0.65530|0.65680|
-DGX A100|1|TF32|NVTabular preprocessed|No|0.65648|0.00024|0.65580|0.65690|
-DGX A100|1|AMP|Spark preprocessed|Yes|0.65537|0.00013|0.65510|0.65550|
-DGX A100|1|AMP|Spark preprocessed|No|0.65533|0.00016|0.65500|0.65550|
-DGX A100|1|AMP|NVTabular preprocessed|Yes|0.65646|0.00036|0.65530|0.65690|
-DGX A100|1|AMP|NVTabular preprocessed|No|0.65643|0.00027|0.65590|0.65690|
-DGX A100|8|TF32|Spark preprocessed|Yes|0.65527|0.00013|0.65500|0.65560|
-DGX A100|8|TF32|Spark preprocessed|No|0.65517|0.00025|0.65460|0.65560|
-DGX A100|8|TF32|NVTabular preprocessed|Yes|0.65631|0.00038|0.65550|0.65690|
-DGX A100|8|TF32|NVTabular preprocessed|No|0.65642|0.00022|0.65570|0.65680|
-DGX A100|8|AMP|Spark preprocessed|Yes|0.65525|0.00018|0.65490|0.65550|
-DGX A100|8|AMP|Spark preprocessed|No|0.65525|0.00016|0.65490|0.65550|
-DGX A100|8|AMP|NVTabular preprocessed|Yes|0.65638|0.00026|0.65580|0.65680|
-DGX A100|8|AMP|NVTabular preprocessed|No|0.65638|0.00031|0.65560|0.65700|
-DGX-1 V100|1|FP32|Spark preprocessed|Yes|0.65531|0.00017|0.65490|0.65560|
-DGX-1 V100|1|FP32|Spark preprocessed|No|0.65542|0.00012|0.65520|0.65560|
-DGX-1 V100|1|FP32|NVTabular preprocessed|Yes|0.65651|0.00019|0.65610|0.65680|
-DGX-1 V100|1|FP32|NVTabular preprocessed|No|0.65638|0.00035|0.65560|0.65680|
-DGX-1 V100|1|AMP|Spark preprocessed|Yes|0.65529|0.00015|0.65500|0.65570|
-DGX-1 V100|1|AMP|Spark preprocessed|No|0.65534|0.00015|0.65500|0.65560|
-DGX-1 V100|1|AMP|NVTabular preprocessed|Yes|0.65651|0.00028|0.65560|0.65690|
-DGX-1 V100|1|AMP|NVTabular preprocessed|No|0.65641|0.00032|0.65570|0.65680|
-DGX-1 V100|8|FP32|Spark preprocessed|Yes|0.65544|0.00019|0.65500|0.65580|
-DGX-1 V100|8|FP32|Spark preprocessed|No|0.65548|0.00013|0.65510|0.65560|
-DGX-1 V100|8|FP32|NVTabular preprocessed|Yes|0.65645|0.00012|0.65630|0.65670|
-DGX-1 V100|8|FP32|NVTabular preprocessed|No|0.65638|0.00015|0.65610|0.65670|
-DGX-1 V100|8|AMP|Spark preprocessed|Yes|0.65547|0.00015|0.65520|0.65580|
-DGX-1 V100|8|AMP|Spark preprocessed|No|0.65540|0.00019|0.65500|0.65580|
-DGX-1 V100|8|AMP|NVTabular preprocessed|Yes|0.65642|0.00028|0.65580|0.65690|
-DGX-1 V100|8|AMP|NVTabular preprocessed|No|0.65633|0.00037|0.65510|0.65680|
+| | GPUs | Precicision | XLA | Mean | Std | Min | Max | 
+| -------- | --- |  --------- | ---- | ------ | ------ | ------ | ------ |
+|DGX A100|1|TF32|Yes   |0.65656|0.00016|0.6563|0.6569
+|DGX A100|1|TF32|No    |0.65662|0.00013|0.6563|0.6568
+|DGX A100|1|AMP|Yes    |0.65654|0.00010|0.6563|0.6567
+|DGX A100|1|AMP|No     |0.65656|0.00011|0.6564|0.6568
+|DGX A100|8|TF32|Yes   |0.65672|0.00012|0.6565|0.6570
+|DGX A100|8|TF32|No    |0.65671|0.00013|0.6565|0.6569
+|DGX A100|8|AMP|Yes    |0.65665|0.00014|0.6564|0.6569
+|DGX A100|8|AMP|No     |0.65655|0.00012|0.6564|0.6568
+|DGX-1 V100|1|FP32|Yes |0.65658|0.00013|0.6563|0.6568
+|DGX-1 V100|1|FP32|No  |0.65662|0.00011|0.6564|0.6568
+|DGX-1 V100|1|AMP|Yes  |0.65664|0.00011|0.6564|0.6568
+|DGX-1 V100|1|AMP|No   |0.65658|0.00011|0.6564|0.6568
+|DGX-1 V100|8|FP32|Yes |0.65668|0.00016|0.6564|0.6570
+|DGX-1 V100|8|FP32|No  |0.65665|0.00019|0.6564|0.6570
+|DGX-1 V100|8|AMP|Yes  |0.65655|0.00012|0.6563|0.6567
+|DGX-1 V100|8|AMP|No   |0.65654|0.00013|0.6563|0.6567
 </details>


-
-
 ##### Impact of mixed precision on training accuracy

 The accuracy of training, measured with [MAP@12](https://en.wikipedia.org/wiki/Evaluation_measures_(information_retrieval)#Mean_average_precision) on the evaluation set after the final epoch metric was not impacted by enabling mixed precision. The obtained results were statistically similar. The similarity was measured according to the following procedure:

 The model was trained 20 times for default settings (FP32 or TF32 for Volta and Ampere architecture respectively) and 20 times for AMP. After the last epoch, the accuracy score MAP@12 was calculated on the evaluation set.

-Distributions for four configurations: architecture (A100, V100) and single/multi gpu for 2 datasets are presented below.
+Distributions for four configurations: architecture (A100, V100) and single/multi gpu for NVTabular dataset are presented below.

 <p align="center">
-  <img width="100%" src="./img/amp_influence_spark.svg" />
+  <img width="100%" src="./img/amp_influence.svg" />
  <br>
-  Figure 6. Influence of AMP on MAP@12 distribution for DGX A100 and DGX-1 V100 for single and multi gpu training on Spark dataset. </a>
+  Figure 4. Influence of AMP on MAP@12 distribution for DGX A100 and DGX-1 V100 for single and multi gpu training. </a>
 </p>

-<p align="center">
-  <img width="100%" src="./img/amp_influence_nvtabular.svg" />
-  <br>
-  Figure 7. Influence of AMP on MAP@12 distribution for DGX A100 and DGX-1 V100 for single and multi gpu training on NVTabular dataset.
-</p>
-
-
-
 Distribution scores for full precision training and AMP training were compared in terms of mean, variance and [Kolmogorov–Smirnov test](https://en.wikipedia.org/wiki/Kolmogorov%E2%80%93Smirnov_test) to state statistical difference between full precision and AMP results. Refer to the expandable table below.

 <details>
 <summary>Full tabular data for AMP influence on MAP@12</summary>

-|            |GPUs                   | Dataset | XLA    | Mean MAP@12 for Full precision (TF32 for A100, FP32 for V100) | Std MAP@12 for Full precision (TF32 for A100, FP32 for V100) | Mean MAP@12 for AMP | Std MAP@12 for AMP | KS test value: statistics, p-value |
-| ------------ | ---------------------- | ------- | ------ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------- | ------------------ | ---------------------------------- |
-| DGX A100   | 1    | NVTabular preprocessed | No      | 0.6565 | 0.0002                                                       | 0.6564                                                       | 0.0003              | 0.2000 (0.8320)    |                                    |
-| DGX A100   | 8    | NVTabular preprocessed | No      | 0.6564 | 0.0002                                                       | 0.6564                                                       | 0.0003              | 0.1500 (0.9831)    |                                    |
-| DGX A100   | 1    | Spark preprocessed     | No      | 0.6554 | 0.0001                                                       | 0.6553                                                       | 0.0002              | 0.2500 (0.5713)    |                                    |
-| DGX A100   | 8    | Spark preprocessed     | No      | 0.6552 | 0.0002                                                       | 0.6552                                                       | 0.0002              | 0.3000 (0.3356)    |                                    |
-| DGX A100   | 1    | NVTabular preprocessed | No      | 0.6564 | 0.0004                                                       | 0.6565                                                       | 0.0004              | 0.1500 (0.9831)    |                                    |
-| DGX A100   | 8    | NVTabular preprocessed | No      | 0.6563 | 0.0004                                                       | 0.6564                                                       | 0.0003              | 0.2500 (0.5713)    |                                    |
-| DGX A100   | 1    | Spark preprocessed     | No      | 0.6554 | 0.0002                                                       | 0.6554                                                       | 0.0001              | 0.1500 (0.9831)    |                                    |
-| DGX A100   | 8    | Spark preprocessed     | No      | 0.6553 | 0.0001                                                       | 0.6552                                                       | 0.0002              | 0.1500 (0.9831))   |                                    |
-| DGX-1 V100 | 1    | NVTabular preprocessed | No      | 0.6564 | 0.0004                                                       | 0.6564                                                       | 0.0003              | 0.1000 (1.0000)    |                                    |
-| DGX-1 V100 | 8    | NVTabular preprocessed | No      | 0.6564 | 0.0001                                                       | 0.6563                                                       | 0.0004              | 0.2500 (0.5713)    |                                    |
-| DGX-1 V100 | 1    | Spark preprocessed     | No      | 0.6554 | 0.0001                                                       | 0.6553                                                       | 0.0001              | 0.2000 (0.8320)    |                                    |
-| DGX-1 V100 | 8    | Spark preprocessed     | No      | 0.6555 | 0.0001                                                       | 0.6554                                                       | 0.0002              | 0.3500 (0.1745)    |                                    |
-| DGX-1 V100 | 1    | NVTabular preprocessed | No      | 0.6565 | 0.0002                                                       | 0.6565                                                       | 0.0003              | 0.1500 (0.9831)    |                                    |
-| DGX-1 V100 | 8    | NVTabular preprocessed | No      | 0.6564 | 0.0001                                                       | 0.6564                                                       | 0.0003              | 0.2000 (0.8320)    |                                    |
-| DGX-1 V100 | 1    | Spark preprocessed     | No      | 0.6553 | 0.0002                                                       | 0.6553                                                       | 0.0002              | 0.2000 (0.8320)    |                                    |
-| DGX-1 V100 | 8    | Spark preprocessed     | No      | 0.6554 | 0.0002                                                       | 0.6555                                                       | 0.0002              | 0.1500 (0.9831)    |                                    |
+|              | GPUs                   |  XLA    | Mean MAP@12 for Full precision (TF32 for A100, FP32 for V100) | Std MAP@12 for Full precision (TF32 for A100, FP32 for V100) | Mean MAP@12 for AMP | Std MAP@12 for AMP | KS test value: statistics, p-value |
+| ------------ | ---------------------- |  ------ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------- | ------------------ | ---------------------------------- |
+| DGX A100   | 1    | Yes     |0.65656|0.00016|0.65654|0.00010|0.10000 (0.99999)
+| DGX A100   | 8    | Yes     |0.65672|0.00012|0.65665|0.00014|0.40000 (0.08106)
+| DGX A100   | 1    | No      |0.65662|0.00013|0.65656|0.00011|0.35000 (0.17453)
+| DGX A100   | 8    | No      |0.65671|0.00013|0.65655|0.00012|0.35000 (0.17453)
+| DGX-1 V100 | 1    | Yes     |0.65658|0.00013|0.65664|0.00011|0.25000 (0.57134)
+| DGX-1 V100 | 8    | Yes     |0.65668|0.00016|0.65655|0.00012|0.30000 (0.33559)
+| DGX-1 V100 | 1    | No      |0.65662|0.00011|0.65658|0.00011|0.20000 (0.83197)
+| DGX-1 V100 | 8    | No      |0.65665|0.00019|0.65654|0.00013|0.40000 (0.08106)

 </details>

@ -696,132 +541,132 @@ Our results were obtained by running the benchmark script (`main.py --benchmark`

 |GPUs | Batch size / GPU | XLA | Throughput - TF32 (samples/s)|Throughput - mixed precision (samples/s)|Throughput speedup (TF32 - mixed precision)| Strong scaling - TF32|Strong scaling - mixed precision
 | ---- | ---------------- | --- | ----------------------------- | ---------------------------------------- | ------------------------------------------- | --------------------- | -------------------------------- |
-|1|131,072|Yes|1642892|1997414|1.22|1.00|1.00|
-|1|131,072|No|1269638|1355523|1.07|1.00|1.00|
-|8|16,384|Yes|3376438|2508278|0.74|2.06|1.26|
-|8|16,384|No|3351118|2643009|0.79|2.64|1.07|
+|1|131,072|Yes|2026524|3069487|1.51|1.00|1.00
+|1|131,072|No |1379960|1928375|1.40|1.00|1.00
+|8|16,384|Yes |6892010|7574174|1.10|3.40|2.47
+|8|16,384|No  |5124054|5120040|1.00|3.71|2.66


-##### Training performance: NVIDIA DGX-1 (8x V100 16GB)
+##### Training performance: NVIDIA DGX-1 (8x V100 32GB)

-Our results were obtained by running the benchmark script (`main.py --benchmark`) in the TensorFlow2 NGC container on NVIDIA DGX-1 with (8x V100 16GB) GPUs.
+Our results were obtained by running the benchmark script (`main.py --benchmark`) in the TensorFlow2 NGC container on NVIDIA DGX-1 with (8x V100 32GB) GPUs.

 |GPUs | Batch size / GPU | XLA | Throughput - FP32 (samples/s)|Throughput - mixed precision (samples/s)|Throughput speedup (FP32 - mixed precision)| Strong scaling - FP32|Strong scaling - mixed precision
 | ---- | ---------------- | --- | ----------------------------- | ---------------------------------------- | ------------------------------------------- | --------------------- | -------------------------------- |
-|1|131,072|Yes|361202|1091584|3.02|1.00|1.00
-|1|131,072|No|321816|847229|2.63|1.00|1.00
-|8|16,384|Yes|1512691|1731391|1.14|4.19|1.59
-|8|16,384|No|1490044|1837962|1.23|4.63|2.17
+|1|131,072|Yes|378918|1405633|3.71|1.00|1.00
+|1|131,072|No |323817|969824|2.99|1.00|1.00
+|8|16,384|Yes |2196648|4332939|1.97|5.80|3.08
+|8|16,384|No  |1772485|3058944|1.73|5.47|3.15


-#### Inference performance results
+#### Evaluation performance results

-##### Inference performance: NVIDIA DGX A100 (8x A100 80GB)
+##### Evaluation performance: NVIDIA DGX A100 (8x A100 80GB)

 Our results were obtained by running the benchmark script (`main.py --evaluate --benchmark`) in the TensorFlow2 NGC container on NVIDIA DGX A100 with 8x A100 80GB GPUs. 


-|GPUs|Batch size / GPU|XLA|Throughput \[samples/s\] TF32|Throughput \[samples/s\]AMP|Throughput speedup AMP to TF32
+|GPUs|Batch size / GPU|XLA|Throughput \[samples/s\] TF32|Throughput \[samples/s\] AMP|Throughput speedup AMP to TF32
 |----|----------------|---|------------------------------|-----------------------------|-------------------------------
-|1|4096|NO|648058|614053|0.95|
-|1|8192|NO|1063986|1063203|1.00|
-|1|16384|NO|1506679|1573248|1.04|
-|1|32768|NO|1983238|2088212|1.05|
-|1|65536|NO|2280630|2523812|1.11|
-|1|131072|NO|2568911|2915340|1.13|
-|8|4096|NO|4516588|4374181|0.97|
-|8|8192|NO|7715609|7718173|1.00|
-|8|16384|NO|11296845|11624159|1.03|
-|8|32768|NO|14957242|15904745|1.06|
-|8|65536|NO|17671055|19332987|1.09|
-|8|131072|NO|19779711|21761656|1.10|
+|1|4096|NO    |1107650|1028782|0.93|
+|1|8192|NO    |1783848|1856528|1.04|
+|1|16384|NO   |2295874|2409601|1.05|
+|1|32768|NO   |2367142|2583293|1.09|
+|1|65536|NO   |3044662|3471619|1.14|
+|1|131072|NO  |3229625|3823612|1.18|
+|8|4096|NO    |5503985|5333228|0.97|
+|8|8192|NO    |12251675|12386870|1.01|
+|8|16384|NO   |16020973|16438269|1.03|
+|8|32768|NO   |17225168|18667798|1.08|
+|8|65536|NO   |19969248|22270424|1.12|
+|8|131072|NO  |19929457|22496045|1.13|

 For more results go to the expandable table below.

 <details>
-<summary>Full tabular data for inference performance results for DGX A100</summary>
+<summary>Full tabular data for evaluation performance results for DGX A100</summary>

-|GPUs|Batch size / GPU|XLA|Throughput \[samples/s\] TF32|Throughput \[samples/s\]AMP|Throughput speedup AMP to TF32
+|GPUs|Batch size / GPU|XLA|Throughput \[samples/s\] TF32|Throughput \[samples/s\] AMP|Throughput speedup AMP to TF32
 |----|----------------|---|------------------------------|-----------------------------|-------------------------------
-|1|4096|YES|621024|648441|1.04|
-|1|4096|NO|648058|614053|0.95|
-|1|8192|YES|1068943|1045790|0.98|
-|1|8192|NO|1063986|1063203|1.00|
-|1|16384|YES|1554101|1710186|1.10|
-|1|16384|NO|1506679|1573248|1.04|
-|1|32768|YES|2014216|2363490|1.17|
-|1|32768|NO|1983238|2088212|1.05|
-|1|65536|YES|2010050|2450872|1.22|
-|1|65536|NO|2280630|2523812|1.11|
-|1|131072|YES|2321543|2885393|1.24|
-|1|131072|NO|2568911|2915340|1.13|
-|8|4096|YES|4328154|4445315|1.03|
-|8|4096|NO|4516588|4374181|0.97|
-|8|8192|YES|7410554|7640191|1.03|
-|8|8192|NO|7715609|7718173|1.00|
-|8|16384|YES|11412928|12422567|1.09|
-|8|16384|NO|11296845|11624159|1.03|
-|8|32768|YES|11428369|12525670|1.10|
-|8|32768|NO|14957242|15904745|1.06|
-|8|65536|YES|13453756|15308455|1.14|
-|8|65536|NO|17671055|19332987|1.09|
-|8|131072|YES|17047482|20930042|1.23|
-|8|131072|NO|19779711|21761656|1.10|
+|1|4096|YES   |1344225|1501677|1.12|
+|1|4096|NO    |1107650|1028782|0.93|
+|1|8192|YES   |2220721|2545781|1.15|
+|1|8192|NO    |1783848|1856528|1.04|
+|1|16384|YES  |2730441|3230949|1.18|
+|1|16384|NO   |2295874|2409601|1.05|
+|1|32768|YES  |2527368|2974417|1.18|
+|1|32768|NO   |2367142|2583293|1.09|
+|1|65536|YES  |3163906|3935731|1.24|
+|1|65536|NO   |3044662|3471619|1.14|
+|1|131072|YES |3171670|4064426|1.28|
+|1|131072|NO  |3229625|3823612|1.18|
+|8|4096|YES   |6243348|6553485|1.05|
+|8|4096|NO    |5503985|5333228|0.97|
+|8|8192|YES   |14995914|16222429|1.08|
+|8|8192|NO    |12251675|12386870|1.01|
+|8|16384|YES  |14584474|16543902|1.13|
+|8|16384|NO   |16020973|16438269|1.03|
+|8|32768|YES  |17840220|21537660|1.21|
+|8|32768|NO   |17225168|18667798|1.08|
+|8|65536|YES  |20732672|24082577|1.16|
+|8|65536|NO   |19969248|22270424|1.12|
+|8|131072|YES |20104010|24157900|1.20|
+|8|131072|NO  |19929457|22496045|1.13|
 </details>


-##### Inference performance: NVIDIA DGX-1 (8x V100 16GB)
+##### Evaluation performance: NVIDIA DGX-1 (8x V100 32GB)

-Our results were obtained by running the benchmark script (`main.py --evaluate --benchmark`) in the TensorFlow2 NGC container on NVIDIA DGX-1 with (8x V100 16GB) GPUs.
+Our results were obtained by running the benchmark script (`main.py --evaluate --benchmark`) in the TensorFlow2 NGC container on NVIDIA DGX-1 with (8x V100 32GB) GPUs.

-|GPUs|Batch size / GPU|XLA|Throughput \[samples/s\] TF32|Throughput \[samples/s\]AMP|Throughput speedup AMP to TF32
+|GPUs|Batch size / GPU|XLA|Throughput \[samples/s\] FP32|Throughput \[samples/s\] AMP|Throughput speedup AMP to FP32
 |----|----------------|---|------------------------------|-----------------------------|-------------------------------
-|1|4096|NO|375928|439395|1.17|
-|1|8192|NO|526780|754517|1.43|
-|1|16384|NO|673971|1133696|1.68|
-|1|32768|NO|791637|1470221|1.86|
-|1|65536|NO|842831|1753500|2.08|
-|1|131072|NO|892941|1990898|2.23|
-|8|4096|NO|2893390|3278473|1.13|
-|8|8192|NO|3881996|5337866|1.38|
-|8|16384|NO|5003135|8086178|1.62|
-|8|32768|NO|6124648|11087247|1.81|
-|8|65536|NO|6631887|13233484|2.00|
-|8|131072|NO|7030438|15081861|2.15|
+|1|4096|NO    |499442|718163|1.44|
+|1|8192|NO    |670906|1144640|1.71|
+|1|16384|NO   |802366|1599006|1.99|
+|1|32768|NO   |856130|1795285|2.10|
+|1|65536|NO   |934394|2221221|2.38|
+|1|131072|NO  |965293|2403829|2.49|
+|8|4096|NO    |2840155|3602516|1.27|
+|8|8192|NO    |4810100|7912019|1.64|
+|8|16384|NO   |5939908|10876135|1.83|
+|8|32768|NO   |6489446|12593087|1.94|
+|8|65536|NO   |6614453|14742844|2.23|
+|8|131072|NO  |7133219|15524549|2.18|



 For more results go to the expandable table below.

 <details>
-<summary>Full tabular data for inference performance for DGX-1 V100 results</summary>
+<summary>Full tabular data for evaluation performance for DGX-1 V100 results</summary>

-|GPUs|Batch size / GPU|XLA|Throughput \[samples/s\] TF32|Throughput \[samples/s\]AMP|Throughput speedup AMP to TF32
+|GPUs|Batch size / GPU|XLA|Throughput \[samples/s\] FP32|Throughput \[samples/s\] AMP|Throughput speedup AMP to FP32
 |----|----------------|---|------------------------------|-----------------------------|-------------------------------
-|1|4096|YES|356963|459481|1.29|
-|1|4096|NO|375928|439395|1.17|
-|1|8192|YES|517016|734515|1.42|
-|1|8192|NO|526780|754517|1.43|
-|1|16384|YES|660772|1150292|1.74|
-|1|16384|NO|673971|1133696|1.68|
-|1|32768|YES|776357|1541699|1.99|
-|1|32768|NO|791637|1470221|1.86|
-|1|65536|YES|863311|1962275|2.27|
-|1|65536|NO|842831|1753500|2.08|
-|1|131072|YES|928290|2235968|2.41|
-|1|131072|NO|892941|1990898|2.23|
-|8|4096|YES|2680961|3182591|1.19|
-|8|4096|NO|2893390|3278473|1.13|
-|8|8192|YES|3738172|5185972|1.39|
-|8|8192|NO|3881996|5337866|1.38|
-|8|16384|YES|4961435|8170489|1.65|
-|8|16384|NO|5003135|8086178|1.62|
-|8|32768|YES|6218767|11658218|1.87|
-|8|32768|NO|6124648|11087247|1.81|
-|8|65536|YES|6808677|14921211|2.19|
-|8|65536|NO|6631887|13233484|2.00|
-|8|131072|YES|7205370|16923294|2.35|
-|8|131072|NO|7030438|15081861|2.15|
+|1|4096|YES   |573285|919150|1.60|
+|1|4096|NO    |499442|718163|1.44|
+|1|8192|YES   |753993|1486867|1.97|
+|1|8192|NO    |670906|1144640|1.71|
+|1|16384|YES  |859699|1945700|2.26|
+|1|16384|NO   |802366|1599006|1.99|
+|1|32768|YES  |904255|1995194|2.21|
+|1|32768|NO   |856130|1795285|2.10|
+|1|65536|YES  |982448|2608010|2.65|
+|1|65536|NO   |934394|2221221|2.38|
+|1|131072|YES |926734|2621095|2.83|
+|1|131072|NO  |965293|2403829|2.49|
+|8|4096|YES   |3102948|4083015|1.32|
+|8|4096|NO    |2840155|3602516|1.27|
+|8|8192|YES   |5536556|10094905|1.82|
+|8|8192|NO    |4810100|7912019|1.64|
+|8|16384|YES  |5722386|10524548|1.84|
+|8|16384|NO   |5939908|10876135|1.83|
+|8|32768|YES  |6813318|14356608|2.11|
+|8|32768|NO   |6489446|12593087|1.94|
+|8|65536|YES  |6918413|16227668|2.35|
+|8|65536|NO   |6614453|14742844|2.23|
+|8|131072|YES |6910518|16423342|2.38|
+|8|131072|NO  |7133219|15524549|2.18|
 </details>

 ## Release notes
@ -829,7 +674,15 @@ For more results go to the expandable table below.
 ### Changelog

 February 2021
-Initial release
+- Initial release
+
+November 2021
+- Refresh release with performance optimizations
+- Updated NVTabular to v0.6.1
+- Replaced native TF dataloader with NVTabular counterpart
+- Removed spark CPU preprocessing
+- Updated readme numbers
+- Changed V100 cards from 16GB to 32GB

 ### Known issues
 * In this model the TF32 precision can in some cases be as fast as the FP16 precision on Ampere GPUs. This is because TF32 also uses Tensor Cores and doesn't need any additional logic such as maintaining FP32 master weights and casts. However, please note that W&D is, by modern recommender standards, a very small model. Larger models should still see significant benefits of using FP16 math.
--- a/TensorFlow2/Recommendation/WideAndDeep/data/outbrain/dataloader.py
+++ b/TensorFlow2/Recommendation/WideAndDeep/data/outbrain/dataloader.py
@ -12,128 +12,67 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-from functools import partial
-from multiprocessing import cpu_count

+import cupy
+import horovod.tensorflow as hvd
 import tensorflow as tf
+from data.outbrain.features import CATEGORICAL_COLUMNS, NUMERIC_COLUMNS
+from nvtabular.loader.tensorflow import KerasSequenceLoader

-from data.outbrain.features import get_features_keys
+cupy.random.seed(None)


-def _consolidate_batch(elem):
-    label = elem.pop('label')
-    reshaped_label = tf.reshape(label, [-1, label.shape[-1]])
-    features = get_features_keys()
+def seed_fn():
+    min_int, max_int = tf.int32.limits
+    max_rand = max_int // hvd.size()

-    reshaped_elem = {
-        key: tf.reshape(elem[key], [-1, elem[key].shape[-1]])
-        for key in elem
-        if key in features
-    }
+    # Generate a seed fragment on each worker
+    seed_fragment = cupy.random.randint(0, max_rand).get()

-    return reshaped_elem, reshaped_label
+    # Aggregate seed fragments from all Horovod workers
+    seed_tensor = tf.constant(seed_fragment)
+    reduced_seed = hvd.allreduce(seed_tensor, name="shuffle_seed", op=hvd.mpi_ops.Sum)

-
-def get_parse_function(feature_spec):
-    def _parse_function(example_proto):
-        return tf.io.parse_single_example(example_proto, feature_spec)
-
-    return _parse_function
+    return reduced_seed % max_rand


 def train_input_fn(
-        filepath_pattern,
-        feature_spec,
-        records_batch_size,
-        num_gpus=1,
-        id=0):
-    _parse_function = get_parse_function(feature_spec)
-
-    dataset = tf.data.Dataset.list_files(
-        file_pattern=filepath_pattern
-    )
-
-    dataset = dataset.interleave(
-        lambda x: tf.data.TFRecordDataset(x),
-        cycle_length=cpu_count() // num_gpus,
-        block_length=1
-    )
-
-    dataset = dataset.map(
-        map_func=_parse_function,
-        num_parallel_calls=tf.data.experimental.AUTOTUNE
-    )
-
-    dataset = dataset.shard(num_gpus, id)
-
-    dataset = dataset.shuffle(records_batch_size * 8)
-
-    dataset = dataset.repeat(
-        count=None
-    )
-
-    dataset = dataset.batch(
+    train_paths, records_batch_size, buffer_size=0.1, parts_per_chunk=1, shuffle=True
+):
+    train_dataset_tf = KerasSequenceLoader(
+        train_paths,
        batch_size=records_batch_size,
-        drop_remainder=False
+        label_names=["clicked"],
+        cat_names=CATEGORICAL_COLUMNS,
+        cont_names=NUMERIC_COLUMNS,
+        engine="parquet",
+        shuffle=shuffle,
+        buffer_size=buffer_size,
+        parts_per_chunk=parts_per_chunk,
+        global_size=hvd.size(),
+        global_rank=hvd.rank(),
+        seed_fn=seed_fn,
    )

-    dataset = dataset.map(
-        map_func=partial(
-            _consolidate_batch
-        ),
-        num_parallel_calls=tf.data.experimental.AUTOTUNE
-    )
-
-    dataset = dataset.prefetch(
-        buffer_size=tf.data.experimental.AUTOTUNE
-    )
-
-    return dataset
+    return train_dataset_tf


 def eval_input_fn(
-        filepath_pattern,
-        feature_spec,
-        records_batch_size,
-        num_gpus=1,
-        repeat=1,
-        id=0):
-    dataset = tf.data.Dataset.list_files(
-        file_pattern=filepath_pattern,
-        shuffle=False
-    )
-
-    dataset = tf.data.TFRecordDataset(
-        filenames=dataset,
-        num_parallel_reads=1
-    )
-
-    dataset = dataset.shard(num_gpus, id)
-
-    dataset = dataset.repeat(
-        count=repeat
-    )
-
-    dataset = dataset.batch(
+    valid_paths, records_batch_size, buffer_size=0.1, parts_per_chunk=1, shuffle=False
+):
+    valid_dataset_tf = KerasSequenceLoader(
+        valid_paths,
        batch_size=records_batch_size,
-        drop_remainder=False
+        label_names=["clicked"],
+        cat_names=CATEGORICAL_COLUMNS + ["display_id"],
+        cont_names=NUMERIC_COLUMNS,
+        engine="parquet",
+        shuffle=shuffle,
+        buffer_size=buffer_size,
+        parts_per_chunk=parts_per_chunk,
+        global_size=hvd.size(),
+        global_rank=hvd.rank(),
+        seed_fn=seed_fn,
    )

-    dataset = dataset.apply(
-        transformation_func=tf.data.experimental.parse_example_dataset(
-            features=feature_spec,
-            num_parallel_calls=1
-        )
-    )
-
-    dataset = dataset.map(
-        map_func=partial(
-            _consolidate_batch
-        ),
-        num_parallel_calls=None
-    )
-    dataset = dataset.prefetch(
-        buffer_size=1
-    )
-
-    return dataset
+    return valid_dataset_tf
--- a/TensorFlow2/Recommendation/WideAndDeep/data/outbrain/features.py
+++ b/TensorFlow2/Recommendation/WideAndDeep/data/outbrain/features.py
@ -16,83 +16,75 @@ import logging

 import tensorflow as tf

-PREBATCH_SIZE = 4096
-DISPLAY_ID_COLUMN = 'display_id'
+DISPLAY_ID_COLUMN = "display_id"

-TIME_COLUMNS = [
-    'doc_event_days_since_published_log_01scaled',
-    'doc_ad_days_since_published_log_01scaled'
+NUMERIC_COLUMNS = [
+    "document_id_document_id_promo_sim_categories",
+    "document_id_document_id_promo_sim_topics",
+    "document_id_document_id_promo_sim_entities",
+    "document_id_promo_ctr",
+    "publisher_id_promo_ctr",
+    "source_id_promo_ctr",
+    "document_id_promo_count",
+    "publish_time_days_since_published",
+    "ad_id_ctr",
+    "advertiser_id_ctr",
+    "campaign_id_ctr",
+    "ad_id_count",
+    "publish_time_promo_days_since_published",
 ]

-GB_COLUMNS = [
-    'pop_document_id',
-    'pop_publisher_id',
-    'pop_source_id',
-    'pop_ad_id',
-    'pop_advertiser_id',
-    'pop_campain_id',
-    'doc_views_log_01scaled',
-    'ad_views_log_01scaled'
-]
-
-SIM_COLUMNS = [
-    'doc_event_doc_ad_sim_categories',
-    'doc_event_doc_ad_sim_topics',
-    'doc_event_doc_ad_sim_entities'
-]
-
-NUMERIC_COLUMNS = TIME_COLUMNS + SIM_COLUMNS + GB_COLUMNS
-
 CATEGORICAL_COLUMNS = [
-    'ad_id',
-    'campaign_id',
-    'doc_event_id',
-    'event_platform',
-    'doc_id',
-    'ad_advertiser',
-    'doc_event_source_id',
-    'doc_event_publisher_id',
-    'doc_ad_source_id',
-    'doc_ad_publisher_id',
-    'event_geo_location',
-    'event_country',
-    'event_country_state',
+    "ad_id",
+    "document_id",
+    "platform",
+    "document_id_promo",
+    "campaign_id",
+    "advertiser_id",
+    "source_id",
+    "geo_location",
+    "geo_location_country",
+    "geo_location_state",
+    "publisher_id",
+    "source_id_promo",
+    "publisher_id_promo",
 ]

 HASH_BUCKET_SIZES = {
-    'doc_event_id': 300000,
-    'ad_id': 250000,
-    'doc_id': 100000,
-    'doc_ad_source_id': 4000,
-    'doc_event_source_id': 4000,
-    'event_geo_location': 2500,
-    'ad_advertiser': 2500,
-    'event_country_state': 2000,
-    'doc_ad_publisher_id': 1000,
-    'doc_event_publisher_id': 1000,
-    'event_country': 300,
-    'event_platform': 4,
-    'campaign_id': 5000
+    "document_id": 300000,
+    "ad_id": 250000,
+    "document_id_promo": 100000,
+    "source_id_promo": 4000,
+    "source_id": 4000,
+    "geo_location": 2500,
+    "advertiser_id": 2500,
+    "geo_location_state": 2000,
+    "publisher_id_promo": 1000,
+    "publisher_id": 1000,
+    "geo_location_country": 300,
+    "platform": 4,
+    "campaign_id": 5000,
 }

 EMBEDDING_DIMENSIONS = {
-    'doc_event_id': 128,
-    'ad_id': 128,
-    'doc_id': 128,
-    'doc_ad_source_id': 64,
-    'doc_event_source_id': 64,
-    'event_geo_location': 64,
-    'ad_advertiser': 64,
-    'event_country_state': 64,
-    'doc_ad_publisher_id': 64,
-    'doc_event_publisher_id': 64,
-    'event_country': 64,
-    'event_platform': 16,
-    'campaign_id': 128
+    "document_id": 128,
+    "ad_id": 128,
+    "document_id_promo": 128,
+    "source_id_promo": 64,
+    "source_id": 64,
+    "geo_location": 64,
+    "advertiser_id": 64,
+    "geo_location_state": 64,
+    "publisher_id_promo": 64,
+    "publisher_id": 64,
+    "geo_location_country": 64,
+    "platform": 19,
+    "campaign_id": 128,
 }

 EMBEDDING_TABLE_SHAPES = {
-    column: (HASH_BUCKET_SIZES[column], EMBEDDING_DIMENSIONS[column]) for column in CATEGORICAL_COLUMNS
+    column: (HASH_BUCKET_SIZES[column], EMBEDDING_DIMENSIONS[column])
+    for column in CATEGORICAL_COLUMNS
 }


@ -101,31 +93,40 @@ def get_features_keys():


 def get_feature_columns():
-    logger = logging.getLogger('tensorflow')
+    logger = logging.getLogger("tensorflow")
    wide_columns, deep_columns = [], []

    for column_name in CATEGORICAL_COLUMNS:
        if column_name in EMBEDDING_TABLE_SHAPES:
            categorical_column = tf.feature_column.categorical_column_with_identity(
-                column_name, num_buckets=EMBEDDING_TABLE_SHAPES[column_name][0])
+                column_name, num_buckets=EMBEDDING_TABLE_SHAPES[column_name][0]
+            )
            wrapped_column = tf.feature_column.embedding_column(
                categorical_column,
                dimension=EMBEDDING_TABLE_SHAPES[column_name][1],
-                combiner='mean')
+                combiner="mean",
+            )
        else:
-            raise ValueError(f'Unexpected categorical column found {column_name}')
+            raise ValueError(f"Unexpected categorical column found {column_name}")

        wide_columns.append(categorical_column)
        deep_columns.append(wrapped_column)

-    numerics = [tf.feature_column.numeric_column(column_name, shape=(1,), dtype=tf.float32)
-                for column_name in NUMERIC_COLUMNS]
+    numerics = [
+        tf.feature_column.numeric_column(column_name, shape=(1,), dtype=tf.float32)
+        for column_name in NUMERIC_COLUMNS
+        if column_name != DISPLAY_ID_COLUMN
+    ]

    wide_columns.extend(numerics)
    deep_columns.extend(numerics)

-    logger.warning('deep columns: {}'.format(len(deep_columns)))
-    logger.warning('wide columns: {}'.format(len(wide_columns)))
-    logger.warning('wide&deep intersection: {}'.format(len(set(wide_columns).intersection(set(deep_columns)))))
+    logger.warning("deep columns: {}".format(len(deep_columns)))
+    logger.warning("wide columns: {}".format(len(wide_columns)))
+    logger.warning(
+        "wide&deep intersection: {}".format(
+            len(set(wide_columns).intersection(set(deep_columns)))
+        )
+    )

    return wide_columns, deep_columns
--- a/TensorFlow2/Recommendation/WideAndDeep/data/outbrain/nvtabular/preproc.py
+++ b/TensorFlow2/Recommendation/WideAndDeep/data/outbrain/nvtabular/preproc.py
@ -15,37 +15,28 @@
 import logging
 import os

-os.environ['TF_MEMORY_ALLOCATION'] = "0.0"
-from data.outbrain.nvtabular.utils.converter import nvt_to_tfrecords
+os.environ["TF_MEMORY_ALLOCATION"] = "0.0"
 from data.outbrain.nvtabular.utils.workflow import execute_pipeline
 from data.outbrain.nvtabular.utils.arguments import parse_args
 from data.outbrain.nvtabular.utils.setup import create_config


 def is_empty(path):
-    return not os.path.exists(path) or (not os.path.isfile(path) and not os.listdir(path))
+    return not (os.path.exists(path) and (os.path.isfile(path) or os.listdir(path)))


 def main():
    args = parse_args()
    config = create_config(args)
    if is_empty(args.metadata_path):
-        logging.warning('Creating new stats data into {}'.format(config['stats_file']))
+        logging.warning(
+            "Creating parquets into {}".format(config["output_bucket_folder"])
+        )
        execute_pipeline(config)
    else:
-        logging.warning('Directory is not empty {args.metadata_path}')
-        logging.warning('Skipping NVTabular preprocessing')
-
-    if os.path.exists(config['output_train_folder']) and os.path.exists(config['output_valid_folder']):
-        if is_empty(config['tfrecords_path']):
-            logging.warning('Executing NVTabular parquets to TFRecords conversion')
-            nvt_to_tfrecords(config)
-        else:
-            logging.warning(f"Directory is not empty {config['tfrecords_path']}")
-            logging.warning('Skipping TFrecords conversion')
-    else:
-        logging.warning(f'Train and validation dataset not found in {args.metadata_path}')
+        logging.warning(f"Directory exists {args.metadata_path}")
+        logging.warning("Skipping NVTabular preprocessing")


-if __name__ == '__main__':
+if __name__ == "__main__":
    main()
--- a/TensorFlow2/Recommendation/WideAndDeep/data/outbrain/nvtabular/utils/arguments.py
+++ b/TensorFlow2/Recommendation/WideAndDeep/data/outbrain/nvtabular/utils/arguments.py
@ -14,39 +14,32 @@

 import argparse

-DEFAULT_DIR = '/outbrain'
+DEFAULT_DIR = "/outbrain"


 def parse_args():
    parser = argparse.ArgumentParser()

    parser.add_argument(
-        '--data_path',
-        help='Path with the data required for NVTabular preprocessing. '
-             'If stats already exists under metadata_path preprocessing phase will be skipped.',
+        "--data_path",
+        help="Path with the data required for NVTabular preprocessing. "
+             "If stats already exists under metadata_path preprocessing phase will be skipped.",
        type=str,
-        default=f'{DEFAULT_DIR}/orig',
-        nargs='+'
+        default=f"{DEFAULT_DIR}/orig",
+        nargs="+",
    )
    parser.add_argument(
-        '--metadata_path',
-        help='Path with preprocessed NVTabular stats',
+        "--metadata_path",
+        help="Path with preprocessed NVTabular stats",
        type=str,
-        default=f'{DEFAULT_DIR}/data',
-        nargs='+'
+        default=f"{DEFAULT_DIR}/data",
+        nargs="+",
    )
    parser.add_argument(
-        '--tfrecords_path',
-        help='Path where converted tfrecords will be stored',
-        type=str,
-        default=f'{DEFAULT_DIR}/tfrecords',
-        nargs='+'
-    )
-    parser.add_argument(
-        '--workers',
-        help='Number of TfRecords files to be created',
-        type=int,
-        default=40
+        '--use_dask',
+        default=False,
+        action='store_true',
+        help='Use multi-gpu preprocessing for nvTabular workflow'
    )

    return parser.parse_args()
--- a/TensorFlow2/Recommendation/WideAndDeep/data/outbrain/nvtabular/utils/converter.py
+++ b/TensorFlow2/Recommendation/WideAndDeep/data/outbrain/nvtabular/utils/converter.py
@ -1,158 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import logging
-import os
-from multiprocessing import Process
-
-import pandas as pd
-import tensorflow as tf
-from tensorflow_transform.tf_metadata import dataset_metadata
-from tensorflow_transform.tf_metadata import dataset_schema
-from tensorflow_transform.tf_metadata import metadata_io
-
-from data.outbrain.features import PREBATCH_SIZE
-from data.outbrain.nvtabular.utils.feature_description import transform_nvt_to_spark, CATEGORICAL_COLUMNS, \
-    DISPLAY_ID_COLUMN, EXCLUDE_COLUMNS
-
-
-def create_metadata(df, prebatch_size, output_path):
-    fixed_shape = [prebatch_size, 1]
-    spec = {}
-    for column in df:
-        if column in CATEGORICAL_COLUMNS + [DISPLAY_ID_COLUMN]:
-            spec[transform_nvt_to_spark(column)] = tf.io.FixedLenFeature(shape=fixed_shape, dtype=tf.int64,
-                                                                         default_value=None)
-        else:
-            spec[transform_nvt_to_spark(column)] = tf.io.FixedLenFeature(shape=fixed_shape, dtype=tf.float32,
-                                                                         default_value=None)
-    metadata = dataset_metadata.DatasetMetadata(dataset_schema.from_feature_spec(spec))
-    metadata_io.write_metadata(metadata, output_path)
-
-
-def create_tf_example(df, start_index, offset):
-    parsed_features = {}
-    records = df.loc[start_index:start_index + offset - 1]
-    for column in records:
-        if column in CATEGORICAL_COLUMNS + [DISPLAY_ID_COLUMN]:
-            feature = tf.train.Feature(int64_list=tf.train.Int64List(value=records[column].to_numpy()))
-        else:
-            feature = tf.train.Feature(float_list=tf.train.FloatList(value=records[column].to_numpy()))
-        parsed_features[transform_nvt_to_spark(column)] = feature
-    features = tf.train.Features(feature=parsed_features)
-    return tf.train.Example(features=features)
-
-
-def create_tf_records(df, prebatch_size, output_path):
-    with tf.io.TFRecordWriter(output_path) as out_file:
-        start_index = df.index[0]
-        for index in range(start_index, df.shape[0] + start_index - prebatch_size + 1, prebatch_size):
-            example = create_tf_example(df, index, prebatch_size)
-            out_file.write(example.SerializeToString())
-
-
-def convert(path_to_nvt_dataset, output_path, prebatch_size, exclude_columns, workers=6):
-    train_path = os.path.join(path_to_nvt_dataset, 'train')
-    valid_path = os.path.join(path_to_nvt_dataset, 'valid')
-    output_metadata_path = os.path.join(output_path, 'transformed_metadata')
-    output_train_path = os.path.join(output_path, 'train')
-    output_valid_path = os.path.join(output_path, 'eval')
-
-    for directory in [output_metadata_path, output_train_path, output_valid_path]:
-        os.makedirs(directory, exist_ok=True)
-
-    train_workers, valid_workers = [], []
-    output_train_paths, output_valid_paths = [], []
-
-    for worker in range(workers):
-        part_number = str(worker).rjust(5, '0')
-        record_train_path = os.path.join(output_train_path, f'part-r-{part_number}')
-        record_valid_path = os.path.join(output_valid_path, f'part-r-{part_number}')
-        output_train_paths.append(record_train_path)
-        output_valid_paths.append(record_valid_path)
-
-    logging.warning(f'Prebatch size set to {prebatch_size}')
-    logging.warning(f'Number of TFRecords set to {workers}')
-
-    logging.warning(f'Reading training parquets from {train_path}')
-    df_train = pd.read_parquet(train_path, engine='pyarrow')
-    logging.warning('Done')
-
-    logging.warning(f'Removing training columns {exclude_columns}')
-    df_train = df_train.drop(columns=exclude_columns)
-    logging.warning('Done')
-
-    logging.warning(f'Creating metadata in {output_metadata_path}')
-    metadata_worker = Process(target=create_metadata, args=(df_train, prebatch_size, output_metadata_path))
-    metadata_worker.start()
-
-    logging.warning(f'Creating training TFrecords to {output_train_paths}')
-
-    shape = df_train.shape[0] // workers
-    shape = shape + (prebatch_size - shape % prebatch_size)
-
-    for worker_index in range(workers):
-        df_subset = df_train.loc[worker_index * shape:(worker_index + 1) * shape - 1]
-        worker = Process(target=create_tf_records, args=(df_subset, prebatch_size, output_train_paths[worker_index]))
-        train_workers.append(worker)
-
-    for worker in train_workers:
-        worker.start()
-
-    logging.warning(f'Reading validation parquets from {valid_path}')
-    df_valid = pd.read_parquet(valid_path, engine='pyarrow')
-    logging.warning('Done')
-
-    logging.warning(f'Removing validation columns {exclude_columns}')
-    df_valid = df_valid.drop(columns=exclude_columns)
-    logging.warning('Done')
-
-    logging.warning(f'Creating validation TFrecords to {output_valid_paths}')
-
-    shape = df_valid.shape[0] // workers
-    shape = shape + (prebatch_size - shape % prebatch_size)
-
-    for worker_index in range(workers):
-        df_subset = df_valid.loc[worker_index * shape:(worker_index + 1) * shape - 1]
-        worker = Process(target=create_tf_records, args=(df_subset, prebatch_size, output_valid_paths[worker_index]))
-        valid_workers.append(worker)
-
-    for worker in valid_workers:
-        worker.start()
-
-    for worker_index in range(workers):
-        metadata_worker.join()
-        train_workers[worker_index].join()
-        valid_workers[worker_index].join()
-
-    logging.warning('Done')
-
-    del df_train
-    del df_valid
-
-    return output_path
-
-
-def nvt_to_tfrecords(config):
-    path_to_nvt_dataset = config['output_bucket_folder']
-    output_path = config['tfrecords_path']
-    workers = config['workers']
-
-    convert(
-        path_to_nvt_dataset=path_to_nvt_dataset,
-        output_path=output_path,
-        prebatch_size=PREBATCH_SIZE,
-        exclude_columns=EXCLUDE_COLUMNS,
-        workers=workers
-    )
--- a/TensorFlow2/Recommendation/WideAndDeep/data/outbrain/nvtabular/utils/feature_description.py
+++ b/TensorFlow2/Recommendation/WideAndDeep/data/outbrain/nvtabular/utils/feature_description.py
@ -12,89 +12,105 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-DISPLAY_ID_COLUMN = 'display_id'
+DISPLAY_ID_COLUMN = "display_id"

-BASE_CONT_COLUMNS = ['publish_time', 'publish_time_promo', 'timestamp', 'document_id_promo_clicked_sum_ctr',
-                     'publisher_id_promo_clicked_sum_ctr',
-                     'source_id_promo_clicked_sum_ctr', 'document_id_promo_count', 'publish_time_days_since_published',
-                     'ad_id_clicked_sum_ctr',
-                     'advertiser_id_clicked_sum_ctr', 'campaign_id_clicked_sum_ctr', 'ad_id_count',
-                     'publish_time_promo_days_since_published']
+BASE_CONT_COLUMNS = [
+    "publish_time",
+    "publish_time_promo",
+    "timestamp",
+    "document_id_promo_clicked_sum_ctr",
+    "publisher_id_promo_clicked_sum_ctr",
+    "source_id_promo_clicked_sum_ctr",
+    "document_id_promo_count",
+    "publish_time_days_since_published",
+    "ad_id_clicked_sum_ctr",
+    "advertiser_id_clicked_sum_ctr",
+    "campaign_id_clicked_sum_ctr",
+    "ad_id_count",
+    "publish_time_promo_days_since_published",
+]

 SIM_COLUMNS = [
-    'doc_event_doc_ad_sim_categories',
-    'doc_event_doc_ad_sim_topics',
-    'doc_event_doc_ad_sim_entities'
+    "doc_event_doc_ad_sim_categories",
+    "doc_event_doc_ad_sim_topics",
+    "doc_event_doc_ad_sim_entities",
 ]

 CONTINUOUS_COLUMNS = BASE_CONT_COLUMNS + SIM_COLUMNS + [DISPLAY_ID_COLUMN]

-groupby_columns = ['ad_id_count', 'ad_id_clicked_sum', 'source_id_promo_count', 'source_id_promo_clicked_sum',
-                   'document_id_promo_count', 'document_id_promo_clicked_sum',
-                   'publisher_id_promo_count', 'publisher_id_promo_clicked_sum', 'advertiser_id_count',
-                   'advertiser_id_clicked_sum',
-                   'campaign_id_count', 'campaign_id_clicked_sum']
-
-ctr_columns = ['advertiser_id_clicked_sum_ctr', 'document_id_promo_clicked_sum_ctr',
-               'publisher_id_promo_clicked_sum_ctr',
-               'source_id_promo_clicked_sum_ctr',
-               'ad_id_clicked_sum_ctr', 'campaign_id_clicked_sum_ctr']
-
-exclude_conts = ['publish_time', 'publish_time_promo', 'timestamp']
+exclude_conts = ["publish_time", "publish_time_promo", "timestamp"]

 NUMERIC_COLUMNS = [col for col in CONTINUOUS_COLUMNS if col not in exclude_conts]

-CATEGORICAL_COLUMNS = ['ad_id', 'document_id', 'platform', 'document_id_promo', 'campaign_id', 'advertiser_id',
-                       'source_id',
-                       'publisher_id', 'source_id_promo', 'publisher_id_promo', 'geo_location', 'geo_location_country',
-                       'geo_location_state']
+CATEGORICAL_COLUMNS = [
+    "ad_id",
+    "document_id",
+    "platform",
+    "document_id_promo",
+    "campaign_id",
+    "advertiser_id",
+    "source_id",
+    "publisher_id",
+    "source_id_promo",
+    "publisher_id_promo",
+]
+
+CTR_INPUTS = [
+    "ad_id",
+    "source_id_promo",
+    "document_id_promo",
+    "publisher_id_promo",
+    "advertiser_id",
+    "campaign_id",
+]

 EXCLUDE_COLUMNS = [
-    'publish_time',
-    'publish_time_promo',
-    'timestamp',
-    'ad_id_clicked_sum',
-    'source_id_promo_count',
-    'source_id_promo_clicked_sum',
-    'document_id_promo_clicked_sum',
-    'publisher_id_promo_count', 'publisher_id_promo_clicked_sum',
-    'advertiser_id_count',
-    'advertiser_id_clicked_sum',
-    'campaign_id_count',
-    'campaign_id_clicked_sum',
-    'uuid',
-    'day_event'
+    "publish_time",
+    "publish_time_promo",
+    "timestamp",
+    "ad_id_clicked_sum",
+    "source_id_promo_count",
+    "source_id_promo_clicked_sum",
+    "document_id_promo_clicked_sum",
+    "publisher_id_promo_count",
+    "publisher_id_promo_clicked_sum",
+    "advertiser_id_count",
+    "advertiser_id_clicked_sum",
+    "campaign_id_count",
+    "campaign_id_clicked_sum",
+    "uuid",
+    "day_event",
 ]

 nvt_to_spark = {
-    'ad_id': 'ad_id',
-    'clicked': 'label',
-    'display_id': 'display_id',
-    'document_id': 'doc_event_id',
-    'platform': 'event_platform',
-    'document_id_promo': 'doc_id',
-    'campaign_id': 'campaign_id',
-    'advertiser_id': 'ad_advertiser',
-    'source_id': 'doc_event_source_id',
-    'publisher_id': 'doc_event_publisher_id',
-    'source_id_promo': 'doc_ad_source_id',
-    'publisher_id_promo': 'doc_ad_publisher_id',
-    'geo_location': 'event_geo_location',
-    'geo_location_country': 'event_country',
-    'geo_location_state': 'event_country_state',
-    'document_id_promo_clicked_sum_ctr': 'pop_document_id',
-    'publisher_id_promo_clicked_sum_ctr': 'pop_publisher_id',
-    'source_id_promo_clicked_sum_ctr': 'pop_source_id',
-    'document_id_promo_count': 'doc_views_log_01scaled',
-    'publish_time_days_since_published': 'doc_event_days_since_published_log_01scaled',
-    'ad_id_clicked_sum_ctr': 'pop_ad_id',
-    'advertiser_id_clicked_sum_ctr': 'pop_advertiser_id',
-    'campaign_id_clicked_sum_ctr': 'pop_campain_id',
-    'ad_id_count': 'ad_views_log_01scaled',
-    'publish_time_promo_days_since_published': 'doc_ad_days_since_published_log_01scaled',
-    'doc_event_doc_ad_sim_categories': 'doc_event_doc_ad_sim_categories',
-    'doc_event_doc_ad_sim_topics': 'doc_event_doc_ad_sim_topics',
-    'doc_event_doc_ad_sim_entities': 'doc_event_doc_ad_sim_entities'
+    "ad_id": "ad_id",
+    "clicked": "label",
+    "display_id": "display_id",
+    "document_id": "doc_event_id",
+    "platform": "event_platform",
+    "document_id_promo": "doc_id",
+    "campaign_id": "campaign_id",
+    "advertiser_id": "ad_advertiser",
+    "source_id": "doc_event_source_id",
+    "publisher_id": "doc_event_publisher_id",
+    "source_id_promo": "doc_ad_source_id",
+    "publisher_id_promo": "doc_ad_publisher_id",
+    "geo_location": "event_geo_location",
+    "geo_location_country": "event_country",
+    "geo_location_state": "event_country_state",
+    "document_id_promo_ctr": "pop_document_id",
+    "publisher_id_promo_ctr": "pop_publisher_id",
+    "source_id_promo_ctr": "pop_source_id",
+    "document_id_promo_count": "doc_views_log_01scaled",
+    "publish_time_days_since_published": "doc_event_days_since_published_log_01scaled",
+    "ad_id_ctr": "pop_ad_id",
+    "advertiser_id_ctr": "pop_advertiser_id",
+    "campaign_id_ctr": "pop_campain_id",
+    "ad_id_count": "ad_views_log_01scaled",
+    "publish_time_promo_days_since_published": "doc_ad_days_since_published_log_01scaled",
+    "document_id_document_id_promo_sim_categories": "doc_event_doc_ad_sim_categories",
+    "document_id_document_id_promo_sim_topics": "doc_event_doc_ad_sim_topics",
+    "document_id_document_id_promo_sim_entities": "doc_event_doc_ad_sim_entities",
 }

 spark_to_nvt = {item: key for key, item in nvt_to_spark.items()}
--- a/TensorFlow2/Recommendation/WideAndDeep/data/outbrain/nvtabular/utils/setup.py
+++ b/TensorFlow2/Recommendation/WideAndDeep/data/outbrain/nvtabular/utils/setup.py
@ -15,34 +15,30 @@
 import os

 from data.outbrain.features import HASH_BUCKET_SIZES
-from data.outbrain.nvtabular.utils.feature_description import transform_spark_to_nvt


 def create_config(args):
-    stats_file = os.path.join(args.metadata_path, 'stats_wnd_workflow')
    data_bucket_folder = args.data_path
    output_bucket_folder = args.metadata_path
-    output_train_folder = os.path.join(output_bucket_folder, 'train/')
-    temporary_folder = os.path.join('/tmp', 'preprocessed')
-    train_path = os.path.join(temporary_folder, 'train_gdf.parquet')
-    valid_path = os.path.join(temporary_folder, 'valid_gdf.parquet')
-    output_valid_folder = os.path.join(output_bucket_folder, 'valid/')
-    tfrecords_path = args.tfrecords_path
-    workers = args.workers
-    hash_spec = {transform_spark_to_nvt(column): hash for column, hash in HASH_BUCKET_SIZES.items()}
+    temporary_folder = os.path.join("/tmp", "preprocessed")
+    train_path = os.path.join(temporary_folder, "train_gdf.parquet")
+    valid_path = os.path.join(temporary_folder, "valid_gdf.parquet")
+    stats_file = os.path.join(temporary_folder, "stats_wnd_workflow")
+    output_train_folder = os.path.join(output_bucket_folder, "train/")
+    output_valid_folder = os.path.join(output_bucket_folder, "valid/")
+    hash_spec = HASH_BUCKET_SIZES

    config = {
-        'stats_file': stats_file,
-        'data_bucket_folder': data_bucket_folder,
-        'output_bucket_folder': output_bucket_folder,
-        'output_train_folder': output_train_folder,
-        'temporary_folder': temporary_folder,
-        'train_path': train_path,
-        'valid_path': valid_path,
-        'output_valid_folder': output_valid_folder,
-        'tfrecords_path': tfrecords_path,
-        'workers': workers,
-        'hash_spec': hash_spec
+        "stats_file": stats_file,
+        "data_bucket_folder": data_bucket_folder,
+        "output_bucket_folder": output_bucket_folder,
+        "output_train_folder": output_train_folder,
+        "temporary_folder": temporary_folder,
+        "train_path": train_path,
+        "valid_path": valid_path,
+        "output_valid_folder": output_valid_folder,
+        "hash_spec": hash_spec,
+        "dask": args.use_dask
    }

    return config
--- a/TensorFlow2/Recommendation/WideAndDeep/data/outbrain/nvtabular/utils/workflow.py
+++ b/TensorFlow2/Recommendation/WideAndDeep/data/outbrain/nvtabular/utils/workflow.py
@ -21,10 +21,24 @@ import nvtabular as nvt
 import rmm
 from dask.distributed import Client
 from dask_cuda import LocalCUDACluster
-from data.outbrain.nvtabular.utils.feature_description import CATEGORICAL_COLUMNS, CONTINUOUS_COLUMNS, \
-    DISPLAY_ID_COLUMN, groupby_columns, ctr_columns
+from data.outbrain.nvtabular.utils.feature_description import (
+    CATEGORICAL_COLUMNS,
+    DISPLAY_ID_COLUMN,
+    CTR_INPUTS,
+)
+from nvtabular import ColumnGroup
 from nvtabular.io import Shuffle
-from nvtabular.ops import Normalize, FillMedian, FillMissing, LogOp, LambdaOp, JoinGroupby, HashBucket
+from nvtabular.ops import (
+    FillMedian,
+    LogOp,
+    Rename,
+    JoinGroupby,
+    LambdaOp,
+    FillMissing,
+    HashBucket,
+    Normalize,
+)
+from nvtabular.ops import Operator
 from nvtabular.ops.column_similarity import ColumnSimilarity
 from nvtabular.utils import device_mem_size, get_rmm_size

@ -33,24 +47,38 @@ TIMESTAMP_DELTA = 1465876799998

 def get_devices():
    try:
-        devices = [int(device) for device in os.environ["CUDA_VISIBLE_DEVICES"].split(",")]
+        devices = [
+            int(device) for device in os.environ["CUDA_VISIBLE_DEVICES"].split(",")
+        ]
    except KeyError:
        from pynvml import nvmlInit, nvmlDeviceGetCount
+
        nvmlInit()
        devices = list(range(nvmlDeviceGetCount()))
    return devices


-def _calculate_delta(col, gdf):
-    col.loc[col == ''] = None
-    col = col.astype('datetime64[ns]')
-    timestamp = (gdf['timestamp'] + TIMESTAMP_DELTA).astype('datetime64[ms]')
-    delta = (timestamp - col).dt.days
-    delta = delta * (delta >= 0) * (delta <= 10 * 365)
-    return delta
+class DaysSincePublished(Operator):
+    def transform(self, columns, gdf):
+        for column in columns:
+            col = gdf[column]
+            col.loc[col == ""] = None
+            col = col.astype("datetime64[ns]")
+            timestamp = (gdf["timestamp"] + TIMESTAMP_DELTA).astype("datetime64[ms]")
+            delta = (timestamp - col).dt.days
+            gdf[column + "_days_since_published"] = (
+                    delta * (delta >= 0) * (delta <= 10 * 365)
+            )
+        return gdf
+
+    def output_column_names(self, columns):
+        return [column + "_days_since_published" for column in columns]
+
+    def dependencies(self):
+        return ["timestamp"]


-def _df_to_coo(df, row='document_id', col=None, data='confidence_level'):
+def _df_to_coo(df, row="document_id", col=None, data="confidence_level"):
    return cupy.sparse.coo_matrix((df[data].values, (df[row].values, df[col].values)))


@ -71,7 +99,7 @@ def create_client(devices, local_directory):
            n_workers=len(devices),
            CUDA_VISIBLE_DEVICES=",".join(str(x) for x in devices),
            device_memory_limit=device_limit,
-            local_directory=local_directory
+            local_directory=local_directory,
        )
        client = Client(cluster)
        setup_rmm_pool(client, device_pool_size)
@ -79,86 +107,95 @@ def create_client(devices, local_directory):
    return client


-def create_workflow(data_bucket_folder, output_bucket_folder, hash_spec, devices, local_directory):
+def create_workflow(data_bucket_folder, hash_spec, devices, local_directory, dask):
    rmm.reinitialize(managed_memory=False)
-    documents_categories_path = os.path.join(data_bucket_folder, 'documents_categories.csv')
-    documents_topics_path = os.path.join(data_bucket_folder, 'documents_topics.csv')
-    documents_entities_path = os.path.join(data_bucket_folder, 'documents_entities.csv')
+    documents_categories_path = os.path.join(
+        data_bucket_folder, "documents_categories.csv"
+    )
+    documents_topics_path = os.path.join(data_bucket_folder, "documents_topics.csv")
+    documents_entities_path = os.path.join(data_bucket_folder, "documents_entities.csv")

    documents_categories_cudf = cudf.read_csv(documents_categories_path)
    documents_topics_cudf = cudf.read_csv(documents_topics_path)
    documents_entities_cudf = cudf.read_csv(documents_entities_path)
-    documents_entities_cudf['entity_id'] = documents_entities_cudf['entity_id'].astype('category').cat.codes
+    documents_entities_cudf["entity_id"] = (
+        documents_entities_cudf["entity_id"].astype("category").cat.codes
+    )

-    categories = _df_to_coo(documents_categories_cudf, col='category_id')
-    topics = _df_to_coo(documents_topics_cudf, col='topic_id')
-    entities = _df_to_coo(documents_entities_cudf, col='entity_id')
+    categories = _df_to_coo(documents_categories_cudf, col="category_id")
+    topics = _df_to_coo(documents_topics_cudf, col="topic_id")
+    entities = _df_to_coo(documents_entities_cudf, col="entity_id")

    del documents_categories_cudf, documents_topics_cudf, documents_entities_cudf
    ctr_thresh = {
-        'ad_id': 5,
-        'source_id_promo': 10,
-        'publisher_id_promo': 10,
-        'advertiser_id': 10,
-        'campaign_id': 10,
-        'document_id_promo': 5,
-
+        "ad_id": 5,
+        "source_id_promo": 10,
+        "publisher_id_promo": 10,
+        "advertiser_id": 10,
+        "campaign_id": 10,
+        "document_id_promo": 5,
    }

-    client = create_client(
-        devices=devices,
-        local_directory=local_directory
+    ctr_inputs = ColumnGroup(CTR_INPUTS)
+    cat_cols = ColumnGroup(CATEGORICAL_COLUMNS)
+
+    geo_location = ColumnGroup(["geo_location"])
+    country = (
+            geo_location >> (lambda col: col.str.slice(0, 2)) >> Rename(postfix="_country")
+    )
+    state = (
+            geo_location >> (lambda col: col.str.slice(0, 5)) >> Rename(postfix="_state")
+    )
+    geo_features = geo_location + country + state
+
+    dates = ["publish_time", "publish_time_promo"]
+    date_features = dates >> DaysSincePublished() >> FillMedian() >> LogOp
+
+    stat_cols = ctr_inputs >> JoinGroupby(cont_cols=["clicked"], stats=["sum", "count"])
+    ctr_cols = (
+            stat_cols - [column + "_count" for column in ctr_inputs.flattened_columns]
+            >> LambdaOp(
+        f=lambda col, gdf: (
+                (col) / (gdf[col.name.replace("_clicked_sum", "_count")])
+        ).where(
+            gdf[col.name.replace("_clicked_sum", "_count")]
+            >= ctr_thresh[col.name.replace("_clicked_sum", "")],
+            0,
+        ),
+        dependency=stat_cols
+                   - [column + "clicked_sum" for column in ctr_inputs.flattened_columns],
+    )
+            >> Rename(f=lambda x: x.replace("_clicked_sum", "_ctr"))
    )

-    workflow = nvt.Workflow(
-        cat_names=CATEGORICAL_COLUMNS,
-        cont_names=CONTINUOUS_COLUMNS,
-        label_name=['clicked'],
-        client=client
+    stat_cols = stat_cols >> FillMissing() >> LogOp() >> Normalize()
+    ctr_cols = ctr_cols >> FillMissing()
+
+    cat_cols = cat_cols + geo_features >> HashBucket(hash_spec)
+
+    features = (
+            date_features + ctr_cols + stat_cols + cat_cols + ["clicked", "display_id"]
    )
+    sim_features_categ = (
+            [["document_id", "document_id_promo"]]
+            >> ColumnSimilarity(categories, metric="tfidf", on_device=False)
+            >> Rename(postfix="_categories")
+    )
+    sim_features_topics = (
+            [["document_id", "document_id_promo"]]
+            >> ColumnSimilarity(topics, metric="tfidf", on_device=False)
+            >> Rename(postfix="_topics")
+    )
+    sim_features_entities = (
+            [["document_id", "document_id_promo"]]
+            >> ColumnSimilarity(entities, metric="tfidf", on_device=False)
+            >> Rename(postfix="_entities")
+    )
+    sim_features = sim_features_categ + sim_features_topics + sim_features_entities

-    workflow.add_feature([
-        LambdaOp(
-            op_name='country',
-            f=lambda col, gdf: col.str.slice(0, 2),
-            columns=['geo_location'], replace=False),
-        LambdaOp(
-            op_name='state',
-            f=lambda col, gdf: col.str.slice(0, 5),
-            columns=['geo_location'], replace=False),
-        LambdaOp(
-            op_name='days_since_published',
-            f=_calculate_delta,
-            columns=['publish_time', 'publish_time_promo'], replace=False),
+    client = create_client(devices=devices, local_directory=local_directory) if dask else None

-        FillMedian(columns=['publish_time_days_since_published', 'publish_time_promo_days_since_published']),
-
-        JoinGroupby(columns=['ad_id', 'source_id_promo', 'document_id_promo', 'publisher_id_promo', 'advertiser_id',
-                             'campaign_id'],
-                    cont_names=['clicked'], out_path=output_bucket_folder, stats=['sum', 'count']),
-        LambdaOp(
-            op_name='ctr',
-            f=lambda col, gdf: ((col) / (gdf[col.name.replace('_clicked_sum', '_count')])).where(
-                gdf[col.name.replace('_clicked_sum', '_count')] >= ctr_thresh[col.name.replace('_clicked_sum', '')], 0),
-            columns=['ad_id_clicked_sum', 'source_id_promo_clicked_sum', 'document_id_promo_clicked_sum',
-                     'publisher_id_promo_clicked_sum',
-                     'advertiser_id_clicked_sum', 'campaign_id_clicked_sum'], replace=False),
-        FillMissing(columns=groupby_columns + ctr_columns),
-        LogOp(
-            columns=groupby_columns + ['publish_time_days_since_published', 'publish_time_promo_days_since_published']),
-        Normalize(columns=groupby_columns),
-        ColumnSimilarity('doc_event_doc_ad_sim_categories', 'document_id', categories, 'document_id_promo',
-                         metric='tfidf', on_device=False),
-        ColumnSimilarity('doc_event_doc_ad_sim_topics', 'document_id', topics, 'document_id_promo', metric='tfidf',
-                         on_device=False),
-        ColumnSimilarity('doc_event_doc_ad_sim_entities', 'document_id', entities, 'document_id_promo', metric='tfidf',
-                         on_device=False)
-    ])
-
-    workflow.add_cat_preprocess([
-        HashBucket(hash_spec)
-    ])
-    workflow.finalize()
+    workflow = nvt.Workflow(column_group=features + sim_features, client=client)

    return workflow

@ -166,31 +203,52 @@ def create_workflow(data_bucket_folder, output_bucket_folder, hash_spec, devices
 def create_parquets(data_bucket_folder, train_path, valid_path):
    cupy.random.seed(seed=0)
    rmm.reinitialize(managed_memory=True)
-    documents_meta_path = os.path.join(data_bucket_folder, 'documents_meta.csv')
-    clicks_train_path = os.path.join(data_bucket_folder, 'clicks_train.csv')
-    events_path = os.path.join(data_bucket_folder, 'events.csv')
-    promoted_content_path = os.path.join(data_bucket_folder, 'promoted_content.csv')
+    documents_meta_path = os.path.join(data_bucket_folder, "documents_meta.csv")
+    clicks_train_path = os.path.join(data_bucket_folder, "clicks_train.csv")
+    events_path = os.path.join(data_bucket_folder, "events.csv")
+    promoted_content_path = os.path.join(data_bucket_folder, "promoted_content.csv")

-    documents_meta = cudf.read_csv(documents_meta_path, na_values=['\\N', ''])
-    documents_meta = documents_meta.dropna(subset='source_id')
-    documents_meta['publisher_id'].fillna(
-        documents_meta['publisher_id'].isnull().cumsum() + documents_meta['publisher_id'].max() + 1, inplace=True)
-    merged = (cudf.read_csv(clicks_train_path, na_values=['\\N', ''])
-              .merge(cudf.read_csv(events_path, na_values=['\\N', '']), on=DISPLAY_ID_COLUMN, how='left',
-                     suffixes=('', '_event'))
-              .merge(cudf.read_csv(promoted_content_path, na_values=['\\N', '']), on='ad_id',
-                     how='left',
-                     suffixes=('', '_promo'))
-              .merge(documents_meta, on='document_id', how='left')
-              .merge(documents_meta, left_on='document_id_promo', right_on='document_id', how='left',
-                     suffixes=('', '_promo')))
-    merged['day_event'] = (merged['timestamp'] / 1000 / 60 / 60 / 24).astype(int)
-    merged['platform'] = merged['platform'].fillna(1)
-    merged['platform'] = merged['platform'] - 1
-    display_event = merged[[DISPLAY_ID_COLUMN, 'day_event']].drop_duplicates().reset_index()
+    documents_meta = cudf.read_csv(documents_meta_path, na_values=["\\N", ""])
+    documents_meta = documents_meta.dropna(subset="source_id")
+    documents_meta["publisher_id"].fillna(
+        documents_meta["publisher_id"].isnull().cumsum()
+        + documents_meta["publisher_id"].max()
+        + 1,
+        inplace=True,
+    )
+    merged = (
+        cudf.read_csv(clicks_train_path, na_values=["\\N", ""])
+            .merge(
+            cudf.read_csv(events_path, na_values=["\\N", ""]),
+            on=DISPLAY_ID_COLUMN,
+            how="left",
+            suffixes=("", "_event"),
+        )
+            .merge(
+            cudf.read_csv(promoted_content_path, na_values=["\\N", ""]),
+            on="ad_id",
+            how="left",
+            suffixes=("", "_promo"),
+        )
+            .merge(documents_meta, on="document_id", how="left")
+            .merge(
+            documents_meta,
+            left_on="document_id_promo",
+            right_on="document_id",
+            how="left",
+            suffixes=("", "_promo"),
+        )
+    )
+    merged["day_event"] = (merged["timestamp"] / 1000 / 60 / 60 / 24).astype(int)
+    merged["platform"] = merged["platform"].fillna(1)
+    merged["platform"] = merged["platform"] - 1
+    display_event = (
+        merged[[DISPLAY_ID_COLUMN, "day_event"]].drop_duplicates().reset_index()
+    )
    random_state = cudf.Series(cupy.random.uniform(size=len(display_event)))
    valid_ids, train_ids = display_event.scatter_by_map(
-        ((display_event.day_event <= 10) & (random_state > 0.2)).astype(int))
+        ((display_event.day_event <= 10) & (random_state > 0.2)).astype(int)
+    )
    valid_ids = valid_ids[DISPLAY_ID_COLUMN].drop_duplicates()
    train_ids = train_ids[DISPLAY_ID_COLUMN].drop_duplicates()
    valid_set = merged[merged[DISPLAY_ID_COLUMN].isin(valid_ids)]
@ -201,27 +259,39 @@ def create_parquets(data_bucket_folder, train_path, valid_path):
    del merged, train_set, valid_set


-def save_stats(data_bucket_folder, output_bucket_folder,
-               output_train_folder, train_path, output_valid_folder,
-               valid_path, stats_file, hash_spec, local_directory):
+def save_stats(
+        data_bucket_folder,
+        output_train_folder,
+        train_path,
+        output_valid_folder,
+        valid_path,
+        stats_file,
+        hash_spec,
+        local_directory,
+        dask
+):
    devices = get_devices()
    shuffle = Shuffle.PER_PARTITION if len(devices) > 1 else True

-    workflow = create_workflow(data_bucket_folder=data_bucket_folder,
-                               output_bucket_folder=output_bucket_folder,
-                               hash_spec=hash_spec,
-                               devices=devices,
-                               local_directory=local_directory)
+    workflow = create_workflow(
+        data_bucket_folder=data_bucket_folder,
+        hash_spec=hash_spec,
+        devices=devices,
+        local_directory=local_directory,
+        dask=dask
+    )

-    train_dataset = nvt.Dataset(train_path, part_mem_fraction=0.12)
-    valid_dataset = nvt.Dataset(valid_path, part_mem_fraction=0.12)
+    train_dataset = nvt.Dataset(train_path, part_size="1GB")
+    valid_dataset = nvt.Dataset(valid_path, part_size="150MB")
+    workflow.fit(train_dataset)
+    workflow.transform(train_dataset).to_parquet(
+        output_path=output_train_folder, shuffle=shuffle, out_files_per_proc=8
+    )
+    workflow.transform(valid_dataset).to_parquet(
+        output_path=output_valid_folder, shuffle=None, output_files=8
+    )

-    workflow.apply(train_dataset, record_stats=True, output_path=output_train_folder, shuffle=shuffle,
-                   out_files_per_proc=5)
-    workflow.apply(valid_dataset, record_stats=False, output_path=output_valid_folder, shuffle=None,
-                   out_files_per_proc=None)
-
-    workflow.save_stats(stats_file)
+    workflow.save(stats_file)

    return workflow

@ -231,24 +301,30 @@ def clean(path):


 def execute_pipeline(config):
-    required_folders = [config['temporary_folder'], config['output_train_folder'], config['output_valid_folder']]
+    required_folders = [
+        config["temporary_folder"],
+        config["output_train_folder"],
+        config["output_valid_folder"],
+    ]
    for folder in required_folders:
        os.makedirs(folder, exist_ok=True)

    create_parquets(
-        data_bucket_folder=config['data_bucket_folder'],
-        train_path=config['train_path'],
-        valid_path=config['valid_path']
+        data_bucket_folder=config["data_bucket_folder"],
+        train_path=config["train_path"],
+        valid_path=config["valid_path"],
    )
    save_stats(
-        data_bucket_folder=config['data_bucket_folder'],
-        output_bucket_folder=config['output_bucket_folder'],
-        output_train_folder=config['output_train_folder'],
-        train_path=config['train_path'],
-        output_valid_folder=config['output_valid_folder'],
-        valid_path=config['valid_path'],
-        stats_file=config['stats_file'],
-        hash_spec=config['hash_spec'],
-        local_directory=config['temporary_folder']
+        data_bucket_folder=config["data_bucket_folder"],
+        output_train_folder=config["output_train_folder"],
+        train_path=config["train_path"],
+        output_valid_folder=config["output_valid_folder"],
+        valid_path=config["valid_path"],
+        stats_file=config["stats_file"],
+        hash_spec=config["hash_spec"],
+        local_directory=config["temporary_folder"],
+        dask=config["dask"]
    )
-    clean(config['temporary_folder'])
+
+    clean(config["temporary_folder"])
+    clean("./categories")
--- a/TensorFlow2/Recommendation/WideAndDeep/data/outbrain/spark/data/ca_states_abbrev_bst.csv
+++ b/TensorFlow2/Recommendation/WideAndDeep/data/outbrain/spark/data/ca_states_abbrev_bst.csv
@ -1,13 +0,0 @@
-state_abb,utc_dst_time_offset_cleaned
-AB,-6.0
-BC,-7.0
-MB,-5.0
-NB,-3.0
-NL,-3.0
-NS,-3.0
-NU,-5.0
-ON,-4.0
-PE,-3.0
-QC,-4.0
-SK,-6.0
-YT,-7.0
--- a/TensorFlow2/Recommendation/WideAndDeep/data/outbrain/spark/data/country_codes_utc_dst_tz_delta.csv
+++ b/TensorFlow2/Recommendation/WideAndDeep/data/outbrain/spark/data/country_codes_utc_dst_tz_delta.csv
@ -1,247 +0,0 @@
-country_code,utc_dst_time_offset_cleaned
-AX,3.0
-AF,4.5
-AL,2.0
-DZ,1.0
-AD,2.0
-AO,1.0
-AI,-4.0
-AG,-4.0
-AR,-3.0
-AM,4.0
-AW,-4.0
-AU,10.0
-AT,2.0
-AZ,4.0
-BS,-4.0
-BH,3.0
-BD,6.0
-BB,-4.0
-BY,3.0
-BE,2.0
-BZ,-6.0
-BJ,1.0
-BM,-3.0
-BT,6.0
-BO,-4.0
-BA,2.0
-BW,2.0
-BR,-3.0
-IO,6.0
-BN,8.0
-BG,3.0
-BF,0.0
-BI,2.0
-KH,7.0
-CM,1.0
-CA,-5.0
-BQ,-5.0
-KY,-5.0
-CF,1.0
-TD,1.0
-CL,-3.0
-CN,8.0
-CX,7.0
-CC,6.5
-CO,-5.0
-KM,3.0
-CD,1.0
-CG,1.0
-CK,-10.0
-CR,-6.0
-CI,0.0
-HR,2.0
-CW,-4.0
-CY,3.0
-CZ,2.0
-DK,2.0
-DJ,3.0
-DM,-4.0
-DO,-4.0
-TL,9.0
-EC,-5.0
-EG,2.0
-SV,-6.0
-GQ,1.0
-ER,3.0
-EE,3.0
-ET,3.0
-FK,-3.0
-FO,1.0
-FJ,12.0
-FI,3.0
-FR,2.0
-GF,-3.0
-PF,-10.0
-GA,1.0
-GM,0.0
-GE,4.0
-DE,2.0
-GH,0.0
-GI,2.0
-GR,3.0
-GL,-2.0
-GD,-4.0
-GP,-4.0
-GU,10.0
-GT,-6.0
-GG,1.0
-GN,0.0
-GW,0.0
-GY,-4.0
-HT,-5.0
-HN,-6.0
-HK,8.0
-HU,2.0
-IS,0.0
-IN,5.5
-ID,8.0
-IR,4.5
-IQ,3.0
-IE,1.0
-IM,1.0
-IL,3.0
-IT,2.0
-JM,-5.0
-JP,9.0
-JE,1.0
-JO,3.0
-KZ,5.0
-KE,3.0
-KI,13.0
-KP,-4.0
-KR,-4.0
-KP,8.5
-KR,8.5
-KP,9.0
-KR,9.0
-KW,3.0
-KG,6.0
-LA,7.0
-LV,3.0
-LB,3.0
-LS,2.0
-LR,0.0
-LY,2.0
-LI,2.0
-LT,3.0
-LU,2.0
-MO,8.0
-MK,2.0
-MG,3.0
-MW,2.0
-MY,8.0
-MV,5.0
-ML,0.0
-MT,2.0
-MH,12.0
-MQ,-4.0
-MR,0.0
-MU,4.0
-YT,3.0
-MX,-5.0
-FM,10.0
-MD,3.0
-MC,2.0
-MN,9.0
-ME,2.0
-MS,-4.0
-MA,1.0
-MZ,2.0
-MM,6.5
-NA,1.0
-NR,12.0
-NP,5.0
-NL,2.0
-NC,11.0
-NZ,12.0
-NI,-6.0
-NE,1.0
-NG,1.0
-NU,-11.0
-NF,11.0
-MP,10.0
-NO,2.0
-OM,4.0
-PK,5.0
-PW,9.0
-PS,3.0
-PA,-5.0
-PG,10.0
-PY,-4.0
-PE,-5.0
-PH,8.0
-PN,-8.0
-PL,2.0
-PT,1.0
-PR,-4.0
-QA,3.0
-RE,4.0
-RO,3.0
-RU,7.0
-RW,2.0
-BL,-4.0
-AS,-11.0
-WS,-11.0
-AS,13.0
-WS,13.0
-SM,2.0
-ST,0.0
-SA,3.0
-SN,0.0
-RS,2.0
-SC,4.0
-SL,0.0
-SG,8.0
-SK,2.0
-SI,2.0
-SB,11.0
-SO,3.0
-ZA,2.0
-GS,-2.0
-SS,3.0
-ES,2.0
-LK,5.5
-SH,0.0
-KN,-4.0
-SX,-4.0
-MF,-4.0
-SD,3.0
-SR,-3.0
-SJ,2.0
-SZ,2.0
-SE,2.0
-CH,2.0
-SY,3.0
-TW,8.0
-TJ,5.0
-TZ,3.0
-TH,7.0
-TG,0.0
-TK,13.0
-TO,13.0
-TT,-4.0
-TN,1.0
-TR,3.0
-TM,5.0
-TC,-4.0
-TV,12.0
-UG,3.0
-UA,3.0
-AE,4.0
-GB,1.0
-US,-7.0
-UY,-3.0
-UZ,5.0
-VU,11.0
-VA,2.0
-VE,-4.0
-VN,7.0
-VG,-4.0
-VI,-4.0
-VG,-4.0
-VI,-4.0
-WF,12.0
-YE,3.0
-ZM,2.0
-ZW,2.0
--- a/TensorFlow2/Recommendation/WideAndDeep/data/outbrain/spark/data/tensorflow-hadoop-1.5.0.jar
+++ b/TensorFlow2/Recommendation/WideAndDeep/data/outbrain/spark/data/tensorflow-hadoop-1.5.0.jar
--- a/TensorFlow2/Recommendation/WideAndDeep/data/outbrain/spark/data/us_states_abbrev_bst.csv
+++ b/TensorFlow2/Recommendation/WideAndDeep/data/outbrain/spark/data/us_states_abbrev_bst.csv
@ -1,52 +0,0 @@
-state_abb,utc_dst_time_offset_cleaned
-AL,-5.0
-AK,-8.0
-AZ,-7.0
-AR,-5.0
-CA,-7.0
-CO,-6.0
-CT,-4.0
-DE,-4.0
-DC,-4.0
-FL,-4.0
-GA,-4.0
-HI,-10.0
-ID,-6.0
-IL,-5.0
-IN,-4.0
-IA,-5.0
-KS,-5.0
-KY,-4.0
-LA,-5.0
-ME,-4.0
-MD,-4.0
-MA,-4.0
-MI,-4.0
-MN,-5.0
-MS,-5.0
-MO,-5.0
-MT,-6.0
-NE,-5.0
-NV,-7.0
-NH,-4.0
-NJ,-4.0
-NM,-6.0
-NY,-4.0
-NC,-4.0
-ND,-5.0
-OH,-4.0
-OK,-5.0
-OR,-7.0
-PA,-4.0
-RI,-4.0
-SC,-4.0
-SD,-5.0
-TN,-5.0
-TX,-5.0
-UT,-6.0
-VT,-4.0
-VA,-4.0
-WA,-7.0
-WV,-4.0
-WI,-5.0
-WY,-6.0
--- a/TensorFlow2/Recommendation/WideAndDeep/data/outbrain/spark/preproc1.py
+++ b/TensorFlow2/Recommendation/WideAndDeep/data/outbrain/spark/preproc1.py
@ -1,104 +0,0 @@
-#!/usr/bin/env python
-# coding: utf-8
-
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from pyspark.context import SparkContext, SparkConf
-from pyspark.sql.functions import col
-from pyspark.sql.session import SparkSession
-from pyspark.sql.types import IntegerType, StringType, StructType, StructField
-
-OUTPUT_BUCKET_FOLDER = "/tmp/spark/preprocessed/"
-DATA_BUCKET_FOLDER = "/outbrain/orig/"
-SPARK_TEMP_FOLDER = "/tmp/spark/spark-temp/"
-
-conf = SparkConf().setMaster('local[*]').set('spark.executor.memory', '40g').set('spark.driver.memory', '200g').set(
-    "spark.local.dir", SPARK_TEMP_FOLDER)
-
-sc = SparkContext(conf=conf)
-spark = SparkSession(sc)
-
-print('Loading data...')
-
-events_schema = StructType(
-    [StructField("display_id", IntegerType(), True),
-     StructField("uuid_event", StringType(), True),
-     StructField("document_id_event", IntegerType(), True),
-     StructField("timestamp_event", IntegerType(), True),
-     StructField("platform_event", IntegerType(), True),
-     StructField("geo_location_event", StringType(), True)]
-)
-
-events_df = spark.read.schema(events_schema) \
-    .options(header='true', inferschema='false', nullValue='\\N') \
-    .csv(DATA_BUCKET_FOLDER + "events.csv") \
-    .withColumn('day_event', (col('timestamp_event') / 1000 / 60 / 60 / 24).cast("int")) \
-    .alias('events')
-
-events_df.count()
-
-print('Drop rows with empty "geo_location"...')
-events_df = events_df.dropna(subset="geo_location_event")
-events_df.count()
-
-print('Drop rows with empty "platform"...')
-events_df = events_df.dropna(subset="platform_event")
-events_df.count()
-
-promoted_content_schema = StructType(
-    [StructField("ad_id", IntegerType(), True),
-     StructField("document_id_promo", IntegerType(), True),
-     StructField("campaign_id", IntegerType(), True),
-     StructField("advertiser_id", IntegerType(), True)]
-)
-
-promoted_content_df = spark.read.schema(promoted_content_schema) \
-    .options(header='true', inferschema='false', nullValue='\\N') \
-    .csv(DATA_BUCKET_FOLDER + "promoted_content.csv") \
-    .alias('promoted_content')
-
-clicks_train_schema = StructType(
-    [StructField("display_id", IntegerType(), True),
-     StructField("ad_id", IntegerType(), True),
-     StructField("clicked", IntegerType(), True)]
-)
-
-clicks_train_df = spark.read.schema(clicks_train_schema) \
-    .options(header='true', inferschema='false', nullValue='\\N') \
-    .csv(DATA_BUCKET_FOLDER + "clicks_train.csv") \
-    .alias('clicks_train')
-
-clicks_train_joined_df = clicks_train_df \
-    .join(promoted_content_df, on='ad_id', how='left') \
-    .join(events_df, on='display_id', how='left')
-clicks_train_joined_df.createOrReplaceTempView('clicks_train_joined')
-
-validation_display_ids_df = clicks_train_joined_df.select('display_id', 'day_event') \
-    .distinct() \
-    .sampleBy("day_event", fractions={0: 0.2, 1: 0.2, 2: 0.2, 3: 0.2, 4: 0.2,
-                                      5: 0.2, 6: 0.2, 7: 0.2, 8: 0.2, 9: 0.2, 10: 0.2, 11: 1.0, 12: 1.0}, seed=0)
-validation_display_ids_df.createOrReplaceTempView("validation_display_ids")
-validation_set_df = spark.sql('''SELECT display_id, ad_id, uuid_event, day_event,
-  timestamp_event, document_id_promo, platform_event, geo_location_event
-  FROM clicks_train_joined t
-    WHERE EXISTS (SELECT display_id FROM validation_display_ids
-      WHERE display_id = t.display_id)''')
-
-validation_set_gcs_output = "validation_set.parquet"
-validation_set_df.write.parquet(OUTPUT_BUCKET_FOLDER + validation_set_gcs_output, mode='overwrite')
-
-print(validation_set_df.take(5))
-
-spark.stop()
--- a/TensorFlow2/Recommendation/WideAndDeep/data/outbrain/spark/preproc2.py
+++ b/TensorFlow2/Recommendation/WideAndDeep/data/outbrain/spark/preproc2.py
--- a/TensorFlow2/Recommendation/WideAndDeep/data/outbrain/spark/preproc3.py
+++ b/TensorFlow2/Recommendation/WideAndDeep/data/outbrain/spark/preproc3.py
@ -1,474 +0,0 @@
-#!/usr/bin/env python
-# coding: utf-8
-
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import argparse
-import datetime
-
-import numpy as np
-import pandas as pd
-import pyspark.sql.functions as F
-import tensorflow as tf
-from pyspark import TaskContext
-from pyspark.context import SparkContext, SparkConf
-from pyspark.sql.functions import col, udf
-from pyspark.sql.session import SparkSession
-from pyspark.sql.types import ArrayType, DoubleType
-from tensorflow_transform.tf_metadata import dataset_metadata
-from tensorflow_transform.tf_metadata import dataset_schema
-from tensorflow_transform.tf_metadata import metadata_io
-from data.outbrain.features import PREBATCH_SIZE, HASH_BUCKET_SIZES
-from data.outbrain.spark.utils.feature_description import LABEL_COLUMN, DISPLAY_ID_COLUMN, CATEGORICAL_COLUMNS, \
-    DOC_CATEGORICAL_MULTIVALUED_COLUMNS, BOOL_COLUMNS, INT_COLUMNS, FLOAT_COLUMNS, \
-    FLOAT_COLUMNS_LOG_BIN_TRANSFORM, FLOAT_COLUMNS_SIMPLE_BIN_TRANSFORM, FLOAT_COLUMNS_NO_TRANSFORM
-
-pd.set_option('display.max_columns', 1000)
-evaluation = True
-evaluation_verbose = False
-OUTPUT_BUCKET_FOLDER = "/tmp/spark/preprocessed/"
-DATA_BUCKET_FOLDER = "/data/orig/"
-SPARK_TEMP_FOLDER = "/tmp/spark/spark-temp/"
-LOCAL_DATA_TFRECORDS_DIR = "/outbrain/tfrecords"
-
-TEST_SET_MODE = False
-
-TENSORFLOW_HADOOP = "data/outbrain/spark/data/tensorflow-hadoop-1.5.0.jar"
-
-conf = SparkConf().setMaster('local[*]').set('spark.executor.memory', '40g').set('spark.driver.memory', '200g').set(
-    "spark.local.dir", SPARK_TEMP_FOLDER)
-conf.set("spark.jars", TENSORFLOW_HADOOP)
-conf.set("spark.sql.files.maxPartitionBytes", 805306368)
-
-sc = SparkContext(conf=conf)
-spark = SparkSession(sc)
-
-parser = argparse.ArgumentParser()
-
-parser.add_argument(
-    '--num_train_partitions',
-    help='number of train partitions',
-    type=int,
-    default=40)
-
-parser.add_argument(
-    '--num_valid_partitions',
-    help='number of validation partitions',
-    type=int,
-    default=40)
-args = parser.parse_args()
-num_train_partitions = args.num_train_partitions
-num_valid_partitions = args.num_valid_partitions
-batch_size = PREBATCH_SIZE
-
-# # Feature Vector export
-bool_feature_names = []
-
-int_feature_names = ['ad_views',
-                     'doc_views',
-                     'doc_event_days_since_published',
-                     'doc_ad_days_since_published',
-                     ]
-
-float_feature_names = [
-    'pop_ad_id',
-    'pop_document_id',
-    'pop_publisher_id',
-    'pop_advertiser_id',
-    'pop_campain_id',
-    'pop_source_id',
-    'doc_event_doc_ad_sim_categories',
-    'doc_event_doc_ad_sim_topics',
-    'doc_event_doc_ad_sim_entities',
-]
-
-TRAFFIC_SOURCE_FV = 'traffic_source'
-EVENT_HOUR_FV = 'event_hour'
-EVENT_COUNTRY_FV = 'event_country'
-EVENT_COUNTRY_STATE_FV = 'event_country_state'
-EVENT_GEO_LOCATION_FV = 'event_geo_location'
-EVENT_PLATFORM_FV = 'event_platform'
-AD_ADVERTISER_FV = 'ad_advertiser'
-DOC_AD_SOURCE_ID_FV = 'doc_ad_source_id'
-DOC_AD_PUBLISHER_ID_FV = 'doc_ad_publisher_id'
-DOC_EVENT_SOURCE_ID_FV = 'doc_event_source_id'
-DOC_EVENT_PUBLISHER_ID_FV = 'doc_event_publisher_id'
-DOC_AD_CATEGORY_ID_FV = 'doc_ad_category_id'
-DOC_AD_TOPIC_ID_FV = 'doc_ad_topic_id'
-DOC_AD_ENTITY_ID_FV = 'doc_ad_entity_id'
-DOC_EVENT_CATEGORY_ID_FV = 'doc_event_category_id'
-DOC_EVENT_TOPIC_ID_FV = 'doc_event_topic_id'
-DOC_EVENT_ENTITY_ID_FV = 'doc_event_entity_id'
-
-# ### Configuring feature vector
-category_feature_names_integral = ['ad_advertiser',
-                                   'doc_ad_publisher_id',
-                                   'doc_ad_source_id',
-                                   'doc_event_publisher_id',
-                                   'doc_event_source_id',
-                                   'event_country',
-                                   'event_country_state',
-                                   'event_geo_location',
-                                   'event_hour',
-                                   'event_platform',
-                                   'traffic_source']
-feature_vector_labels_integral = bool_feature_names \
-                                 + int_feature_names \
-                                 + float_feature_names \
-                                 + category_feature_names_integral
-
-train_feature_vector_gcs_folder_name = 'train_feature_vectors_integral_eval'
-
-# ## Exporting integral feature vectors to CSV
-train_feature_vectors_exported_df = spark.read.parquet(OUTPUT_BUCKET_FOLDER + train_feature_vector_gcs_folder_name)
-train_feature_vectors_exported_df.take(3)
-
-integral_headers = ['label', 'display_id', 'ad_id', 'doc_id', 'doc_event_id'] + feature_vector_labels_integral
-
-CSV_ORDERED_COLUMNS = ['label', 'display_id', 'ad_id', 'doc_id', 'doc_event_id', 'ad_views', 'campaign_id','doc_views',
-                       'doc_event_days_since_published', 'doc_ad_days_since_published',
-                       'pop_ad_id', 'pop_document_id', 'pop_publisher_id', 'pop_advertiser_id', 'pop_campain_id',
-                       'pop_source_id',
-                       'doc_event_doc_ad_sim_categories', 'doc_event_doc_ad_sim_topics',
-                       'doc_event_doc_ad_sim_entities', 'ad_advertiser', 'doc_ad_publisher_id',
-                       'doc_ad_source_id', 'doc_event_publisher_id', 'doc_event_source_id', 'event_country',
-                       'event_country_state', 'event_geo_location', 'event_platform',
-                       'traffic_source']
-
-FEAT_CSV_ORDERED_COLUMNS = ['ad_views', 'campaign_id','doc_views',
-                            'doc_event_days_since_published', 'doc_ad_days_since_published',
-                            'pop_ad_id', 'pop_document_id', 'pop_publisher_id', 'pop_advertiser_id', 'pop_campain_id',
-                            'pop_source_id',
-                            'doc_event_doc_ad_sim_categories', 'doc_event_doc_ad_sim_topics',
-                            'doc_event_doc_ad_sim_entities', 'ad_advertiser', 'doc_ad_publisher_id',
-                            'doc_ad_source_id', 'doc_event_publisher_id', 'doc_event_source_id', 'event_country',
-                            'event_country_state', 'event_geo_location', 'event_platform',
-                            'traffic_source']
-
-
-def to_array(col):
-    def to_array_(v):
-        return v.toArray().tolist()
-
-    # Important: asNondeterministic requires Spark 2.3 or later
-    # It can be safely removed i.e.
-    # return udf(to_array_, ArrayType(DoubleType()))(col)
-    # but at the cost of decreased performance
-
-    return udf(to_array_, ArrayType(DoubleType())).asNondeterministic()(col)
-
-
-CONVERT_TO_INT = ['doc_ad_category_id_1',
-                  'doc_ad_category_id_2', 'doc_ad_category_id_3', 'doc_ad_topic_id_1', 'doc_ad_topic_id_2',
-                  'doc_ad_topic_id_3', 'doc_ad_entity_id_1', 'doc_ad_entity_id_2', 'doc_ad_entity_id_3',
-                  'doc_ad_entity_id_4', 'doc_ad_entity_id_5', 'doc_ad_entity_id_6',
-                  'doc_ad_source_id', 'doc_event_category_id_1', 'doc_event_category_id_2', 'doc_event_category_id_3',
-                  'doc_event_topic_id_1', 'doc_event_topic_id_2', 'doc_event_topic_id_3', 'doc_event_entity_id_1',
-                  'doc_event_entity_id_2', 'doc_event_entity_id_3', 'doc_event_entity_id_4', 'doc_event_entity_id_5',
-                  'doc_event_entity_id_6']
-
-
-def format_number(element, name):
-    if name in BOOL_COLUMNS + CATEGORICAL_COLUMNS:
-        return element.cast("int")
-    elif name in CONVERT_TO_INT:
-        return element.cast("int")
-    else:
-        return element
-
-
-def to_array_with_none(col):
-    def to_array_with_none_(v):
-        tmp = np.full((v.size,), fill_value=None, dtype=np.float64)
-        tmp[v.indices] = v.values
-        return tmp.tolist()
-
-    # Important: asNondeterministic requires Spark 2.3 or later
-    # It can be safely removed i.e.
-    # return udf(to_array_, ArrayType(DoubleType()))(col)
-    # but at the cost of decreased performance
-
-    return udf(to_array_with_none_, ArrayType(DoubleType())).asNondeterministic()(col)
-
-
-@udf
-def count_value(x):
-    from collections import Counter
-    tmp = Counter(x).most_common(2)
-    if not tmp or np.isnan(tmp[0][0]):
-        return 0
-    return float(tmp[0][0])
-
-
-def replace_with_most_frequent(most_value):
-    return udf(lambda x: most_value if not x or np.isnan(x) else x)
-
-
-train_feature_vectors_integral_csv_rdd_df = train_feature_vectors_exported_df.select('label', 'display_id', 'ad_id',
-                                                                                     'document_id', 'document_id_event',
-                                                                                     'feature_vector').withColumn(
-    "featvec", to_array("feature_vector")).select(
-    ['label'] + ['display_id'] + ['ad_id'] + ['document_id'] + ['document_id_event'] + [
-        format_number(element, FEAT_CSV_ORDERED_COLUMNS[index]).alias(FEAT_CSV_ORDERED_COLUMNS[index]) for
-        index, element in enumerate([col("featvec")[i] for i in range(len(feature_vector_labels_integral))])]).replace(
-    float('nan'), 0)
-
-test_validation_feature_vector_gcs_folder_name = 'validation_feature_vectors_integral'
-
-# ## Exporting integral feature vectors
-test_validation_feature_vectors_exported_df = spark.read.parquet(
-    OUTPUT_BUCKET_FOLDER + test_validation_feature_vector_gcs_folder_name)
-test_validation_feature_vectors_exported_df = test_validation_feature_vectors_exported_df.repartition(40,
-                                                                                                      'display_id').orderBy(
-    'display_id')
-test_validation_feature_vectors_exported_df.take(3)
-
-test_validation_feature_vectors_integral_csv_rdd_df = test_validation_feature_vectors_exported_df.select(
-    'label', 'display_id', 'ad_id', 'document_id', 'document_id_event', 'feature_vector').withColumn("featvec",
-                                                                                                     to_array(
-                                                                                                         "feature_vector")).select(
-    ['label'] + ['display_id'] + ['ad_id'] + ['document_id'] + ['document_id_event'] + [
-        format_number(element, FEAT_CSV_ORDERED_COLUMNS[index]).alias(FEAT_CSV_ORDERED_COLUMNS[index]) for
-        index, element in enumerate([col("featvec")[i] for i in range(len(feature_vector_labels_integral))])]).replace(
-    float('nan'), 0)
-
-
-def make_spec(output_dir, batch_size=None):
-    fixed_shape = [batch_size, 1] if batch_size is not None else []
-    spec = {}
-    spec[LABEL_COLUMN] = tf.io.FixedLenFeature(shape=fixed_shape, dtype=tf.int64, default_value=None)
-    spec[DISPLAY_ID_COLUMN] = tf.io.FixedLenFeature(shape=fixed_shape, dtype=tf.int64, default_value=None)
-    for name in BOOL_COLUMNS:
-        spec[name] = tf.io.FixedLenFeature(shape=fixed_shape, dtype=tf.int64, default_value=None)
-    for name in FLOAT_COLUMNS_LOG_BIN_TRANSFORM + FLOAT_COLUMNS_SIMPLE_BIN_TRANSFORM + FLOAT_COLUMNS_NO_TRANSFORM:
-        spec[name] = tf.io.FixedLenFeature(shape=fixed_shape, dtype=tf.float32, default_value=None)
-    for name in FLOAT_COLUMNS_SIMPLE_BIN_TRANSFORM:
-        spec[name + '_binned'] = tf.io.FixedLenFeature(shape=fixed_shape, dtype=tf.int64, default_value=None)
-    for name in FLOAT_COLUMNS_LOG_BIN_TRANSFORM:
-        spec[name + '_log_01scaled'] = tf.io.FixedLenFeature(shape=fixed_shape, dtype=tf.float32, default_value=None)
-    for name in INT_COLUMNS:
-        spec[name + '_log_01scaled'] = tf.io.FixedLenFeature(shape=fixed_shape, dtype=tf.float32, default_value=None)
-    for name in BOOL_COLUMNS + CATEGORICAL_COLUMNS:
-        spec[name] = tf.io.FixedLenFeature(shape=fixed_shape, dtype=tf.int64, default_value=None)
-    for multi_category in DOC_CATEGORICAL_MULTIVALUED_COLUMNS:
-        shape = fixed_shape[:-1] + [len(DOC_CATEGORICAL_MULTIVALUED_COLUMNS[multi_category])]
-        spec[multi_category] = tf.io.FixedLenFeature(shape=shape, dtype=tf.int64)
-    metadata = dataset_metadata.DatasetMetadata(dataset_schema.from_feature_spec(spec))
-    metadata_io.write_metadata(metadata, output_dir)
-
-
-# write out tfrecords meta
-make_spec(LOCAL_DATA_TFRECORDS_DIR + '/transformed_metadata', batch_size=batch_size)
-
-
-def log2_1p(x):
-    return np.log1p(x) / np.log(2.0)
-
-
-# calculate min and max stats for the given dataframes all in one go
-def compute_min_max_logs(df):
-    print(str(datetime.datetime.now()) + '\tComputing min and max')
-    min_logs = {}
-    max_logs = {}
-    float_expr = []
-    for name in FLOAT_COLUMNS_LOG_BIN_TRANSFORM + INT_COLUMNS:
-        float_expr.append(F.min(name))
-        float_expr.append(F.max(name))
-    floatDf = all_df.agg(*float_expr).collect()
-    for name in FLOAT_COLUMNS_LOG_BIN_TRANSFORM:
-        minAgg = floatDf[0]["min(" + name + ")"]
-        maxAgg = floatDf[0]["max(" + name + ")"]
-        min_logs[name + '_log_01scaled'] = log2_1p(minAgg * 1000)
-        max_logs[name + '_log_01scaled'] = log2_1p(maxAgg * 1000)
-    for name in INT_COLUMNS:
-        minAgg = floatDf[0]["min(" + name + ")"]
-        maxAgg = floatDf[0]["max(" + name + ")"]
-        min_logs[name + '_log_01scaled'] = log2_1p(minAgg)
-        max_logs[name + '_log_01scaled'] = log2_1p(maxAgg)
-
-    return min_logs, max_logs
-
-
-all_df = test_validation_feature_vectors_integral_csv_rdd_df.union(train_feature_vectors_integral_csv_rdd_df)
-min_logs, max_logs = compute_min_max_logs(all_df)
-
-train_output_string = '/train'
-eval_output_string = '/eval'
-
-path = LOCAL_DATA_TFRECORDS_DIR
-
-
-def create_tf_example_spark(df, min_logs, max_logs):
-    result = {}
-    result[LABEL_COLUMN] = tf.train.Feature(int64_list=tf.train.Int64List(value=df[LABEL_COLUMN].to_list()))
-    result[DISPLAY_ID_COLUMN] = tf.train.Feature(int64_list=tf.train.Int64List(value=df[DISPLAY_ID_COLUMN].to_list()))
-    for name in FLOAT_COLUMNS:
-        value = df[name].to_list()
-        result[name] = tf.train.Feature(float_list=tf.train.FloatList(value=value))
-    for name in FLOAT_COLUMNS_SIMPLE_BIN_TRANSFORM:
-        value = df[name].multiply(10).astype('int64').to_list()
-        result[name + '_binned'] = tf.train.Feature(int64_list=tf.train.Int64List(value=value))
-    for name in FLOAT_COLUMNS_LOG_BIN_TRANSFORM:
-        value_prelim = df[name].multiply(1000).apply(np.log1p).multiply(1. / np.log(2.0))
-        value = value_prelim.astype('int64').to_list()
-        result[name + '_binned'] = tf.train.Feature(int64_list=tf.train.Int64List(value=value))
-        nn = name + '_log_01scaled'
-        value = value_prelim.add(-min_logs[nn]).multiply(1. / (max_logs[nn] - min_logs[nn])).to_list()
-        result[nn] = tf.train.Feature(float_list=tf.train.FloatList(value=value))
-    for name in INT_COLUMNS:
-        value_prelim = df[name].apply(np.log1p).multiply(1. / np.log(2.0))
-        value = value_prelim.astype('int64').to_list()
-        result[name + '_log_int'] = tf.train.Feature(int64_list=tf.train.Int64List(value=value))
-        nn = name + '_log_01scaled'
-        value = value_prelim.add(-min_logs[nn]).multiply(1. / (max_logs[nn] - min_logs[nn])).to_list()
-        result[nn] = tf.train.Feature(float_list=tf.train.FloatList(value=value))
-    for name in BOOL_COLUMNS + CATEGORICAL_COLUMNS:
-        value = df[name].fillna(0).astype('int64').to_list()
-        result[name] = tf.train.Feature(int64_list=tf.train.Int64List(value=value))
-    for multi_category in DOC_CATEGORICAL_MULTIVALUED_COLUMNS:
-        values = []
-        for category in DOC_CATEGORICAL_MULTIVALUED_COLUMNS[multi_category]:
-            values = values + [df[category].to_numpy()]
-        # need to transpose the series so they will be parsed correctly by the FixedLenFeature
-        # we can pass in a single series here; they'll be reshaped to [batch_size, num_values]
-        # when parsed from the TFRecord
-        value = np.stack(values, axis=1).flatten().tolist()
-        result[multi_category] = tf.train.Feature(int64_list=tf.train.Int64List(value=value))
-    tf_example = tf.train.Example(features=tf.train.Features(feature=result))
-    return tf_example
-
-
-def hash_bucket(num_buckets):
-    return lambda x: x % num_buckets
-
-
-def _transform_to_tfrecords(rdds):
-    csv = pd.DataFrame(list(rdds), columns=CSV_ORDERED_COLUMNS)
-    num_rows = len(csv.index)
-    examples = []
-    for start_ind in range(0, num_rows, batch_size if batch_size is not None else 1):  # for each batch
-        if start_ind + batch_size - 1 > num_rows:  # if we'd run out of rows
-            csv_slice = csv.iloc[start_ind:]
-            # drop the remainder
-            print("last Example has: ", len(csv_slice))
-            examples.append((create_tf_example_spark(csv_slice, min_logs, max_logs), len(csv_slice)))
-            return examples
-        else:
-            csv_slice = csv.iloc[start_ind:start_ind + (batch_size if batch_size is not None else 1)]
-        examples.append((create_tf_example_spark(csv_slice, min_logs, max_logs), batch_size))
-    return examples
-
-
-max_partition_num = 30
-
-
-def _transform_to_slices(rdds):
-    taskcontext = TaskContext.get()
-    partitionid = taskcontext.partitionId()
-    csv = pd.DataFrame(list(rdds), columns=CSV_ORDERED_COLUMNS)
-    for name, size in HASH_BUCKET_SIZES.items():
-        if name in csv.columns.values:
-            csv[name] = csv[name].apply(hash_bucket(size))
-    num_rows = len(csv.index)
-    print("working with partition: ", partitionid, max_partition_num, num_rows)
-    examples = []
-    for start_ind in range(0, num_rows, batch_size if batch_size is not None else 1):  # for each batch
-        if start_ind + batch_size - 1 > num_rows:  # if we'd run out of rows
-            csv_slice = csv.iloc[start_ind:]
-            print("last Example has: ", len(csv_slice), partitionid)
-            examples.append((csv_slice, len(csv_slice)))
-            return examples
-        else:
-            csv_slice = csv.iloc[start_ind:start_ind + (batch_size if batch_size is not None else 1)]
-        examples.append((csv_slice, len(csv_slice)))
-    return examples
-
-
-def _transform_to_tfrecords_from_slices(rdds):
-    examples = []
-    for slice in rdds:
-        if len(slice[0]) != batch_size:
-            print("slice size is not correct, dropping: ", len(slice[0]))
-        else:
-            examples.append(
-                (bytearray((create_tf_example_spark(slice[0], min_logs, max_logs)).SerializeToString()), None))
-    return examples
-
-
-def _transform_to_tfrecords_from_reslice(rdds):
-    examples = []
-    all_dataframes = pd.DataFrame([])
-    for slice in rdds:
-        all_dataframes = all_dataframes.append(slice[0])
-    num_rows = len(all_dataframes.index)
-    examples = []
-    for start_ind in range(0, num_rows, batch_size if batch_size is not None else 1):  # for each batch
-        if start_ind + batch_size - 1 > num_rows:  # if we'd run out of rows
-            csv_slice = all_dataframes.iloc[start_ind:]
-            if TEST_SET_MODE:
-                remain_len = batch_size - len(csv_slice)
-                (m, n) = divmod(remain_len, len(csv_slice))
-                print("remainder: ", len(csv_slice), remain_len, m, n)
-                if m:
-                    for i in range(m):
-                        csv_slice = csv_slice.append(csv_slice)
-                csv_slice = csv_slice.append(csv_slice.iloc[:n])
-                print("after fill remainder: ", len(csv_slice))
-                examples.append(
-                    (bytearray((create_tf_example_spark(csv_slice, min_logs, max_logs)).SerializeToString()), None))
-                return examples
-            # drop the remainder
-            print("dropping remainder: ", len(csv_slice))
-            return examples
-        else:
-            csv_slice = all_dataframes.iloc[start_ind:start_ind + (batch_size if batch_size is not None else 1)]
-            examples.append(
-                (bytearray((create_tf_example_spark(csv_slice, min_logs, max_logs)).SerializeToString()), None))
-    return examples
-
-
-TEST_SET_MODE = False
-train_features = train_feature_vectors_integral_csv_rdd_df.coalesce(30).rdd.mapPartitions(_transform_to_slices)
-cached_train_features = train_features.cache()
-train_full = cached_train_features.filter(lambda x: x[1] == batch_size)
-# split out slies where we don't have a full batch so that we can reslice them so we only drop mininal rows
-train_not_full = cached_train_features.filter(lambda x: x[1] < batch_size)
-train_examples_full = train_full.mapPartitions(_transform_to_tfrecords_from_slices)
-train_left = train_not_full.coalesce(1).mapPartitions(_transform_to_tfrecords_from_reslice)
-all_train = train_examples_full.union(train_left)
-
-TEST_SET_MODE = True
-valid_features = test_validation_feature_vectors_integral_csv_rdd_df.repartition(num_valid_partitions,
-                                                                                 'display_id').rdd.mapPartitions(
-    _transform_to_slices)
-cached_valid_features = valid_features.cache()
-valid_full = cached_valid_features.filter(lambda x: x[1] == batch_size)
-valid_not_full = cached_valid_features.filter(lambda x: x[1] < batch_size)
-valid_examples_full = valid_full.mapPartitions(_transform_to_tfrecords_from_slices)
-valid_left = valid_not_full.coalesce(1).mapPartitions(_transform_to_tfrecords_from_reslice)
-all_valid = valid_examples_full.union(valid_left)
-
-all_train.saveAsNewAPIHadoopFile(LOCAL_DATA_TFRECORDS_DIR + train_output_string,
-                                 "org.tensorflow.hadoop.io.TFRecordFileOutputFormat",
-                                 keyClass="org.apache.hadoop.io.BytesWritable",
-                                 valueClass="org.apache.hadoop.io.NullWritable")
-
-all_valid.saveAsNewAPIHadoopFile(LOCAL_DATA_TFRECORDS_DIR + eval_output_string,
-                                 "org.tensorflow.hadoop.io.TFRecordFileOutputFormat",
-                                 keyClass="org.apache.hadoop.io.BytesWritable",
-                                 valueClass="org.apache.hadoop.io.NullWritable")
-
-spark.stop()
--- a/TensorFlow2/Recommendation/WideAndDeep/data/outbrain/spark/utils/feature_description.py
+++ b/TensorFlow2/Recommendation/WideAndDeep/data/outbrain/spark/utils/feature_description.py
@ -1,136 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-LABEL_COLUMN = "label"
-
-DISPLAY_ID_COLUMN = 'display_id'
-
-IS_LEAK_COLUMN = 'is_leak'
-
-DISPLAY_ID_AND_IS_LEAK_ENCODED_COLUMN = 'display_ad_and_is_leak'
-
-CATEGORICAL_COLUMNS = [
-    'ad_id',
-    'campaign_id',
-    'doc_id',
-    'doc_event_id',
-    'ad_advertiser',
-    'doc_ad_source_id',
-    'doc_ad_publisher_id',
-    'doc_event_publisher_id',
-    'doc_event_source_id',
-    'event_country',
-    'event_country_state',
-    'event_geo_location',
-    'event_platform']
-
-DOC_CATEGORICAL_MULTIVALUED_COLUMNS = {
-}
-
-BOOL_COLUMNS = []
-
-INT_COLUMNS = [
-    'ad_views',
-    'doc_views',
-    'doc_event_days_since_published',
-    'doc_ad_days_since_published']
-
-FLOAT_COLUMNS_LOG_BIN_TRANSFORM = []
-FLOAT_COLUMNS_NO_TRANSFORM = [
-    'pop_ad_id',
-    'pop_document_id',
-    'pop_publisher_id',
-    'pop_advertiser_id',
-    'pop_campain_id',
-    'pop_source_id',
-    'doc_event_doc_ad_sim_categories',
-    'doc_event_doc_ad_sim_topics',
-    'doc_event_doc_ad_sim_entities',
-]
-FLOAT_COLUMNS_SIMPLE_BIN_TRANSFORM = []
-FLOAT_COLUMNS = FLOAT_COLUMNS_LOG_BIN_TRANSFORM + FLOAT_COLUMNS_SIMPLE_BIN_TRANSFORM + FLOAT_COLUMNS_NO_TRANSFORM
-
-REQUEST_SINGLE_HOT_COLUMNS = [
-    "doc_event_id",
-    "doc_id",
-    "doc_event_source_id",
-    "event_geo_location",
-    "event_country_state",
-    "doc_event_publisher_id",
-    "event_country",
-    "event_hour",
-    "event_platform",
-    "traffic_source",
-    "event_weekend",
-    "user_has_already_viewed_doc"]
-
-REQUEST_MULTI_HOT_COLUMNS = [
-    "doc_event_entity_id",
-    "doc_event_topic_id",
-    "doc_event_category_id"]
-
-REQUEST_NUMERIC_COLUMNS = [
-    "pop_document_id_conf",
-    "pop_publisher_id_conf",
-    "pop_source_id_conf",
-    "pop_entity_id_conf",
-    "pop_topic_id_conf",
-    "pop_category_id_conf",
-    "pop_document_id",
-    "pop_publisher_id",
-    "pop_source_id",
-    "pop_entity_id",
-    "pop_topic_id",
-    "pop_category_id",
-    "user_views",
-    "doc_views",
-    "doc_event_days_since_published",
-    "doc_event_hour"]
-
-ITEM_SINGLE_HOT_COLUMNS = [
-    "ad_id",
-    'campaign_id',
-    "doc_ad_source_id",
-    "ad_advertiser",
-    "doc_ad_publisher_id"]
-
-ITEM_MULTI_HOT_COLUMNS = [
-    "doc_ad_topic_id",
-    "doc_ad_entity_id",
-    "doc_ad_category_id"]
-
-ITEM_NUMERIC_COLUMNS = [
-    "pop_ad_id_conf",
-    "user_doc_ad_sim_categories_conf",
-    "user_doc_ad_sim_topics_conf",
-    "pop_advertiser_id_conf",
-    "pop_ad_id",
-    "pop_advertiser_id",
-    "pop_campain_id",
-    "user_doc_ad_sim_categories",
-    "user_doc_ad_sim_topics",
-    "user_doc_ad_sim_entities",
-    "doc_event_doc_ad_sim_categories",
-    "doc_event_doc_ad_sim_topics",
-    "doc_event_doc_ad_sim_entities",
-    "ad_views",
-    "doc_ad_days_since_published"]
-
-NV_TRAINING_COLUMNS = (
-        REQUEST_SINGLE_HOT_COLUMNS +
-        REQUEST_MULTI_HOT_COLUMNS +
-        REQUEST_NUMERIC_COLUMNS +
-        ITEM_SINGLE_HOT_COLUMNS +
-        ITEM_MULTI_HOT_COLUMNS +
-        ITEM_NUMERIC_COLUMNS)
--- a/TensorFlow2/Recommendation/WideAndDeep/Dockerfile-train
+++ b/TensorFlow2/Recommendation/WideAndDeep/Dockerfile-train
@ -1,3 +1,5 @@
+#!/bin/bash
+
 # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@ -12,16 +14,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-ARG FROM_IMAGE_NAME=nvcr.io/nvidia/tensorflow:20.12-tf2-py3

-FROM ${FROM_IMAGE_NAME}
+# Get local process ID from OpenMPI or alternatively from SLURM
+if [ -z "${CUDA_VISIBLE_DEVICES:-}" ]; then
+    if [ -n "${OMPI_COMM_WORLD_LOCAL_RANK:-}" ]; then
+        LOCAL_RANK="${OMPI_COMM_WORLD_LOCAL_RANK}"
+    elif [ -n "${SLURM_LOCALID:-}" ]; then
+        LOCAL_RANK="${SLURM_LOCALID}"
+    fi
+    export CUDA_VISIBLE_DEVICES=${LOCAL_RANK}
+fi

-USER root
-
-RUN pip install --no-cache-dir --no-deps tensorflow-transform==0.24.1 tensorflow-metadata==0.14.0 pydot dill && \
-    pip install --no-cache-dir ipdb pynvml==8.0.4 && \
-    pip install --no-cache-dir -e git+https://github.com/NVIDIA/dllogger#egg=dllogger
-
-WORKDIR  /wd
-
-COPY . .
+exec "$@"
--- a/TensorFlow2/Recommendation/WideAndDeep/img/amp_influence.svg
+++ b/TensorFlow2/Recommendation/WideAndDeep/img/amp_influence.svg
--- a/TensorFlow2/Recommendation/WideAndDeep/img/amp_influence_nvtabular.svg
+++ b/TensorFlow2/Recommendation/WideAndDeep/img/amp_influence_nvtabular.svg
--- a/TensorFlow2/Recommendation/WideAndDeep/img/amp_influence_spark.svg
+++ b/TensorFlow2/Recommendation/WideAndDeep/img/amp_influence_spark.svg
--- a/TensorFlow2/Recommendation/WideAndDeep/img/learning_curve.svg
+++ b/TensorFlow2/Recommendation/WideAndDeep/img/learning_curve.svg
--- a/TensorFlow2/Recommendation/WideAndDeep/img/learning_curve_nvt.svg
+++ b/TensorFlow2/Recommendation/WideAndDeep/img/learning_curve_nvt.svg
--- a/TensorFlow2/Recommendation/WideAndDeep/img/learning_curve_spark_nvt.svg
+++ b/TensorFlow2/Recommendation/WideAndDeep/img/learning_curve_spark_nvt.svg
--- a/TensorFlow2/Recommendation/WideAndDeep/img/leraning_curve_spark.svg
+++ b/TensorFlow2/Recommendation/WideAndDeep/img/leraning_curve_spark.svg
--- a/TensorFlow2/Recommendation/WideAndDeep/img/map_12_amp_influence.svg
+++ b/TensorFlow2/Recommendation/WideAndDeep/img/map_12_amp_influence.svg
--- a/TensorFlow2/Recommendation/WideAndDeep/img/training_stability.svg
+++ b/TensorFlow2/Recommendation/WideAndDeep/img/training_stability.svg
--- a/TensorFlow2/Recommendation/WideAndDeep/img/training_stability_nvtabular.svg
+++ b/TensorFlow2/Recommendation/WideAndDeep/img/training_stability_nvtabular.svg
--- a/TensorFlow2/Recommendation/WideAndDeep/img/training_stability_spark.svg
+++ b/TensorFlow2/Recommendation/WideAndDeep/img/training_stability_spark.svg
--- a/TensorFlow2/Recommendation/WideAndDeep/main.py
+++ b/TensorFlow2/Recommendation/WideAndDeep/main.py
@ -12,8 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+import os
+
+os.environ["TF_MEMORY_ALLOCATION"] = "0.6"  # fraction of free memory
+import nvtabular as nvt
+
 from trainer.model.widedeep import wide_deep_model
-from trainer.run import train, evaluate
+from trainer.run import run
 from trainer.utils.arguments import parse_args
 from trainer.utils.setup import create_config

@ -21,13 +26,9 @@ from trainer.utils.setup import create_config
 def main():
    args = parse_args()
    config = create_config(args)
-    model = wide_deep_model(args)
-
-    if args.evaluate:
-        evaluate(args, model, config)
-    else:
-        train(args, model, config)
+    model, _ = wide_deep_model(args)
+    run(args, model, config)


-if __name__ == '__main__':
+if __name__ == "__main__":
    main()
--- a/TensorFlow2/Recommendation/WideAndDeep/scripts/evaluating_benchmark.sh
+++ b/TensorFlow2/Recommendation/WideAndDeep/scripts/evaluating_benchmark.sh
@ -80,13 +80,13 @@ if ! [ "$gpu" -ge 0 ] || [[ ! "$gpu" =~ ^(1|4|8)$ ]] 2>/dev/null; then
  exit 1
 fi

-cmd="mpiexec --allow-run-as-root --bind-to socket -np ${gpu} \
+cmd="horovodrun -np ${gpu} sh hvd_wrapper.sh \
 	python main.py \
 	--evaluate \
 	--benchmark \
 	--benchmark_warmup_steps 500 \
 	--benchmark_steps 1000 \
-	-eval_batch_size ${bs} \
+	--eval_batch_size ${bs} \
 	${amp} \
 	${xla}"

--- a/TensorFlow2/Recommendation/WideAndDeep/scripts/memscript.sh
+++ b/TensorFlow2/Recommendation/WideAndDeep/scripts/memscript.sh
@ -1,30 +0,0 @@
-#!/bin/bash -e
-
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-mTotal=$(cat /proc/meminfo | grep "MemTotal:" | tr -s ' ' | cut -d' ' -f2)
-mFOffset=$(cat /proc/meminfo | grep "MemAvailable:" | tr -s ' ' | cut -d' ' -f2)
-minFreeMem=$mFOffset
-while true; do 
-	mF=$(cat /proc/meminfo | grep "MemAvailable:" | tr -s ' ' | cut -d' ' -f2)	
-	if [ $minFreeMem -gt $mF ]
-	then
-		minFreeMem=$mF
-		memConsumed=$((mFOffset - mF))
-		echo $memConsumed > mem_consumption.txt
-	fi
-       	sleep 1
-done
--- a/TensorFlow2/Recommendation/WideAndDeep/scripts/preproc.sh
+++ b/TensorFlow2/Recommendation/WideAndDeep/scripts/preproc.sh
@ -17,7 +17,7 @@
 set -e

 function usage() {
-  echo "Usage: bash scripts/preproc.sh nvtabular/spark [tfrecords]"
+  echo "Usage: bash scripts/preproc.sh"
 }

 if [ ! -d "scripts" ] || [ ! "$(ls -A 'scripts')" ]; then
@ -26,35 +26,4 @@ if [ ! -d "scripts" ] || [ ! "$(ls -A 'scripts')" ]; then
  exit 1
 fi

-if [ $# -ne 1 ] && [ $# -ne 2 ]; then
-  usage
-  exit 1
-fi
-
-tfrecords=${2:-40}
-
-if ! [ "$tfrecords" -ge 0 ] 2>/dev/null; then
-  echo "Expected tfrecords (${tfrecords}) to be positive integer"
-  usage
-  exit 1
-fi
-
-case "$1" in
-  nvtabular)
-    time python -m data.outbrain.nvtabular.preproc --workers "${tfrecords}"
-    ;;
-
-  spark)
-    echo "Starting preprocessing 1/3..."
-    time python data/outbrain/spark/preproc1.py
-    echo "Starting preprocessing 2/3..."
-    time python data/outbrain/spark/preproc2.py
-    echo "Starting preprocessing 3/3..."
-    time python data/outbrain/spark/preproc3.py --num_train_partitions "${tfrecords}" --num_valid_partitions "${tfrecords}"
-    ;;
-
-  *)
-    usage
-    exit 1
-    ;;
-esac
+time python -m data.outbrain.nvtabular.preproc
--- a/TensorFlow2/Recommendation/WideAndDeep/scripts/preproc_benchmark.sh
+++ b/TensorFlow2/Recommendation/WideAndDeep/scripts/preproc_benchmark.sh
@ -1,119 +0,0 @@
-#!/bin/bash
-
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-set -e
-
-OUTBRAIN_DIR='/outbrain'
-SPARK_DIR='/tmp/spark'
-
-usage() {
-  cat <<EOF
-Usage: bash scripts/preproc_benchmark.sh -m nvtabular/spark
-m    | --mode           (Required)            Preprocessing to be executed from [nvtabular, spark].
-t    | --tfrecords      (Optional)            Number of tfrecords to be created, default 40.
-i    | --iteration      (Optional)            Number of benchmark iterations, default 10.
-EOF
-}
-
-if [ ! -d "scripts" ] || [ ! "$(ls -A 'scripts')" ]; then
-  echo "You are probably calling this script from wrong directory"
-  usage
-  exit 1
-fi
-
-mode=
-iter=10
-tfrecords=40
-
-while [ "$1" != "" ]; do
-  case $1 in
-    -m | --mode)
-      shift
-      mode="$1"
-      ;;
-    -t | --tfrecords)
-      shift
-      tfrecords="$1"
-      ;;
-    -i | --iteration)
-      shift
-      iter="$1"
-      ;;
-    *)
-      usage
-      exit 1
-      ;;
-  esac
-  shift
-
-done
-
-if [ -z "$mode" ]; then
-  echo "Missing preprocessing mode"
-  usage
-  exit 1
-fi
-
-if [[ ! "$mode" =~ ^(spark|nvtabular)$ ]]; then
-  echo "Expected mode (${mode}) to be equal spark or nvtabular"
-  usage
-  exit 1
-fi
-
-if ! [ "$tfrecords" -ge 0 ] 2>/dev/null; then
-  echo "Expected tfrecords (${tfrecords}) to be positive integer"
-  usage
-  exit 1
-fi
-
-if ! [ "$iter" -ge 0 ] 2>/dev/null; then
-  echo "Expected iteration (${iter}) to be positive integer"
-  usage
-  exit 1
-fi
-
-function clean() {
-  case "$1" in
-    nvtabular)
-      rm -rf "$OUTBRAIN_DIR/data"
-      rm -rf "$OUTBRAIN_DIR/tfrecords"
-      ;;
-
-    spark)
-      rm -rf "$SPARK_DIR"
-      rm -rf "$OUTBRAIN_DIR/tfrecords"
-      ;;
-  esac
-}
-
-SECONDS=0
-
-for i in $(seq 1 "$iter"); do
-	echo "[BENCHMARK] Cleaning directories"
-	clean "${mode}"
-	echo "[BENCHMARK] Running iteration ${i}"	
-	bash scripts/memscript.sh & bash scripts/preproc.sh "${mode}" "${tfrecords}" 
-  echo "[BENCHMARK] Memory consumption during iteration ${i} (kB): $(cat mem_consumption.txt)"
-done
-echo -e "\n[BENCHMARK] Benchmark finished:\n"
-echo "[BENCHMARK] Memory consumption (kB): $(cat mem_consumption.txt)"
-rm mem_consumption.txt
-echo "[BENCHMARK] Mode=${mode}"
-echo "[BENCHMARK] Iteration=${iter}"
-echo "[BENCHMARK] Tfrecords=${tfrecords}"
-AVG_SECONDS=$((("$SECONDS" + "$iter" / 2) / "$iter"))
-printf '[BENCHMARK] Total time elapsed: %dh:%dm:%ds\n' $(("$SECONDS" / 3600)) $(("$SECONDS" % 3600 / 60)) $(("$SECONDS" % 60))
-printf '[BENCHMARK] Average iteration time: %dh:%dm:%ds\n\n' $(("$AVG_SECONDS" / 3600)) $(("$AVG_SECONDS" % 3600 / 60)) $(("$AVG_SECONDS" % 60))
--- a/TensorFlow2/Recommendation/WideAndDeep/scripts/training_benchmark.sh
+++ b/TensorFlow2/Recommendation/WideAndDeep/scripts/training_benchmark.sh
@ -68,7 +68,7 @@ if ! [ "$gpu" -ge 0 ] || [[ ! "$gpu" =~ ^(1|4|8)$ ]] 2>/dev/null; then
  exit 1
 fi

-cmd="mpiexec --allow-run-as-root --bind-to socket -np ${gpu} \
+cmd="horovodrun -np ${gpu} sh hvd_wrapper.sh \
 	python main.py \
 	--benchmark \
 	--benchmark_warmup_steps 500 \
--- a/TensorFlow2/Recommendation/WideAndDeep/scripts/training_full.sh
+++ b/TensorFlow2/Recommendation/WideAndDeep/scripts/training_full.sh
@ -68,7 +68,7 @@ if ! [ "$gpu" -ge 0 ] || [[ ! "$gpu" =~ ^(1|4|8)$ ]] 2>/dev/null; then
  exit 1
 fi

-cmd="mpiexec --allow-run-as-root --bind-to socket -np ${gpu} \
+cmd="horovodrun -np ${gpu} sh hvd_wrapper.sh \
 	python main.py \
 	${amp} \
 	${xla}"
--- a/TensorFlow2/Recommendation/WideAndDeep/trainer/model/layers.py
+++ b/TensorFlow2/Recommendation/WideAndDeep/trainer/model/layers.py
@ -1,167 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import numpy as np
-import tensorflow as tf
-from tensorflow.python.feature_column import feature_column_v2 as fc
-
-
-def _sort_columns(feature_columns):
-    return sorted(feature_columns, key=lambda col: col.name)
-
-
-def _validate_numeric_column(feature_column):
-    if len(feature_column.shape) > 1:
-        return "Matrix numeric utils are not allowed, " "found feature {} with shape {}".format(
-            feature_column.key, feature_column.shape
-        )
-    elif feature_column.shape[0] != 1:
-        return "Vector numeric utils are not allowed, " "found feature {} with shape {}".format(
-            feature_column.key, feature_column.shape
-        )
-
-
-def _validate_categorical_column(feature_column):
-    if not isinstance(feature_column, fc.IdentityCategoricalColumn):
-        return (
-            "Only acceptable categorical columns for feeding "
-            "embeddings are identity, found column {} of type {}. "
-            "Consider using NVTabular online preprocessing to perform "
-            "categorical transformations".format(feature_column.name, type(feature_column).__name__)
-        )
-
-
-def _validate_dense_feature_columns(feature_columns):
-    _errors = []
-    for feature_column in feature_columns:
-        if isinstance(feature_column, fc.CategoricalColumn):
-            if not isinstance(feature_column, fc.BucketizedColumn):
-                _errors.append(
-                    "All feature columns must be dense, found categorical "
-                    "column {} of type {}. Please wrap categorical columns "
-                    "in embedding or indicator columns before passing".format(
-                        feature_column.name, type(feature_column).__name__
-                    )
-                )
-            else:
-                _errors.append(
-                    "Found bucketized column {}. ScalarDenseFeatures layer "
-                    "cannot apply bucketization preprocessing. Consider using "
-                    "NVTabular to do preprocessing offline".format(feature_column.name)
-                )
-        elif isinstance(feature_column, (fc.EmbeddingColumn, fc.IndicatorColumn)):
-            _errors.append(_validate_categorical_column(feature_column.categorical_column))
-
-        elif isinstance(feature_column, fc.NumericColumn):
-            _errors.append(_validate_numeric_column(feature_column))
-
-    _errors = list(filter(lambda e: e is not None, _errors))
-    if len(_errors) > 0:
-        msg = "Found issues with columns passed to ScalarDenseFeatures:"
-        msg += "\n\t".join(_errors)
-        raise ValueError(_errors)
-
-
-def _validate_stack_dimensions(feature_columns):
-    dims = []
-    for feature_column in feature_columns:
-        if isinstance(feature_column, fc.EmbeddingColumn):
-            dimension = feature_column.dimension
-        elif isinstance(feature_column, fc.IndicatorColumn):
-            dimension = feature_column.categorical_column.num_buckets
-        else:
-            dimension = feature_column.shape[0]
-
-        dims.append(dimension)
-
-    dim0 = dims[0]
-    if not all(dim == dim0 for dim in dims[1:]):
-        dims = ", ".join(map(str, dims))
-        raise ValueError(
-            "'stack' aggregation requires all categorical "
-            "embeddings and continuous utils to have same "
-            "size. Found dimensions {}".format(dims)
-        )
-
-
-class ScalarDenseFeatures(tf.keras.layers.Layer):
-    def __init__(self, feature_columns, aggregation="concat", name=None, **kwargs):
-        feature_columns = _sort_columns(feature_columns)
-        _validate_dense_feature_columns(feature_columns)
-
-        assert aggregation in ("concat", "stack")
-        if aggregation == "stack":
-            _validate_stack_dimensions(feature_columns)
-
-        self.feature_columns = feature_columns
-        self.aggregation = aggregation
-        super(ScalarDenseFeatures, self).__init__(name=name, **kwargs)
-
-    def build(self, input_shapes):
-        assert all(shape[1] == 1 for shape in input_shapes.values())
-
-        self.embedding_tables = {}
-        for feature_column in self.feature_columns:
-            if isinstance(feature_column, fc.NumericColumn):
-                continue
-
-            feature_name = feature_column.categorical_column.key
-            num_buckets = feature_column.categorical_column.num_buckets
-            if isinstance(feature_column, fc.EmbeddingColumn):
-                self.embedding_tables[feature_name] = self.add_weight(
-                    name="{}/embedding_weights".format(feature_name),
-                    trainable=True,
-                    initializer="glorot_normal",
-                    shape=(num_buckets, feature_column.dimension),
-                )
-            else:
-                self.embedding_tables[feature_name] = self.add_weight(
-                    name="{}/embedding_weights".format(feature_name),
-                    trainable=False,
-                    initializer=tf.constant_initializer(np.eye(num_buckets)),
-                    shape=(num_buckets, num_buckets),
-                )
-        self.built = True
-
-    def call(self, inputs):
-        features = []
-        for feature_column in self.feature_columns:
-            if isinstance(feature_column, fc.NumericColumn):
-                features.append(inputs[feature_column.name])
-            else:
-                feature_name = feature_column.categorical_column.name
-                table = self.embedding_tables[feature_name]
-                embeddings = tf.gather(table, inputs[feature_name][:, 0])
-                features.append(embeddings)
-
-        if self.aggregation == "stack":
-            return tf.stack(features, axis=1)
-        return tf.concat(features, axis=1)
-
-    def compute_output_shape(self, input_shapes):
-        input_shape = [i for i in input_shapes.values()][0]
-        if self.aggregation == "concat":
-            output_dim = len(self.numeric_features) + sum(
-                [shape[-1] for shape in self.embedding_shapes.values()]
-            )
-            return (input_shape[0], output_dim)
-        else:
-            embedding_dim = [i for i in self.embedding_shapes.values()][0]
-            return (input_shape[0], len(self.embedding_shapes), embedding_dim)
-
-    def get_config(self):
-        return {
-            "feature_columns": self.feature_columns,
-            "aggregation": self.aggregation,
-        }
--- a/TensorFlow2/Recommendation/WideAndDeep/trainer/model/widedeep.py
+++ b/TensorFlow2/Recommendation/WideAndDeep/trainer/model/widedeep.py
@ -13,65 +13,79 @@
 # limitations under the License.

 import tensorflow as tf
-
-from data.outbrain.features import get_feature_columns, NUMERIC_COLUMNS, EMBEDDING_TABLE_SHAPES
-from trainer.model.layers import ScalarDenseFeatures
+from data.outbrain.features import (
+    CATEGORICAL_COLUMNS,
+    NUMERIC_COLUMNS,
+    get_feature_columns,
+)
+from nvtabular.framework_utils.tensorflow import layers as nvtlayers


-def wide_deep_model(args):
+def get_inputs_columns():
    wide_columns, deep_columns = get_feature_columns()

-    wide_weighted_outputs = []
-    numeric_dense_inputs = []
    wide_columns_dict = {}
    deep_columns_dict = {}
    features = {}

    for col in wide_columns:
-        features[col.key] = tf.keras.Input(shape=(1,),
-                                           batch_size=None,
-                                           name=col.key,
-                                           dtype=tf.float32 if col.key in NUMERIC_COLUMNS else tf.int32,
-                                           sparse=False)
+        features[col.key] = tf.keras.Input(
+            shape=(1,),
+            batch_size=None,
+            name=col.key,
+            dtype=tf.float32 if col.key in NUMERIC_COLUMNS else tf.int32,
+            sparse=False,
+        )
        wide_columns_dict[col.key] = col
+
    for col in deep_columns:
-        is_embedding_column = ('key' not in dir(col))
+        is_embedding_column = "key" not in dir(col)
        key = col.categorical_column.key if is_embedding_column else col.key

        if key not in features:
-            features[key] = tf.keras.Input(shape=(1,),
-                                           batch_size=None,
-                                           name=key,
-                                           dtype=tf.float32 if col.key in NUMERIC_COLUMNS else tf.int32,
-                                           sparse=False)
+            features[key] = tf.keras.Input(
+                shape=(1,),
+                batch_size=None,
+                name=key,
+                dtype=tf.float32 if col.key in NUMERIC_COLUMNS else tf.int32,
+                sparse=False,
+            )
        deep_columns_dict[key] = col

-    for key in wide_columns_dict:
-        if key in EMBEDDING_TABLE_SHAPES:
-            wide_weighted_outputs.append(tf.keras.layers.Flatten()(tf.keras.layers.Embedding(
-                EMBEDDING_TABLE_SHAPES[key][0], 1, input_length=1)(features[key])))
-        else:
-            numeric_dense_inputs.append(features[key])
-
-    categorical_output_contrib = tf.keras.layers.add(wide_weighted_outputs,
-                                                     name='categorical_output')
-    numeric_dense_tensor = tf.keras.layers.concatenate(
-        numeric_dense_inputs, name='numeric_dense')
    deep_columns = list(deep_columns_dict.values())
+    wide_columns = list(wide_columns_dict.values())

-    dnn = ScalarDenseFeatures(deep_columns, name='deep_embedded')(features)
+    return deep_columns, wide_columns, features
+
+
+def wide_deep_model(args):
+    deep_columns, wide_columns, features = get_inputs_columns()
+
+    wide = nvtlayers.LinearFeatures(wide_columns, name="wide_linear")(features)
+
+    dnn = nvtlayers.DenseFeatures(deep_columns, name="deep_embedded")(features)
    for unit_size in args.deep_hidden_units:
-        dnn = tf.keras.layers.Dense(units=unit_size, activation='relu')(dnn)
+        dnn = tf.keras.layers.Dense(units=unit_size, activation="relu")(dnn)
        dnn = tf.keras.layers.Dropout(rate=args.deep_dropout)(dnn)
    dnn = tf.keras.layers.Dense(units=1)(dnn)
-    dnn_model = tf.keras.Model(inputs=features,
-                               outputs=dnn)
-    linear_output = categorical_output_contrib + tf.keras.layers.Dense(1)(numeric_dense_tensor)

-    linear_model = tf.keras.Model(inputs=features,
-                                  outputs=linear_output)
+    dnn_model = tf.keras.Model(inputs=features, outputs=dnn)
+    linear_model = tf.keras.Model(inputs=features, outputs=wide)

    model = tf.keras.experimental.WideDeepModel(
-        linear_model, dnn_model, activation='sigmoid')
+        linear_model, dnn_model, activation="sigmoid"
+    )

-    return model
+    return model, features
+
+
+def get_dummy_inputs(batch_size):
+    inputs = {}
+    shape = (batch_size, 1)
+    for cat in CATEGORICAL_COLUMNS:
+        inputs[cat] = tf.zeros(shape, dtype=tf.dtypes.int32)
+
+    for cat in NUMERIC_COLUMNS:
+        inputs[cat] = tf.zeros(shape, dtype=tf.dtypes.float32)
+
+    return inputs
--- a/TensorFlow2/Recommendation/WideAndDeep/trainer/run.py
+++ b/TensorFlow2/Recommendation/WideAndDeep/trainer/run.py
@ -12,398 +12,70 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-import logging
-import os
-import time
-
-import dllogger
 import horovod.tensorflow as hvd
-import numpy as np
 import tensorflow as tf
-from data.outbrain.features import DISPLAY_ID_COLUMN
-from tensorflow.python.keras import backend as K
-from trainer.utils.schedulers import get_schedule
+from trainer.utils.benchmark import ThroughputCalculator
+from trainer.utils.evaluator import Evaluator
+from trainer.utils.schedulers import LearningRateScheduler
+from trainer.utils.trainer import Trainer


-def train(args, model, config):
-    logger = logging.getLogger('tensorflow')
+def run(args, model, config):
+    train_dataset = config["train_dataset"]
+    eval_dataset = config["eval_dataset"]
+    steps_per_epoch = len(train_dataset)

-    train_dataset = config['train_dataset']
-    eval_dataset = config['eval_dataset']
-    steps = int(config['steps_per_epoch'])
-    schedule = get_schedule(
+    steps = int(steps_per_epoch * args.num_epochs)
+    deep_optimizer = tf.keras.optimizers.RMSprop(
+        learning_rate=args.deep_learning_rate, rho=0.5
+    )
+
+    wide_optimizer = tf.keras.optimizers.Ftrl(learning_rate=args.linear_learning_rate)
+
+    if not args.cpu:
+        deep_optimizer = hvd.DistributedOptimizer(deep_optimizer)
+        wide_optimizer = hvd.DistributedOptimizer(wide_optimizer)
+
+    if args.amp:
+        deep_optimizer = tf.keras.mixed_precision.LossScaleOptimizer(
+            deep_optimizer, dynamic=True
+        )
+        wide_optimizer = tf.keras.mixed_precision.LossScaleOptimizer(
+            wide_optimizer, dynamic=True
+        )
+
+    scheduler = LearningRateScheduler(
+        args=args, steps_per_epoch=steps_per_epoch, optimizer=deep_optimizer
+    )
+
+    throughput_calculator = ThroughputCalculator(args)
+    compiled_loss = tf.keras.losses.BinaryCrossentropy()
+
+    evaluator = Evaluator(
+        model=model,
+        throughput_calculator=throughput_calculator,
+        eval_dataset=eval_dataset,
+        compiled_loss=compiled_loss,
+        steps=steps,
        args=args,
-        steps_per_epoch=steps
-    )
-    writer = tf.summary.create_file_writer(os.path.join(args.model_dir, 'event_files'))
-
-    deep_optimizer = tf.keras.optimizers.RMSprop(
-        learning_rate=args.deep_learning_rate,
-        rho=0.5
    )

-    wide_optimizer = tf.keras.optimizers.Ftrl(
-        learning_rate=args.linear_learning_rate
-    )
-
-    compiled_loss = tf.keras.losses.BinaryCrossentropy()
-    eval_loss = tf.keras.metrics.Mean()
-
-    metrics = [
-        tf.keras.metrics.BinaryAccuracy(),
-        tf.keras.metrics.AUC()
-    ]
-
-    current_step_var = tf.Variable(0, trainable=False, dtype=tf.int64)
-    display_id_counter = tf.Variable(0., trainable=False, dtype=tf.float64)
-    streaming_map = tf.Variable(0., name='STREAMING_MAP', trainable=False, dtype=tf.float64)
-
-    checkpoint = tf.train.Checkpoint(
+    trainer = Trainer(
+        model=model,
+        scheduler=scheduler,
        deep_optimizer=deep_optimizer,
        wide_optimizer=wide_optimizer,
-        model=model,
-        current_step=current_step_var
-    )
-    manager = tf.train.CheckpointManager(
-        checkpoint=checkpoint,
-        directory=os.path.join(args.model_dir, 'checkpoint'),
-        max_to_keep=1
+        throughput_calculator=throughput_calculator,
+        compiled_loss=compiled_loss,
+        steps=steps,
+        args=args,
+        train_dataset=train_dataset,
+        evaluator=evaluator,
    )

-    if args.use_checkpoint:
-        checkpoint.restore(manager.latest_checkpoint)
-        if manager.latest_checkpoint:
-            logger.warning(f'Model restored from checkpoint {args.model_dir}')
-            if args.benchmark:
-                current_step_var.assign(0)
-        else:
-            logger.warning(f'Failed to restore model from checkpoint {args.model_dir}')
+    trainer.maybe_restore_checkpoint()

-    if args.amp:
-        deep_optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(
-            deep_optimizer,
-            loss_scale='dynamic'
-        )
-        wide_optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(
-            wide_optimizer,
-            loss_scale='dynamic'
-        )
-
-    @tf.function
-    def train_step(x, y, first_batch):
-        with tf.GradientTape(persistent=True) as tape:
-            y_pred = model(x, training=True)
-            loss = compiled_loss(y, y_pred)
-            linear_loss = wide_optimizer.get_scaled_loss(loss) if args.amp else loss
-            deep_loss = deep_optimizer.get_scaled_loss(loss) if args.amp else loss
-
-        if not args.cpu:
-            tape = hvd.DistributedGradientTape(tape)
-
-        for metric in metrics:
-            metric.update_state(y, y_pred)
-
-        linear_vars = model.linear_model.trainable_variables
-        dnn_vars = model.dnn_model.trainable_variables
-        linear_grads = tape.gradient(linear_loss, linear_vars)
-        dnn_grads = tape.gradient(deep_loss, dnn_vars)
-        if args.amp:
-            linear_grads = wide_optimizer.get_unscaled_gradients(linear_grads)
-            dnn_grads = deep_optimizer.get_unscaled_gradients(dnn_grads)
-
-        wide_optimizer.apply_gradients(zip(linear_grads, linear_vars))
-        deep_optimizer.apply_gradients(zip(dnn_grads, dnn_vars))
-        if first_batch and not args.cpu:
-            hvd.broadcast_variables(model.linear_model.variables, root_rank=0)
-            hvd.broadcast_variables(model.dnn_model.variables, root_rank=0)
-            hvd.broadcast_variables(wide_optimizer.variables(), root_rank=0)
-            hvd.broadcast_variables(deep_optimizer.variables(), root_rank=0)
-        return loss
-
-    @tf.function
-    def evaluation_step(x, y):
-        predictions = model(x, training=False)
-        loss = compiled_loss(y, predictions)
-
-        for metric in metrics:
-            metric.update_state(y, predictions)
-
-        predictions = tf.reshape(predictions, [-1])
-        predictions = tf.cast(predictions, tf.float64)
-        display_ids = x[DISPLAY_ID_COLUMN]
-        display_ids = tf.reshape(display_ids, [-1])
-        labels = tf.reshape(y, [-1])
-        sorted_ids = tf.argsort(display_ids)
-        display_ids = tf.gather(display_ids, indices=sorted_ids)
-        predictions = tf.gather(predictions, indices=sorted_ids)
-        labels = tf.gather(labels, indices=sorted_ids)
-        _, display_ids_idx, display_ids_ads_count = tf.unique_with_counts(display_ids, out_idx=tf.int64)
-        pad_length = 30 - tf.reduce_max(display_ids_ads_count)
-        preds = tf.RaggedTensor.from_value_rowids(predictions, display_ids_idx).to_tensor()
-        labels = tf.RaggedTensor.from_value_rowids(labels, display_ids_idx).to_tensor()
-
-        labels_mask = tf.math.reduce_max(labels, 1)
-        preds_masked = tf.boolean_mask(preds, labels_mask)
-        labels_masked = tf.boolean_mask(labels, labels_mask)
-        labels_masked = tf.argmax(labels_masked, axis=1, output_type=tf.int32)
-        labels_masked = tf.reshape(labels_masked, [-1, 1])
-
-        preds_masked = tf.pad(preds_masked, [(0, 0), (0, pad_length)])
-        _, predictions_idx = tf.math.top_k(preds_masked, 12)
-        indices = tf.math.equal(predictions_idx, labels_masked)
-        indices_mask = tf.math.reduce_any(indices, 1)
-        masked_indices = tf.boolean_mask(indices, indices_mask)
-
-        res = tf.argmax(masked_indices, axis=1)
-        ap_matrix = tf.divide(1, tf.add(res, 1))
-        ap_sum = tf.reduce_sum(ap_matrix)
-        shape = tf.cast(tf.shape(indices)[0], tf.float64)
-        display_id_counter.assign_add(shape)
-        streaming_map.assign_add(ap_sum)
-        return loss
-
-    t0 = None
-    t_batch = None
-
-    with writer.as_default():
-        for epoch in range(1, args.num_epochs + 1):
-            for step, (x, y) in enumerate(train_dataset):
-                current_step = np.asscalar(current_step_var.numpy())
-                schedule(optimizer=deep_optimizer, current_step=current_step)
-
-                for metric in metrics:
-                    metric.reset_states()
-                loss = train_step(x, y, epoch == 1 and step == 0)
-                if args.cpu or hvd.rank() == 0:
-                    for metric in metrics:
-                        tf.summary.scalar(f'{metric.name}', metric.result(), step=current_step)
-                    tf.summary.scalar('loss', loss, step=current_step)
-                    tf.summary.scalar('schedule', K.get_value(deep_optimizer.lr), step=current_step)
-                    writer.flush()
-
-                if args.benchmark:
-                    boundary = max(args.benchmark_warmup_steps, 1)
-                    if current_step == boundary:
-                        t0 = time.time()
-                    if current_step > boundary:
-                        batch_time = time.time() - t_batch
-                        samplesps = args.global_batch_size / batch_time
-                        dllogger.log(data={'batch_samplesps': samplesps}, step=(1, current_step))
-
-                        if args.benchmark_steps <= current_step:
-                            train_time = time.time() - t0
-                            epochs = args.benchmark_steps - max(args.benchmark_warmup_steps, 1)
-                            train_throughput = (args.global_batch_size * epochs) / train_time
-                            dllogger.log(
-                                data={'train_throughput': train_throughput},
-                                step=tuple()
-                            )
-                            return
-
-                else:
-                    if current_step % 100 == 0:
-                        train_data = {metric.name: f'{metric.result().numpy():.4f}' for metric in metrics}
-                        train_data['loss'] = f'{loss.numpy():.4f}'
-                        dllogger.log(data=train_data, step=(current_step, args.num_epochs * steps))
-
-                    if step == steps:
-                        break
-
-                current_step_var.assign_add(1)
-                t_batch = time.time()
-            if args.benchmark:
-                continue
-
-            for metric in metrics:
-                metric.reset_states()
-            eval_loss.reset_states()
-
-            for step, (x, y) in enumerate(eval_dataset):
-                loss = evaluation_step(x, y)
-                eval_loss.update_state(loss)
-
-            map_metric = tf.divide(streaming_map, display_id_counter) if args.cpu else \
-                hvd.allreduce(tf.divide(streaming_map, display_id_counter))
-
-            map_metric = map_metric.numpy()
-            eval_loss_reduced = eval_loss.result() if args.cpu else \
-                hvd.allreduce(eval_loss.result())
-
-            metrics_reduced = {
-                f'{metric.name}_val': metric.result() if args.cpu else
-                hvd.allreduce(metric.result()) for metric in metrics
-            }
-
-            for name, result in metrics_reduced.items():
-                tf.summary.scalar(f'{name}', result, step=steps * epoch)
-            tf.summary.scalar('loss_val', eval_loss_reduced, step=steps * epoch)
-            tf.summary.scalar('map_val', map_metric, step=steps * epoch)
-            writer.flush()
-
-            eval_data = {name: f'{result.numpy():.4f}' for name, result in metrics_reduced.items()}
-            eval_data.update({
-                'loss_val': f'{eval_loss_reduced.numpy():.4f}',
-                'streaming_map_val': f'{map_metric:.4f}'
-            })
-            dllogger.log(data=eval_data, step=(steps * epoch, args.num_epochs * steps))
-
-            if args.cpu or hvd.rank() == 0:
-                manager.save()
-
-            display_id_counter.assign(0)
-            streaming_map.assign(0)
-        if args.cpu or hvd.rank() == 0:
-            dllogger.log(data=eval_data, step=tuple())
-
-
-def evaluate(args, model, config):
-    logger = logging.getLogger('tensorflow')
-
-    deep_optimizer = tf.keras.optimizers.RMSprop(
-        learning_rate=args.deep_learning_rate,
-        rho=0.5
-    )
-
-    wide_optimizer = tf.keras.optimizers.Ftrl(
-        learning_rate=args.linear_learning_rate
-    )
-
-    compiled_loss = tf.keras.losses.BinaryCrossentropy()
-    eval_loss = tf.keras.metrics.Mean()
-
-    metrics = [
-        tf.keras.metrics.BinaryAccuracy(),
-        tf.keras.metrics.AUC()
-    ]
-
-    if args.amp:
-        deep_optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(
-            deep_optimizer,
-            loss_scale='dynamic'
-        )
-        wide_optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(
-            wide_optimizer,
-            loss_scale='dynamic'
-        )
-
-    current_step = 0
-    current_step_var = tf.Variable(0, trainable=False, dtype=tf.int64)
-    display_id_counter = tf.Variable(0., trainable=False, dtype=tf.float64)
-    streaming_map = tf.Variable(0., name='STREAMING_MAP', trainable=False, dtype=tf.float64)
-
-    checkpoint = tf.train.Checkpoint(
-        deep_optimizer=deep_optimizer,
-        wide_optimizer=wide_optimizer,
-        model=model,
-        current_step=current_step_var
-    )
-    manager = tf.train.CheckpointManager(
-        checkpoint=checkpoint,
-        directory=os.path.join(args.model_dir, 'checkpoint'),
-        max_to_keep=1
-    )
-
-    if args.use_checkpoint:
-        checkpoint.restore(manager.latest_checkpoint).expect_partial()
-        if manager.latest_checkpoint:
-            logger.warning(f'Model restored from checkpoint {args.model_dir}')
-        else:
-            logger.warning(f'Failed to restore model from checkpoint {args.model_dir}')
-
-    @tf.function
-    def evaluation_step(x, y):
-        predictions = model(x, training=False)
-        loss = compiled_loss(y, predictions)
-
-        for metric in metrics:
-            metric.update_state(y, predictions)
-
-        predictions = tf.reshape(predictions, [-1])
-        predictions = tf.cast(predictions, tf.float64)
-        display_ids = x[DISPLAY_ID_COLUMN]
-        display_ids = tf.reshape(display_ids, [-1])
-        labels = tf.reshape(y, [-1])
-        sorted_ids = tf.argsort(display_ids)
-        display_ids = tf.gather(display_ids, indices=sorted_ids)
-        predictions = tf.gather(predictions, indices=sorted_ids)
-        labels = tf.gather(labels, indices=sorted_ids)
-        _, display_ids_idx, display_ids_ads_count = tf.unique_with_counts(display_ids, out_idx=tf.int64)
-        pad_length = 30 - tf.reduce_max(display_ids_ads_count)
-        preds = tf.RaggedTensor.from_value_rowids(predictions, display_ids_idx).to_tensor()
-        labels = tf.RaggedTensor.from_value_rowids(labels, display_ids_idx).to_tensor()
-
-        labels_mask = tf.math.reduce_max(labels, 1)
-        preds_masked = tf.boolean_mask(preds, labels_mask)
-        labels_masked = tf.boolean_mask(labels, labels_mask)
-        labels_masked = tf.argmax(labels_masked, axis=1, output_type=tf.int32)
-        labels_masked = tf.reshape(labels_masked, [-1, 1])
-
-        preds_masked = tf.pad(preds_masked, [(0, 0), (0, pad_length)])
-        _, predictions_idx = tf.math.top_k(preds_masked, 12)
-        indices = tf.math.equal(predictions_idx, labels_masked)
-        indices_mask = tf.math.reduce_any(indices, 1)
-        masked_indices = tf.boolean_mask(indices, indices_mask)
-
-        res = tf.argmax(masked_indices, axis=1)
-        ap_matrix = tf.divide(1, tf.add(res, 1))
-        ap_sum = tf.reduce_sum(ap_matrix)
-        shape = tf.cast(tf.shape(indices)[0], tf.float64)
-        display_id_counter.assign_add(shape)
-        streaming_map.assign_add(ap_sum)
-        return loss
-
-    eval_dataset = config['eval_dataset']
-
-    t0 = None
-    t_batch = None
-
-    for step, (x, y) in enumerate(eval_dataset):
-        loss = evaluation_step(x, y)
-        eval_loss.update_state(loss)
-        if args.benchmark:
-            boundary = max(args.benchmark_warmup_steps, 1)
-            if current_step == boundary:
-                t0 = time.time()
-            if current_step > boundary:
-                batch_time = time.time() - t_batch
-                samplesps = args.eval_batch_size / batch_time
-                if args.cpu or hvd.rank() == 0:
-                    dllogger.log(data={'batch_samplesps': samplesps}, step=(1, current_step))
-
-                if args.benchmark_steps <= current_step:
-                    valid_time = time.time() - t0
-                    epochs = args.benchmark_steps - max(args.benchmark_warmup_steps, 1)
-                    valid_throughput = (args.eval_batch_size * epochs) / valid_time
-                    if args.cpu or hvd.rank() == 0:
-                        dllogger.log(
-                            data={'validation_throughput': valid_throughput},
-                            step=tuple()
-                        )
-                    return
-
-        else:
-            if step % 100 == 0:
-                valid_data = {metric.name: f'{metric.result().numpy():.4f}' for metric in metrics}
-                valid_data['loss'] = f'{loss.numpy():.4f}'
-                if args.cpu or hvd.rank() == 0:
-                    dllogger.log(data=valid_data, step=(step,))
-        current_step += 1
-        t_batch = time.time()
-
-    map_metric = tf.divide(streaming_map, display_id_counter) if args.cpu else \
-        hvd.allreduce(tf.divide(streaming_map, display_id_counter))
-    eval_loss_reduced = eval_loss.result() if args.cpu else \
-        hvd.allreduce(eval_loss.result())
-
-    metrics_reduced = {
-        f'{metric.name}_val': metric.result() if args.cpu else
-        hvd.allreduce(metric.result()) for metric in metrics
-    }
-
-    eval_data = {name: f'{result.numpy():.4f}' for name, result in metrics_reduced.items()}
-    eval_data.update({
-        'loss_val': f'{eval_loss_reduced.numpy():.4f}',
-        'streaming_map_val': f'{map_metric.numpy():.4f}'
-    })
-
-    dllogger.log(data=eval_data, step=(step,))
+    if args.evaluate:
+        evaluator.eval(trainer.current_step_var)
+    else:
+        trainer.run_loop()
--- a/TensorFlow2/Recommendation/WideAndDeep/trainer/utils/arguments.py
+++ b/TensorFlow2/Recommendation/WideAndDeep/trainer/utils/arguments.py
@ -14,101 +14,178 @@

 import argparse

-# Default train dataset size
-TRAIN_DATASET_SIZE = 59761827
+DEFAULT_DIR = "/outbrain"


 def parse_args():
    parser = argparse.ArgumentParser(
-        description='Tensorflow2 WideAndDeep Model',
+        description="Tensorflow2 WideAndDeep Model",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        add_help=True,
    )

-    locations = parser.add_argument_group('location of datasets')
+    locations = parser.add_argument_group("location of datasets")

-    locations.add_argument('--train_data_pattern', type=str, default='/outbrain/tfrecords/train/part*', nargs='+',
-                           help='Pattern of training file names. For example if training files are train_000.tfrecord, '
-                                'train_001.tfrecord then --train_data_pattern is train_*')
+    locations.add_argument(
+        "--train_data_pattern",
+        type=str,
+        default=f"{DEFAULT_DIR}/data/train/*.parquet",
+        help="Pattern of training file names. For example if training files are part_0.parquet, "
+             "part_0.parquet then --train_data_pattern is *.parquet",
+    )

-    locations.add_argument('--eval_data_pattern', type=str, default='/outbrain/tfrecords/eval/part*', nargs='+',
-                           help='Pattern of eval file names. For example if eval files are eval_000.tfrecord, '
-                                'eval_001.tfrecord then --eval_data_pattern is eval_*')
+    locations.add_argument(
+        "--eval_data_pattern",
+        type=str,
+        default=f"{DEFAULT_DIR}/data/valid/*.parquet",
+        help="Pattern of eval file names. For example if training files are part_0.parquet, "
+             "part_0.parquet then --eval_data_pattern is *.parquet",
+    )

-    locations.add_argument('--transformed_metadata_path', type=str, default='/outbrain/tfrecords',
-                           help='Path to transformed_metadata for feature specification reconstruction')
+    locations.add_argument(
+        "--use_checkpoint",
+        default=False,
+        action="store_true",
+        help="Use checkpoint stored in model_dir path",
+    )

-    locations.add_argument('--use_checkpoint', default=False, action='store_true',
-                           help='Use checkpoint stored in model_dir path')
+    locations.add_argument(
+        "--model_dir",
+        type=str,
+        default=f"{DEFAULT_DIR}/checkpoints",
+        help="Destination where model checkpoint will be saved",
+    )

-    locations.add_argument('--model_dir', type=str, default='/outbrain/checkpoints',
-                           help='Destination where model checkpoint will be saved')
+    locations.add_argument(
+        "--results_dir",
+        type=str,
+        default="/results",
+        help="Directory to store training results",
+    )

-    locations.add_argument('--results_dir', type=str, default='/results',
-                           help='Directory to store training results')
+    locations.add_argument(
+        "--log_filename",
+        type=str,
+        default="log.json",
+        help="Name of the file to store dlloger output",
+    )

-    locations.add_argument('--log_filename', type=str, default='log.json',
-                           help='Name of the file to store dlloger output')
+    training_params = parser.add_argument_group("training parameters")

-    training_params = parser.add_argument_group('training parameters')
+    training_params.add_argument(
+        "--global_batch_size",
+        type=int,
+        default=131072,
+        help="Total size of training batch",
+    )

-    training_params.add_argument('--training_set_size', type=int, default=TRAIN_DATASET_SIZE,
-                                 help='Number of samples in the training set')
+    training_params.add_argument(
+        "--eval_batch_size",
+        type=int,
+        default=131072,
+        help="Total size of evaluation batch",
+    )

-    training_params.add_argument('--global_batch_size', type=int, default=131072,
-                                 help='Total size of training batch')
+    training_params.add_argument(
+        "--num_epochs", type=int, default=20, help="Number of training epochs"
+    )

-    training_params.add_argument('--eval_batch_size', type=int, default=131072,
-                                 help='Total size of evaluation batch')
+    training_params.add_argument(
+        "--cpu", default=False, action="store_true", help="Run computations on the CPU"
+    )

-    training_params.add_argument('--num_epochs', type=int, default=20,
-                                 help='Number of training epochs')
+    training_params.add_argument(
+        "--amp",
+        default=False,
+        action="store_true",
+        help="Enable automatic mixed precision conversion",
+    )

-    training_params.add_argument('--cpu', default=False, action='store_true',
-                                 help='Run computations on the CPU')
+    training_params.add_argument(
+        "--xla", default=False, action="store_true", help="Enable XLA conversion"
+    )

-    training_params.add_argument('--amp', default=False, action='store_true',
-                                 help='Enable automatic mixed precision conversion')
+    training_params.add_argument(
+        "--linear_learning_rate",
+        type=float,
+        default=0.02,
+        help="Learning rate for linear model",
+    )

-    training_params.add_argument('--xla', default=False, action='store_true',
-                                 help='Enable XLA conversion')
+    training_params.add_argument(
+        "--deep_learning_rate",
+        type=float,
+        default=0.00012,
+        help="Learning rate for deep model",
+    )

-    training_params.add_argument('--linear_learning_rate', type=float, default=0.02,
-                                 help='Learning rate for linear model')
+    training_params.add_argument(
+        "--deep_warmup_epochs",
+        type=float,
+        default=6,
+        help="Number of learning rate warmup epochs for deep model",
+    )

-    training_params.add_argument('--deep_learning_rate', type=float, default=0.00012,
-                                 help='Learning rate for deep model')
+    model_construction = parser.add_argument_group("model construction")

-    training_params.add_argument('--deep_warmup_epochs', type=float, default=6,
-                                 help='Number of learning rate warmup epochs for deep model')
+    model_construction.add_argument(
+        "--deep_hidden_units",
+        type=int,
+        default=[1024, 1024, 1024, 1024, 1024],
+        nargs="+",
+        help="Hidden units per layer for deep model, separated by spaces",
+    )

-    model_construction = parser.add_argument_group('model construction')
+    model_construction.add_argument(
+        "--deep_dropout",
+        type=float,
+        default=0.1,
+        help="Dropout regularization for deep model",
+    )

-    model_construction.add_argument('--deep_hidden_units', type=int, default=[1024, 1024, 1024, 1024, 1024], nargs="+",
-                                    help='Hidden units per layer for deep model, separated by spaces')
+    run_params = parser.add_argument_group("run mode parameters")

-    model_construction.add_argument('--deep_dropout', type=float, default=0.1,
-                                    help='Dropout regularization for deep model')
+    run_params.add_argument(
+        "--evaluate",
+        default=False,
+        action="store_true",
+        help="Only perform an evaluation on the validation dataset, don't train",
+    )

-    run_params = parser.add_argument_group('run mode parameters')
+    run_params.add_argument(
+        "--benchmark",
+        action="store_true",
+        default=False,
+        help="Run training or evaluation benchmark to collect performance metrics",
+    )

-    run_params.add_argument('--evaluate', default=False, action='store_true',
-                            help='Only perform an evaluation on the validation dataset, don\'t train')
+    run_params.add_argument(
+        "--benchmark_warmup_steps",
+        type=int,
+        default=500,
+        help="Number of warmup steps before start of the benchmark",
+    )

-    run_params.add_argument('--benchmark', action='store_true', default=False,
-                            help='Run training or evaluation benchmark to collect performance metrics', )
+    run_params.add_argument(
+        "--benchmark_steps",
+        type=int,
+        default=1000,
+        help="Number of steps for performance benchmark",
+    )

-    run_params.add_argument('--benchmark_warmup_steps', type=int, default=500,
-                            help='Number of warmup steps before start of the benchmark')
-
-    run_params.add_argument('--benchmark_steps', type=int, default=1000,
-                            help='Number of steps for performance benchmark')
-
-    run_params.add_argument('--affinity', type=str, default='socket_unique_interleaved',
-                            choices=['socket', 'single', 'single_unique',
-                                     'socket_unique_interleaved',
-                                     'socket_unique_continuous',
-                                     'disabled'],
-                            help='Type of CPU affinity')
+    run_params.add_argument(
+        "--affinity",
+        type=str,
+        default="socket_unique_interleaved",
+        choices=[
+            "socket",
+            "single",
+            "single_unique",
+            "socket_unique_interleaved",
+            "socket_unique_continuous",
+            "disabled",
+        ],
+        help="Type of CPU affinity",
+    )

    return parser.parse_args()
--- a/TensorFlow2/Recommendation/WideAndDeep/trainer/utils/benchmark.py
+++ b/TensorFlow2/Recommendation/WideAndDeep/trainer/utils/benchmark.py
@ -0,0 +1,70 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import time
+
+import dllogger
+import horovod.tensorflow as hvd
+import tensorflow as tf
+from horovod.tensorflow.mpi_ops import Sum
+
+
+class ThroughputCalculator:
+    def __init__(self, args):
+        self.args = args
+        self.boundary = max(self.args.benchmark_warmup_steps, 1)
+        self.step = 0
+        self.t0 = None
+        self.start_batch_time = None
+        with tf.device("/CPU:0"):
+            self.samples = tf.Variable(0, trainable=False, dtype=tf.int64)
+
+    def _init_benchmark(self):
+        self.t0 = time.time()
+
+    def on_epoch_end_log(self, step, shape):
+        batch_time = time.time() - self.start_batch_time
+        self.samples.assign_add(shape)
+        workers = hvd.size() if not self.args.cpu else 1
+        samplesps = shape * workers / batch_time
+        if self.args.cpu or hvd.rank() == 0:
+            dllogger.log(data={"batch_samplesps": samplesps}, step=(1, step))
+
+    def on_benchmark_end_log(self, eval_benchmark=False):
+        train_time = time.time() - self.t0
+        hvd.join()
+        if not self.args.cpu:
+            all_samples = hvd.allreduce(self.samples, op=Sum)
+        else:
+            all_samples = self.samples
+
+        all_samples = all_samples.numpy()
+
+        if self.args.cpu or hvd.rank() == 0:
+            key = "train_throughput" if not eval_benchmark else "validation_throughput"
+            throughput = all_samples / train_time
+            dllogger.log(data={key: throughput}, step=tuple())
+
+    def __call__(self, shape, eval_benchmark=False):
+        if self.args.benchmark:
+            if self.step == self.boundary:
+                self._init_benchmark()
+            if self.step > self.boundary:
+                self.on_epoch_end_log(self.step, shape)
+                if self.args.benchmark_steps <= self.step:
+                    self.on_benchmark_end_log(eval_benchmark=eval_benchmark)
+                    exit(0)
+
+            self.step += 1
+            self.start_batch_time = time.time()
--- a/TensorFlow2/Recommendation/WideAndDeep/trainer/utils/evaluator.py
+++ b/TensorFlow2/Recommendation/WideAndDeep/trainer/utils/evaluator.py
@ -0,0 +1,164 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import dllogger
+import horovod.tensorflow as hvd
+import tensorflow as tf
+from data.outbrain.features import DISPLAY_ID_COLUMN
+from horovod.tensorflow.mpi_ops import Sum, Average
+
+
+class Evaluator:
+    def __init__(
+        self,
+        model,
+        throughput_calculator,
+        eval_dataset,
+        compiled_loss,
+        steps,
+        args,
+    ):
+
+        self.model = model
+        self.steps = steps
+        self.args = args
+        self.throughput_calculator = throughput_calculator
+        self.compiled_loss = compiled_loss
+        self.eval_loss = tf.keras.metrics.Mean()
+        self.metrics = []
+        self.eval_dataset = eval_dataset
+
+        with tf.device("/CPU:0"):
+            self.current_step_var = tf.Variable(0, trainable=False, dtype=tf.int64)
+            self.display_id_counter = tf.Variable(
+                0.0, trainable=False, dtype=tf.float64
+            )
+            self.streaming_map = tf.Variable(
+                0.0, name="STREAMING_MAP", trainable=False, dtype=tf.float64
+            )
+
+    def _reset_states(self):
+        for metric in self.metrics:
+            metric.reset_states()
+
+        self.eval_loss.reset_states()
+        self.display_id_counter.assign(1)
+        self.current_step_var.assign(1)
+        self.streaming_map.assign(1)
+
+    @tf.function
+    def _calculate_map(self, x, y, predictions):
+        predictions = tf.reshape(predictions, [-1])
+        predictions = tf.cast(predictions, tf.float64)
+        display_ids = x[DISPLAY_ID_COLUMN]
+        display_ids = tf.reshape(display_ids, [-1])
+        labels = tf.reshape(y, [-1])
+        sorted_ids = tf.argsort(display_ids)
+        display_ids = tf.gather(display_ids, indices=sorted_ids)
+        predictions = tf.gather(predictions, indices=sorted_ids)
+        labels = tf.gather(labels, indices=sorted_ids)
+        _, display_ids_idx, display_ids_ads_count = tf.unique_with_counts(
+            display_ids, out_idx=tf.int64
+        )
+        pad_length = 30 - tf.reduce_max(display_ids_ads_count)
+        preds = tf.RaggedTensor.from_value_rowids(
+            predictions, display_ids_idx
+        ).to_tensor()
+        labels = tf.RaggedTensor.from_value_rowids(labels, display_ids_idx).to_tensor()
+
+        labels_mask = tf.math.reduce_max(labels, 1)
+        preds_masked = tf.boolean_mask(preds, labels_mask)
+        labels_masked = tf.boolean_mask(labels, labels_mask)
+        labels_masked = tf.argmax(labels_masked, axis=1, output_type=tf.int32)
+        labels_masked = tf.reshape(labels_masked, [-1, 1])
+
+        preds_masked = tf.pad(preds_masked, [(0, 0), (0, pad_length)])
+        _, predictions_idx = tf.math.top_k(preds_masked, 12)
+        indices = tf.math.equal(predictions_idx, labels_masked)
+        indices_mask = tf.math.reduce_any(indices, 1)
+        masked_indices = tf.boolean_mask(indices, indices_mask)
+
+        res = tf.argmax(masked_indices, axis=1)
+        ap_matrix = tf.divide(1, tf.add(res, 1))
+        ap_sum = tf.reduce_sum(ap_matrix)
+        shape = tf.cast(tf.shape(indices)[0], tf.float64)
+        self.display_id_counter.assign_add(shape)
+        self.streaming_map.assign_add(ap_sum)
+
+    @tf.function
+    def _execute_step_calculations(self, x, y):
+        predictions = self.model(x, training=False)
+
+        with tf.device("/CPU:0"):
+            loss = self.compiled_loss(y, predictions)
+            for metric in self.metrics:
+                metric.update_state(y, predictions)
+            self.eval_loss.update_state(loss)
+            self._calculate_map(x, y, predictions)
+
+        return loss
+
+    @tf.function
+    def _reduce_results(self):
+        if not self.args.cpu:
+            all_streaming_map = hvd.allreduce(self.streaming_map, op=Sum)
+            all_display_id_counter = hvd.allreduce(self.display_id_counter, op=Sum)
+            eval_loss = hvd.allreduce(
+                self.eval_loss.result(), op=Average
+            )
+        else:
+            all_streaming_map = self.streaming_map
+            all_display_id_counter = self.display_id_counter
+            eval_loss = self.eval_loss.result()
+
+        map_metric = tf.divide(all_streaming_map, all_display_id_counter)
+        eval_loss = eval_loss
+
+        return map_metric, eval_loss
+
+    @staticmethod
+    def log(eval_data, step, steps):
+        dllogger.log(data=eval_data, step=(step, steps))
+
+    def eval_step(self, x, y):
+        self._execute_step_calculations(x, y)
+
+        if self.args.benchmark:
+            self.throughput_calculator(y.shape[0], eval_benchmark=True)
+
+    def eval(self, step):
+
+        eval_data = {}
+        self._reset_states()
+        range_val = 1 if not self.args.benchmark else 100
+
+        # Graph mode part
+        for _ in range(range_val):
+            for x, y in self.eval_dataset:
+                self.eval_step(x, y)
+
+        map_metric, eval_loss = self._reduce_results()
+
+        if self.args.cpu or hvd.rank() == 0:
+            with tf.device("/CPU:0"):
+                # Eager mode part
+                current_step = int(step.numpy())
+                eval_data = {
+                    "loss_val": f"{eval_loss.numpy():.4f}",
+                    "streaming_map_val": f"{map_metric.numpy():.4f}",
+                }
+
+                self.log(eval_data, current_step, self.steps)
+
+        return eval_data
--- a/TensorFlow2/Recommendation/WideAndDeep/trainer/utils/gpu_affinity.py
+++ b/TensorFlow2/Recommendation/WideAndDeep/trainer/utils/gpu_affinity.py
@ -43,12 +43,12 @@ class device:
        return pynvml.nvmlDeviceGetName(self.handle)

    def getCpuAffinity(self):
-        affinity_string = ''
+        affinity_string = ""
        for j in pynvml.nvmlDeviceGetCpuAffinity(
                self.handle, device._nvml_affinity_elements
        ):
            # assume nvml returns list of 64 bit ints
-            affinity_string = '{:064b}'.format(j) + affinity_string
+            affinity_string = "{:064b}".format(j) + affinity_string
        affinity_list = [int(x) for x in affinity_string]
        affinity_list.reverse()  # so core 0 is in 0th element of list

@ -77,7 +77,9 @@ def set_single_unique_affinity(gpu_id, nproc_per_node):

    # remove siblings
    for idx, socket_affinity in enumerate(socket_affinities):
-        socket_affinities[idx] = list(set(socket_affinity) - set(siblings_dict.values()))
+        socket_affinities[idx] = list(
+            set(socket_affinity) - set(siblings_dict.values())
+        )

    affinities = []
    assigned = []
@ -100,7 +102,9 @@ def set_socket_unique_affinity(gpu_id, nproc_per_node, mode):

    # remove siblings
    for idx, socket_affinity in enumerate(socket_affinities):
-        socket_affinities[idx] = list(set(socket_affinity) - set(siblings_dict.values()))
+        socket_affinities[idx] = list(
+            set(socket_affinity) - set(siblings_dict.values())
+        )

    socket_affinities_to_device_ids = collections.defaultdict(list)

@ -112,22 +116,26 @@ def set_socket_unique_affinity(gpu_id, nproc_per_node, mode):
        cores_per_device = len(socket_affinity) // devices_per_group
        for group_id, device_id in enumerate(device_ids):
            if device_id == gpu_id:
-                if mode == 'interleaved':
+                if mode == "interleaved":
                    affinity = list(socket_affinity[group_id::devices_per_group])
-                elif mode == 'continuous':
-                    affinity = list(socket_affinity[group_id * cores_per_device:(group_id + 1) * cores_per_device])
+                elif mode == "continuous":
+                    affinity = list(
+                        socket_affinity[group_id * cores_per_device:(group_id + 1) * cores_per_device]
+                    )
                else:
-                    raise RuntimeError('Unknown set_socket_unique_affinity mode')
+                    raise RuntimeError("Unknown set_socket_unique_affinity mode")

                # reintroduce siblings
-                affinity += [siblings_dict[aff] for aff in affinity if aff in siblings_dict]
+                affinity += [
+                    siblings_dict[aff] for aff in affinity if aff in siblings_dict
+                ]
                os.sched_setaffinity(0, affinity)


 def get_thread_siblings_list():
-    path = '/sys/devices/system/cpu/cpu*/topology/thread_siblings_list'
+    path = "/sys/devices/system/cpu/cpu*/topology/thread_siblings_list"
    thread_siblings_list = []
-    pattern = re.compile(r'(\d+)\D(\d+)')
+    pattern = re.compile(r"(\d+)\D(\d+)")
    for fname in pathlib.Path(path[0]).glob(path[1:]):
        with open(fname) as f:
            content = f.read().strip()
@ -138,19 +146,19 @@ def get_thread_siblings_list():
    return thread_siblings_list


-def set_affinity(gpu_id, nproc_per_node, mode='socket'):
-    if mode == 'socket':
+def set_affinity(gpu_id, nproc_per_node, mode="socket"):
+    if mode == "socket":
        set_socket_affinity(gpu_id)
-    elif mode == 'single':
+    elif mode == "single":
        set_single_affinity(gpu_id)
-    elif mode == 'single_unique':
+    elif mode == "single_unique":
        set_single_unique_affinity(gpu_id, nproc_per_node)
-    elif mode == 'socket_unique_interleaved':
-        set_socket_unique_affinity(gpu_id, nproc_per_node, 'interleaved')
-    elif mode == 'socket_unique_continuous':
-        set_socket_unique_affinity(gpu_id, nproc_per_node, 'continuous')
+    elif mode == "socket_unique_interleaved":
+        set_socket_unique_affinity(gpu_id, nproc_per_node, "interleaved")
+    elif mode == "socket_unique_continuous":
+        set_socket_unique_affinity(gpu_id, nproc_per_node, "continuous")
    else:
-        raise RuntimeError('Unknown affinity mode')
+        raise RuntimeError("Unknown affinity mode")

    affinity = os.sched_getaffinity(0)
    return affinity
--- a/TensorFlow2/Recommendation/WideAndDeep/trainer/utils/schedulers.py
+++ b/TensorFlow2/Recommendation/WideAndDeep/trainer/utils/schedulers.py
@ -12,29 +12,29 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-from tensorflow.python.keras import backend as K
+import tensorflow as tf


-def get_schedule(args, steps_per_epoch):
-    assert args.deep_warmup_epochs <= args.num_epochs, 'Number of warmup epochs cannot be higher than training epochs'
-    base_lr = args.deep_learning_rate
-    warmup_steps = args.deep_warmup_epochs * steps_per_epoch
-    bound_epoch = args.deep_warmup_epochs + (args.num_epochs - args.deep_warmup_epochs) / 2
-    boundaries = [bound_epoch * steps_per_epoch]
-    values = [base_lr / 4, base_lr / 8]
+class LearningRateScheduler:
+    def __init__(self, args, steps_per_epoch, optimizer):
+        assert (
+            args.deep_warmup_epochs <= args.num_epochs
+        ), "Number of warmup epochs cannot be higher than training epochs"
+        self.base_lr = args.deep_learning_rate
+        self.warmup_steps = args.deep_warmup_epochs * steps_per_epoch
+        bound_epoch = (
+            args.deep_warmup_epochs + (args.num_epochs - args.deep_warmup_epochs) / 2
+        )
+        self.boundaries = [bound_epoch * steps_per_epoch]
+        self.values = [self.base_lr / 4, self.base_lr / 8]
+        self.optimizer = optimizer

-    def schedule(optimizer, current_step):
-        current_step = max(1, current_step)
-
-        if current_step < warmup_steps:
-            warmup_lr = base_lr * current_step / warmup_steps
-            K.set_value(optimizer.lr, K.get_value(warmup_lr))
+    @tf.function
+    def __call__(self, step):
+        if step < self.warmup_steps:
+            warmup_lr = self.base_lr * step / self.warmup_steps
+            self.optimizer.lr.assign(warmup_lr)
        else:
-            for index, bound in enumerate(boundaries):
-                if current_step <= bound:
-                    K.set_value(optimizer.lr, K.get_value(values[index]))
-                    return
-            K.set_value(optimizer.lr, K.get_value(values[-1]))
-        return
-
-    return schedule
+            index = tf.reduce_sum(tf.cast(step > self.boundaries, tf.int64))
+            value = tf.gather(self.values, index)
+            self.optimizer.lr.assign(value)
--- a/TensorFlow2/Recommendation/WideAndDeep/trainer/utils/setup.py
+++ b/TensorFlow2/Recommendation/WideAndDeep/trainer/utils/setup.py
@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+import glob
 import json
 import logging
 import os
@ -19,46 +20,33 @@ import os
 import dllogger
 import horovod.tensorflow.keras as hvd
 import tensorflow as tf
-import tensorflow_transform as tft
 from data.outbrain.dataloader import train_input_fn, eval_input_fn
-from data.outbrain.features import PREBATCH_SIZE
 from trainer.utils.gpu_affinity import set_affinity


 def init_cpu(args, logger):
    os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

-    init_logger(
-        full=True,
-        args=args,
-        logger=logger
-    )
+    init_logger(full=True, args=args, logger=logger)

-    logger.warning('--gpu flag not set, running computation on CPU')
+    logger.warning("--gpu flag not set, running computation on CPU")
+
+    raise RuntimeError("CPU not supported with nvTabular dataloader")


 def init_gpu(args, logger):
    hvd.init()

-    init_logger(
-        full=hvd.rank() == 0,
-        args=args,
-        logger=logger
-    )
-    if args.affinity != 'disabled':
+    init_logger(full=hvd.rank() == 0, args=args, logger=logger)
+    if args.affinity != "disabled":
        gpu_id = hvd.local_rank()
        affinity = set_affinity(
-            gpu_id=gpu_id,
-            nproc_per_node=hvd.size(),
-            mode=args.affinity
+            gpu_id=gpu_id, nproc_per_node=hvd.size(), mode=args.affinity
        )
-        logger.warning(f'{gpu_id}: thread affinity: {affinity}')
-    gpus = tf.config.experimental.list_physical_devices('GPU')
-    tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU')
+        logger.warning(f"{gpu_id}: thread affinity: {affinity}")

    if args.amp:
-        policy = tf.keras.mixed_precision.experimental.Policy("mixed_float16")
-        tf.keras.mixed_precision.experimental.set_policy(policy)
+        tf.keras.mixed_precision.set_global_policy("mixed_float16")

    if args.xla:
        tf.config.optimizer.set_jit(True)
@ -69,29 +57,36 @@ def init_logger(args, full, logger):
        logger.setLevel(logging.INFO)
        log_path = os.path.join(args.results_dir, args.log_filename)
        os.makedirs(args.results_dir, exist_ok=True)
-        dllogger.init(backends=[
-            dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE,
-                                       filename=log_path),
-            dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE)])
-        logger.warning('command line arguments: {}'.format(json.dumps(vars(args))))
+        dllogger.init(
+            backends=[
+                dllogger.JSONStreamBackend(
+                    verbosity=dllogger.Verbosity.VERBOSE, filename=log_path
+                ),
+                dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE),
+            ]
+        )
+        logger.warning("command line arguments: {}".format(json.dumps(vars(args))))
        if not os.path.exists(args.results_dir):
            os.mkdir(args.results_dir)

-        with open('{}/args.json'.format(args.results_dir), 'w') as f:
+        with open("{}/args.json".format(args.results_dir), "w") as f:
            json.dump(vars(args), f, indent=4)
    else:
        logger.setLevel(logging.ERROR)
        dllogger.init(backends=[])

-    dllogger.log(data=vars(args), step='PARAMETER')
+    dllogger.log(data=vars(args), step="PARAMETER")


 def create_config(args):
-    assert not (args.cpu and args.amp), \
-        'Automatic mixed precision conversion works only with GPU'
-    assert not args.benchmark or args.benchmark_warmup_steps < args.benchmark_steps, \
-        'Number of benchmark steps must be higher than warmup steps'
-    logger = logging.getLogger('tensorflow')
+    assert not (
+        args.cpu and args.amp
+    ), "Automatic mixed precision conversion works only with GPU"
+    assert (
+        not args.benchmark or args.benchmark_warmup_steps < args.benchmark_steps
+    ), "Number of benchmark steps must be higher than warmup steps"
+
+    logger = logging.getLogger("tensorflow")

    if args.cpu:
        init_cpu(args, logger)
@ -99,36 +94,24 @@ def create_config(args):
        init_gpu(args, logger)

    num_gpus = 1 if args.cpu else hvd.size()
-    gpu_id = 0 if args.cpu else hvd.rank()
    train_batch_size = args.global_batch_size // num_gpus
    eval_batch_size = args.eval_batch_size // num_gpus
-    steps_per_epoch = args.training_set_size / args.global_batch_size

-    feature_spec = tft.TFTransformOutput(
-        args.transformed_metadata_path
-    ).transformed_feature_spec()
+    train_paths = sorted(glob.glob(args.train_data_pattern))
+    valid_paths = sorted(glob.glob(args.eval_data_pattern))

    train_spec_input_fn = train_input_fn(
-        num_gpus=num_gpus,
-        id=gpu_id,
-        filepath_pattern=args.train_data_pattern,
-        feature_spec=feature_spec,
-        records_batch_size=train_batch_size // PREBATCH_SIZE,
+        train_paths=train_paths,
+        records_batch_size=train_batch_size,
    )

    eval_spec_input_fn = eval_input_fn(
-        num_gpus=num_gpus,
-        id=gpu_id,
-        repeat=None if args.benchmark else 1,
-        filepath_pattern=args.eval_data_pattern,
-        feature_spec=feature_spec,
-        records_batch_size=eval_batch_size // PREBATCH_SIZE
+        valid_paths=valid_paths, records_batch_size=eval_batch_size
    )

    config = {
-        'steps_per_epoch': steps_per_epoch,
-        'train_dataset': train_spec_input_fn,
-        'eval_dataset': eval_spec_input_fn
+        "train_dataset": train_spec_input_fn,
+        "eval_dataset": eval_spec_input_fn,
    }

    return config
--- a/TensorFlow2/Recommendation/WideAndDeep/trainer/utils/trainer.py
+++ b/TensorFlow2/Recommendation/WideAndDeep/trainer/utils/trainer.py
@ -0,0 +1,170 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+
+import dllogger
+import horovod.tensorflow as hvd
+import tensorflow as tf
+
+
+class Trainer:
+    def __init__(
+        self,
+        model,
+        scheduler,
+        deep_optimizer,
+        wide_optimizer,
+        throughput_calculator,
+        compiled_loss,
+        steps,
+        args,
+        train_dataset,
+        evaluator,
+    ):
+        self.model = model
+        self.scheduler = scheduler
+        self.deep_optimizer = deep_optimizer
+        self.wide_optimizer = wide_optimizer
+        self.throughput_calculator = throughput_calculator
+        self.steps = steps
+        self.args = args
+        self.train_dataset = train_dataset
+        self.evaluator = evaluator
+        self.compiled_loss = compiled_loss
+        self.logger = logging.getLogger("tensorflow")
+
+        with tf.device("/CPU:0"):
+            self.current_step_var = tf.Variable(0, trainable=False, dtype=tf.int64)
+            self.display_id_counter = tf.Variable(
+                0.0, trainable=False, dtype=tf.float64
+            )
+
+        self._init_checkpoint_manager()
+
+    def _init_checkpoint_manager(self):
+        self.checkpoint = tf.train.Checkpoint(
+            deep_optimizer=self.deep_optimizer,
+            wide_optimizer=self.wide_optimizer,
+            model=self.model,
+            current_step=self.current_step_var,
+        )
+        self.manager = tf.train.CheckpointManager(
+            checkpoint=self.checkpoint,
+            directory=os.path.join(self.args.model_dir, "checkpoint"),
+            max_to_keep=1,
+        )
+
+    def maybe_restore_checkpoint(self):
+        if self.args.use_checkpoint:
+            self.checkpoint.restore(self.manager.latest_checkpoint).expect_partial()
+            if self.manager.latest_checkpoint:
+                self.logger.warning(
+                    f"Model restored from checkpoint {self.args.model_dir}"
+                )
+                if self.args.benchmark:
+                    self.current_step_var.assign(0)
+            else:
+                self.logger.warning(
+                    f"Failed to restore model from checkpoint {self.args.model_dir}"
+                )
+
+    @tf.function
+    def __call__(self, x, y):
+        with tf.GradientTape(persistent=True) as tape:
+            y_pred = self.model(x, training=True)
+            loss = self.compiled_loss(y, y_pred)
+            linear_loss = (
+                self.wide_optimizer.get_scaled_loss(loss) if self.args.amp else loss
+            )
+            deep_loss = (
+                self.deep_optimizer.get_scaled_loss(loss) if self.args.amp else loss
+            )
+
+        if not self.args.cpu:
+            tape = hvd.DistributedGradientTape(tape, sparse_as_dense=True)
+
+        linear_vars = self.model.linear_model.trainable_variables
+        dnn_vars = self.model.dnn_model.trainable_variables
+        linear_grads = tape.gradient(linear_loss, linear_vars)
+        dnn_grads = tape.gradient(deep_loss, dnn_vars)
+        if self.args.amp:
+            linear_grads = self.wide_optimizer.get_unscaled_gradients(linear_grads)
+            dnn_grads = self.deep_optimizer.get_unscaled_gradients(dnn_grads)
+
+        self.wide_optimizer.apply_gradients(zip(linear_grads, linear_vars))
+        self.deep_optimizer.apply_gradients(zip(dnn_grads, dnn_vars))
+
+        if self.current_step_var == 0:
+            hvd.broadcast_variables(self.model.linear_model.variables, root_rank=0)
+            hvd.broadcast_variables(self.model.dnn_model.variables, root_rank=0)
+            hvd.broadcast_variables(self.wide_optimizer.variables(), root_rank=0)
+            hvd.broadcast_variables(self.deep_optimizer.variables(), root_rank=0)
+
+        return loss
+
+    @tf.function
+    def _execute_step_calculations(self, x, y):
+        loss = self(x, y)
+        with tf.device("/CPU:0"):
+            self.scheduler(tf.cast(self.current_step_var + 1, tf.float32))
+            self.current_step_var.assign_add(1)
+
+        return loss
+
+    def log(self, current_step, loss):
+        train_data = {"loss": f"{loss:.4f}"}
+        dllogger.log(data=train_data, step=(current_step, self.steps))
+
+    def train_step(self, x, y):
+
+        # Graph mode part
+        loss = self._execute_step_calculations(x, y)
+
+        # Eager mode part
+        current_step = int(self.current_step_var.numpy()) - 1
+        if self.args.benchmark:
+            self.throughput_calculator(y.shape[0])
+        elif (self.args.cpu or hvd.rank() == 0) and current_step % 100 == 0:
+            self.log(current_step, loss.numpy())
+
+    def join_and_broadcast(self):
+        hvd.join()
+        if not self.args.benchmark:
+            hvd.broadcast_variables(self.model.linear_model.variables, root_rank=0)
+            hvd.broadcast_variables(self.model.dnn_model.variables, root_rank=0)
+            hvd.broadcast_variables(self.wide_optimizer.variables(), root_rank=0)
+            hvd.broadcast_variables(self.deep_optimizer.variables(), root_rank=0)
+
+    def run_loop(self):
+        eval_data = {}
+        current_epoch = int(self.current_step_var.numpy()) // len(self.train_dataset) + 1
+
+        for _ in range(current_epoch, self.args.num_epochs + 1):
+            range_val = 1 if not self.args.benchmark else 100
+
+            # Graph mode part
+            for _ in range(range_val):
+                for x, y in self.train_dataset:
+                    self.train_step(x, y)
+                self.join_and_broadcast()
+
+            eval_data = self.evaluator.eval(self.current_step_var)
+
+            if self.args.cpu or hvd.rank() == 0:
+                self.manager.save()
+
+        if self.args.cpu or hvd.rank() == 0:
+            dllogger.log(data=eval_data, step=tuple())