diff --git a/PyTorch/Forecasting/TFT/Dockerfile b/PyTorch/Forecasting/TFT/Dockerfile
new file mode 100644
index 00000000..70552ea1
--- /dev/null
+++ b/PyTorch/Forecasting/TFT/Dockerfile
@@ -0,0 +1,36 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:21.06-py3
+
+FROM ${FROM_IMAGE_NAME}
+
+RUN apt-get update && apt-get install -y libb64-dev libb64-0d
+WORKDIR /workspace
+#ENV PYTHONPATH /workspace
+RUN pip uninstall -y typing
+
+RUN apt update && apt install -y p7zip-full
+COPY requirements.txt .
+RUN pip install --upgrade pip
+RUN pip install --no-cache-dir --ignore-installed -r requirements.txt
+RUN pip install --no-cache-dir -e git://github.com/NVIDIA/dllogger#egg=dllogger
+
+COPY . .
+ENV PYTHONPATH="${PYTHONPATH}:/workspace"
+
+# AMP monkey-patch
+RUN sed -i 's/  def forward(ctx,/  @amp.custom_fwd\(cast_inputs=torch.float32\)\n  def forward(ctx,/g' /opt/conda/lib/python3.8/site-packages/apex/normalization/fused_layer_norm.py
+RUN sed -i 's/  def backward(ctx,/  @amp.custom_bwd\n  def backward(ctx,/g' /opt/conda/lib/python3.8/site-packages/apex/normalization/fused_layer_norm.py
+RUN sed -i 's/^import torch$/import torch\nfrom torch.cuda import amp/' /opt/conda/lib/python3.8/site-packages/apex/normalization/fused_layer_norm.py
diff --git a/PyTorch/Forecasting/TFT/LICENCE b/PyTorch/Forecasting/TFT/LICENCE
new file mode 100644
index 00000000..261eeb9e
--- /dev/null
+++ b/PyTorch/Forecasting/TFT/LICENCE
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/PyTorch/Forecasting/TFT/LICENSE AGREEMENT b/PyTorch/Forecasting/TFT/LICENSE AGREEMENT
new file mode 100644
index 00000000..5d1d88cf
--- /dev/null
+++ b/PyTorch/Forecasting/TFT/LICENSE AGREEMENT	
@@ -0,0 +1,25 @@
+Individual Contributor License Agreement (CLA)
+Thank you for submitting your contributions to this project.
+
+By signing this CLA, you agree that the following terms apply to all of your past, present and future contributions to the project.
+
+License.
+You hereby represent that all present, past and future contributions are governed by the Apache 2.0 License copyright statement.
+
+This entails that to the extent possible under law, you transfer all copyright and related or neighboring rights of the code or documents you contribute to the project itself or its maintainers. Furthermore you also represent that you have the authority to perform the above waiver with respect to the entirety of you contributions.
+
+Moral Rights.
+To the fullest extent permitted under applicable law, you hereby waive, and agree not to assert, all of your “moral rights” in or relating to your contributions for the benefit of the project.
+
+Third Party Content.
+If your Contribution includes or is based on any source code, object code, bug fixes, configuration changes, tools, specifications, documentation, data, materials, feedback, information or other works of authorship that were not authored by you (“Third Party Content”) or if you are aware of any third party intellectual property or proprietary rights associated with your Contribution (“Third Party Rights”), then you agree to include with the submission of your Contribution full details respecting such Third Party Content and Third Party Rights, including, without limitation, identification of which aspects of your Contribution contain Third Party Content or are associated with Third Party Rights, the owner/author of the Third Party Content and Third Party Rights, where you obtained the Third Party Content, and any applicable third party license terms or restrictions respecting the Third Party Content and Third Party Rights. For greater certainty, the foregoing obligations respecting the identification of Third Party Content and Third Party Rights do not apply to any portion of a Project that is incorporated into your Contribution to that same Project.
+
+Representations.
+You represent that, other than the Third Party Content and Third Party Rights identified by you in accordance with this Agreement, you are the sole author of your Contributions and are legally entitled to grant the foregoing licenses and waivers in respect of your Contributions. If your Contributions were created in the course of your employment with your past or present employer(s), you represent that such employer(s) has authorized you to make your Contributions on behalf of such employer(s) or such employer (s) has waived all of their right, title or interest in or to your Contributions.
+
+Disclaimer.
+To the fullest extent permitted under applicable law, your Contributions are provided on an "as is" basis, without any warranties or conditions, express or implied, including, without limitation, any implied warranties or conditions of non-infringement, merchantability or fitness for a particular purpose. You are not required to provide support for your Contributions, except to the extent you desire to provide support.
+
+No Obligation.
+You acknowledge that the maintainers of this project are under no obligation to use or incorporate your contributions into the project. The decision to use or incorporate your contributions into the project will be made at the sole discretion of the maintainers or their authorized delegates.
+
diff --git a/PyTorch/Forecasting/TFT/NOTICE b/PyTorch/Forecasting/TFT/NOTICE
new file mode 100644
index 00000000..ae19bb47
--- /dev/null
+++ b/PyTorch/Forecasting/TFT/NOTICE
@@ -0,0 +1,3 @@
+TFT for PyTorch
+
+This repository includes software from https://github.com/google-research/google-research/tree/master/tft licensed under the Apache License, Version 2.0
diff --git a/PyTorch/Forecasting/TFT/README.md b/PyTorch/Forecasting/TFT/README.md
new file mode 100644
index 00000000..69b39d12
--- /dev/null
+++ b/PyTorch/Forecasting/TFT/README.md
@@ -0,0 +1,465 @@
+# Temporal Fusion Transformer For PyTorch
+
+This repository provides a script and recipe to train the Temporal Fusion Transformer model to achieve state-of-the-art accuracy. The content of this repository is tested and maintained by NVIDIA.
+
+## Table Of Contents
+
+- [Model overview](#model-overview)
+    * [Model architecture](#model-architecture)
+    * [Default configuration](#default-configuration)
+    * [Feature support matrix](#feature-support-matrix)
+	    * [Features](#features)
+    * [Mixed precision training](#mixed-precision-training)
+	    * [Enabling mixed precision](#enabling-mixed-precision)
+          * [Enabling TF32](#enabling-tf32)
+    * [Glossary](#glossary)
+- [Setup](#setup)
+    * [Requirements](#requirements)
+- [Quick Start Guide](#quick-start-guide)
+- [Advanced](#advanced)
+    * [Scripts and sample code](#scripts-and-sample-code)
+    * [Command-line options](#command-line-options)
+    * [Getting the data](#getting-the-data)
+        * [Dataset guidelines](#dataset-guidelines)
+        * [Multi-dataset](#multi-dataset)
+    * [Training process](#training-process)
+    * [Inference process](#inference-process)
+- [Performance](#performance)
+    * [Benchmarking](#benchmarking)
+        * [Training performance benchmark](#training-performance-benchmark)
+        * [Inference performance benchmark](#inference-performance-benchmark)
+    * [Results](#results)
+        * [Training accuracy results](#training-accuracy-results)                         
+            * [Training accuracy: NVIDIA DGX A100 (8x A100 80GB)](#training-accuracy-nvidia-dgx-a100-8x-a100-80gb)
+            * [Training accuracy: NVIDIA DGX-1 (8x V100 16GB)](#training-accuracy-nvidia-dgx-1-8x-v100-16gb)
+            * [Training stability test](#training-stability-test)
+        * [Training performance results](#training-performance-results)
+            * [Training performance: NVIDIA DGX A100 (8x A100 80GB)](#training-performance-nvidia-dgx-a100-8x-a100-80gb)
+            * [Training performance: NVIDIA DGX-1 (8x V100 16GB)](#training-performance-nvidia-dgx-1-8x-v100-16gb)
+- [Release notes](#release-notes)
+    * [Changelog](#changelog)
+    * [Known issues](#known-issues)
+
+
+
+## Model overview
+
+The Temporal Fusion Transformer [TFT](https://arxiv.org/abs/1912.09363) model is a state-of-the-art architecture for interpretable, multi-horizon time-series prediction. The model was first developed and [implemented by Google](https://github.com/google-research/google-research/tree/master/tft) with the collaboration with the University of Oxford.
+This implementation differs from the reference implementation by addressing the issue of missing data, which is common in production datasets, by either masking their values in attention matrices or embedding them as a special value in the latent space.
+This model enables the prediction of confidence intervals for future values of time series for multiple future timesteps.
+
+This model is trained with mixed precision using Tensor Cores on Volta, Turing, and the NVIDIA Ampere GPU architectures. Therefore, researchers can get results 1.45x faster than training without Tensor Cores while experiencing the benefits of mixed precision training. This model is tested against each NGC monthly container release to ensure consistent accuracy and performance over time.
+
+### Model architecture
+
+The TFT model is a hybrid architecture joining LSTM encoding of time series and interpretability of transformer attention layers. Prediction is based on three  types of variables: static (constant for a given time series), known (known in advance for whole history and future), observed (known only for historical data). All these variables come in two flavors: categorical, and continuous. In addition to historical data, we feed the model with historical values of time series. All variables are embedded in high-dimensional space by learning an embedding vector. Categorical variables embeddings are learned in the classical sense of embedding discrete values. The model learns a single vector for each continuous variable, which is then scaled by this variable’s value for further processing. The next step is to filter variables through the Variable Selection Network (VSN), which assigns weights to the inputs in accordance with their relevance to the prediction. Static variables are used as a context for variable selection of other variables and as an initial state of LSTM encoders.
+After encoding, variables are passed to multi-head attention layers (decoder), which produce the final prediction. Whole architecture is interwoven with residual connections with gating mechanisms that allow  the architecture to adapt to various problems by skipping some parts of it.
+For the sake of explainability, heads of self-attention layers share value matrices. This allows interpreting  self-attention as an ensemble of models predicting different temporal patterns over the same feature set. The other feature that helps us understand the model is VSN activations, which tells us how relevant the given feature is to the prediction.
+![](TFT_architecture.PNG)
+*image source: https://arxiv.org/abs/1912.09363*
+
+### Default configuration
+
+The specific configuration of the TFT model depends on the dataset used. Not only is the volume of the model subject to change but so are the data sampling and preprocessing strategies. During preprocessing, data is normalized per feature. For a part of the datasets, we apply scaling per-time-series, which takes into account shifts in distribution between entities (i.e., a factory consumes more electricity than an average house). The model is trained with the quantile loss: <img src="https://render.githubusercontent.com/render/math?math=\Large\sum_{i=1}^N\sum_{q\in\mathcal{Q}}\sum_{t=1}^{t_{max}}\frac{QL(y_it,\hat{y}_i(q,t),q)}{Nt_{max}}">
+For quantiles in [0.1, 0.5, 0.9]. The default configurations are tuned for distributed training on DGX-1-32G with mixed precision. We use dynamic loss scaling. Specific values are provided in the table below.
+
+| Dataset | Training samples | Validation samples | Test samples | History length | Forecast horizon | Dropout | Hidden size | #Heads | BS | LR | Gradient clipping |
+| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
+| Electricity | 450k | 50k | 53.5k | 168 | 24 | 0.1 | 128 | 4 | 8x1024 | 1e-3 | 0.0 |
+| Traffic | 450k | 50k | 139.6k | 168 | 24 | 0.3 | 128 | 4 | 8x1024 | 1e-3 | 0.0
+
+### Feature support matrix
+
+The following features are supported by this model: 
+
+| Feature                    | Yes column                
+|----------------------------|--------------------------
+|Distributed data parallel   |         Yes
+|PyTorch AMP                 |         Yes 
+    
+         
+#### Features
+
+[Automatic Mixed Precision](https://pytorch.org/docs/stable/amp.html)
+provides an easy way to leverage Tensor Cores’ performance. It allows the execution of parts of a network in lower precision. Refer to [Mixed precision training](#mixed-precision-training) for more information.
+
+[PyTorch
+DistributedDataParallel](https://pytorch.org/docs/stable/nn.html#torch.nn.parallel.DistributedDataParallel) - a module
+wrapper that enables easy multiprocess distributed data-parallel
+training.
+
+### Mixed precision training
+
+Mixed precision is the combined use of different numerical precisions in a
+computational method.
+[Mixed precision](https://arxiv.org/abs/1710.03740) training offers significant
+computational speedup by performing operations in half-precision format while
+storing minimal information in single-precision to retain as much information
+as possible in critical parts of the network. Since the introduction of [Tensor Cores](https://developer.nvidia.com/tensor-cores) in Volta, and following with 
+both the Turing and Ampere architectures, significant training speedups are 
+experienced by switching to
+mixed precision -- up to 3x overall speedup on the most arithmetically intense
+model architectures. Using mixed precision training previously required two
+steps:
+
+1. Porting the model to use the FP16 data type where appropriate.
+2. Manually adding loss scaling to preserve small gradient values.
+
+The ability to train deep learning networks with lower precision was introduced
+in the Pascal architecture and first supported in [CUDA
+8](https://devblogs.nvidia.com/parallelforall/tag/fp16/) in the NVIDIA Deep
+Learning SDK.
+
+For information about:
+* How to train using mixed precision, refer to the [Mixed Precision
+  Training](https://arxiv.org/abs/1710.03740) paper and [Training With Mixed
+  Precision](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html)
+  documentation.
+* Techniques used for mixed precision training, refer to the [Mixed-Precision
+  Training of Deep Neural
+  Networks](https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/)
+  blog.
+* APEX tools for mixed precision training, refer to the [NVIDIA Apex: Tools for Easy Mixed-Precision Training in
+  PyTorch](https://devblogs.nvidia.com/apex-pytorch-easy-mixed-precision-training/)
+  .
+
+
+#### Enabling mixed precision
+
+
+Mixed precision is enabled in PyTorch by using the Automatic Mixed Precision torch.cuda.amp module, which casts variables to half-precision upon retrieval while storing variables in single-precision format. Furthermore, to preserve small gradient magnitudes in backpropagation, a [loss scaling](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html#lossscaling) step must be included when applying gradients. In PyTorch, loss scaling can be applied automatically by the GradScaler class. All the necessary steps to implement AMP are verbosely described [here](https://pytorch.org/docs/stable/notes/amp_examples.html#amp-examples).
+
+To enable mixed precision for TFT, simply add the `--use_amp` option to the training script.
+#### Enabling TF32
+
+TensorFloat-32 (TF32) is the new math mode in [NVIDIA A100](https://www.nvidia.com/en-us/data-center/a100/) GPUs for handling the matrix math, also called tensor operations. TF32 running on Tensor Cores in A100 GPUs can provide up to 10x speedups compared to single-precision floating-point math (FP32) on Volta GPUs. 
+
+TF32 Tensor Cores can speed up networks using FP32, typically with no loss of accuracy. It is more robust than FP16 for models which require high dynamic range for weights or activations.
+
+For more information, refer to the [TensorFloat-32 in the A100 GPU Accelerates AI Training, HPC up to 20x](https://blogs.nvidia.com/blog/2020/05/14/tensorfloat-32-precision-format/) blog post.
+
+TF32 is supported in the NVIDIA Ampere GPU architecture and is enabled by default.
+
+
+
+### Glossary
+
+**Multi horizon prediction**  
+Process of estimating values of a time series for multiple future time steps.
+
+**Quantiles**  
+Cut points dividing the range of a probability distribution intervals with equal probabilities.
+
+**Time series**  
+Series of data points indexed and equally spaced in time.
+
+**Transformer**  
+The paper [Attention Is All You Need](https://arxiv.org/abs/1706.03762) introduces a novel architecture called Transformer that uses an attention mechanism and transforms one sequence into another.
+ 
+
+## Setup
+
+The following section lists the requirements that you need to meet in order to start training the TFT model.
+
+### Requirements
+
+This repository contains Dockerfile, which extends the PyTorch NGC container and encapsulates some dependencies. Aside from these dependencies, ensure you have the following components:
+-   [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker)
+-   [PyTorch 21.06 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch)
+-   Supported GPUs:
+- [NVIDIA Volta architecture](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/)
+- [NVIDIA Turing architecture](https://www.nvidia.com/en-us/design-visualization/technologies/turing-architecture/)
+- [NVIDIA Ampere architecture](https://www.nvidia.com/en-us/data-center/nvidia-ampere-gpu-architecture/)
+
+For more information about how to get started with NGC containers, refer to the following sections from the NVIDIA GPU Cloud Documentation and the Deep Learning Documentation:
+-   [Getting Started Using NVIDIA GPU Cloud](https://docs.nvidia.com/ngc/ngc-getting-started-guide/index.html)
+-   [Accessing And Pulling From The NGC Container Registry](https://docs.nvidia.com/deeplearning/frameworks/user-guide/index.html#accessing_registry)
+-   Running [PyTorch](https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/running.html#running)
+
+  
+For those unable to use the PyTorch NGC container to set up the required environment or create your own container, refer to the versioned [NVIDIA Container Support Matrix](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html).
+
+## Quick Start Guide
+
+To train your model using mixed or TF32 precision with Tensor Cores, perform the following steps using the default parameters of the TFT model on any of the benchmark datasets. For the specifics concerning training and inference, refer to the [Advanced](#advanced) section.
+
+1. Clone the repository.
+```bash
+git clone https://github.com/NVIDIA/DeepLearningExamples
+cd DeepLearningExamples/PyTorch/Forecasting/TFT
+```
+
+2.  Build the TFT PyTorch NGC container.
+```bash
+docker build --network=host -t tft .
+```
+
+3.  Start an interactive session in the NGC container to run training/inference.
+```bash
+docker run -it --rm --ipc=host --network=host --gpus all -v /path/to/your/data:/data/ tft
+```
+
+Note: Ensure to mount your dataset using the -v flag to make it available for training inside the NVIDIA Docker container.
+
+4.  Download and preprocess datasets.
+```bash
+bash scripts/get_data.sh
+```
+
+5. Start training. Choose one of the scripts provided in the `scripts/` directory. Results are stored in the `/results` directory.
+These scripts are tuned for DGX1-32G. If you have a different system, use NGPU and BATCH_SIZE variables to adjust the parameters for your system.
+```bash
+bash scripts/run_electricity.sh
+bash scripts/run_traffic.sh
+```
+
+6. Start validation/evaluation. The metric we use for evaluation is q-risk. We can compare it per-quantile in the Pareto sense or jointly as one number indicating accuracy.
+```bash
+python inference.py \
+--checkpoint <your_checkpoint> \
+--data /data/processed/<dataset>/test.csv \
+--cat_encodings /data/processed/<dataset>/cat_encodings.bin \
+--tgt_scalers /data/processed/<dataset>/tgt_scalers.bin
+```
+
+7. Start inference/predictions. Visualize and save predictions by running the following command.
+```bash
+python inference.py \
+--checkpoint <your_checkpoint> \
+--data /data/processed/<dataset>/test.csv \
+--cat_encodings /data/processed/<dataset>/cat_encodings.bin \
+--tgt_scalers /data/processed/<dataset>/tgt_scalers.bin \
+--visualize \
+--save_predictions
+```
+
+
+
+Now that you have your model trained and evaluated, you can choose to compare your training results with our [Training accuracy results](#training-accuracy-results). You can also choose to benchmark your performance to [Training performance benchmark](#training-performance-results). Following the steps in these sections will ensure that you achieve the same accuracy and performance results as stated in the [Results](#results) section.
+## Advanced
+
+The following sections provide more  details about the dataset, running training and inference, and the training results.
+
+### Scripts and sample code
+
+In the root directory, the most important files are:
+
+`train.py`: Entry point for training
+`data_utils.py`: File containing the dataset implementation and preprocessing functions
+`modeling.py`: Definition of the model
+`configuration.py`: Contains configuration classes for various experiments
+`test.py`: Entry point testing trained model.
+`Dockerfile`: Container definition
+`log_helper.py`: Contains helper functions for setting up dllogger
+`criterions.py`: Definitions of loss functions
+
+The `scripts` directory contains scripts for default use cases:
+`run_electricity.sh`: train default model on the electricity dataset
+`run_traffic.sh`: train default model on the traffic dataset
+
+### Command-line options
+
+To view the full list of available options and their descriptions, use the `-h` or `--help` command-line option, for example:
+`python train.py --help`.
+
+The following example output is printed when running the model:
+```
+usage: train.py [-h] --data_path DATA_PATH --dataset {electricity,volatility,traffic,favorita} [--epochs EPOCHS] [--sample_data SAMPLE_DATA SAMPLE_DATA] [--batch_size BATCH_SIZE] [--lr LR] [--seed SEED] [--use_amp] [--clip_grad CLIP_GRAD]
+                [--early_stopping EARLY_STOPPING] [--results RESULTS] [--log_file LOG_FILE] [--distributed_world_size N] [--distributed_rank DISTRIBUTED_RANK] [--local_rank LOCAL_RANK] [--overwrite_config OVERWRITE_CONFIG]
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --data_path DATA_PATH
+  --dataset {electricity,volatility,traffic,favorita}
+  --epochs EPOCHS
+  --sample_data SAMPLE_DATA SAMPLE_DATA
+  --batch_size BATCH_SIZE
+  --lr LR
+  --seed SEED
+  --use_amp             Enable automatic mixed precision
+  --clip_grad CLIP_GRAD
+  --early_stopping EARLY_STOPPING
+                        Stop training if validation loss does not improve for more than this number of epochs.
+  --results RESULTS
+  --log_file LOG_FILE
+  --distributed_world_size N
+                        total number of GPUs across all nodes (default: all visible GPUs)
+  --distributed_rank DISTRIBUTED_RANK
+                        rank of the current worker
+  --local_rank LOCAL_RANK
+                        rank of the current worker
+  --overwrite_config OVERWRITE_CONFIG
+                        JSON string used to overload config
+
+```
+
+### Getting the data
+    
+The TFT model was trained on the electricity and traffic benchmark datasets. This repository contains the `get_data.sh` download script, which for electricity and and traffic datasets will automatically download and preprocess the training, validation and test datasets, and produce files that contain scalers.
+#### Dataset guidelines
+
+The `data_utils.py` file contains all functions that are used to preprocess the data. Initially the data is loaded to a `pandas.DataFrame` and parsed to the common format which contains the features we will use for training. Then standardized data is cleaned, normalized, encoded and binarized.
+This step does the following:
+Drop all the columns that are not marked in the configuration file as used for training or preprocessing
+Flatten indices in case time series are indexed by more than one column
+Split the data into training, validation and test splits
+Filter out all the time series shorter than minimal example length
+Normalize columns marked as continuous in the configuration file
+Encode as integers columns marked as categorical
+Save the data in csv and binary formats
+
+#### Multi-dataset
+In order to use an alternate dataset, you have to write a function that parses your data to a common format. The format is as follows:
+There is at least one id column
+There is exactly one time column (that can also be used as a feature column)
+Each feature is in a separate column
+Each row represents a moment in time for only one time series
+Additionally, you must specify a configuration of the network, including a data description. Refer to the example in `configuration.py` file.
+### Training process
+
+The `train.py` script is an entry point for a training procedure. Refined recipes can be found in the `scripts` directory.
+The model trains for at most `--epochs` epochs. If option `--early_stopping N` is set, then training will end if for N subsequent epochs validation loss hadn’t improved.
+The details of the architecture and the dataset configuration are encapsulated by the `--dataset` option. This option chooses one of the configurations stored in the `configuration.py` file. You can enable mixed precision training by providing the `--use_amp` option. The training script supports multi-GPU training with the APEX package. To enable distributed training prepend training command with `python -m torch.distributed.launch --nproc_per_node=${NGPU}`.
+
+Example command:
+```
+python -m torch.distributed.launch --nproc_per_node=8 train.py \
+        --dataset electricity \
+        --data_path /data/processed/electricity_bin \
+        --batch_size=1024 \
+        --sample 450000 50000 \
+        --lr 1e-3 \
+        --epochs 25 \
+        --early_stopping 5 \
+        --seed 1 \
+        --use_amp \
+        --results /results/TFT_electricity_bs8x1024_lr1e-3/seed_1
+```
+
+The model is trained by optimizing quantile loss <img src="https://render.githubusercontent.com/render/math?math=\Large\sum_{i=1}^N\sum_{q\in\mathcal{Q}}\sum_{t=1}^{t_{max}}\frac{QL(y_{it},\hat{y}_i(q,t),q)}{Nt_{max}}">
+. After training, the checkpoint with the least validation loss is evaluated on a test split with q-risk metric <img src="https://render.githubusercontent.com/render/math?math=\Large\frac{2\sum_{y\in\Omega}\sum_{t=1}^{t_{max}}QL(y_t,\hat{y}(q,t),q)}{\sum_{y\in\Omega}\sum_{t=1}^{t_{max}}|y_t|}">.
+Results are by default stored in the `/results` directory. This can be changed by providing the `--results` option. At the end of the training,  the results directory will contain the trained checkpoint which had the lowest validation loss, dllogger logs (in dictionary per line format), and TensorBoard logs.
+
+### Inference process
+
+Inference can be run by launching the `inference.py` script. The script requires a trained checkpoint to run. It is crucial to prepare the data in the same way as training data prior to running the inference. Example command:
+```
+python inference.py \
+--checkpoint /results/checkpoint.pt \
+--data /data/processed/electricity_bin/test.csv \
+--tgt_scalers /data/processed/electricity_bin/tgt_scalers.bin \
+--cat_encodings /data/processed/electricity_bin/cat_encodings.bin \
+--batch_size 2048 \
+--visualize \
+--save_predictions \
+--joint_visualization \
+--results /results \
+--use_amp
+```
+
+In the default setting, it performs the evaluation of the model on a specified dataset and prints q-risk evaluated on this dataset. In order to save the predictions, use the `--save_predictions` option. Predictions will be stored in the directory specified by the `--results` option in the csv format. Option `--joint_visualization` allows us to plot graphs in TensorBoard format, allowing us to inspect the results and compare them to true values. Using `--visualize`, you can save plots for each example in a separate file.
+## Performance
+
+### Benchmarking
+
+The following section shows how to run benchmarks measuring the model performance in training and inference modes.
+
+#### Training performance benchmark
+
+In order to run training benchmarks, use the `scripts/benchmark.sh` script.
+
+#### Inference performance benchmark
+
+To benchmark the inference performance on a specific batch size and dataset, run the `inference.py` script.
+### Results
+
+The following sections provide details on how we achieved our performance and accuracy in training and inference.
+
+#### Training accuracy results
+
+We conducted an extensive hyperparameter search along with stability tests. The presented results are the averages from the hundreds of runs.
+
+##### Training accuracy: NVIDIA DGX A100 (A100 80GB)
+
+Our results were obtained by running the `train.sh` training script in the [PyTorch 21.06 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch) on NVIDIA A100 GPUs.
+
+| Dataset | GPUs | Batch size / GPU    | Accuracy - TF32  | Accuracy - mixed precision  |   Time to train - TF32  |  Time to train - mixed precision | Time to train speedup (TF32 to mixed precision)     
+|-------------|---|------|-----------------------|-----------------------|-------|-------|-------
+| Electricity | 1 | 1024 | 0.027 / 0.059 / 0.029 | 0.028 / 0.058 / 0.029 | 1427s | 1087s | 1.313x
+| Electricity | 8 | 1024 | 0.027 / 0.056 / 0.028 | 0.026 / 0.054 / 0.029 | 216s  | 176s  | 1.227x
+| Traffic     | 1 | 1024 | 0.040 / 0.103 / 0.075 | 0.040 / 0.103 / 0.075 | 957s  | 726s  | 1.318x
+| Traffic     | 8 | 1024 | 0.042 / 0.104 / 0.076 | 0.042 / 0.106 / 0.077 | 151s  | 126s  | 1.198x
+
+
+
+
+##### Training accuracy: NVIDIA DGX-1 (V100 16GB)
+
+Our results were obtained by running the `train.sh` training script in the [PyTorch 21.06 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch) on NVIDIA DGX-1 with V100 16GB GPUs.
+
+| Dataset | GPUs    | Batch size / GPU    | Accuracy - FP32  | Accuracy - mixed precision  |   Time to train - FP32  |  Time to train - mixed precision | Time to train speedup (FP32 to mixed precision)        
+|-------------|---|------|-----------------------|-----------------------|-------|-------|-----------
+| Electricity | 1 | 1024 | 0.027 / 0.056 / 0.028 | 0.027 / 0.058 / 0.029 | 2559s | 1598s | 1.601x 
+| Electricity | 8 | 1024 | 0.027 / 0.055 / 0.028 | 0.027 / 0.055 / 0.029 | 381s  | 261s  | 1.460x   
+| Traffic     | 1 | 1024 | 0.040 / 0.102 / 0.075 | 0.041 / 0.101 / 0.074 | 1718s | 1062s | 1.618x 
+| Traffic     | 8 | 1024 | 0.042 / 0.106 / 0.076 | 0.042 / 0.105 / 0.077 | 256s  | 176s  | 1.455x
+
+
+
+##### Training stability test
+
+In order to get a greater picture of the model’s accuracy, we performed a hyperparameter search along with stability tests on 100 random seeds for each configuration. Then, for each benchmark dataset, we have chosen the architecture with the least mean test q-risk. The table below summarizes the best configurations.
+
+| Dataset     | #GPU | Hidden size | #Heads | Local BS | LR   | Gradient clipping | Dropout | Mean q-risk | Std q-risk | Min q-risk | Max q-risk
+|-------------|------|-------------|--------|----------|------|-------------------|---------|-------------|------------| -----------|------ 
+| Electricity | 8    | 128         | 4      | 1024     | 1e-3 | 0.0               | 0.1     | 0.1131      | 0.0025     | 0.1080     | 0.1200
+| Traffic     | 8    | 128         | 4      | 1024     | 1e-3 | 0.0               | 0.3     | 0.2180      | 0.0049     | 0.2069     | 0.2336
+
+
+#### Training performance results
+
+##### Training performance: NVIDIA DGX A100 (A100 80GB)
+
+Our results were obtained by running the `train.sh` training script in the [PyTorch 21.06 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch) on NVIDIA A100 (A100 80GB) GPUs. Performance numbers (in items/images per second) were averaged over an entire training epoch.
+
+| Dataset | GPUs   | Batch size / GPU   | Throughput - TF32    | Throughput - mixed precision    | Throughput speedup (TF32 - mixed precision)   | Weak scaling - TF32    | Weak scaling - mixed precision        
+|-------------|---|------|--------|--------|-------|-------|-----
+| Electricity | 1 | 1024 | 10173  | 13703  | 1.35x | 1     | 1
+| Electricity | 8 | 1024 | 80596  | 107761 | 1.34x | 7.92x | 7.86x
+| Traffic     | 1 | 1024 | 10197  | 13779  | 1.35x | 1     | 1
+| Traffic     | 8 | 1024 | 80692  | 107979 | 1.34x | 7.91x | 7.84x
+
+
+To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
+
+The performance metrics used were items per second.
+
+
+##### Training performance: NVIDIA DGX-1 (V100 16GB)
+
+Our results were obtained by running the `train.sh` training script in the [PyTorch 21.06 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch) on NVIDIA DGX-1 with (V100 16GB) GPUs. Performance numbers (in items/images per second) were averaged over an entire training epoch.
+
+| Dataset | GPUs   | Batch size / GPU   | Throughput - FP32    | Throughput - mixed precision    | Throughput speedup (FP32 - mixed precision)   | Weak scaling - FP32    | Weak scaling - mixed precision        
+|-------------|---|------|-------|-------|-------|------|----
+| Electricity | 1 | 1024 | 5580  | 9148  | 1.64x | 1     | 1
+| Electricity | 8 | 1024 | 43351 | 69855 | 1.61x | 7.77x | 7.64x
+| Traffic     | 1 | 1024 | 5593  | 9194  | 1.64x | 1     | 1
+| Traffic     | 8 | 1024 | 43426 | 69983 | 1.61x | 7.76x | 7.61x
+
+
+
+To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
+
+The performance metrics used were items per second.
+
+## Release notes
+The performance measurements in this document were conducted at the time of publication and may not reflect the performance achieved from NVIDIA’s latest software release. For the most up-to-date performance measurements, go to https://developer.nvidia.com/deep-learning-performance-training-inference.
+
+### Changelog
+
+October 2021
+- Initial release
+
+### Known issues
+There are no known issues with this model.
+
diff --git a/PyTorch/Forecasting/TFT/TFT_architecture.PNG b/PyTorch/Forecasting/TFT/TFT_architecture.PNG
new file mode 100644
index 00000000..c3431031
Binary files /dev/null and b/PyTorch/Forecasting/TFT/TFT_architecture.PNG differ
diff --git a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/Dockerfile b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/Dockerfile
new file mode 100644
index 00000000..70552ea1
--- /dev/null
+++ b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/Dockerfile
@@ -0,0 +1,36 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:21.06-py3
+
+FROM ${FROM_IMAGE_NAME}
+
+RUN apt-get update && apt-get install -y libb64-dev libb64-0d
+WORKDIR /workspace
+#ENV PYTHONPATH /workspace
+RUN pip uninstall -y typing
+
+RUN apt update && apt install -y p7zip-full
+COPY requirements.txt .
+RUN pip install --upgrade pip
+RUN pip install --no-cache-dir --ignore-installed -r requirements.txt
+RUN pip install --no-cache-dir -e git://github.com/NVIDIA/dllogger#egg=dllogger
+
+COPY . .
+ENV PYTHONPATH="${PYTHONPATH}:/workspace"
+
+# AMP monkey-patch
+RUN sed -i 's/  def forward(ctx,/  @amp.custom_fwd\(cast_inputs=torch.float32\)\n  def forward(ctx,/g' /opt/conda/lib/python3.8/site-packages/apex/normalization/fused_layer_norm.py
+RUN sed -i 's/  def backward(ctx,/  @amp.custom_bwd\n  def backward(ctx,/g' /opt/conda/lib/python3.8/site-packages/apex/normalization/fused_layer_norm.py
+RUN sed -i 's/^import torch$/import torch\nfrom torch.cuda import amp/' /opt/conda/lib/python3.8/site-packages/apex/normalization/fused_layer_norm.py
diff --git a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/LICENCE b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/LICENCE
new file mode 100644
index 00000000..261eeb9e
--- /dev/null
+++ b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/LICENCE
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/LICENSE AGREEMENT b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/LICENSE AGREEMENT
new file mode 100644
index 00000000..5d1d88cf
--- /dev/null
+++ b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/LICENSE AGREEMENT	
@@ -0,0 +1,25 @@
+Individual Contributor License Agreement (CLA)
+Thank you for submitting your contributions to this project.
+
+By signing this CLA, you agree that the following terms apply to all of your past, present and future contributions to the project.
+
+License.
+You hereby represent that all present, past and future contributions are governed by the Apache 2.0 License copyright statement.
+
+This entails that to the extent possible under law, you transfer all copyright and related or neighboring rights of the code or documents you contribute to the project itself or its maintainers. Furthermore you also represent that you have the authority to perform the above waiver with respect to the entirety of you contributions.
+
+Moral Rights.
+To the fullest extent permitted under applicable law, you hereby waive, and agree not to assert, all of your “moral rights” in or relating to your contributions for the benefit of the project.
+
+Third Party Content.
+If your Contribution includes or is based on any source code, object code, bug fixes, configuration changes, tools, specifications, documentation, data, materials, feedback, information or other works of authorship that were not authored by you (“Third Party Content”) or if you are aware of any third party intellectual property or proprietary rights associated with your Contribution (“Third Party Rights”), then you agree to include with the submission of your Contribution full details respecting such Third Party Content and Third Party Rights, including, without limitation, identification of which aspects of your Contribution contain Third Party Content or are associated with Third Party Rights, the owner/author of the Third Party Content and Third Party Rights, where you obtained the Third Party Content, and any applicable third party license terms or restrictions respecting the Third Party Content and Third Party Rights. For greater certainty, the foregoing obligations respecting the identification of Third Party Content and Third Party Rights do not apply to any portion of a Project that is incorporated into your Contribution to that same Project.
+
+Representations.
+You represent that, other than the Third Party Content and Third Party Rights identified by you in accordance with this Agreement, you are the sole author of your Contributions and are legally entitled to grant the foregoing licenses and waivers in respect of your Contributions. If your Contributions were created in the course of your employment with your past or present employer(s), you represent that such employer(s) has authorized you to make your Contributions on behalf of such employer(s) or such employer (s) has waived all of their right, title or interest in or to your Contributions.
+
+Disclaimer.
+To the fullest extent permitted under applicable law, your Contributions are provided on an "as is" basis, without any warranties or conditions, express or implied, including, without limitation, any implied warranties or conditions of non-infringement, merchantability or fitness for a particular purpose. You are not required to provide support for your Contributions, except to the extent you desire to provide support.
+
+No Obligation.
+You acknowledge that the maintainers of this project are under no obligation to use or incorporate your contributions into the project. The decision to use or incorporate your contributions into the project will be made at the sole discretion of the maintainers or their authorized delegates.
+
diff --git a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/NOTICE b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/NOTICE
new file mode 100644
index 00000000..ae19bb47
--- /dev/null
+++ b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/NOTICE
@@ -0,0 +1,3 @@
+TFT for PyTorch
+
+This repository includes software from https://github.com/google-research/google-research/tree/master/tft licensed under the Apache License, Version 2.0
diff --git a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/README.md b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/README.md
new file mode 100644
index 00000000..69b39d12
--- /dev/null
+++ b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/README.md
@@ -0,0 +1,465 @@
+# Temporal Fusion Transformer For PyTorch
+
+This repository provides a script and recipe to train the Temporal Fusion Transformer model to achieve state-of-the-art accuracy. The content of this repository is tested and maintained by NVIDIA.
+
+## Table Of Contents
+
+- [Model overview](#model-overview)
+    * [Model architecture](#model-architecture)
+    * [Default configuration](#default-configuration)
+    * [Feature support matrix](#feature-support-matrix)
+	    * [Features](#features)
+    * [Mixed precision training](#mixed-precision-training)
+	    * [Enabling mixed precision](#enabling-mixed-precision)
+          * [Enabling TF32](#enabling-tf32)
+    * [Glossary](#glossary)
+- [Setup](#setup)
+    * [Requirements](#requirements)
+- [Quick Start Guide](#quick-start-guide)
+- [Advanced](#advanced)
+    * [Scripts and sample code](#scripts-and-sample-code)
+    * [Command-line options](#command-line-options)
+    * [Getting the data](#getting-the-data)
+        * [Dataset guidelines](#dataset-guidelines)
+        * [Multi-dataset](#multi-dataset)
+    * [Training process](#training-process)
+    * [Inference process](#inference-process)
+- [Performance](#performance)
+    * [Benchmarking](#benchmarking)
+        * [Training performance benchmark](#training-performance-benchmark)
+        * [Inference performance benchmark](#inference-performance-benchmark)
+    * [Results](#results)
+        * [Training accuracy results](#training-accuracy-results)                         
+            * [Training accuracy: NVIDIA DGX A100 (8x A100 80GB)](#training-accuracy-nvidia-dgx-a100-8x-a100-80gb)
+            * [Training accuracy: NVIDIA DGX-1 (8x V100 16GB)](#training-accuracy-nvidia-dgx-1-8x-v100-16gb)
+            * [Training stability test](#training-stability-test)
+        * [Training performance results](#training-performance-results)
+            * [Training performance: NVIDIA DGX A100 (8x A100 80GB)](#training-performance-nvidia-dgx-a100-8x-a100-80gb)
+            * [Training performance: NVIDIA DGX-1 (8x V100 16GB)](#training-performance-nvidia-dgx-1-8x-v100-16gb)
+- [Release notes](#release-notes)
+    * [Changelog](#changelog)
+    * [Known issues](#known-issues)
+
+
+
+## Model overview
+
+The Temporal Fusion Transformer [TFT](https://arxiv.org/abs/1912.09363) model is a state-of-the-art architecture for interpretable, multi-horizon time-series prediction. The model was first developed and [implemented by Google](https://github.com/google-research/google-research/tree/master/tft) with the collaboration with the University of Oxford.
+This implementation differs from the reference implementation by addressing the issue of missing data, which is common in production datasets, by either masking their values in attention matrices or embedding them as a special value in the latent space.
+This model enables the prediction of confidence intervals for future values of time series for multiple future timesteps.
+
+This model is trained with mixed precision using Tensor Cores on Volta, Turing, and the NVIDIA Ampere GPU architectures. Therefore, researchers can get results 1.45x faster than training without Tensor Cores while experiencing the benefits of mixed precision training. This model is tested against each NGC monthly container release to ensure consistent accuracy and performance over time.
+
+### Model architecture
+
+The TFT model is a hybrid architecture joining LSTM encoding of time series and interpretability of transformer attention layers. Prediction is based on three  types of variables: static (constant for a given time series), known (known in advance for whole history and future), observed (known only for historical data). All these variables come in two flavors: categorical, and continuous. In addition to historical data, we feed the model with historical values of time series. All variables are embedded in high-dimensional space by learning an embedding vector. Categorical variables embeddings are learned in the classical sense of embedding discrete values. The model learns a single vector for each continuous variable, which is then scaled by this variable’s value for further processing. The next step is to filter variables through the Variable Selection Network (VSN), which assigns weights to the inputs in accordance with their relevance to the prediction. Static variables are used as a context for variable selection of other variables and as an initial state of LSTM encoders.
+After encoding, variables are passed to multi-head attention layers (decoder), which produce the final prediction. Whole architecture is interwoven with residual connections with gating mechanisms that allow  the architecture to adapt to various problems by skipping some parts of it.
+For the sake of explainability, heads of self-attention layers share value matrices. This allows interpreting  self-attention as an ensemble of models predicting different temporal patterns over the same feature set. The other feature that helps us understand the model is VSN activations, which tells us how relevant the given feature is to the prediction.
+![](TFT_architecture.PNG)
+*image source: https://arxiv.org/abs/1912.09363*
+
+### Default configuration
+
+The specific configuration of the TFT model depends on the dataset used. Not only is the volume of the model subject to change but so are the data sampling and preprocessing strategies. During preprocessing, data is normalized per feature. For a part of the datasets, we apply scaling per-time-series, which takes into account shifts in distribution between entities (i.e., a factory consumes more electricity than an average house). The model is trained with the quantile loss: <img src="https://render.githubusercontent.com/render/math?math=\Large\sum_{i=1}^N\sum_{q\in\mathcal{Q}}\sum_{t=1}^{t_{max}}\frac{QL(y_it,\hat{y}_i(q,t),q)}{Nt_{max}}">
+For quantiles in [0.1, 0.5, 0.9]. The default configurations are tuned for distributed training on DGX-1-32G with mixed precision. We use dynamic loss scaling. Specific values are provided in the table below.
+
+| Dataset | Training samples | Validation samples | Test samples | History length | Forecast horizon | Dropout | Hidden size | #Heads | BS | LR | Gradient clipping |
+| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
+| Electricity | 450k | 50k | 53.5k | 168 | 24 | 0.1 | 128 | 4 | 8x1024 | 1e-3 | 0.0 |
+| Traffic | 450k | 50k | 139.6k | 168 | 24 | 0.3 | 128 | 4 | 8x1024 | 1e-3 | 0.0
+
+### Feature support matrix
+
+The following features are supported by this model: 
+
+| Feature                    | Yes column                
+|----------------------------|--------------------------
+|Distributed data parallel   |         Yes
+|PyTorch AMP                 |         Yes 
+    
+         
+#### Features
+
+[Automatic Mixed Precision](https://pytorch.org/docs/stable/amp.html)
+provides an easy way to leverage Tensor Cores’ performance. It allows the execution of parts of a network in lower precision. Refer to [Mixed precision training](#mixed-precision-training) for more information.
+
+[PyTorch
+DistributedDataParallel](https://pytorch.org/docs/stable/nn.html#torch.nn.parallel.DistributedDataParallel) - a module
+wrapper that enables easy multiprocess distributed data-parallel
+training.
+
+### Mixed precision training
+
+Mixed precision is the combined use of different numerical precisions in a
+computational method.
+[Mixed precision](https://arxiv.org/abs/1710.03740) training offers significant
+computational speedup by performing operations in half-precision format while
+storing minimal information in single-precision to retain as much information
+as possible in critical parts of the network. Since the introduction of [Tensor Cores](https://developer.nvidia.com/tensor-cores) in Volta, and following with 
+both the Turing and Ampere architectures, significant training speedups are 
+experienced by switching to
+mixed precision -- up to 3x overall speedup on the most arithmetically intense
+model architectures. Using mixed precision training previously required two
+steps:
+
+1. Porting the model to use the FP16 data type where appropriate.
+2. Manually adding loss scaling to preserve small gradient values.
+
+The ability to train deep learning networks with lower precision was introduced
+in the Pascal architecture and first supported in [CUDA
+8](https://devblogs.nvidia.com/parallelforall/tag/fp16/) in the NVIDIA Deep
+Learning SDK.
+
+For information about:
+* How to train using mixed precision, refer to the [Mixed Precision
+  Training](https://arxiv.org/abs/1710.03740) paper and [Training With Mixed
+  Precision](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html)
+  documentation.
+* Techniques used for mixed precision training, refer to the [Mixed-Precision
+  Training of Deep Neural
+  Networks](https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/)
+  blog.
+* APEX tools for mixed precision training, refer to the [NVIDIA Apex: Tools for Easy Mixed-Precision Training in
+  PyTorch](https://devblogs.nvidia.com/apex-pytorch-easy-mixed-precision-training/)
+  .
+
+
+#### Enabling mixed precision
+
+
+Mixed precision is enabled in PyTorch by using the Automatic Mixed Precision torch.cuda.amp module, which casts variables to half-precision upon retrieval while storing variables in single-precision format. Furthermore, to preserve small gradient magnitudes in backpropagation, a [loss scaling](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html#lossscaling) step must be included when applying gradients. In PyTorch, loss scaling can be applied automatically by the GradScaler class. All the necessary steps to implement AMP are verbosely described [here](https://pytorch.org/docs/stable/notes/amp_examples.html#amp-examples).
+
+To enable mixed precision for TFT, simply add the `--use_amp` option to the training script.
+#### Enabling TF32
+
+TensorFloat-32 (TF32) is the new math mode in [NVIDIA A100](https://www.nvidia.com/en-us/data-center/a100/) GPUs for handling the matrix math, also called tensor operations. TF32 running on Tensor Cores in A100 GPUs can provide up to 10x speedups compared to single-precision floating-point math (FP32) on Volta GPUs. 
+
+TF32 Tensor Cores can speed up networks using FP32, typically with no loss of accuracy. It is more robust than FP16 for models which require high dynamic range for weights or activations.
+
+For more information, refer to the [TensorFloat-32 in the A100 GPU Accelerates AI Training, HPC up to 20x](https://blogs.nvidia.com/blog/2020/05/14/tensorfloat-32-precision-format/) blog post.
+
+TF32 is supported in the NVIDIA Ampere GPU architecture and is enabled by default.
+
+
+
+### Glossary
+
+**Multi horizon prediction**  
+Process of estimating values of a time series for multiple future time steps.
+
+**Quantiles**  
+Cut points dividing the range of a probability distribution intervals with equal probabilities.
+
+**Time series**  
+Series of data points indexed and equally spaced in time.
+
+**Transformer**  
+The paper [Attention Is All You Need](https://arxiv.org/abs/1706.03762) introduces a novel architecture called Transformer that uses an attention mechanism and transforms one sequence into another.
+ 
+
+## Setup
+
+The following section lists the requirements that you need to meet in order to start training the TFT model.
+
+### Requirements
+
+This repository contains Dockerfile, which extends the PyTorch NGC container and encapsulates some dependencies. Aside from these dependencies, ensure you have the following components:
+-   [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker)
+-   [PyTorch 21.06 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch)
+-   Supported GPUs:
+- [NVIDIA Volta architecture](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/)
+- [NVIDIA Turing architecture](https://www.nvidia.com/en-us/design-visualization/technologies/turing-architecture/)
+- [NVIDIA Ampere architecture](https://www.nvidia.com/en-us/data-center/nvidia-ampere-gpu-architecture/)
+
+For more information about how to get started with NGC containers, refer to the following sections from the NVIDIA GPU Cloud Documentation and the Deep Learning Documentation:
+-   [Getting Started Using NVIDIA GPU Cloud](https://docs.nvidia.com/ngc/ngc-getting-started-guide/index.html)
+-   [Accessing And Pulling From The NGC Container Registry](https://docs.nvidia.com/deeplearning/frameworks/user-guide/index.html#accessing_registry)
+-   Running [PyTorch](https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/running.html#running)
+
+  
+For those unable to use the PyTorch NGC container to set up the required environment or create your own container, refer to the versioned [NVIDIA Container Support Matrix](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html).
+
+## Quick Start Guide
+
+To train your model using mixed or TF32 precision with Tensor Cores, perform the following steps using the default parameters of the TFT model on any of the benchmark datasets. For the specifics concerning training and inference, refer to the [Advanced](#advanced) section.
+
+1. Clone the repository.
+```bash
+git clone https://github.com/NVIDIA/DeepLearningExamples
+cd DeepLearningExamples/PyTorch/Forecasting/TFT
+```
+
+2.  Build the TFT PyTorch NGC container.
+```bash
+docker build --network=host -t tft .
+```
+
+3.  Start an interactive session in the NGC container to run training/inference.
+```bash
+docker run -it --rm --ipc=host --network=host --gpus all -v /path/to/your/data:/data/ tft
+```
+
+Note: Ensure to mount your dataset using the -v flag to make it available for training inside the NVIDIA Docker container.
+
+4.  Download and preprocess datasets.
+```bash
+bash scripts/get_data.sh
+```
+
+5. Start training. Choose one of the scripts provided in the `scripts/` directory. Results are stored in the `/results` directory.
+These scripts are tuned for DGX1-32G. If you have a different system, use NGPU and BATCH_SIZE variables to adjust the parameters for your system.
+```bash
+bash scripts/run_electricity.sh
+bash scripts/run_traffic.sh
+```
+
+6. Start validation/evaluation. The metric we use for evaluation is q-risk. We can compare it per-quantile in the Pareto sense or jointly as one number indicating accuracy.
+```bash
+python inference.py \
+--checkpoint <your_checkpoint> \
+--data /data/processed/<dataset>/test.csv \
+--cat_encodings /data/processed/<dataset>/cat_encodings.bin \
+--tgt_scalers /data/processed/<dataset>/tgt_scalers.bin
+```
+
+7. Start inference/predictions. Visualize and save predictions by running the following command.
+```bash
+python inference.py \
+--checkpoint <your_checkpoint> \
+--data /data/processed/<dataset>/test.csv \
+--cat_encodings /data/processed/<dataset>/cat_encodings.bin \
+--tgt_scalers /data/processed/<dataset>/tgt_scalers.bin \
+--visualize \
+--save_predictions
+```
+
+
+
+Now that you have your model trained and evaluated, you can choose to compare your training results with our [Training accuracy results](#training-accuracy-results). You can also choose to benchmark your performance to [Training performance benchmark](#training-performance-results). Following the steps in these sections will ensure that you achieve the same accuracy and performance results as stated in the [Results](#results) section.
+## Advanced
+
+The following sections provide more  details about the dataset, running training and inference, and the training results.
+
+### Scripts and sample code
+
+In the root directory, the most important files are:
+
+`train.py`: Entry point for training
+`data_utils.py`: File containing the dataset implementation and preprocessing functions
+`modeling.py`: Definition of the model
+`configuration.py`: Contains configuration classes for various experiments
+`test.py`: Entry point testing trained model.
+`Dockerfile`: Container definition
+`log_helper.py`: Contains helper functions for setting up dllogger
+`criterions.py`: Definitions of loss functions
+
+The `scripts` directory contains scripts for default use cases:
+`run_electricity.sh`: train default model on the electricity dataset
+`run_traffic.sh`: train default model on the traffic dataset
+
+### Command-line options
+
+To view the full list of available options and their descriptions, use the `-h` or `--help` command-line option, for example:
+`python train.py --help`.
+
+The following example output is printed when running the model:
+```
+usage: train.py [-h] --data_path DATA_PATH --dataset {electricity,volatility,traffic,favorita} [--epochs EPOCHS] [--sample_data SAMPLE_DATA SAMPLE_DATA] [--batch_size BATCH_SIZE] [--lr LR] [--seed SEED] [--use_amp] [--clip_grad CLIP_GRAD]
+                [--early_stopping EARLY_STOPPING] [--results RESULTS] [--log_file LOG_FILE] [--distributed_world_size N] [--distributed_rank DISTRIBUTED_RANK] [--local_rank LOCAL_RANK] [--overwrite_config OVERWRITE_CONFIG]
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --data_path DATA_PATH
+  --dataset {electricity,volatility,traffic,favorita}
+  --epochs EPOCHS
+  --sample_data SAMPLE_DATA SAMPLE_DATA
+  --batch_size BATCH_SIZE
+  --lr LR
+  --seed SEED
+  --use_amp             Enable automatic mixed precision
+  --clip_grad CLIP_GRAD
+  --early_stopping EARLY_STOPPING
+                        Stop training if validation loss does not improve for more than this number of epochs.
+  --results RESULTS
+  --log_file LOG_FILE
+  --distributed_world_size N
+                        total number of GPUs across all nodes (default: all visible GPUs)
+  --distributed_rank DISTRIBUTED_RANK
+                        rank of the current worker
+  --local_rank LOCAL_RANK
+                        rank of the current worker
+  --overwrite_config OVERWRITE_CONFIG
+                        JSON string used to overload config
+
+```
+
+### Getting the data
+    
+The TFT model was trained on the electricity and traffic benchmark datasets. This repository contains the `get_data.sh` download script, which for electricity and and traffic datasets will automatically download and preprocess the training, validation and test datasets, and produce files that contain scalers.
+#### Dataset guidelines
+
+The `data_utils.py` file contains all functions that are used to preprocess the data. Initially the data is loaded to a `pandas.DataFrame` and parsed to the common format which contains the features we will use for training. Then standardized data is cleaned, normalized, encoded and binarized.
+This step does the following:
+Drop all the columns that are not marked in the configuration file as used for training or preprocessing
+Flatten indices in case time series are indexed by more than one column
+Split the data into training, validation and test splits
+Filter out all the time series shorter than minimal example length
+Normalize columns marked as continuous in the configuration file
+Encode as integers columns marked as categorical
+Save the data in csv and binary formats
+
+#### Multi-dataset
+In order to use an alternate dataset, you have to write a function that parses your data to a common format. The format is as follows:
+There is at least one id column
+There is exactly one time column (that can also be used as a feature column)
+Each feature is in a separate column
+Each row represents a moment in time for only one time series
+Additionally, you must specify a configuration of the network, including a data description. Refer to the example in `configuration.py` file.
+### Training process
+
+The `train.py` script is an entry point for a training procedure. Refined recipes can be found in the `scripts` directory.
+The model trains for at most `--epochs` epochs. If option `--early_stopping N` is set, then training will end if for N subsequent epochs validation loss hadn’t improved.
+The details of the architecture and the dataset configuration are encapsulated by the `--dataset` option. This option chooses one of the configurations stored in the `configuration.py` file. You can enable mixed precision training by providing the `--use_amp` option. The training script supports multi-GPU training with the APEX package. To enable distributed training prepend training command with `python -m torch.distributed.launch --nproc_per_node=${NGPU}`.
+
+Example command:
+```
+python -m torch.distributed.launch --nproc_per_node=8 train.py \
+        --dataset electricity \
+        --data_path /data/processed/electricity_bin \
+        --batch_size=1024 \
+        --sample 450000 50000 \
+        --lr 1e-3 \
+        --epochs 25 \
+        --early_stopping 5 \
+        --seed 1 \
+        --use_amp \
+        --results /results/TFT_electricity_bs8x1024_lr1e-3/seed_1
+```
+
+The model is trained by optimizing quantile loss <img src="https://render.githubusercontent.com/render/math?math=\Large\sum_{i=1}^N\sum_{q\in\mathcal{Q}}\sum_{t=1}^{t_{max}}\frac{QL(y_{it},\hat{y}_i(q,t),q)}{Nt_{max}}">
+. After training, the checkpoint with the least validation loss is evaluated on a test split with q-risk metric <img src="https://render.githubusercontent.com/render/math?math=\Large\frac{2\sum_{y\in\Omega}\sum_{t=1}^{t_{max}}QL(y_t,\hat{y}(q,t),q)}{\sum_{y\in\Omega}\sum_{t=1}^{t_{max}}|y_t|}">.
+Results are by default stored in the `/results` directory. This can be changed by providing the `--results` option. At the end of the training,  the results directory will contain the trained checkpoint which had the lowest validation loss, dllogger logs (in dictionary per line format), and TensorBoard logs.
+
+### Inference process
+
+Inference can be run by launching the `inference.py` script. The script requires a trained checkpoint to run. It is crucial to prepare the data in the same way as training data prior to running the inference. Example command:
+```
+python inference.py \
+--checkpoint /results/checkpoint.pt \
+--data /data/processed/electricity_bin/test.csv \
+--tgt_scalers /data/processed/electricity_bin/tgt_scalers.bin \
+--cat_encodings /data/processed/electricity_bin/cat_encodings.bin \
+--batch_size 2048 \
+--visualize \
+--save_predictions \
+--joint_visualization \
+--results /results \
+--use_amp
+```
+
+In the default setting, it performs the evaluation of the model on a specified dataset and prints q-risk evaluated on this dataset. In order to save the predictions, use the `--save_predictions` option. Predictions will be stored in the directory specified by the `--results` option in the csv format. Option `--joint_visualization` allows us to plot graphs in TensorBoard format, allowing us to inspect the results and compare them to true values. Using `--visualize`, you can save plots for each example in a separate file.
+## Performance
+
+### Benchmarking
+
+The following section shows how to run benchmarks measuring the model performance in training and inference modes.
+
+#### Training performance benchmark
+
+In order to run training benchmarks, use the `scripts/benchmark.sh` script.
+
+#### Inference performance benchmark
+
+To benchmark the inference performance on a specific batch size and dataset, run the `inference.py` script.
+### Results
+
+The following sections provide details on how we achieved our performance and accuracy in training and inference.
+
+#### Training accuracy results
+
+We conducted an extensive hyperparameter search along with stability tests. The presented results are the averages from the hundreds of runs.
+
+##### Training accuracy: NVIDIA DGX A100 (A100 80GB)
+
+Our results were obtained by running the `train.sh` training script in the [PyTorch 21.06 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch) on NVIDIA A100 GPUs.
+
+| Dataset | GPUs | Batch size / GPU    | Accuracy - TF32  | Accuracy - mixed precision  |   Time to train - TF32  |  Time to train - mixed precision | Time to train speedup (TF32 to mixed precision)     
+|-------------|---|------|-----------------------|-----------------------|-------|-------|-------
+| Electricity | 1 | 1024 | 0.027 / 0.059 / 0.029 | 0.028 / 0.058 / 0.029 | 1427s | 1087s | 1.313x
+| Electricity | 8 | 1024 | 0.027 / 0.056 / 0.028 | 0.026 / 0.054 / 0.029 | 216s  | 176s  | 1.227x
+| Traffic     | 1 | 1024 | 0.040 / 0.103 / 0.075 | 0.040 / 0.103 / 0.075 | 957s  | 726s  | 1.318x
+| Traffic     | 8 | 1024 | 0.042 / 0.104 / 0.076 | 0.042 / 0.106 / 0.077 | 151s  | 126s  | 1.198x
+
+
+
+
+##### Training accuracy: NVIDIA DGX-1 (V100 16GB)
+
+Our results were obtained by running the `train.sh` training script in the [PyTorch 21.06 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch) on NVIDIA DGX-1 with V100 16GB GPUs.
+
+| Dataset | GPUs    | Batch size / GPU    | Accuracy - FP32  | Accuracy - mixed precision  |   Time to train - FP32  |  Time to train - mixed precision | Time to train speedup (FP32 to mixed precision)        
+|-------------|---|------|-----------------------|-----------------------|-------|-------|-----------
+| Electricity | 1 | 1024 | 0.027 / 0.056 / 0.028 | 0.027 / 0.058 / 0.029 | 2559s | 1598s | 1.601x 
+| Electricity | 8 | 1024 | 0.027 / 0.055 / 0.028 | 0.027 / 0.055 / 0.029 | 381s  | 261s  | 1.460x   
+| Traffic     | 1 | 1024 | 0.040 / 0.102 / 0.075 | 0.041 / 0.101 / 0.074 | 1718s | 1062s | 1.618x 
+| Traffic     | 8 | 1024 | 0.042 / 0.106 / 0.076 | 0.042 / 0.105 / 0.077 | 256s  | 176s  | 1.455x
+
+
+
+##### Training stability test
+
+In order to get a greater picture of the model’s accuracy, we performed a hyperparameter search along with stability tests on 100 random seeds for each configuration. Then, for each benchmark dataset, we have chosen the architecture with the least mean test q-risk. The table below summarizes the best configurations.
+
+| Dataset     | #GPU | Hidden size | #Heads | Local BS | LR   | Gradient clipping | Dropout | Mean q-risk | Std q-risk | Min q-risk | Max q-risk
+|-------------|------|-------------|--------|----------|------|-------------------|---------|-------------|------------| -----------|------ 
+| Electricity | 8    | 128         | 4      | 1024     | 1e-3 | 0.0               | 0.1     | 0.1131      | 0.0025     | 0.1080     | 0.1200
+| Traffic     | 8    | 128         | 4      | 1024     | 1e-3 | 0.0               | 0.3     | 0.2180      | 0.0049     | 0.2069     | 0.2336
+
+
+#### Training performance results
+
+##### Training performance: NVIDIA DGX A100 (A100 80GB)
+
+Our results were obtained by running the `train.sh` training script in the [PyTorch 21.06 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch) on NVIDIA A100 (A100 80GB) GPUs. Performance numbers (in items/images per second) were averaged over an entire training epoch.
+
+| Dataset | GPUs   | Batch size / GPU   | Throughput - TF32    | Throughput - mixed precision    | Throughput speedup (TF32 - mixed precision)   | Weak scaling - TF32    | Weak scaling - mixed precision        
+|-------------|---|------|--------|--------|-------|-------|-----
+| Electricity | 1 | 1024 | 10173  | 13703  | 1.35x | 1     | 1
+| Electricity | 8 | 1024 | 80596  | 107761 | 1.34x | 7.92x | 7.86x
+| Traffic     | 1 | 1024 | 10197  | 13779  | 1.35x | 1     | 1
+| Traffic     | 8 | 1024 | 80692  | 107979 | 1.34x | 7.91x | 7.84x
+
+
+To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
+
+The performance metrics used were items per second.
+
+
+##### Training performance: NVIDIA DGX-1 (V100 16GB)
+
+Our results were obtained by running the `train.sh` training script in the [PyTorch 21.06 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch) on NVIDIA DGX-1 with (V100 16GB) GPUs. Performance numbers (in items/images per second) were averaged over an entire training epoch.
+
+| Dataset | GPUs   | Batch size / GPU   | Throughput - FP32    | Throughput - mixed precision    | Throughput speedup (FP32 - mixed precision)   | Weak scaling - FP32    | Weak scaling - mixed precision        
+|-------------|---|------|-------|-------|-------|------|----
+| Electricity | 1 | 1024 | 5580  | 9148  | 1.64x | 1     | 1
+| Electricity | 8 | 1024 | 43351 | 69855 | 1.61x | 7.77x | 7.64x
+| Traffic     | 1 | 1024 | 5593  | 9194  | 1.64x | 1     | 1
+| Traffic     | 8 | 1024 | 43426 | 69983 | 1.61x | 7.76x | 7.61x
+
+
+
+To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
+
+The performance metrics used were items per second.
+
+## Release notes
+The performance measurements in this document were conducted at the time of publication and may not reflect the performance achieved from NVIDIA’s latest software release. For the most up-to-date performance measurements, go to https://developer.nvidia.com/deep-learning-performance-training-inference.
+
+### Changelog
+
+October 2021
+- Initial release
+
+### Known issues
+There are no known issues with this model.
+
diff --git a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/TFT_architecture.PNG b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/TFT_architecture.PNG
new file mode 100644
index 00000000..c3431031
Binary files /dev/null and b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/TFT_architecture.PNG differ
diff --git a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/configuration.py b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/configuration.py
new file mode 100644
index 00000000..bef26e66
--- /dev/null
+++ b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/configuration.py
@@ -0,0 +1,128 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from data_utils import InputTypes, DataTypes, FeatureSpec
+import datetime
+
+class ElectricityConfig():
+    def __init__(self):
+
+        self.features = [
+                         FeatureSpec('id', InputTypes.ID, DataTypes.CATEGORICAL),
+                         FeatureSpec('hours_from_start', InputTypes.TIME, DataTypes.CONTINUOUS),
+                         FeatureSpec('power_usage', InputTypes.TARGET, DataTypes.CONTINUOUS),
+                         FeatureSpec('hour', InputTypes.KNOWN, DataTypes.CONTINUOUS),
+                         FeatureSpec('day_of_week', InputTypes.KNOWN, DataTypes.CONTINUOUS),
+                         FeatureSpec('hours_from_start', InputTypes.KNOWN, DataTypes.CONTINUOUS),
+                         FeatureSpec('categorical_id', InputTypes.STATIC, DataTypes.CATEGORICAL),
+                        ]
+        # Dataset split boundaries
+        self.time_ids = 'days_from_start' # This column contains time indices across which we split the data
+        self.train_range = (1096, 1315)
+        self.valid_range = (1308, 1339)
+        self.test_range = (1332, 1346)
+        self.dataset_stride = 1 #how many timesteps between examples
+        self.scale_per_id = True
+        self.missing_id_strategy = None
+        self.missing_cat_data_strategy='encode_all'
+
+        # Feature sizes
+        self.static_categorical_inp_lens = [369]
+        self.temporal_known_categorical_inp_lens = []
+        self.temporal_observed_categorical_inp_lens = []
+        self.quantiles = [0.1, 0.5, 0.9]
+
+        self.example_length = 8 * 24
+        self.encoder_length = 7 * 24
+
+        self.n_head = 4
+        self.hidden_size = 128
+        self.dropout = 0.1
+        self.attn_dropout = 0.0
+
+        #### Derived variables ####
+        self.temporal_known_continuous_inp_size = len([x for x in self.features 
+            if x.feature_type == InputTypes.KNOWN and x.feature_embed_type == DataTypes.CONTINUOUS])
+        self.temporal_observed_continuous_inp_size = len([x for x in self.features 
+            if x.feature_type == InputTypes.OBSERVED and x.feature_embed_type == DataTypes.CONTINUOUS])
+        self.temporal_target_size = len([x for x in self.features if x.feature_type == InputTypes.TARGET])
+        self.static_continuous_inp_size = len([x for x in self.features 
+            if x.feature_type == InputTypes.STATIC and x.feature_embed_type == DataTypes.CONTINUOUS])
+
+        self.num_static_vars = self.static_continuous_inp_size + len(self.static_categorical_inp_lens)
+        self.num_future_vars = self.temporal_known_continuous_inp_size + len(self.temporal_known_categorical_inp_lens)
+        self.num_historic_vars = sum([self.num_future_vars,
+                                      self.temporal_observed_continuous_inp_size,
+                                      self.temporal_target_size,
+                                      len(self.temporal_observed_categorical_inp_lens),
+                                      ])
+
+
+class TrafficConfig():
+    def __init__(self):
+
+        self.features = [
+                         FeatureSpec('id', InputTypes.ID, DataTypes.CATEGORICAL),
+                         FeatureSpec('hours_from_start', InputTypes.TIME, DataTypes.CONTINUOUS),
+                         FeatureSpec('values', InputTypes.TARGET, DataTypes.CONTINUOUS),
+                         FeatureSpec('time_on_day', InputTypes.KNOWN, DataTypes.CONTINUOUS),
+                         FeatureSpec('day_of_week', InputTypes.KNOWN, DataTypes.CONTINUOUS),
+                         FeatureSpec('hours_from_start', InputTypes.KNOWN, DataTypes.CONTINUOUS),
+                         FeatureSpec('categorical_id', InputTypes.STATIC, DataTypes.CATEGORICAL),
+                        ]
+        # Dataset split boundaries
+        self.time_ids = 'sensor_day' # This column contains time indices across which we split the data
+        self.train_range = (0, 151)
+        self.valid_range = (144, 166)
+        self.test_range = (159, float('inf'))
+        self.dataset_stride = 1 #how many timesteps between examples
+        self.scale_per_id = False
+        self.missing_id_strategy = None
+        self.missing_cat_data_strategy='encode_all'
+
+        # Feature sizes
+        self.static_categorical_inp_lens = [963]
+        self.temporal_known_categorical_inp_lens = []
+        self.temporal_observed_categorical_inp_lens = []
+        self.quantiles = [0.1, 0.5, 0.9]
+
+        self.example_length = 8 * 24
+        self.encoder_length = 7 * 24
+
+        self.n_head = 4
+        self.hidden_size = 128
+        self.dropout = 0.3
+        self.attn_dropout = 0.0
+
+        #### Derived variables ####
+        self.temporal_known_continuous_inp_size = len([x for x in self.features 
+            if x.feature_type == InputTypes.KNOWN and x.feature_embed_type == DataTypes.CONTINUOUS])
+        self.temporal_observed_continuous_inp_size = len([x for x in self.features 
+            if x.feature_type == InputTypes.OBSERVED and x.feature_embed_type == DataTypes.CONTINUOUS])
+        self.temporal_target_size = len([x for x in self.features if x.feature_type == InputTypes.TARGET])
+        self.static_continuous_inp_size = len([x for x in self.features 
+            if x.feature_type == InputTypes.STATIC and x.feature_embed_type == DataTypes.CONTINUOUS])
+
+        self.num_static_vars = self.static_continuous_inp_size + len(self.static_categorical_inp_lens)
+        self.num_future_vars = self.temporal_known_continuous_inp_size + len(self.temporal_known_categorical_inp_lens)
+        self.num_historic_vars = sum([self.num_future_vars,
+                                      self.temporal_observed_continuous_inp_size,
+                                      self.temporal_target_size,
+                                      len(self.temporal_observed_categorical_inp_lens),
+                                      ])
+
+
+CONFIGS = {'electricity':  ElectricityConfig,
+           'traffic':      TrafficConfig, 
+           }
diff --git a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/criterions.py b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/criterions.py
new file mode 100644
index 00000000..5c9df6ae
--- /dev/null
+++ b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/criterions.py
@@ -0,0 +1,28 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class QuantileLoss(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.register_buffer('q', torch.tensor(config.quantiles))
+
+    def forward(self, predictions, targets):
+        diff = predictions - targets
+        ql = (1-self.q)*F.relu(diff) + self.q*F.relu(-diff)
+        losses = ql.view(-1, ql.shape[-1]).mean(0)
+        return losses
diff --git a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/data_utils.py b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/data_utils.py
new file mode 100644
index 00000000..f38f8bfb
--- /dev/null
+++ b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/data_utils.py
@@ -0,0 +1,790 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################
+# Copyright 2021 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import math
+import pickle
+import enum
+import datetime
+
+from collections import namedtuple, OrderedDict
+
+import sklearn.preprocessing
+from sklearn.impute import SimpleImputer
+import pandas as pd
+import numpy as np
+from bisect import bisect
+
+import torch
+from torch.utils.data import Dataset,IterableDataset,DataLoader
+
+class DataTypes(enum.IntEnum):
+    """Defines numerical types of each column."""
+    CONTINUOUS = 0
+    CATEGORICAL = 1
+    DATE = 2
+    STR = 3
+
+class InputTypes(enum.IntEnum):
+    """Defines input types of each column."""
+    TARGET = 0
+    OBSERVED = 1
+    KNOWN = 2
+    STATIC = 3
+    ID = 4  # Single column used as an entity identifier
+    TIME = 5  # Single column exclusively used as a time index
+
+FeatureSpec = namedtuple('FeatureSpec', ['name', 'feature_type', 'feature_embed_type'])
+DTYPE_MAP = {
+        DataTypes.CONTINUOUS : np.float32,
+        DataTypes.CATEGORICAL : np.int64,
+        DataTypes.DATE:'datetime64[ns]',
+        DataTypes.STR: str
+        }
+
+FEAT_ORDER = [
+        (InputTypes.STATIC, DataTypes.CATEGORICAL),
+        (InputTypes.STATIC, DataTypes.CONTINUOUS),
+        (InputTypes.KNOWN, DataTypes.CATEGORICAL),
+        (InputTypes.KNOWN, DataTypes.CONTINUOUS),
+        (InputTypes.OBSERVED, DataTypes.CATEGORICAL),
+        (InputTypes.OBSERVED, DataTypes.CONTINUOUS),
+        (InputTypes.TARGET, DataTypes.CONTINUOUS),
+        (InputTypes.ID, DataTypes.CATEGORICAL)
+        ]
+
+FEAT_NAMES = ['s_cat' , 's_cont' , 'k_cat' , 'k_cont' , 'o_cat' , 'o_cont' , 'target', 'id']
+DEFAULT_ID_COL = 'id'
+
+class TFTBinaryDataset(Dataset):
+    def __init__(self, path, config):
+        super(TFTBinaryDataset).__init__()
+        self.features = [x for x in config.features if x.feature_embed_type != DataTypes.DATE]
+        self.example_length = config.example_length
+        self.stride = config.dataset_stride
+
+        self.grouped = pickle.load(open(path, 'rb'))
+        self.grouped = [x for x in self.grouped if x.shape[0] >= self.example_length]
+        self._cum_examples_in_group = np.cumsum([(g.shape[0] - self.example_length + 1)//self.stride for g in self.grouped])
+
+
+        self.feature_type_col_map = [[i for i,f in enumerate(self.features) if (f.feature_type, f.feature_embed_type) == x] for x in FEAT_ORDER]
+
+        # The list comprehension below is an elaborate way of rearranging data into correct order,
+        # simultaneously doing casting to proper types. Probably can be written neater
+        self.grouped = [
+                [
+                    arr[:, idxs].view(dtype=np.float32).astype(DTYPE_MAP[t[1]]) 
+                    for t, idxs in zip(FEAT_ORDER, self.feature_type_col_map)
+                ] 
+                for arr in self.grouped
+            ]
+
+    def __len__(self):
+        return self._cum_examples_in_group[-1] if len(self._cum_examples_in_group) else 0
+
+    def __getitem__(self, idx):
+        g_idx = bisect(self._cum_examples_in_group, idx)
+        e_idx = idx - self._cum_examples_in_group[g_idx-1] if g_idx else idx
+
+        group =  self.grouped[g_idx]
+
+        tensors = [
+                   torch.from_numpy(feat[e_idx * self.stride:e_idx*self.stride + self.example_length])
+                   if feat.size else torch.empty(0)
+                   for feat in group
+                  ]
+
+        return OrderedDict(zip(FEAT_NAMES, tensors))
+
+
+class TFTDataset(Dataset):
+    def __init__(self, path, config):
+        super(TFTDataset).__init__()
+        self.features = config.features
+        self.data = pd.read_csv(path, index_col=0)
+        self.example_length = config.example_length
+        self.stride = config.dataset_stride
+
+        # name field is a column name.
+        # there can be multiple entries with the same name because one column can be interpreted in many ways
+        time_col_name = next(x.name for x in self.features if x.feature_type==InputTypes.TIME)
+        id_col_name   = next(x.name for x in self.features if x.feature_type==InputTypes.ID)
+        if not id_col_name in self.data.columns:
+            id_col_name = DEFAULT_ID_COL
+            self.features = [x for x in self.features if x.feature_type!=InputTypes.ID]
+            self.features.append(FeatureSpec(DEFAULT_ID_COL, InputTypes.ID, DataTypes.CATEGORICAL))
+        col_dtypes    = {v.name:DTYPE_MAP[v.feature_embed_type] for v in self.features}
+
+
+        self.data.sort_values(time_col_name,inplace=True)
+        self.data = self.data[set(x.name for x in self.features)] #leave only relevant columns
+        self.data = self.data.astype(col_dtypes)
+        self.data = self.data.groupby(id_col_name).filter(lambda group: len(group) >= self.example_length)
+        self.grouped = list(self.data.groupby(id_col_name))
+
+        self._cum_examples_in_group = np.cumsum([(len(g[1]) - self.example_length + 1)//self.stride for g in self.grouped])
+
+    def __len__(self):
+        return self._cum_examples_in_group[-1]
+
+    def __getitem__(self, idx):
+        g_idx = len([x for x in self._cum_examples_in_group if x <= idx])
+        e_idx = idx - self._cum_examples_in_group[g_idx-1] if g_idx else idx
+
+        group =  self.grouped[g_idx][1]
+        sliced = group.iloc[e_idx * self.stride:e_idx*self.stride + self.example_length]
+
+        # We need to be sure that tensors are returned in the correct order
+        tensors = tuple([] for _ in range(8))
+        for v in self.features:
+            if v.feature_type == InputTypes.STATIC and v.feature_embed_type == DataTypes.CATEGORICAL:
+                tensors[0].append(torch.from_numpy(sliced[v.name].to_numpy()))
+            elif v.feature_type == InputTypes.STATIC and v.feature_embed_type == DataTypes.CONTINUOUS:
+                tensors[1].append(torch.from_numpy(sliced[v.name].to_numpy()))
+            elif v.feature_type == InputTypes.KNOWN and v.feature_embed_type == DataTypes.CATEGORICAL:
+                tensors[2].append(torch.from_numpy(sliced[v.name].to_numpy()))
+            elif v.feature_type == InputTypes.KNOWN and v.feature_embed_type == DataTypes.CONTINUOUS:
+                tensors[3].append(torch.from_numpy(sliced[v.name].to_numpy()))
+            elif v.feature_type == InputTypes.OBSERVED and v.feature_embed_type == DataTypes.CATEGORICAL:
+                tensors[4].append(torch.from_numpy(sliced[v.name].to_numpy()))
+            elif v.feature_type == InputTypes.OBSERVED and v.feature_embed_type == DataTypes.CONTINUOUS:
+                tensors[5].append(torch.from_numpy(sliced[v.name].to_numpy()))
+            elif v.feature_type == InputTypes.TARGET:
+                tensors[6].append(torch.from_numpy(sliced[v.name].to_numpy()))
+            elif v.feature_type == InputTypes.ID:
+                tensors[7].append(torch.from_numpy(sliced[v.name].to_numpy()))
+
+
+        tensors = [torch.stack(x, dim=-1) if x else torch.empty(0) for x in tensors]
+
+        return OrderedDict(zip(FEAT_NAMES, tensors))
+        
+def get_dataset_splits(df, config):
+
+    if hasattr(config, 'relative_split') and config.relative_split:
+        forecast_len = config.example_length - config.encoder_length
+        # The valid split is shifted from the train split by number of the forecast steps to the future.
+        # The test split is shifted by the number of the forecast steps from the valid split
+        train = []
+        valid = []
+        test = []
+
+        for _, group in df.groupby(DEFAULT_ID_COL):
+            index = group[config.time_ids]
+            _train = group.loc[index < config.valid_boundary]
+            _valid = group.iloc[(len(_train) - config.encoder_length):(len(_train) + forecast_len)]
+            _test = group.iloc[(len(_train) - config.encoder_length + forecast_len):(len(_train) + 2*forecast_len)]
+            train.append(_train)
+            valid.append(_valid)
+            test.append(_test)
+
+        train = pd.concat(train, axis=0)
+        valid = pd.concat(valid, axis=0)
+        test = pd.concat(test, axis=0)
+    else:
+        index = df[config.time_ids]
+        train = df.loc[(index >= config.train_range[0]) & (index < config.train_range[1])]
+        valid = df.loc[(index >= config.valid_range[0]) & (index < config.valid_range[1])]
+        test  = df.loc[(index >= config.test_range[0]) & (index < config.test_range[1])]
+
+    return train, valid, test
+
+def flatten_ids(df, config):
+
+    if config.missing_id_strategy == 'drop':
+        if hasattr(config, 'combine_ids') and config.combine_ids:
+            index = np.logical_or.reduce([df[c].isna() for c in config.combine_ids])
+        else:
+            id_col = next(x.name for x in config.features if x.feature_type == InputTypes.ID)
+            index = df[id_col].isna()
+        index = index[index == True].index # Extract indices of nans
+        df.drop(index, inplace=True)
+
+    if not (hasattr(config, 'combine_ids') and config.combine_ids):
+        id_col = next(x.name for x in config.features if x.feature_type == InputTypes.ID)
+        ids = df[id_col].apply(str)
+        df.drop(id_col, axis=1, inplace=True)
+        encoder = sklearn.preprocessing.LabelEncoder().fit(ids.values)
+        df[DEFAULT_ID_COL] = encoder.transform(ids)
+        encoders = OrderedDict({id_col: encoder})
+
+    else:
+        encoders = {c:sklearn.preprocessing.LabelEncoder().fit(df[c].values) for c in config.combine_ids}
+        encoders = OrderedDict(encoders)
+        lens = [len(v.classes_) for v in encoders.values()]
+        clens = np.roll(np.cumprod(lens), 1)
+        clens[0] = 1
+
+        # this takes a looooooot of time. Probably it would be better to create 2 dummy columns
+        df[DEFAULT_ID_COL] = df.apply(lambda row: sum([encoders[c].transform([row[c]])[0]*clens[i] for i,c in enumerate(encoders.keys())]), axis=1)
+        df.drop(config.combine_ids, axis=1, inplace=True)
+
+    return DEFAULT_ID_COL, encoders
+
+def impute(df, config):
+    #XXX This ensures that out scaling will have the same mean. We still need to check the variance
+    if not hasattr(config, 'missing_data_label'):
+        return df, None
+    else:
+        imp = SimpleImputer(missing_values=config.missing_data_label, strategy='mean')
+        mask = df.applymap(lambda x: True if x == config.missing_data_label else False)
+        data = df.values
+        col_mask = (data == config.missing_data_label).all(axis=0)
+        data[:,~col_mask] = imp.fit_transform(data)
+        return data, mask
+
+def normalize_reals(train, valid, test, config, id_col=DEFAULT_ID_COL):
+    tgt_cols = [x.name for x in config.features if x.feature_type == InputTypes.TARGET]
+    real_cols = list(set(v.name for v in config.features if v.feature_embed_type == DataTypes.CONTINUOUS).difference(set(tgt_cols)))
+    real_scalers = {}
+    tgt_scalers = {}
+
+    def apply_scalers(df, name=None):
+        if name is None:
+            name = df.name
+        mask = df.applymap(lambda x: True if x == config.missing_data_label else False) if hasattr(config, 'missing_data_label') else None
+        df[real_cols] = real_scalers[name].transform(df[real_cols])
+        if mask is not None and any(mask):
+            df[real_cols].mask(mask, 10**9)
+        df[tgt_cols] = tgt_scalers[name].transform(df[tgt_cols])
+        return df
+
+    if config.scale_per_id:
+        for identifier, sliced in train.groupby(id_col):
+            data = sliced[real_cols]
+            data, _ = impute(data, config)
+            real_scalers[identifier] = sklearn.preprocessing.StandardScaler().fit(data)
+            # XXX We should probably remove examples that contain NaN as a target
+            target = sliced[tgt_cols]
+            tgt_scalers[identifier] = sklearn.preprocessing.StandardScaler().fit(target)
+
+        train = train.groupby(id_col).apply(apply_scalers)
+        # For valid and testing leave only timeseries previously present in train subset
+        # XXX for proper data science we should consider encoding unseen timeseries as a special case, not throwing them away
+        valid = valid.loc[valid[id_col].isin(real_scalers.keys())]
+        valid = valid.groupby(id_col).apply(apply_scalers)
+        test = test.loc[test[id_col].isin(real_scalers.keys())]
+        test = test.groupby(id_col).apply(apply_scalers)
+
+    else:
+        data, _ = impute(train[real_cols], config)
+        real_scalers[''] = sklearn.preprocessing.StandardScaler().fit(data)
+        tgt_scalers[''] = sklearn.preprocessing.StandardScaler().fit(train[tgt_cols])
+
+        train = apply_scalers(train, name='')
+        valid = apply_scalers(valid, name='')
+        test = apply_scalers(test, name='')
+
+    return train, valid, test, real_scalers, tgt_scalers
+
+def encode_categoricals(train, valid, test, config):
+    cat_encodings = {}
+    cat_cols = list(set(v.name for v in config.features if v.feature_embed_type == DataTypes.CATEGORICAL and v.feature_type != InputTypes.ID))
+    num_classes = [] #XXX Maybe we should modify config based on this value? Or send a warninig?
+                     # For TC performance reasons we might want for num_classes[i] be divisible by 8
+
+    # Train categorical encoders
+    for c in cat_cols:
+        if config.missing_cat_data_strategy == 'special_token':
+            #XXX this will probably require some data augmentation
+            unique = train[c].unique()
+            valid[c].loc[valid[c].isin(unique)] = '<UNK>'
+            test[c].loc[test[c].isin(unique)] = '<UNK>'
+
+        if config.missing_cat_data_strategy == 'encode_all' or \
+                config.missing_cat_data_strategy == 'special_token':
+            srs = pd.concat([train[c], valid[c], test[c]]).apply(str)
+            cat_encodings[c] = sklearn.preprocessing.LabelEncoder().fit(srs.values)
+        elif config.missing_cat_data_strategy == 'drop':
+            # TODO: implement this. In addition to dropping rows this has to split specific time series in chunks
+            # to prevent data from having temporal gaps
+            pass
+        num_classes.append(srs.nunique())
+    print('Categorical variables encodings lens: ', num_classes)
+
+
+    for split in [train, valid, test]:
+        for c in cat_cols:
+            srs = split[c].apply(str)
+            split[c] = srs
+            split.loc[:,c] = cat_encodings[c].transform(srs)
+
+    return cat_encodings
+
+
+def preprocess(src_path, dst_path, config):
+    df = pd.read_csv(src_path, index_col=0)
+
+    for c in config.features:
+        if c.feature_embed_type == DataTypes.DATE:
+            df[c.name] = pd.to_datetime(df[c.name])
+
+    # Leave only columns relevant to preprocessing
+    relevant_columns = list(set([f.name for f in config.features] + [config.time_ids]))
+    df = df[relevant_columns]
+
+
+    id_col, id_encoders = flatten_ids(df, config)
+    df = df.reindex(sorted(df.columns), axis=1)
+    
+    train, valid, test = get_dataset_splits(df, config)
+   
+    # Length filter the data (all timeseries shorter than example len will be dropped)
+    #for df in [train, valid, test]:
+    #    df.groupby(id_col).filter(lambda x: len(x) >= config.example_length)
+    train = pd.concat([x[1] for x in train.groupby(id_col) if len(x[1]) >= config.example_length])
+    valid = pd.concat([x[1] for x in valid.groupby(id_col) if len(x[1]) >= config.example_length])
+    test  = pd.concat([x[1] for x in test.groupby(id_col)  if len(x[1]) >= config.example_length])
+
+    train, valid, test, real_scalers, tgt_scalers = normalize_reals(train, valid, test, config, id_col)
+
+    cat_encodings = encode_categoricals(train, valid, test, config)
+
+    os.makedirs(dst_path, exist_ok=True)
+    
+    train.to_csv(os.path.join(dst_path, 'train.csv'))
+    valid.to_csv(os.path.join(dst_path, 'valid.csv'))
+    test.to_csv(os.path.join(dst_path, 'test.csv'))
+
+    # Save relevant columns in binary form for faster dataloading
+    # IMORTANT: We always expect id to be a single column indicating the complete timeseries
+    # We also expect a copy of id in form of static categorical input!!!
+    col_names = [id_col] + [x.name for x in config.features if x.feature_embed_type != DataTypes.DATE and x.feature_type != InputTypes.ID]
+    grouped_train = [x[1][col_names].values.astype(np.float32).view(dtype=np.int32) for x in train.groupby(id_col)]
+    grouped_valid = [x[1][col_names].values.astype(np.float32).view(dtype=np.int32) for x in valid.groupby(id_col)]
+    grouped_test  = [x[1][col_names].values.astype(np.float32).view(dtype=np.int32) for x in test.groupby(id_col)]
+
+    pickle.dump(grouped_train, open(os.path.join(dst_path, 'train.bin'), 'wb'))
+    pickle.dump(grouped_valid, open(os.path.join(dst_path, 'valid.bin'), 'wb'))
+    pickle.dump(grouped_test,  open(os.path.join(dst_path, 'test.bin'), 'wb'))
+
+    
+    with open(os.path.join(dst_path, 'real_scalers.bin'), 'wb') as f:
+        pickle.dump(real_scalers, f)
+    with open(os.path.join(dst_path, 'tgt_scalers.bin'), 'wb') as f:
+        pickle.dump(tgt_scalers, f)
+    with open(os.path.join(dst_path, 'cat_encodings.bin'), 'wb') as f:
+        pickle.dump(cat_encodings, f)
+    with open(os.path.join(dst_path, 'id_encoders.bin'), 'wb') as f:
+        pickle.dump(id_encoders, f)
+    
+
+def sample_data(dataset, num_samples):
+    if num_samples < 0:
+        return dataset
+    else:
+        return torch.utils.data.Subset(dataset, np.random.choice(np.arange(len(dataset)), size=num_samples, replace=False))
+
+
+def standarize_electricity(path):
+    """Code taken from https://github.com/google-research/google-research/blob/master/tft/script_download_data.py"""
+    df = pd.read_csv(os.path.join(path, 'LD2011_2014.txt'), index_col=0, sep=';', decimal=',')
+    df.index = pd.to_datetime(df.index)
+    df.sort_index(inplace=True)
+  
+    # Used to determine the start and end dates of a series
+    output = df.resample('1h').mean().replace(0., np.nan)
+  
+    earliest_time = output.index.min()
+  
+    df_list = []
+    for label in output:
+        print('Processing {}'.format(label))
+        srs = output[label]
+  
+        start_date = min(srs.fillna(method='ffill').dropna().index)
+        end_date = max(srs.fillna(method='bfill').dropna().index)
+  
+        active_range = (srs.index >= start_date) & (srs.index <= end_date)
+        srs = srs[active_range].fillna(0.)
+  
+        tmp = pd.DataFrame({'power_usage': srs})
+        date = tmp.index
+        tmp['t'] = (date - earliest_time).seconds / 60 / 60 + (
+            date - earliest_time).days * 24
+        tmp['days_from_start'] = (date - earliest_time).days
+        tmp['categorical_id'] = label
+        tmp['date'] = date
+        tmp['id'] = label
+        tmp['hour'] = date.hour
+        tmp['day'] = date.day
+        tmp['day_of_week'] = date.dayofweek
+        tmp['month'] = date.month
+  
+        df_list.append(tmp)
+  
+    output = pd.concat(df_list, axis=0, join='outer').reset_index(drop=True)
+  
+    output['categorical_id'] = output['id'].copy()
+    output['hours_from_start'] = output['t']
+    output['categorical_day_of_week'] = output['day_of_week'].copy()
+    output['categorical_hour'] = output['hour'].copy()
+  
+    output.to_csv(os.path.join(path, 'standarized.csv'))
+
+def standarize_volatility(path):
+    df = pd.read_csv(os.path.join(path, 'oxfordmanrealizedvolatilityindices.csv'), index_col=0)  # no explicit index
+  
+    # Adds additional date/day fields
+    idx = [str(s).split('+')[0] for s in df.index
+          ]  # ignore timezones, we don't need them
+    dates = pd.to_datetime(idx)
+    df['date'] = dates
+    df['days_from_start'] = (dates - pd.datetime(2000, 1, 3)).days
+    df['day_of_week'] = dates.dayofweek
+    df['day_of_month'] = dates.day
+    df['week_of_year'] = dates.weekofyear
+    df['month'] = dates.month
+    df['year'] = dates.year
+    df['categorical_id'] = df['Symbol'].copy()
+  
+    # Processes log volatility
+    vol = df['rv5_ss'].copy()
+    vol.loc[vol == 0.] = np.nan
+    df['log_vol'] = np.log(vol)
+  
+    # Adds static information
+    symbol_region_mapping = {
+        '.AEX': 'EMEA',
+        '.AORD': 'APAC',
+        '.BFX': 'EMEA',
+        '.BSESN': 'APAC',
+        '.BVLG': 'EMEA',
+        '.BVSP': 'AMER',
+        '.DJI': 'AMER',
+        '.FCHI': 'EMEA',
+        '.FTMIB': 'EMEA',
+        '.FTSE': 'EMEA',
+        '.GDAXI': 'EMEA',
+        '.GSPTSE': 'AMER',
+        '.HSI': 'APAC',
+        '.IBEX': 'EMEA',
+        '.IXIC': 'AMER',
+        '.KS11': 'APAC',
+        '.KSE': 'APAC',
+        '.MXX': 'AMER',
+        '.N225': 'APAC ',
+        '.NSEI': 'APAC',
+        '.OMXC20': 'EMEA',
+        '.OMXHPI': 'EMEA',
+        '.OMXSPI': 'EMEA',
+        '.OSEAX': 'EMEA',
+        '.RUT': 'EMEA',
+        '.SMSI': 'EMEA',
+        '.SPX': 'AMER',
+        '.SSEC': 'APAC',
+        '.SSMI': 'EMEA',
+        '.STI': 'APAC',
+        '.STOXX50E': 'EMEA'
+    }
+  
+    df['Region'] = df['Symbol'].apply(lambda k: symbol_region_mapping[k])
+  
+    # Performs final processing
+    output_df_list = []
+    for grp in df.groupby('Symbol'):
+        sliced = grp[1].copy()
+        sliced.sort_values('days_from_start', inplace=True)
+        # Impute log volatility values
+        sliced['log_vol'].fillna(method='ffill', inplace=True)
+        sliced.dropna()
+        output_df_list.append(sliced)
+  
+    df = pd.concat(output_df_list, axis=0)
+  
+    df.to_csv(os.path.join(path, 'standarized.csv'))
+
+
+def standarize_traffic(path):
+    def process_list(s, variable_type=int, delimiter=None):
+        """Parses a line in the PEMS format to a list."""
+        if delimiter is None:
+            l = [
+                variable_type(i) for i in s.replace('[', '').replace(']', '').split()
+            ]
+        else:
+            l = [
+                variable_type(i)
+                for i in s.replace('[', '').replace(']', '').split(delimiter)
+            ]
+  
+        return l
+  
+    def read_single_list(filename):
+        """Returns single list from a file in the PEMS-custom format."""
+        with open(os.path.join(path, filename), 'r') as dat:
+            l = process_list(dat.readlines()[0])
+        return l
+  
+    def read_matrix(filename):
+        """Returns a matrix from a file in the PEMS-custom format."""
+        array_list = []
+        with open(os.path.join(path, filename), 'r') as dat:
+            lines = dat.readlines()
+            for i, line in enumerate(lines):
+                if (i + 1) % 50 == 0:
+                    print('Completed {} of {} rows for {}'.format(i + 1, len(lines),
+                                                                filename))
+                array = [
+                    process_list(row_split, variable_type=float, delimiter=None)
+                    for row_split in process_list(
+                        line, variable_type=str, delimiter=';')
+                ]
+                array_list.append(array)
+  
+        return array_list
+  
+    shuffle_order = np.array(read_single_list('randperm')) - 1  # index from 0
+    train_dayofweek = read_single_list('PEMS_trainlabels')
+    train_tensor = read_matrix('PEMS_train')
+    test_dayofweek = read_single_list('PEMS_testlabels')
+    test_tensor = read_matrix('PEMS_test')
+  
+    # Inverse permutate shuffle order
+    print('Shuffling')
+    inverse_mapping = {
+        new_location: previous_location
+        for previous_location, new_location in enumerate(shuffle_order)
+    }
+    reverse_shuffle_order = np.array([
+        inverse_mapping[new_location]
+        for new_location, _ in enumerate(shuffle_order)
+    ])
+  
+    # Group and reoder based on permuation matrix
+    print('Reodering')
+    day_of_week = np.array(train_dayofweek + test_dayofweek)
+    combined_tensor = np.array(train_tensor + test_tensor)
+  
+    day_of_week = day_of_week[reverse_shuffle_order]
+    combined_tensor = combined_tensor[reverse_shuffle_order]
+  
+    # Put everything back into a dataframe
+    print('Parsing as dataframe')
+    labels = ['traj_{}'.format(i) for i in read_single_list('stations_list')]
+  
+    hourly_list = []
+    for day, day_matrix in enumerate(combined_tensor):
+        # Hourly data
+        hourly = pd.DataFrame(day_matrix.T, columns=labels)
+        hourly['hour_on_day'] = [int(i / 6) for i in hourly.index
+                                ]  # sampled at 10 min intervals
+        if hourly['hour_on_day'].max() > 23 or hourly['hour_on_day'].min() < 0:
+            raise ValueError('Invalid hour! {}-{}'.format(
+                hourly['hour_on_day'].min(), hourly['hour_on_day'].max()))
+  
+        hourly = hourly.groupby('hour_on_day', as_index=True).mean()[labels]
+        hourly['sensor_day'] = day
+        hourly['time_on_day'] = hourly.index
+        hourly['day_of_week'] = day_of_week[day]
+  
+        hourly_list.append(hourly)
+  
+    hourly_frame = pd.concat(hourly_list, axis=0, ignore_index=True, sort=False)
+  
+    # Flatten such that each entitiy uses one row in dataframe
+    store_columns = [c for c in hourly_frame.columns if 'traj' in c]
+    other_columns = [c for c in hourly_frame.columns if 'traj' not in c]
+    flat_df = pd.DataFrame(columns=['values', 'prev_values', 'next_values'] +
+                           other_columns + ['id'])
+  
+    for store in store_columns:
+        print('Processing {}'.format(store))
+  
+        sliced = hourly_frame[[store] + other_columns].copy()
+        sliced.columns = ['values'] + other_columns
+        sliced['id'] = int(store.replace('traj_', ''))
+  
+        # Sort by Sensor-date-time
+        key = sliced['id'].apply(str) \
+                + sliced['sensor_day'].apply(lambda x: '_{:03d}'.format(x)) \
+                + sliced['time_on_day'].apply(lambda x: '_{:03d}'.format(x))
+        sliced = sliced.set_index(key).sort_index()
+  
+        sliced['values'] = sliced['values'].fillna(method='ffill')
+        sliced['prev_values'] = sliced['values'].shift(1)
+        sliced['next_values'] = sliced['values'].shift(-1)
+  
+        flat_df = flat_df.append(sliced.dropna(), ignore_index=True, sort=False)
+  
+    # Filter to match range used by other academic papers
+    index = flat_df['sensor_day']
+    flat_df = flat_df[index < 173].copy()
+  
+    # Creating columns fo categorical inputs
+    flat_df['categorical_id'] = flat_df['id'].copy()
+    flat_df['hours_from_start'] = flat_df['time_on_day'] \
+        + flat_df['sensor_day']*24.
+    flat_df['categorical_day_of_week'] = flat_df['day_of_week'].copy()
+    flat_df['categorical_time_on_day'] = flat_df['time_on_day'].copy()
+  
+    flat_df.to_csv(os.path.join(path, 'standarized.csv'))
+
+
+# XXX needs rework
+def standarize_favorita(data_folder):
+    import gc
+    # Extract only a subset of data to save/process for efficiency
+    start_date = pd.datetime(2015, 1, 1)
+    end_date = pd.datetime(2016, 6, 1)
+  
+    print('Regenerating data...')
+  
+    # load temporal data
+    temporal = pd.read_csv(os.path.join(data_folder, 'train.csv'), index_col=0)
+  
+    store_info = pd.read_csv(os.path.join(data_folder, 'stores.csv'), index_col=0)
+    oil = pd.read_csv(
+        os.path.join(data_folder, 'oil.csv'), index_col=0).iloc[:, 0]
+    holidays = pd.read_csv(os.path.join(data_folder, 'holidays_events.csv'))
+    items = pd.read_csv(os.path.join(data_folder, 'items.csv'), index_col=0)
+    transactions = pd.read_csv(os.path.join(data_folder, 'transactions.csv'))
+  
+    # Take first 6 months of data
+    temporal['date'] = pd.to_datetime(temporal['date'])
+  
+    # Filter dates to reduce storage space requirements
+    if start_date is not None:
+        temporal = temporal[(temporal['date'] >= start_date)]
+    if end_date is not None:
+        temporal = temporal[(temporal['date'] < end_date)]
+  
+    dates = temporal['date'].unique()
+  
+    # Add trajectory identifier
+    temporal['traj_id'] = temporal['store_nbr'].apply(
+        str) + '_' + temporal['item_nbr'].apply(str)
+    temporal['unique_id'] = temporal['traj_id'] + '_' + temporal['date'].apply(
+        str)
+  
+    # Remove all IDs with negative returns
+    print('Removing returns data')
+    min_returns = temporal['unit_sales'].groupby(temporal['traj_id']).min()
+    valid_ids = set(min_returns[min_returns >= 0].index)
+    selector = temporal['traj_id'].apply(lambda traj_id: traj_id in valid_ids)
+    new_temporal = temporal[selector].copy()
+    del temporal
+    gc.collect()
+    temporal = new_temporal
+    temporal['open'] = 1
+  
+    # Resampling
+    print('Resampling to regular grid')
+    resampled_dfs = []
+    for traj_id, raw_sub_df in temporal.groupby('traj_id'):
+        print('Resampling', traj_id)
+        sub_df = raw_sub_df.set_index('date', drop=True).copy()
+        sub_df = sub_df.resample('1d').last()
+        sub_df['date'] = sub_df.index
+        sub_df[['store_nbr', 'item_nbr', 'onpromotion']] \
+            = sub_df[['store_nbr', 'item_nbr', 'onpromotion']].fillna(method='ffill')
+        sub_df['open'] = sub_df['open'].fillna(
+            0)  # flag where sales data is unknown
+        sub_df['log_sales'] = np.log(sub_df['unit_sales'])
+    
+        resampled_dfs.append(sub_df.reset_index(drop=True))
+  
+    new_temporal = pd.concat(resampled_dfs, axis=0)
+    del temporal
+    gc.collect()
+    temporal = new_temporal
+  
+    print('Adding oil')
+    oil.name = 'oil'
+    oil.index = pd.to_datetime(oil.index)
+    #XXX the lines below match the value of the oil on given date with the rest of the timeseries
+    # missing values in oil series are copied from the index before. Then the oil series is joined with
+    # temporal. Then there are some dates present in temporal which arent present in oil, for which 
+    # oil values is substituted with -1. WHY?!
+    #TODO: check how many nans there are after first step. Previously oil series was extended by dates
+    # present in dates variable with nan value, which were forward filled. 
+    # This behavior is no longer supported by pandas, so we changed to DataFrame.isin method.
+    # This leaves us with more nans after first step than previously. To achieve previous behavior
+    # we have to join series before filling nans.
+    temporal = temporal.join(
+        #oil.loc[oil.index.isin(dates)].fillna(method='ffill'), on='date', how='left')
+        oil.loc[oil.index.isin(dates)], on='date', how='left')
+    temporal['oil'] = temporal['oil'].fillna(method='ffill')
+    temporal['oil'] = temporal['oil'].fillna(-1)
+  
+    print('Adding store info')
+    temporal = temporal.join(store_info, on='store_nbr', how='left')
+  
+    print('Adding item info')
+    temporal = temporal.join(items, on='item_nbr', how='left')
+  
+    transactions['date'] = pd.to_datetime(transactions['date'])
+    temporal = temporal.merge(
+        transactions,
+        left_on=['date', 'store_nbr'],
+        right_on=['date', 'store_nbr'],
+        how='left')
+    temporal['transactions'] = temporal['transactions'].fillna(-1)
+  
+    # Additional date info
+    temporal['day_of_week'] = pd.to_datetime(temporal['date'].values).dayofweek
+    temporal['day_of_month'] = pd.to_datetime(temporal['date'].values).day
+    temporal['month'] = pd.to_datetime(temporal['date'].values).month
+  
+    # Add holiday info
+    print('Adding holidays')
+    holiday_subset = holidays[holidays['transferred'].apply(
+        lambda x: not x)].copy()
+    holiday_subset.columns = [
+        s if s != 'type' else 'holiday_type' for s in holiday_subset.columns
+    ]
+    holiday_subset['date'] = pd.to_datetime(holiday_subset['date'])
+    local_holidays = holiday_subset[holiday_subset['locale'] == 'Local']
+    regional_holidays = holiday_subset[holiday_subset['locale'] == 'Regional']
+    national_holidays = holiday_subset[holiday_subset['locale'] == 'National']
+  
+    temporal['national_hol'] = temporal.merge(
+        national_holidays, left_on=['date'], right_on=['date'],
+        how='left')['description'].fillna('')
+    temporal['regional_hol'] = temporal.merge(
+        regional_holidays,
+        left_on=['state', 'date'],
+        right_on=['locale_name', 'date'],
+        how='left')['description'].fillna('')
+    temporal['local_hol'] = temporal.merge(
+        local_holidays,
+        left_on=['city', 'date'],
+        right_on=['locale_name', 'date'],
+        how='left')['description'].fillna('')
+  
+    temporal.sort_values('unique_id', inplace=True)
+
+    # Transform date to integer index
+    start_date = pd.to_datetime(min(temporal['date']))
+    dates = temporal['date'].apply(pd.to_datetime)
+    temporal['days_from_start'] = (dates - start_date).dt.days
+    temporal['categorical_id'] = temporal['traj_id'].copy()
+  
+    print('Saving processed file to {}'.format(os.path.join(data_folder, 'standarized.csv')))
+    temporal.to_csv(os.path.join(data_folder, 'standarized.csv'))
diff --git a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/ema.py b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/ema.py
new file mode 100644
index 00000000..f8f5b331
--- /dev/null
+++ b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/ema.py
@@ -0,0 +1,73 @@
+# Copyright 2021 NVIDIA CORPORATION
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Copyright 2019 Ross Wightman
+
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+
+#        http://www.apache.org/licenses/LICENSE-2.0
+
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+
+""" 
+Exponential Moving Average (EMA) of model updates
+"""
+
+from collections import OrderedDict
+from copy import deepcopy
+
+import torch
+import torch.nn as nn
+
+class ModelEma(nn.Module):
+    """ Model Exponential Moving Average V2
+
+    Keep a moving average of everything in the model state_dict (parameters and buffers).
+    V2 of this module is simpler, it does not match params/buffers based on name but simply
+    iterates in order. It works with torchscript (JIT of full model).
+
+    """
+    def __init__(self, model, decay=0.999, device=None):
+        super().__init__()
+        # make a copy of the model for accumulating moving average of weights
+        self.module = deepcopy(model)
+        self.module.eval()
+        self.decay = decay
+        self.device = device  # perform ema on different device from model if set
+        if self.device is not None:
+            self.module.to(device=device)
+
+    def update(self, model):
+        update_fn=lambda ema_v, model_v: self.decay * ema_v + (1. - self.decay) * model_v
+        with torch.no_grad():
+            for ema_v, model_v in zip(self.module.state_dict().values(), model.state_dict().values()):
+                if self.device is not None:
+                    model_v = model_v.to(device=self.device)
+                ema_v.copy_(update_fn(ema_v, model_v))
+
+    def set(self, model):
+        with torch.no_grad():
+            for ema_v, model_v in zip(self.module.state_dict().values(), model.state_dict().values()):
+                if self.device is not None:
+                    model_v = model_v.to(device=self.device)
+                ema_v.copy_( model_v )
+
+    def forward(self, x):
+        return self.module(x)
diff --git a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/gpu_affinity.py b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/gpu_affinity.py
new file mode 100644
index 00000000..79fb1fc4
--- /dev/null
+++ b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/gpu_affinity.py
@@ -0,0 +1,157 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+import math
+import os
+import pathlib
+import re
+
+import pynvml
+
+pynvml.nvmlInit()
+
+
+def systemGetDriverVersion():
+    return pynvml.nvmlSystemGetDriverVersion()
+
+
+def deviceGetCount():
+    return pynvml.nvmlDeviceGetCount()
+
+
+class device:
+    # assume nvml returns list of 64 bit ints
+    _nvml_affinity_elements = math.ceil(os.cpu_count() / 64)
+
+    def __init__(self, device_idx):
+        super().__init__()
+        self.handle = pynvml.nvmlDeviceGetHandleByIndex(device_idx)
+
+    def getName(self):
+        return pynvml.nvmlDeviceGetName(self.handle)
+
+    def getCpuAffinity(self):
+        affinity_string = ''
+        for j in pynvml.nvmlDeviceGetCpuAffinity(
+            self.handle, device._nvml_affinity_elements
+        ):
+            # assume nvml returns list of 64 bit ints
+            affinity_string = '{:064b}'.format(j) + affinity_string
+        affinity_list = [int(x) for x in affinity_string]
+        affinity_list.reverse()  # so core 0 is in 0th element of list
+
+        ret = [i for i, e in enumerate(affinity_list) if e != 0]
+        return ret
+
+
+def set_socket_affinity(gpu_id):
+    dev = device(gpu_id)
+    affinity = dev.getCpuAffinity()
+    os.sched_setaffinity(0, affinity)
+
+
+def set_single_affinity(gpu_id):
+    dev = device(gpu_id)
+    affinity = dev.getCpuAffinity()
+    os.sched_setaffinity(0, affinity[:1])
+
+
+def set_single_unique_affinity(gpu_id, nproc_per_node):
+    devices = [device(i) for i in range(nproc_per_node)]
+    socket_affinities = [dev.getCpuAffinity() for dev in devices]
+
+    siblings_list = get_thread_siblings_list()
+    siblings_dict = dict(siblings_list)
+
+    # remove siblings
+    for idx, socket_affinity in enumerate(socket_affinities):
+        socket_affinities[idx] = list(set(socket_affinity) - set(siblings_dict.values()))
+
+    affinities = []
+    assigned = []
+
+    for socket_affinity in socket_affinities:
+        for core in socket_affinity:
+            if core not in assigned:
+                affinities.append([core])
+                assigned.append(core)
+                break
+    os.sched_setaffinity(0, affinities[gpu_id])
+
+
+def set_socket_unique_affinity(gpu_id, nproc_per_node, mode):
+    device_ids = [device(i) for i in range(nproc_per_node)]
+    socket_affinities = [dev.getCpuAffinity() for dev in device_ids]
+
+    siblings_list = get_thread_siblings_list()
+    siblings_dict = dict(siblings_list)
+
+    # remove siblings
+    for idx, socket_affinity in enumerate(socket_affinities):
+        socket_affinities[idx] = list(set(socket_affinity) - set(siblings_dict.values()))
+
+    socket_affinities_to_device_ids = collections.defaultdict(list)
+
+    for idx, socket_affinity in enumerate(socket_affinities):
+        socket_affinities_to_device_ids[tuple(socket_affinity)].append(idx)
+
+    for socket_affinity, device_ids in socket_affinities_to_device_ids.items():
+        devices_per_group = len(device_ids)
+        cores_per_device = len(socket_affinity) // devices_per_group
+        for group_id, device_id in enumerate(device_ids):
+            if device_id == gpu_id:
+                if mode == 'interleaved':
+                    affinity = list(socket_affinity[group_id::devices_per_group])
+                elif mode == 'continuous':
+                    affinity = list(socket_affinity[group_id*cores_per_device:(group_id+1)*cores_per_device])
+                else:
+                    raise RuntimeError('Unknown set_socket_unique_affinity mode')
+
+                # reintroduce siblings
+                affinity += [siblings_dict[aff] for aff in affinity if aff in siblings_dict]
+                os.sched_setaffinity(0, affinity)
+
+
+def get_thread_siblings_list():
+    path = '/sys/devices/system/cpu/cpu*/topology/thread_siblings_list'
+    thread_siblings_list = []
+    pattern = re.compile(r'(\d+)\D(\d+)')
+    for fname in pathlib.Path(path[0]).glob(path[1:]):
+        with open(fname) as f:
+            content = f.read().strip()
+            res = pattern.findall(content)
+            if res:
+                pair = tuple(map(int, res[0]))
+                thread_siblings_list.append(pair)
+    return thread_siblings_list
+
+
+def set_affinity(gpu_id, nproc_per_node, mode='socket'):
+    if mode == 'socket':
+        set_socket_affinity(gpu_id)
+    elif mode == 'single':
+        set_single_affinity(gpu_id)
+    elif mode == 'single_unique':
+        set_single_unique_affinity(gpu_id, nproc_per_node)
+    elif mode == 'socket_unique_interleaved':
+        set_socket_unique_affinity(gpu_id, nproc_per_node, 'interleaved')
+    elif mode == 'socket_unique_continuous':
+        set_socket_unique_affinity(gpu_id, nproc_per_node, 'continuous')
+    else:
+        raise RuntimeError('Unknown affinity mode')
+
+    affinity = os.sched_getaffinity(0)
+    return affinity
+
diff --git a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/inference.py b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/inference.py
new file mode 100644
index 00000000..056429f1
--- /dev/null
+++ b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/inference.py
@@ -0,0 +1,239 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import pandas as pd
+import numpy as np
+import pickle
+import argparse
+import torch
+from torch.utils.data import DataLoader
+from torch.cuda import amp
+from torch.utils.tensorboard import SummaryWriter
+from tqdm import tqdm
+from modeling import TemporalFusionTransformer
+from configuration import ElectricityConfig
+from data_utils import TFTDataset
+from utils import PerformanceMeter
+from criterions import QuantileLoss
+import dllogger
+from log_helper import setup_logger
+
+def _unscale_per_id(config, values, ids, scalers):
+    values = values.cpu().numpy()
+    num_horizons = config.example_length - config.encoder_length + 1
+    flat_values = pd.DataFrame(
+            values,
+            columns=[f't{j}' for j in range(num_horizons - values.shape[1], num_horizons)]
+            )
+    flat_values['id'] = ids
+    df_list = []
+    for idx, group in flat_values.groupby('id'):
+        scaler = scalers[idx]
+        group_copy = group.copy()
+        for col in group_copy.columns:
+            if not 'id' in col:
+                _col = np.expand_dims(group_copy[col].values, -1)
+                _t_col = scaler.inverse_transform(_col)[:,-1]
+                group_copy[col] = _t_col
+        df_list.append(group_copy)
+    flat_values = pd.concat(df_list, axis=0)
+
+    flat_values = flat_values[[col for col in flat_values if not 'id' in col]]
+    flat_tensor = torch.from_numpy(flat_values.values)
+    return flat_tensor
+
+def _unscale(config, values, scaler):
+    values = values.cpu().numpy()
+    num_horizons = config.example_length - config.encoder_length + 1
+    flat_values = pd.DataFrame(
+            values,
+            columns=[f't{j}' for j in range(num_horizons - values.shape[1], num_horizons)]
+            )
+    for col in flat_values.columns:
+        if not 'id' in col:
+            _col = np.expand_dims(flat_values[col].values, -1)
+            _t_col = scaler.inverse_transform(_col)[:,-1]
+            flat_values[col] = _t_col
+
+    flat_values = flat_values[[col for col in flat_values if not 'id' in col]]
+    flat_tensor = torch.from_numpy(flat_values.values)
+    return flat_tensor
+
+def predict(args, config, model, data_loader, scalers, cat_encodings, extend_targets=False):
+    model.eval()
+    predictions = []
+    targets = []
+    ids = []
+    perf_meter = PerformanceMeter()
+    n_workers = args.distributed_world_size if hasattr(args, 'distributed_world_size') else 1
+
+    for step, batch in enumerate(data_loader):
+        perf_meter.reset_current_lap()
+        with torch.no_grad():
+            batch = {key: tensor.cuda() if tensor.numel() else None for key, tensor in batch.items()}
+            ids.append(batch['id'][:,0,:])
+            targets.append(batch['target'])
+            predictions.append(model(batch).float())
+
+        perf_meter.update(args.batch_size * n_workers,
+            exclude_from_total=step in [0, len(data_loader)-1])
+
+    targets = torch.cat(targets, dim=0)
+    if not extend_targets:
+        targets = targets[:,config.encoder_length:,:] 
+    predictions = torch.cat(predictions, dim=0)
+    
+    if config.scale_per_id:
+        ids = torch.cat(ids, dim=0).cpu().numpy()
+
+        unscaled_predictions = torch.stack(
+                [_unscale_per_id(config, predictions[:,:,i], ids, scalers) for i in range(len(config.quantiles))], 
+                dim=-1)
+        unscaled_targets = _unscale_per_id(config, targets[:,:,0], ids, scalers).unsqueeze(-1)
+    else:
+        ids = None
+        unscaled_predictions = torch.stack(
+                [_unscale(config, predictions[:,:,i], scalers['']) for i in range(len(config.quantiles))], 
+                dim=-1)
+        unscaled_targets = _unscale(config, targets[:,:,0], scalers['']).unsqueeze(-1)
+
+    return unscaled_predictions, unscaled_targets, ids, perf_meter
+
+def visualize_v2(args, config, model, data_loader, scalers, cat_encodings):
+    unscaled_predictions, unscaled_targets, ids, _ = predict(args, config, model, data_loader, scalers, cat_encodings, extend_targets=True)
+
+    num_horizons = config.example_length - config.encoder_length + 1
+    pad = unscaled_predictions.new_full((unscaled_targets.shape[0], unscaled_targets.shape[1] - unscaled_predictions.shape[1], unscaled_predictions.shape[2]), fill_value=float('nan'))
+    pad[:,-1,:] = unscaled_targets[:,-num_horizons,:]
+    unscaled_predictions = torch.cat((pad, unscaled_predictions), dim=1)
+
+    ids = torch.from_numpy(ids.squeeze())
+    joint_graphs = torch.cat([unscaled_targets, unscaled_predictions], dim=2)
+    graphs = {i:joint_graphs[ids == i, :, :] for i in set(ids.tolist())}
+    for key, g in graphs.items():
+        for i, ex in enumerate(g):
+            df = pd.DataFrame(ex.numpy(), 
+                    index=range(num_horizons - ex.shape[0], num_horizons),
+                    columns=['target'] + [f'P{int(q*100)}' for q in config.quantiles])
+            fig = df.plot().get_figure()
+            ax = fig.get_axes()[0]
+            _values = df.values[config.encoder_length-1:,:]
+            ax.fill_between(range(num_horizons), _values[:,1], _values[:,-1], alpha=0.2, color='green')
+            os.makedirs(os.path.join(args.results, 'single_example_vis', str(key)), exist_ok=True)
+            fig.savefig(os.path.join(args.results, 'single_example_vis', str(key), f'{i}.pdf'))
+
+def inference(args, config, model, data_loader, scalers, cat_encodings):
+    unscaled_predictions, unscaled_targets, ids, perf_meter = predict(args, config, model, data_loader, scalers, cat_encodings)
+
+    if args.joint_visualization or args.save_predictions:
+        ids = torch.from_numpy(ids.squeeze())
+        #ids = torch.cat([x['id'][0] for x in data_loader.dataset])
+        joint_graphs = torch.cat([unscaled_targets, unscaled_predictions], dim=2)
+        graphs = {i:joint_graphs[ids == i, :, :] for i in set(ids.tolist())}
+        for key, g in graphs.items(): #timeseries id, joint targets and predictions
+            _g = {'targets': g[:,:,0]}
+            _g.update({f'P{int(q*100)}':g[:,:,i+1] for i, q in enumerate(config.quantiles)})
+            
+            if args.joint_visualization:
+                summary_writer = SummaryWriter(log_dir=os.path.join(args.results, 'predictions_vis', str(key)))
+                for q, t in _g.items(): # target and quantiles, timehorizon values
+                    if q == 'targets':
+                        targets = torch.cat([t[:,0], t[-1,1:]]) # WIP
+                        # We want to plot targets on the same graph as predictions. Probably could be written better.
+                        for i, val in enumerate(targets):
+                            summary_writer.add_scalars(str(key), {f'{q}':val}, i)
+                        continue
+
+                    # Tensor t contains different time horizons which are shifted in phase
+                    # Next lines realign them
+                    y = t.new_full((t.shape[0] + t.shape[1] -1, t.shape[1]), float('nan'))
+                    for i in range(y.shape[1]):
+                        y[i:i+t.shape[0], i] = t[:,i]
+
+                    for i, vals in enumerate(y): # timestep, timehorizon values value
+                        summary_writer.add_scalars(str(key), {f'{q}_t+{j+1}':v for j,v in enumerate(vals) if v == v}, i)
+                summary_writer.close()
+
+            if args.save_predictions:
+                for q, t in _g.items():
+                    df = pd.DataFrame(t.tolist())
+                    df.columns = [f't+{i+1}' for i in range(len(df.columns))]
+                    os.makedirs(os.path.join(args.results, 'predictions', str(key)), exist_ok=True)
+                    df.to_csv(os.path.join(args.results, 'predictions', str(key), q+'.csv'))
+
+    losses = QuantileLoss(config)(unscaled_predictions, unscaled_targets)
+    normalizer = unscaled_targets.abs().mean()
+    q_risk = 2 * losses / normalizer
+
+    perf_dict = {
+                'throughput': perf_meter.avg,
+                'latency_avg': perf_meter.total_time/len(perf_meter.intervals),
+                'latency_p90': perf_meter.p(90),
+                'latency_p95': perf_meter.p(95),
+                'latency_p99': perf_meter.p(99),
+                'total_infernece_time': perf_meter.total_time,
+                }
+
+    return q_risk, perf_dict
+
+
+def main(args):
+    
+    setup_logger(args)
+    # Set up model
+    state_dict = torch.load(args.checkpoint)
+    config = state_dict['config']
+    model = TemporalFusionTransformer(config).cuda()
+    model.load_state_dict(state_dict['model'])
+    model.eval()
+    model.cuda()
+
+    # Set up dataset
+    test_split = TFTDataset(args.data, config)
+    data_loader = DataLoader(test_split, batch_size=args.batch_size, num_workers=4)
+
+    scalers = pickle.load(open(args.tgt_scalers, 'rb'))
+    cat_encodings = pickle.load(open(args.cat_encodings, 'rb'))
+
+    if args.visualize:
+        # TODO: abstract away all forms of visualization.
+        visualize_v2(args, config, model, data_loader, scalers, cat_encodings)
+
+    quantiles, perf_dict = inference(args, config, model, data_loader, scalers, cat_encodings)
+    quantiles = {'test_p10': quantiles[0].item(), 'test_p50': quantiles[1].item(), 'test_p90': quantiles[2].item(), 'sum':sum(quantiles).item()}
+    finish_log = {**quantiles, **perf_dict}
+    dllogger.log(step=(), data=finish_log, verbosity=1)
+    print('Test q-risk: P10 {} | P50 {} | P90 {}'.format(*quantiles))
+    print('Latency:\n\tAverage {:.3f}s\n\tp90 {:.3f}s\n\tp95 {:.3f}s\n\tp99 {:.3f}s'.format(
+        perf_dict['latency_avg'], perf_dict['latency_p90'], perf_dict['latency_p95'], perf_dict['latency_p99']))
+
+if __name__=='__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--checkpoint', type=str,
+                        help='Path to the checkpoint')
+    parser.add_argument('--data', type=str,
+                        help='Path to the test split of the dataset')
+    parser.add_argument('--tgt_scalers', type=str,
+                        help='Path to the tgt_scalers.bin file produced by the preprocessing')
+    parser.add_argument('--cat_encodings', type=str,
+                        help='Path to the cat_encodings.bin file produced by the preprocessing')
+    parser.add_argument('--batch_size', type=int, default=64)
+    parser.add_argument('--visualize', action='store_true', help='Visualize predictions - each example on the separate plot')
+    parser.add_argument('--joint_visualization', action='store_true', help='Visualize predictions - each timeseries on separate plot. Projections will be concatenated.')
+    parser.add_argument('--save_predictions', action='store_true')
+    parser.add_argument('--results', type=str, default='/results')
+    parser.add_argument('--log_file', type=str, default='dllogger.json')
+    ARGS = parser.parse_args()
+    main(ARGS)
diff --git a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/log_helper.py b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/log_helper.py
new file mode 100644
index 00000000..83d2ac7f
--- /dev/null
+++ b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/log_helper.py
@@ -0,0 +1,141 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import subprocess
+import sys
+import itertools
+import atexit
+
+import dllogger
+from dllogger import Backend, JSONStreamBackend, StdOutBackend
+
+import torch.distributed as dist
+from torch.utils.tensorboard import SummaryWriter
+
+class TensorBoardBackend(Backend):
+    def __init__(self, verbosity, log_dir):
+        super().__init__(verbosity=verbosity)
+        self.summary_writer = SummaryWriter(log_dir=os.path.join(log_dir, 'TB_summary'),
+                                            flush_secs=120,
+                                            max_queue=200
+                                            )
+        self.hp_cache = None
+        atexit.register(self.summary_writer.close)
+
+    @property
+    def log_level(self):
+        return self._log_level
+
+    def metadata(self, timestamp, elapsedtime, metric, metadata):
+        pass
+
+    def log(self, timestamp, elapsedtime, step, data):
+        if step == 'HPARAMS':
+            parameters = {k: v for k, v in data.items() if not isinstance(v, (list, tuple))}
+            #Unpack list and tuples
+            for d in [{k+f'_{i}':v for i,v in enumerate(l)} for k,l in data.items() if isinstance(l, (list, tuple))]:
+                parameters.update(d)
+            #Remove custom classes
+            parameters = {k: v for k, v in data.items() if isinstance(v, (int, float, str, bool))}
+            parameters.update({k:'None' for k, v in data.items() if v is None})
+            self.hp_cache = parameters
+        if step == ():
+            if self.hp_cache is None:
+                print('Warning: Cannot save HParameters. Please log HParameters with step=\'HPARAMS\'', file=sys.stderr)
+                return
+            self.summary_writer.add_hparams(self.hp_cache, data)
+        if not isinstance(step, int):
+            return
+        for k, v in data.items():
+            self.summary_writer.add_scalar(k, v, step)
+
+    def flush(self):
+        pass
+
+def setup_logger(args):
+    os.makedirs(args.results, exist_ok=True)
+    log_path = os.path.join(args.results, args.log_file)
+
+    if os.path.exists(log_path):
+        for i in itertools.count():
+            s_fname = args.log_file.split('.')
+            fname = '.'.join(s_fname[:-1]) + f'_{i}.' + s_fname[-1] if len(s_fname) > 1 else args.stat_file + f'.{i}'
+            log_path = os.path.join(args.results, fname)
+            if not os.path.exists(log_path):
+                break
+
+    def metric_format(metric, metadata, value):
+        return "{}: {}".format(metric, f'{value:.5f}' if isinstance(value, float) else value)
+    def step_format(step):
+        if step == ():
+            return "Finished |"
+        elif isinstance(step, int):
+            return "Step {0: <5} |".format(step)
+        return "Step {} |".format(step)
+
+
+    if not dist.is_initialized() or not args.distributed_world_size > 1 or args.distributed_rank == 0:
+        dllogger.init(backends=[JSONStreamBackend(verbosity=1, filename=log_path),
+                                TensorBoardBackend(verbosity=1, log_dir=args.results),
+                                StdOutBackend(verbosity=2, 
+                                              step_format=step_format,
+                                              prefix_format=lambda x: "")#,
+                                              #metric_format=metric_format)
+                                ])
+    else:
+        dllogger.init(backends=[])
+    dllogger.log(step='PARAMETER', data=vars(args), verbosity=0)
+
+    container_setup_info = {**get_framework_env_vars(), **get_system_info()}
+    dllogger.log(step='ENVIRONMENT', data=container_setup_info, verbosity=0)
+
+    dllogger.metadata('loss', {'GOAL': 'MINIMIZE', 'STAGE': 'TRAIN', 'format': ':5f'})
+    dllogger.metadata('P10', {'GOAL': 'MINIMIZE', 'STAGE': 'TRAIN', 'format': ':5f'})
+    dllogger.metadata('P50', {'GOAL': 'MINIMIZE', 'STAGE': 'TRAIN', 'format': ':5f'})
+    dllogger.metadata('P90', {'GOAL': 'MINIMIZE', 'STAGE': 'TRAIN', 'format': ':5f'})
+    dllogger.metadata('items/s', {'GOAL': 'MAXIMIZE', 'STAGE': 'TRAIN', 'format': ':1f'})
+    dllogger.metadata('val_loss', {'GOAL': 'MINIMIZE', 'STAGE': 'VAL', 'format':':5f'})
+    dllogger.metadata('val_P10', {'GOAL': 'MINIMIZE', 'STAGE': 'VAL', 'format': ':5f'})
+    dllogger.metadata('val_P50', {'GOAL': 'MINIMIZE', 'STAGE': 'VAL', 'format': ':5f'})
+    dllogger.metadata('val_P90', {'GOAL': 'MINIMIZE', 'STAGE': 'VAL', 'format': ':5f'})
+    dllogger.metadata('val_items/s', {'GOAL': 'MAXIMIZE', 'STAGE': 'VAL', 'format': ':1f'})
+    dllogger.metadata('test_P10', {'GOAL': 'MINIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
+    dllogger.metadata('test_P50', {'GOAL': 'MINIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
+    dllogger.metadata('test_P90', {'GOAL': 'MINIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
+    dllogger.metadata('throughput', {'GOAL': 'MAXIMIZE', 'STAGE': 'TEST', 'format': ':1f'})
+    dllogger.metadata('latency_p90', {'GOAL': 'MIMIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
+    dllogger.metadata('latency_p95', {'GOAL': 'MIMIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
+    dllogger.metadata('latency_p99', {'GOAL': 'MIMIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
+
+
+def get_framework_env_vars():
+    return {
+        'NVIDIA_PYTORCH_VERSION': os.environ.get('NVIDIA_PYTORCH_VERSION'),
+        'PYTORCH_VERSION': os.environ.get('PYTORCH_VERSION'),
+        'CUBLAS_VERSION': os.environ.get('CUBLAS_VERSION'),
+        'NCCL_VERSION': os.environ.get('NCCL_VERSION'),
+        'CUDA_DRIVER_VERSION': os.environ.get('CUDA_DRIVER_VERSION'),
+        'CUDNN_VERSION': os.environ.get('CUDNN_VERSION'),
+        'CUDA_VERSION': os.environ.get('CUDA_VERSION'),
+        'NVIDIA_PIPELINE_ID': os.environ.get('NVIDIA_PIPELINE_ID'),
+        'NVIDIA_BUILD_ID': os.environ.get('NVIDIA_BUILD_ID'),
+        'NVIDIA_TF32_OVERRIDE': os.environ.get('NVIDIA_TF32_OVERRIDE'),
+    }
+
+def get_system_info():
+    system_info = subprocess.run('nvidia-smi --query-gpu=gpu_name,memory.total,enforced.power.limit --format=csv'.split(), capture_output=True).stdout
+    system_info = [i.decode('utf-8') for i in system_info.split(b'\n')]
+    system_info = [x for x in system_info if x]
+    return {'system_info': system_info}
diff --git a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/modeling.py b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/modeling.py
new file mode 100644
index 00000000..65e64983
--- /dev/null
+++ b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/modeling.py
@@ -0,0 +1,367 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from torch import Tensor
+from typing import Dict, Tuple, Optional, List
+
+if os.environ.get("TFT_SCRIPTING", False):
+    from torch.nn import LayerNorm
+else:
+    from apex.normalization.fused_layer_norm import FusedLayerNorm as LayerNorm
+
+class MaybeLayerNorm(nn.Module):
+    def __init__(self, output_size, hidden_size, eps):
+        super().__init__()
+        if output_size and output_size == 1:
+            self.ln = nn.Identity()
+        else:
+            self.ln = LayerNorm(output_size if output_size else hidden_size, eps=eps)
+    
+    def forward(self, x):
+        return self.ln(x)
+
+
+class GLU(nn.Module):
+    def __init__(self, hidden_size, output_size):
+        super().__init__()
+        self.lin = nn.Linear(hidden_size, output_size * 2)
+
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.lin(x)
+        x = F.glu(x)
+        return x
+
+
+class GRN(nn.Module):
+    def __init__(self,
+                 input_size,
+                 hidden_size, 
+                 output_size=None,
+                 context_hidden_size=None,
+                 dropout=0):
+        super().__init__()
+
+        
+        self.layer_norm = MaybeLayerNorm(output_size, hidden_size, eps=1e-3)
+        self.lin_a = nn.Linear(input_size, hidden_size)
+        if context_hidden_size is not None:
+            self.lin_c = nn.Linear(context_hidden_size, hidden_size, bias=False)
+        self.lin_i = nn.Linear(hidden_size, hidden_size)
+        self.glu = GLU(hidden_size, output_size if output_size else hidden_size)
+        self.dropout = nn.Dropout(dropout)
+        self.out_proj = nn.Linear(input_size, output_size) if output_size else None
+
+    def forward(self, a: Tensor, c: Optional[Tensor] = None):
+        x = self.lin_a(a)
+        if c is not None:
+            x = x + self.lin_c(c).unsqueeze(1)
+        x = F.elu(x)
+        x = self.lin_i(x)
+        x = self.dropout(x)
+        x = self.glu(x)
+        y = a if not self.out_proj else self.out_proj(a)
+        x = x + y
+        x = self.layer_norm(x)
+        return x 
+
+class TFTEmbedding(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.s_cat_inp_lens    = config.static_categorical_inp_lens
+        self.t_cat_k_inp_lens  = config.temporal_known_categorical_inp_lens
+        self.t_cat_o_inp_lens  = config.temporal_observed_categorical_inp_lens
+        self.s_cont_inp_size   = config.static_continuous_inp_size
+        self.t_cont_k_inp_size = config.temporal_known_continuous_inp_size
+        self.t_cont_o_inp_size = config.temporal_observed_continuous_inp_size
+        self.t_tgt_size        = config.temporal_target_size
+
+        self.hidden_size = config.hidden_size
+
+        # There are 7 types of input:
+        # 1. Static categorical
+        # 2. Static continuous
+        # 3. Temporal known a priori categorical
+        # 4. Temporal known a priori continuous
+        # 5. Temporal observed categorical
+        # 6. Temporal observed continuous
+        # 7. Temporal observed targets (time series obseved so far)
+
+        self.s_cat_embed = nn.ModuleList([
+            nn.Embedding(n, self.hidden_size) for n in self.s_cat_inp_lens]) if self.s_cat_inp_lens else None
+        self.t_cat_k_embed = nn.ModuleList([
+            nn.Embedding(n, self.hidden_size) for n in self.t_cat_k_inp_lens]) if self.t_cat_k_inp_lens else None
+        self.t_cat_o_embed = nn.ModuleList([
+            nn.Embedding(n, self.hidden_size) for n in self.t_cat_o_inp_lens]) if self.t_cat_o_inp_lens else None
+
+        self.s_cont_embedding_vectors = nn.Parameter(torch.Tensor(self.s_cont_inp_size, self.hidden_size)) if self.s_cont_inp_size else None
+        self.t_cont_k_embedding_vectors = nn.Parameter(torch.Tensor(self.t_cont_k_inp_size, self.hidden_size)) if self.t_cont_k_inp_size else None
+        self.t_cont_o_embedding_vectors = nn.Parameter(torch.Tensor(self.t_cont_o_inp_size, self.hidden_size)) if self.t_cont_o_inp_size else None
+        self.t_tgt_embedding_vectors = nn.Parameter(torch.Tensor(self.t_tgt_size, self.hidden_size))
+
+        self.s_cont_embedding_bias = nn.Parameter(torch.zeros(self.s_cont_inp_size, self.hidden_size)) if self.s_cont_inp_size else None
+        self.t_cont_k_embedding_bias = nn.Parameter(torch.zeros(self.t_cont_k_inp_size, self.hidden_size)) if self.t_cont_k_inp_size else None
+        self.t_cont_o_embedding_bias = nn.Parameter(torch.zeros(self.t_cont_o_inp_size, self.hidden_size)) if self.t_cont_o_inp_size else None
+        self.t_tgt_embedding_bias = nn.Parameter(torch.zeros(self.t_tgt_size, self.hidden_size))
+
+        if self.s_cont_embedding_vectors is not None:
+            torch.nn.init.xavier_normal_(self.s_cont_embedding_vectors)
+        if self.t_cont_k_embedding_vectors is not None:
+            torch.nn.init.xavier_normal_(self.t_cont_k_embedding_vectors)
+        if self.t_cont_o_embedding_vectors is not None:
+            torch.nn.init.xavier_normal_(self.t_cont_o_embedding_vectors)
+        torch.nn.init.xavier_normal_(self.t_tgt_embedding_vectors)
+
+    def _apply_embedding(self,
+            cat: Optional[Tensor],
+            cont: Optional[Tensor],
+            cat_emb: Optional[nn.ModuleList], 
+            cont_emb: Tensor,
+            cont_bias: Tensor,
+            ) -> Tuple[Optional[Tensor], Optional[Tensor]]:
+        e_cat = torch.stack([embed(cat[...,i]) for i, embed in enumerate(cat_emb)], dim=-2) if cat is not None else None
+        if cont is not None:
+            #the line below is equivalent to following einsums
+            #e_cont = torch.einsum('btf,fh->bthf', cont, cont_emb)
+            #e_cont = torch.einsum('bf,fh->bhf', cont, cont_emb)
+            e_cont = torch.mul(cont.unsqueeze(-1), cont_emb)
+            e_cont = e_cont + cont_bias
+        else:
+            e_cont = None
+
+        if e_cat is not None and e_cont is not None:
+            return torch.cat([e_cat, e_cont], dim=-2)
+        elif e_cat is not None:
+            return e_cat
+        elif e_cont is not None:
+            return e_cont
+        else:
+            return None
+
+    def forward(self, x: Dict[str, Tensor]):
+        # temporal/static categorical/continuous known/observed input 
+        s_cat_inp = x.get('s_cat', None)
+        s_cont_inp = x.get('s_cont', None)
+        t_cat_k_inp = x.get('k_cat', None)
+        t_cont_k_inp = x.get('k_cont', None)
+        t_cat_o_inp = x.get('o_cat', None)
+        t_cont_o_inp = x.get('o_cont', None)
+        t_tgt_obs = x['target'] # Has to be present
+
+        # Static inputs are expected to be equal for all timesteps
+        # For memory efficiency there is no assert statement
+        s_cat_inp = s_cat_inp[:,0,:] if s_cat_inp is not None else None
+        s_cont_inp = s_cont_inp[:,0,:] if s_cont_inp is not None else None
+
+        s_inp = self._apply_embedding(s_cat_inp,
+                                      s_cont_inp,
+                                      self.s_cat_embed,
+                                      self.s_cont_embedding_vectors,
+                                      self.s_cont_embedding_bias)
+        t_known_inp = self._apply_embedding(t_cat_k_inp,
+                                            t_cont_k_inp,
+                                            self.t_cat_k_embed,
+                                            self.t_cont_k_embedding_vectors,
+                                            self.t_cont_k_embedding_bias)
+        t_observed_inp = self._apply_embedding(t_cat_o_inp,
+                                               t_cont_o_inp,
+                                               self.t_cat_o_embed,
+                                               self.t_cont_o_embedding_vectors,
+                                               self.t_cont_o_embedding_bias)
+
+        # Temporal observed targets
+        # t_observed_tgt = torch.einsum('btf,fh->btfh', t_tgt_obs, self.t_tgt_embedding_vectors)
+        t_observed_tgt = torch.matmul(t_tgt_obs.unsqueeze(3).unsqueeze(4), self.t_tgt_embedding_vectors.unsqueeze(1)).squeeze(3)
+        t_observed_tgt = t_observed_tgt + self.t_tgt_embedding_bias
+
+        return s_inp, t_known_inp, t_observed_inp, t_observed_tgt
+
+class VariableSelectionNetwork(nn.Module):
+    def __init__(self, config, num_inputs):
+        super().__init__()
+        self.joint_grn = GRN(config.hidden_size*num_inputs, config.hidden_size, output_size=num_inputs, context_hidden_size=config.hidden_size)
+        self.var_grns = nn.ModuleList([GRN(config.hidden_size, config.hidden_size, dropout=config.dropout) for _ in range(num_inputs)])
+
+    def forward(self, x: Tensor, context: Optional[Tensor] = None):
+        Xi = x.reshape(*x.shape[:-2], -1)
+        grn_outputs = self.joint_grn(Xi, c=context)
+        sparse_weights = F.softmax(grn_outputs, dim=-1)
+        transformed_embed_list = [m(x[...,i,:]) for i, m in enumerate(self.var_grns)]
+        transformed_embed = torch.stack(transformed_embed_list, dim=-1)
+        #the line below performs batched matrix vector multiplication
+        #for temporal features it's bthf,btf->bth
+        #for static features it's bhf,bf->bh
+        variable_ctx = torch.matmul(transformed_embed, sparse_weights.unsqueeze(-1)).squeeze(-1)
+
+        return variable_ctx, sparse_weights
+
+class StaticCovariateEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.vsn = VariableSelectionNetwork(config, config.num_static_vars)
+        self.context_grns = nn.ModuleList([GRN(config.hidden_size, config.hidden_size, dropout=config.dropout) for _ in range(4)])
+
+    def forward(self, x: Tensor) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
+        variable_ctx, sparse_weights = self.vsn(x)
+
+        # Context vectors:
+        # variable selection context
+        # enrichment context
+        # state_c context
+        # state_h context
+        cs, ce, ch, cc = tuple(m(variable_ctx) for m in self.context_grns)
+
+        return cs, ce, ch, cc
+
+
+class InterpretableMultiHeadAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.n_head = config.n_head
+        assert config.hidden_size % config.n_head == 0
+        self.d_head = config.hidden_size // config.n_head
+        self.qkv_linears = nn.Linear(config.hidden_size, (2 * self.n_head + 1) * self.d_head, bias=False)
+        self.out_proj = nn.Linear(self.d_head, config.hidden_size, bias=False)
+        self.attn_dropout = nn.Dropout(config.attn_dropout)
+        self.out_dropout = nn.Dropout(config.dropout)
+        self.scale = self.d_head**-0.5
+        self.register_buffer("_mask", torch.triu(torch.full((config.example_length, config.example_length), float('-inf')), 1).unsqueeze(0))
+
+    def forward(self, x: Tensor, mask_future_timesteps: bool = True) -> Tuple[Tensor, Tensor]:
+        bs, t, h_size = x.shape
+        qkv = self.qkv_linears(x)
+        q, k, v = qkv.split((self.n_head * self.d_head, self.n_head * self.d_head, self.d_head), dim=-1)
+        q = q.view(bs, t, self.n_head, self.d_head)
+        k = k.view(bs, t, self.n_head, self.d_head)
+        v = v.view(bs, t, self.d_head)
+
+        # attn_score = torch.einsum('bind,bjnd->bnij', q, k)
+        attn_score = torch.matmul(q.permute((0, 2, 1, 3)), k.permute((0, 2, 3, 1)))
+        attn_score.mul_(self.scale)
+
+        if mask_future_timesteps:
+            attn_score = attn_score + self._mask
+
+        attn_prob = F.softmax(attn_score, dim=3)
+        attn_prob = self.attn_dropout(attn_prob)
+
+        # attn_vec = torch.einsum('bnij,bjd->bnid', attn_prob, v)
+        attn_vec = torch.matmul(attn_prob, v.unsqueeze(1))
+        m_attn_vec = torch.mean(attn_vec, dim=1)
+        out = self.out_proj(m_attn_vec)
+        out = self.out_dropout(out)
+
+        return out, attn_vec
+
+
+
+class TemporalFusionTransformer(nn.Module):
+    """ 
+    Implementation of https://arxiv.org/abs/1912.09363 
+    """
+    def __init__(self, config):
+        super().__init__()
+
+        if hasattr(config, 'model'):
+            config = config.model
+
+        self.encoder_length = config.encoder_length #this determines from how distant past we want to use data from
+
+        self.embedding = TFTEmbedding(config)
+        self.static_encoder = StaticCovariateEncoder(config)
+
+        self.history_vsn = VariableSelectionNetwork(config, config.num_historic_vars) 
+        self.history_encoder = nn.LSTM(config.hidden_size, config.hidden_size, batch_first=True)
+        self.future_vsn = VariableSelectionNetwork(config, config.num_future_vars)
+        self.future_encoder = nn.LSTM(config.hidden_size, config.hidden_size, batch_first=True)
+
+
+        self.input_gate = GLU(config.hidden_size, config.hidden_size)
+        self.input_gate_ln = LayerNorm(config.hidden_size, eps=1e-3)
+
+        self.enrichment_grn = GRN(config.hidden_size,
+                                  config.hidden_size,
+                                  context_hidden_size=config.hidden_size, 
+                                  dropout=config.dropout)
+        self.attention = InterpretableMultiHeadAttention(config)
+        self.attention_gate = GLU(config.hidden_size, config.hidden_size)
+        self.attention_ln = LayerNorm(config.hidden_size, eps=1e-3)
+
+        self.positionwise_grn = GRN(config.hidden_size,
+                                    config.hidden_size,
+                                    dropout=config.dropout)
+
+        self.decoder_gate = GLU(config.hidden_size, config.hidden_size)
+        self.decoder_ln = LayerNorm(config.hidden_size, eps=1e-3)
+
+        self.quantile_proj = nn.Linear(config.hidden_size, len(config.quantiles))
+
+    def forward(self, x: Dict[str, Tensor]) -> Tensor:
+        s_inp, t_known_inp, t_observed_inp, t_observed_tgt = self.embedding(x)
+
+        # Static context
+        cs, ce, ch, cc = self.static_encoder(s_inp)
+        ch, cc = ch.unsqueeze(0), cc.unsqueeze(0) #lstm initial states
+
+        # Temporal input
+        _historical_inputs = [t_known_inp[:,:self.encoder_length,:], t_observed_tgt[:,:self.encoder_length,:]]
+        if t_observed_inp is not None:
+            _historical_inputs.insert(0,t_observed_inp[:,:self.encoder_length,:])
+
+        historical_inputs = torch.cat(_historical_inputs, dim=-2)
+        future_inputs = t_known_inp[:, self.encoder_length:]
+
+        # Encoders
+        historical_features, _ = self.history_vsn(historical_inputs, cs)
+        history, state = self.history_encoder(historical_features, (ch, cc))
+        future_features, _ = self.future_vsn(future_inputs, cs)
+        future, _ = self.future_encoder(future_features, state)
+        torch.cuda.synchronize() # this call gives perf boost for unknown reasons
+
+        # skip connection
+        input_embedding = torch.cat([historical_features, future_features], dim=1)
+        temporal_features = torch.cat([history, future], dim=1)
+        temporal_features = self.input_gate(temporal_features)
+        temporal_features = temporal_features + input_embedding
+        temporal_features = self.input_gate_ln(temporal_features)
+
+        # Static enrichment
+        enriched = self.enrichment_grn(temporal_features, c=ce)
+
+        # Temporal self attention
+        x, _ = self.attention(enriched, mask_future_timesteps=True)
+
+        # Don't compute hictorical quantiles
+        x = x[:, self.encoder_length:, :]
+        temporal_features = temporal_features[:, self.encoder_length:, :]
+        enriched = enriched[:, self.encoder_length:, :]
+
+        x = self.attention_gate(x)
+        x = x + enriched
+        x = self.attention_ln(x)
+
+        # Position-wise feed-forward
+        x = self.positionwise_grn(x)
+
+        # Final skip connection
+        x = self.decoder_gate(x)
+        x = x + temporal_features
+        x = self.decoder_ln(x)
+
+        out = self.quantile_proj(x)
+
+        return out
diff --git a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/requirements.txt b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/requirements.txt
new file mode 100644
index 00000000..8ba46efc
--- /dev/null
+++ b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/requirements.txt
@@ -0,0 +1 @@
+tensorboard
diff --git a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/scripts/benchmark.sh b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/scripts/benchmark.sh
new file mode 100644
index 00000000..c8a04c36
--- /dev/null
+++ b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/scripts/benchmark.sh
@@ -0,0 +1,54 @@
+#! /bin/bash
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+NUM_GPUS=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
+[ $NUM_GPUS -eq 16 ] && WORKER_NUMS=(1 8 16) || WORKER_NUMS=(1 8)
+DATASETS=(electricity traffic)
+
+rm -r /tmp/benchmark_results
+
+for DATASET in ${DATASETS[@]}
+do
+    for NGPU in ${WORKER_NUMS[@]}
+    do
+        for BATCH_SIZE in 512 1024 1536 2048 2560
+        do
+            for USE_AMP in --use_amp ""
+            do
+                for AFFINITY in "--affinity disabled" "--affinity single" "--affinity socket_unique_interleaved"
+                do 
+                    EXP_NAME="TFT_benchmark_${DATASET}_BS_${BATCH_SIZE}_${NGPU}GPU${USE_AMP}_${AFFINITY}"
+                    python -m torch.distributed.launch --nproc_per_node=${NGPU} train.py \
+                            --dataset ${DATASET} \
+                            --data_path /data/processed/${DATASET}_bin \
+                            --batch_size=${BATCH_SIZE} \
+                            --lr 5e-4 \
+                            --epochs 1 \
+                            --sample 100000 5000 \
+                            --seed 1 \
+                            ${USE_AMP} \
+                            ${AFFINITY} \
+                            --clip_grad 0.1 \
+                            --results /tmp/benchmark_results/${EXP_NAME}
+                done
+            done
+        done
+    done
+done
+for P in `ls /tmp/benchmark_results/`;
+do
+    echo ${P}
+    tail -n 1 /tmp/benchmark_results/${P}/dllogger.json
+done
diff --git a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/scripts/get_data.sh b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/scripts/get_data.sh
new file mode 100644
index 00000000..d4c7c7e1
--- /dev/null
+++ b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/scripts/get_data.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+DATAPATH='/data'
+
+declare -A URLS=( ['electricity']='https://archive.ics.uci.edu/ml/machine-learning-databases/00321/LD2011_2014.txt.zip'
+                  ['traffic']='https://archive.ics.uci.edu/ml/machine-learning-databases/00204/PEMS-SF.zip'
+                )
+
+mkdir -p ${DATAPATH}/raw
+mkdir -p ${DATAPATH}/processed
+
+for DS in electricity traffic
+do
+	DS_PATH=${DATAPATH}/raw/${DS}
+	ZIP_FNAME=${DS_PATH}.zip
+    if [ ! -d ${DS_PATH} ]
+    then
+        wget "${URLS[${DS}]}" -O ${ZIP_FNAME}
+        unzip ${ZIP_FNAME} -d ${DS_PATH}
+    fi
+	python -c "from data_utils import standarize_${DS} as standarize; standarize(\"${DS_PATH}\")"
+	python -c "from data_utils import preprocess; \
+               from configuration import ${DS^}Config as Config; \
+               preprocess(\"${DS_PATH}/standarized.csv\", \"${DATAPATH}/processed/${DS}_bin\", Config())" 
+done
+
+
diff --git a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/scripts/run_electricity.sh b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/scripts/run_electricity.sh
new file mode 100644
index 00000000..86214a9a
--- /dev/null
+++ b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/scripts/run_electricity.sh
@@ -0,0 +1,30 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+: ${SEED:=1}
+: ${LR:=1e-3}
+: ${NGPU:=8}
+: ${BATCH_SIZE:=1024}
+: ${EPOCHS:=30}
+
+python -m torch.distributed.launch --nproc_per_node=${NGPU} train.py \
+        --dataset electricity \
+        --data_path /data/processed/electricity_bin \
+        --batch_size=${BATCH_SIZE} \
+        --sample 450000 50000 \
+        --lr ${LR} \
+        --epochs ${EPOCHS} \
+        --seed ${SEED} \
+        --use_amp \
+        --results /results/TFT_electricity_bs${NGPU}x${BATCH_SIZE}_lr${LR}/seed_${SEED}
diff --git a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/scripts/run_electricity_DGX1-16G.sh b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/scripts/run_electricity_DGX1-16G.sh
new file mode 100644
index 00000000..86214a9a
--- /dev/null
+++ b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/scripts/run_electricity_DGX1-16G.sh
@@ -0,0 +1,30 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+: ${SEED:=1}
+: ${LR:=1e-3}
+: ${NGPU:=8}
+: ${BATCH_SIZE:=1024}
+: ${EPOCHS:=30}
+
+python -m torch.distributed.launch --nproc_per_node=${NGPU} train.py \
+        --dataset electricity \
+        --data_path /data/processed/electricity_bin \
+        --batch_size=${BATCH_SIZE} \
+        --sample 450000 50000 \
+        --lr ${LR} \
+        --epochs ${EPOCHS} \
+        --seed ${SEED} \
+        --use_amp \
+        --results /results/TFT_electricity_bs${NGPU}x${BATCH_SIZE}_lr${LR}/seed_${SEED}
diff --git a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/scripts/run_traffic.sh b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/scripts/run_traffic.sh
new file mode 100644
index 00000000..cab8e473
--- /dev/null
+++ b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/scripts/run_traffic.sh
@@ -0,0 +1,30 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+: ${SEED:=1}
+: ${LR:=1e-3}
+: ${NGPU:=8}
+: ${BATCH_SIZE:=1024}
+: ${EPOCHS:=20}
+
+python -m torch.distributed.launch --nproc_per_node=${NGPU} train.py \
+        --dataset traffic \
+        --data_path /data/processed/traffic_bin \
+        --batch_size=${BATCH_SIZE} \
+        --sample 450000 50000 \
+        --lr ${LR} \
+        --epochs ${EPOCHS} \
+        --seed ${SEED} \
+        --use_amp \
+        --results /results/TFT_traffic_bs${NGPU}x${BATCH_SIZE}_lr${LR}/seed_${SEED}
diff --git a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/scripts/run_traffic_DGX1-16G.sh b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/scripts/run_traffic_DGX1-16G.sh
new file mode 100644
index 00000000..cab8e473
--- /dev/null
+++ b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/scripts/run_traffic_DGX1-16G.sh
@@ -0,0 +1,30 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+: ${SEED:=1}
+: ${LR:=1e-3}
+: ${NGPU:=8}
+: ${BATCH_SIZE:=1024}
+: ${EPOCHS:=20}
+
+python -m torch.distributed.launch --nproc_per_node=${NGPU} train.py \
+        --dataset traffic \
+        --data_path /data/processed/traffic_bin \
+        --batch_size=${BATCH_SIZE} \
+        --sample 450000 50000 \
+        --lr ${LR} \
+        --epochs ${EPOCHS} \
+        --seed ${SEED} \
+        --use_amp \
+        --results /results/TFT_traffic_bs${NGPU}x${BATCH_SIZE}_lr${LR}/seed_${SEED}
diff --git a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/Dockerfile b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/Dockerfile
new file mode 100644
index 00000000..70552ea1
--- /dev/null
+++ b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/Dockerfile
@@ -0,0 +1,36 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:21.06-py3
+
+FROM ${FROM_IMAGE_NAME}
+
+RUN apt-get update && apt-get install -y libb64-dev libb64-0d
+WORKDIR /workspace
+#ENV PYTHONPATH /workspace
+RUN pip uninstall -y typing
+
+RUN apt update && apt install -y p7zip-full
+COPY requirements.txt .
+RUN pip install --upgrade pip
+RUN pip install --no-cache-dir --ignore-installed -r requirements.txt
+RUN pip install --no-cache-dir -e git://github.com/NVIDIA/dllogger#egg=dllogger
+
+COPY . .
+ENV PYTHONPATH="${PYTHONPATH}:/workspace"
+
+# AMP monkey-patch
+RUN sed -i 's/  def forward(ctx,/  @amp.custom_fwd\(cast_inputs=torch.float32\)\n  def forward(ctx,/g' /opt/conda/lib/python3.8/site-packages/apex/normalization/fused_layer_norm.py
+RUN sed -i 's/  def backward(ctx,/  @amp.custom_bwd\n  def backward(ctx,/g' /opt/conda/lib/python3.8/site-packages/apex/normalization/fused_layer_norm.py
+RUN sed -i 's/^import torch$/import torch\nfrom torch.cuda import amp/' /opt/conda/lib/python3.8/site-packages/apex/normalization/fused_layer_norm.py
diff --git a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/LICENCE b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/LICENCE
new file mode 100644
index 00000000..261eeb9e
--- /dev/null
+++ b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/LICENCE
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/LICENSE AGREEMENT b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/LICENSE AGREEMENT
new file mode 100644
index 00000000..5d1d88cf
--- /dev/null
+++ b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/LICENSE AGREEMENT	
@@ -0,0 +1,25 @@
+Individual Contributor License Agreement (CLA)
+Thank you for submitting your contributions to this project.
+
+By signing this CLA, you agree that the following terms apply to all of your past, present and future contributions to the project.
+
+License.
+You hereby represent that all present, past and future contributions are governed by the Apache 2.0 License copyright statement.
+
+This entails that to the extent possible under law, you transfer all copyright and related or neighboring rights of the code or documents you contribute to the project itself or its maintainers. Furthermore you also represent that you have the authority to perform the above waiver with respect to the entirety of you contributions.
+
+Moral Rights.
+To the fullest extent permitted under applicable law, you hereby waive, and agree not to assert, all of your “moral rights” in or relating to your contributions for the benefit of the project.
+
+Third Party Content.
+If your Contribution includes or is based on any source code, object code, bug fixes, configuration changes, tools, specifications, documentation, data, materials, feedback, information or other works of authorship that were not authored by you (“Third Party Content”) or if you are aware of any third party intellectual property or proprietary rights associated with your Contribution (“Third Party Rights”), then you agree to include with the submission of your Contribution full details respecting such Third Party Content and Third Party Rights, including, without limitation, identification of which aspects of your Contribution contain Third Party Content or are associated with Third Party Rights, the owner/author of the Third Party Content and Third Party Rights, where you obtained the Third Party Content, and any applicable third party license terms or restrictions respecting the Third Party Content and Third Party Rights. For greater certainty, the foregoing obligations respecting the identification of Third Party Content and Third Party Rights do not apply to any portion of a Project that is incorporated into your Contribution to that same Project.
+
+Representations.
+You represent that, other than the Third Party Content and Third Party Rights identified by you in accordance with this Agreement, you are the sole author of your Contributions and are legally entitled to grant the foregoing licenses and waivers in respect of your Contributions. If your Contributions were created in the course of your employment with your past or present employer(s), you represent that such employer(s) has authorized you to make your Contributions on behalf of such employer(s) or such employer (s) has waived all of their right, title or interest in or to your Contributions.
+
+Disclaimer.
+To the fullest extent permitted under applicable law, your Contributions are provided on an "as is" basis, without any warranties or conditions, express or implied, including, without limitation, any implied warranties or conditions of non-infringement, merchantability or fitness for a particular purpose. You are not required to provide support for your Contributions, except to the extent you desire to provide support.
+
+No Obligation.
+You acknowledge that the maintainers of this project are under no obligation to use or incorporate your contributions into the project. The decision to use or incorporate your contributions into the project will be made at the sole discretion of the maintainers or their authorized delegates.
+
diff --git a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/NOTICE b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/NOTICE
new file mode 100644
index 00000000..ae19bb47
--- /dev/null
+++ b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/NOTICE
@@ -0,0 +1,3 @@
+TFT for PyTorch
+
+This repository includes software from https://github.com/google-research/google-research/tree/master/tft licensed under the Apache License, Version 2.0
diff --git a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/README.md b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/README.md
new file mode 100644
index 00000000..69b39d12
--- /dev/null
+++ b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/README.md
@@ -0,0 +1,465 @@
+# Temporal Fusion Transformer For PyTorch
+
+This repository provides a script and recipe to train the Temporal Fusion Transformer model to achieve state-of-the-art accuracy. The content of this repository is tested and maintained by NVIDIA.
+
+## Table Of Contents
+
+- [Model overview](#model-overview)
+    * [Model architecture](#model-architecture)
+    * [Default configuration](#default-configuration)
+    * [Feature support matrix](#feature-support-matrix)
+	    * [Features](#features)
+    * [Mixed precision training](#mixed-precision-training)
+	    * [Enabling mixed precision](#enabling-mixed-precision)
+          * [Enabling TF32](#enabling-tf32)
+    * [Glossary](#glossary)
+- [Setup](#setup)
+    * [Requirements](#requirements)
+- [Quick Start Guide](#quick-start-guide)
+- [Advanced](#advanced)
+    * [Scripts and sample code](#scripts-and-sample-code)
+    * [Command-line options](#command-line-options)
+    * [Getting the data](#getting-the-data)
+        * [Dataset guidelines](#dataset-guidelines)
+        * [Multi-dataset](#multi-dataset)
+    * [Training process](#training-process)
+    * [Inference process](#inference-process)
+- [Performance](#performance)
+    * [Benchmarking](#benchmarking)
+        * [Training performance benchmark](#training-performance-benchmark)
+        * [Inference performance benchmark](#inference-performance-benchmark)
+    * [Results](#results)
+        * [Training accuracy results](#training-accuracy-results)                         
+            * [Training accuracy: NVIDIA DGX A100 (8x A100 80GB)](#training-accuracy-nvidia-dgx-a100-8x-a100-80gb)
+            * [Training accuracy: NVIDIA DGX-1 (8x V100 16GB)](#training-accuracy-nvidia-dgx-1-8x-v100-16gb)
+            * [Training stability test](#training-stability-test)
+        * [Training performance results](#training-performance-results)
+            * [Training performance: NVIDIA DGX A100 (8x A100 80GB)](#training-performance-nvidia-dgx-a100-8x-a100-80gb)
+            * [Training performance: NVIDIA DGX-1 (8x V100 16GB)](#training-performance-nvidia-dgx-1-8x-v100-16gb)
+- [Release notes](#release-notes)
+    * [Changelog](#changelog)
+    * [Known issues](#known-issues)
+
+
+
+## Model overview
+
+The Temporal Fusion Transformer [TFT](https://arxiv.org/abs/1912.09363) model is a state-of-the-art architecture for interpretable, multi-horizon time-series prediction. The model was first developed and [implemented by Google](https://github.com/google-research/google-research/tree/master/tft) with the collaboration with the University of Oxford.
+This implementation differs from the reference implementation by addressing the issue of missing data, which is common in production datasets, by either masking their values in attention matrices or embedding them as a special value in the latent space.
+This model enables the prediction of confidence intervals for future values of time series for multiple future timesteps.
+
+This model is trained with mixed precision using Tensor Cores on Volta, Turing, and the NVIDIA Ampere GPU architectures. Therefore, researchers can get results 1.45x faster than training without Tensor Cores while experiencing the benefits of mixed precision training. This model is tested against each NGC monthly container release to ensure consistent accuracy and performance over time.
+
+### Model architecture
+
+The TFT model is a hybrid architecture joining LSTM encoding of time series and interpretability of transformer attention layers. Prediction is based on three  types of variables: static (constant for a given time series), known (known in advance for whole history and future), observed (known only for historical data). All these variables come in two flavors: categorical, and continuous. In addition to historical data, we feed the model with historical values of time series. All variables are embedded in high-dimensional space by learning an embedding vector. Categorical variables embeddings are learned in the classical sense of embedding discrete values. The model learns a single vector for each continuous variable, which is then scaled by this variable’s value for further processing. The next step is to filter variables through the Variable Selection Network (VSN), which assigns weights to the inputs in accordance with their relevance to the prediction. Static variables are used as a context for variable selection of other variables and as an initial state of LSTM encoders.
+After encoding, variables are passed to multi-head attention layers (decoder), which produce the final prediction. Whole architecture is interwoven with residual connections with gating mechanisms that allow  the architecture to adapt to various problems by skipping some parts of it.
+For the sake of explainability, heads of self-attention layers share value matrices. This allows interpreting  self-attention as an ensemble of models predicting different temporal patterns over the same feature set. The other feature that helps us understand the model is VSN activations, which tells us how relevant the given feature is to the prediction.
+![](TFT_architecture.PNG)
+*image source: https://arxiv.org/abs/1912.09363*
+
+### Default configuration
+
+The specific configuration of the TFT model depends on the dataset used. Not only is the volume of the model subject to change but so are the data sampling and preprocessing strategies. During preprocessing, data is normalized per feature. For a part of the datasets, we apply scaling per-time-series, which takes into account shifts in distribution between entities (i.e., a factory consumes more electricity than an average house). The model is trained with the quantile loss: <img src="https://render.githubusercontent.com/render/math?math=\Large\sum_{i=1}^N\sum_{q\in\mathcal{Q}}\sum_{t=1}^{t_{max}}\frac{QL(y_it,\hat{y}_i(q,t),q)}{Nt_{max}}">
+For quantiles in [0.1, 0.5, 0.9]. The default configurations are tuned for distributed training on DGX-1-32G with mixed precision. We use dynamic loss scaling. Specific values are provided in the table below.
+
+| Dataset | Training samples | Validation samples | Test samples | History length | Forecast horizon | Dropout | Hidden size | #Heads | BS | LR | Gradient clipping |
+| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
+| Electricity | 450k | 50k | 53.5k | 168 | 24 | 0.1 | 128 | 4 | 8x1024 | 1e-3 | 0.0 |
+| Traffic | 450k | 50k | 139.6k | 168 | 24 | 0.3 | 128 | 4 | 8x1024 | 1e-3 | 0.0
+
+### Feature support matrix
+
+The following features are supported by this model: 
+
+| Feature                    | Yes column                
+|----------------------------|--------------------------
+|Distributed data parallel   |         Yes
+|PyTorch AMP                 |         Yes 
+    
+         
+#### Features
+
+[Automatic Mixed Precision](https://pytorch.org/docs/stable/amp.html)
+provides an easy way to leverage Tensor Cores’ performance. It allows the execution of parts of a network in lower precision. Refer to [Mixed precision training](#mixed-precision-training) for more information.
+
+[PyTorch
+DistributedDataParallel](https://pytorch.org/docs/stable/nn.html#torch.nn.parallel.DistributedDataParallel) - a module
+wrapper that enables easy multiprocess distributed data-parallel
+training.
+
+### Mixed precision training
+
+Mixed precision is the combined use of different numerical precisions in a
+computational method.
+[Mixed precision](https://arxiv.org/abs/1710.03740) training offers significant
+computational speedup by performing operations in half-precision format while
+storing minimal information in single-precision to retain as much information
+as possible in critical parts of the network. Since the introduction of [Tensor Cores](https://developer.nvidia.com/tensor-cores) in Volta, and following with 
+both the Turing and Ampere architectures, significant training speedups are 
+experienced by switching to
+mixed precision -- up to 3x overall speedup on the most arithmetically intense
+model architectures. Using mixed precision training previously required two
+steps:
+
+1. Porting the model to use the FP16 data type where appropriate.
+2. Manually adding loss scaling to preserve small gradient values.
+
+The ability to train deep learning networks with lower precision was introduced
+in the Pascal architecture and first supported in [CUDA
+8](https://devblogs.nvidia.com/parallelforall/tag/fp16/) in the NVIDIA Deep
+Learning SDK.
+
+For information about:
+* How to train using mixed precision, refer to the [Mixed Precision
+  Training](https://arxiv.org/abs/1710.03740) paper and [Training With Mixed
+  Precision](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html)
+  documentation.
+* Techniques used for mixed precision training, refer to the [Mixed-Precision
+  Training of Deep Neural
+  Networks](https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/)
+  blog.
+* APEX tools for mixed precision training, refer to the [NVIDIA Apex: Tools for Easy Mixed-Precision Training in
+  PyTorch](https://devblogs.nvidia.com/apex-pytorch-easy-mixed-precision-training/)
+  .
+
+
+#### Enabling mixed precision
+
+
+Mixed precision is enabled in PyTorch by using the Automatic Mixed Precision torch.cuda.amp module, which casts variables to half-precision upon retrieval while storing variables in single-precision format. Furthermore, to preserve small gradient magnitudes in backpropagation, a [loss scaling](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html#lossscaling) step must be included when applying gradients. In PyTorch, loss scaling can be applied automatically by the GradScaler class. All the necessary steps to implement AMP are verbosely described [here](https://pytorch.org/docs/stable/notes/amp_examples.html#amp-examples).
+
+To enable mixed precision for TFT, simply add the `--use_amp` option to the training script.
+#### Enabling TF32
+
+TensorFloat-32 (TF32) is the new math mode in [NVIDIA A100](https://www.nvidia.com/en-us/data-center/a100/) GPUs for handling the matrix math, also called tensor operations. TF32 running on Tensor Cores in A100 GPUs can provide up to 10x speedups compared to single-precision floating-point math (FP32) on Volta GPUs. 
+
+TF32 Tensor Cores can speed up networks using FP32, typically with no loss of accuracy. It is more robust than FP16 for models which require high dynamic range for weights or activations.
+
+For more information, refer to the [TensorFloat-32 in the A100 GPU Accelerates AI Training, HPC up to 20x](https://blogs.nvidia.com/blog/2020/05/14/tensorfloat-32-precision-format/) blog post.
+
+TF32 is supported in the NVIDIA Ampere GPU architecture and is enabled by default.
+
+
+
+### Glossary
+
+**Multi horizon prediction**  
+Process of estimating values of a time series for multiple future time steps.
+
+**Quantiles**  
+Cut points dividing the range of a probability distribution intervals with equal probabilities.
+
+**Time series**  
+Series of data points indexed and equally spaced in time.
+
+**Transformer**  
+The paper [Attention Is All You Need](https://arxiv.org/abs/1706.03762) introduces a novel architecture called Transformer that uses an attention mechanism and transforms one sequence into another.
+ 
+
+## Setup
+
+The following section lists the requirements that you need to meet in order to start training the TFT model.
+
+### Requirements
+
+This repository contains Dockerfile, which extends the PyTorch NGC container and encapsulates some dependencies. Aside from these dependencies, ensure you have the following components:
+-   [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker)
+-   [PyTorch 21.06 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch)
+-   Supported GPUs:
+- [NVIDIA Volta architecture](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/)
+- [NVIDIA Turing architecture](https://www.nvidia.com/en-us/design-visualization/technologies/turing-architecture/)
+- [NVIDIA Ampere architecture](https://www.nvidia.com/en-us/data-center/nvidia-ampere-gpu-architecture/)
+
+For more information about how to get started with NGC containers, refer to the following sections from the NVIDIA GPU Cloud Documentation and the Deep Learning Documentation:
+-   [Getting Started Using NVIDIA GPU Cloud](https://docs.nvidia.com/ngc/ngc-getting-started-guide/index.html)
+-   [Accessing And Pulling From The NGC Container Registry](https://docs.nvidia.com/deeplearning/frameworks/user-guide/index.html#accessing_registry)
+-   Running [PyTorch](https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/running.html#running)
+
+  
+For those unable to use the PyTorch NGC container to set up the required environment or create your own container, refer to the versioned [NVIDIA Container Support Matrix](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html).
+
+## Quick Start Guide
+
+To train your model using mixed or TF32 precision with Tensor Cores, perform the following steps using the default parameters of the TFT model on any of the benchmark datasets. For the specifics concerning training and inference, refer to the [Advanced](#advanced) section.
+
+1. Clone the repository.
+```bash
+git clone https://github.com/NVIDIA/DeepLearningExamples
+cd DeepLearningExamples/PyTorch/Forecasting/TFT
+```
+
+2.  Build the TFT PyTorch NGC container.
+```bash
+docker build --network=host -t tft .
+```
+
+3.  Start an interactive session in the NGC container to run training/inference.
+```bash
+docker run -it --rm --ipc=host --network=host --gpus all -v /path/to/your/data:/data/ tft
+```
+
+Note: Ensure to mount your dataset using the -v flag to make it available for training inside the NVIDIA Docker container.
+
+4.  Download and preprocess datasets.
+```bash
+bash scripts/get_data.sh
+```
+
+5. Start training. Choose one of the scripts provided in the `scripts/` directory. Results are stored in the `/results` directory.
+These scripts are tuned for DGX1-32G. If you have a different system, use NGPU and BATCH_SIZE variables to adjust the parameters for your system.
+```bash
+bash scripts/run_electricity.sh
+bash scripts/run_traffic.sh
+```
+
+6. Start validation/evaluation. The metric we use for evaluation is q-risk. We can compare it per-quantile in the Pareto sense or jointly as one number indicating accuracy.
+```bash
+python inference.py \
+--checkpoint <your_checkpoint> \
+--data /data/processed/<dataset>/test.csv \
+--cat_encodings /data/processed/<dataset>/cat_encodings.bin \
+--tgt_scalers /data/processed/<dataset>/tgt_scalers.bin
+```
+
+7. Start inference/predictions. Visualize and save predictions by running the following command.
+```bash
+python inference.py \
+--checkpoint <your_checkpoint> \
+--data /data/processed/<dataset>/test.csv \
+--cat_encodings /data/processed/<dataset>/cat_encodings.bin \
+--tgt_scalers /data/processed/<dataset>/tgt_scalers.bin \
+--visualize \
+--save_predictions
+```
+
+
+
+Now that you have your model trained and evaluated, you can choose to compare your training results with our [Training accuracy results](#training-accuracy-results). You can also choose to benchmark your performance to [Training performance benchmark](#training-performance-results). Following the steps in these sections will ensure that you achieve the same accuracy and performance results as stated in the [Results](#results) section.
+## Advanced
+
+The following sections provide more  details about the dataset, running training and inference, and the training results.
+
+### Scripts and sample code
+
+In the root directory, the most important files are:
+
+`train.py`: Entry point for training
+`data_utils.py`: File containing the dataset implementation and preprocessing functions
+`modeling.py`: Definition of the model
+`configuration.py`: Contains configuration classes for various experiments
+`test.py`: Entry point testing trained model.
+`Dockerfile`: Container definition
+`log_helper.py`: Contains helper functions for setting up dllogger
+`criterions.py`: Definitions of loss functions
+
+The `scripts` directory contains scripts for default use cases:
+`run_electricity.sh`: train default model on the electricity dataset
+`run_traffic.sh`: train default model on the traffic dataset
+
+### Command-line options
+
+To view the full list of available options and their descriptions, use the `-h` or `--help` command-line option, for example:
+`python train.py --help`.
+
+The following example output is printed when running the model:
+```
+usage: train.py [-h] --data_path DATA_PATH --dataset {electricity,volatility,traffic,favorita} [--epochs EPOCHS] [--sample_data SAMPLE_DATA SAMPLE_DATA] [--batch_size BATCH_SIZE] [--lr LR] [--seed SEED] [--use_amp] [--clip_grad CLIP_GRAD]
+                [--early_stopping EARLY_STOPPING] [--results RESULTS] [--log_file LOG_FILE] [--distributed_world_size N] [--distributed_rank DISTRIBUTED_RANK] [--local_rank LOCAL_RANK] [--overwrite_config OVERWRITE_CONFIG]
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --data_path DATA_PATH
+  --dataset {electricity,volatility,traffic,favorita}
+  --epochs EPOCHS
+  --sample_data SAMPLE_DATA SAMPLE_DATA
+  --batch_size BATCH_SIZE
+  --lr LR
+  --seed SEED
+  --use_amp             Enable automatic mixed precision
+  --clip_grad CLIP_GRAD
+  --early_stopping EARLY_STOPPING
+                        Stop training if validation loss does not improve for more than this number of epochs.
+  --results RESULTS
+  --log_file LOG_FILE
+  --distributed_world_size N
+                        total number of GPUs across all nodes (default: all visible GPUs)
+  --distributed_rank DISTRIBUTED_RANK
+                        rank of the current worker
+  --local_rank LOCAL_RANK
+                        rank of the current worker
+  --overwrite_config OVERWRITE_CONFIG
+                        JSON string used to overload config
+
+```
+
+### Getting the data
+    
+The TFT model was trained on the electricity and traffic benchmark datasets. This repository contains the `get_data.sh` download script, which for electricity and and traffic datasets will automatically download and preprocess the training, validation and test datasets, and produce files that contain scalers.
+#### Dataset guidelines
+
+The `data_utils.py` file contains all functions that are used to preprocess the data. Initially the data is loaded to a `pandas.DataFrame` and parsed to the common format which contains the features we will use for training. Then standardized data is cleaned, normalized, encoded and binarized.
+This step does the following:
+Drop all the columns that are not marked in the configuration file as used for training or preprocessing
+Flatten indices in case time series are indexed by more than one column
+Split the data into training, validation and test splits
+Filter out all the time series shorter than minimal example length
+Normalize columns marked as continuous in the configuration file
+Encode as integers columns marked as categorical
+Save the data in csv and binary formats
+
+#### Multi-dataset
+In order to use an alternate dataset, you have to write a function that parses your data to a common format. The format is as follows:
+There is at least one id column
+There is exactly one time column (that can also be used as a feature column)
+Each feature is in a separate column
+Each row represents a moment in time for only one time series
+Additionally, you must specify a configuration of the network, including a data description. Refer to the example in `configuration.py` file.
+### Training process
+
+The `train.py` script is an entry point for a training procedure. Refined recipes can be found in the `scripts` directory.
+The model trains for at most `--epochs` epochs. If option `--early_stopping N` is set, then training will end if for N subsequent epochs validation loss hadn’t improved.
+The details of the architecture and the dataset configuration are encapsulated by the `--dataset` option. This option chooses one of the configurations stored in the `configuration.py` file. You can enable mixed precision training by providing the `--use_amp` option. The training script supports multi-GPU training with the APEX package. To enable distributed training prepend training command with `python -m torch.distributed.launch --nproc_per_node=${NGPU}`.
+
+Example command:
+```
+python -m torch.distributed.launch --nproc_per_node=8 train.py \
+        --dataset electricity \
+        --data_path /data/processed/electricity_bin \
+        --batch_size=1024 \
+        --sample 450000 50000 \
+        --lr 1e-3 \
+        --epochs 25 \
+        --early_stopping 5 \
+        --seed 1 \
+        --use_amp \
+        --results /results/TFT_electricity_bs8x1024_lr1e-3/seed_1
+```
+
+The model is trained by optimizing quantile loss <img src="https://render.githubusercontent.com/render/math?math=\Large\sum_{i=1}^N\sum_{q\in\mathcal{Q}}\sum_{t=1}^{t_{max}}\frac{QL(y_{it},\hat{y}_i(q,t),q)}{Nt_{max}}">
+. After training, the checkpoint with the least validation loss is evaluated on a test split with q-risk metric <img src="https://render.githubusercontent.com/render/math?math=\Large\frac{2\sum_{y\in\Omega}\sum_{t=1}^{t_{max}}QL(y_t,\hat{y}(q,t),q)}{\sum_{y\in\Omega}\sum_{t=1}^{t_{max}}|y_t|}">.
+Results are by default stored in the `/results` directory. This can be changed by providing the `--results` option. At the end of the training,  the results directory will contain the trained checkpoint which had the lowest validation loss, dllogger logs (in dictionary per line format), and TensorBoard logs.
+
+### Inference process
+
+Inference can be run by launching the `inference.py` script. The script requires a trained checkpoint to run. It is crucial to prepare the data in the same way as training data prior to running the inference. Example command:
+```
+python inference.py \
+--checkpoint /results/checkpoint.pt \
+--data /data/processed/electricity_bin/test.csv \
+--tgt_scalers /data/processed/electricity_bin/tgt_scalers.bin \
+--cat_encodings /data/processed/electricity_bin/cat_encodings.bin \
+--batch_size 2048 \
+--visualize \
+--save_predictions \
+--joint_visualization \
+--results /results \
+--use_amp
+```
+
+In the default setting, it performs the evaluation of the model on a specified dataset and prints q-risk evaluated on this dataset. In order to save the predictions, use the `--save_predictions` option. Predictions will be stored in the directory specified by the `--results` option in the csv format. Option `--joint_visualization` allows us to plot graphs in TensorBoard format, allowing us to inspect the results and compare them to true values. Using `--visualize`, you can save plots for each example in a separate file.
+## Performance
+
+### Benchmarking
+
+The following section shows how to run benchmarks measuring the model performance in training and inference modes.
+
+#### Training performance benchmark
+
+In order to run training benchmarks, use the `scripts/benchmark.sh` script.
+
+#### Inference performance benchmark
+
+To benchmark the inference performance on a specific batch size and dataset, run the `inference.py` script.
+### Results
+
+The following sections provide details on how we achieved our performance and accuracy in training and inference.
+
+#### Training accuracy results
+
+We conducted an extensive hyperparameter search along with stability tests. The presented results are the averages from the hundreds of runs.
+
+##### Training accuracy: NVIDIA DGX A100 (A100 80GB)
+
+Our results were obtained by running the `train.sh` training script in the [PyTorch 21.06 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch) on NVIDIA A100 GPUs.
+
+| Dataset | GPUs | Batch size / GPU    | Accuracy - TF32  | Accuracy - mixed precision  |   Time to train - TF32  |  Time to train - mixed precision | Time to train speedup (TF32 to mixed precision)     
+|-------------|---|------|-----------------------|-----------------------|-------|-------|-------
+| Electricity | 1 | 1024 | 0.027 / 0.059 / 0.029 | 0.028 / 0.058 / 0.029 | 1427s | 1087s | 1.313x
+| Electricity | 8 | 1024 | 0.027 / 0.056 / 0.028 | 0.026 / 0.054 / 0.029 | 216s  | 176s  | 1.227x
+| Traffic     | 1 | 1024 | 0.040 / 0.103 / 0.075 | 0.040 / 0.103 / 0.075 | 957s  | 726s  | 1.318x
+| Traffic     | 8 | 1024 | 0.042 / 0.104 / 0.076 | 0.042 / 0.106 / 0.077 | 151s  | 126s  | 1.198x
+
+
+
+
+##### Training accuracy: NVIDIA DGX-1 (V100 16GB)
+
+Our results were obtained by running the `train.sh` training script in the [PyTorch 21.06 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch) on NVIDIA DGX-1 with V100 16GB GPUs.
+
+| Dataset | GPUs    | Batch size / GPU    | Accuracy - FP32  | Accuracy - mixed precision  |   Time to train - FP32  |  Time to train - mixed precision | Time to train speedup (FP32 to mixed precision)        
+|-------------|---|------|-----------------------|-----------------------|-------|-------|-----------
+| Electricity | 1 | 1024 | 0.027 / 0.056 / 0.028 | 0.027 / 0.058 / 0.029 | 2559s | 1598s | 1.601x 
+| Electricity | 8 | 1024 | 0.027 / 0.055 / 0.028 | 0.027 / 0.055 / 0.029 | 381s  | 261s  | 1.460x   
+| Traffic     | 1 | 1024 | 0.040 / 0.102 / 0.075 | 0.041 / 0.101 / 0.074 | 1718s | 1062s | 1.618x 
+| Traffic     | 8 | 1024 | 0.042 / 0.106 / 0.076 | 0.042 / 0.105 / 0.077 | 256s  | 176s  | 1.455x
+
+
+
+##### Training stability test
+
+In order to get a greater picture of the model’s accuracy, we performed a hyperparameter search along with stability tests on 100 random seeds for each configuration. Then, for each benchmark dataset, we have chosen the architecture with the least mean test q-risk. The table below summarizes the best configurations.
+
+| Dataset     | #GPU | Hidden size | #Heads | Local BS | LR   | Gradient clipping | Dropout | Mean q-risk | Std q-risk | Min q-risk | Max q-risk
+|-------------|------|-------------|--------|----------|------|-------------------|---------|-------------|------------| -----------|------ 
+| Electricity | 8    | 128         | 4      | 1024     | 1e-3 | 0.0               | 0.1     | 0.1131      | 0.0025     | 0.1080     | 0.1200
+| Traffic     | 8    | 128         | 4      | 1024     | 1e-3 | 0.0               | 0.3     | 0.2180      | 0.0049     | 0.2069     | 0.2336
+
+
+#### Training performance results
+
+##### Training performance: NVIDIA DGX A100 (A100 80GB)
+
+Our results were obtained by running the `train.sh` training script in the [PyTorch 21.06 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch) on NVIDIA A100 (A100 80GB) GPUs. Performance numbers (in items/images per second) were averaged over an entire training epoch.
+
+| Dataset | GPUs   | Batch size / GPU   | Throughput - TF32    | Throughput - mixed precision    | Throughput speedup (TF32 - mixed precision)   | Weak scaling - TF32    | Weak scaling - mixed precision        
+|-------------|---|------|--------|--------|-------|-------|-----
+| Electricity | 1 | 1024 | 10173  | 13703  | 1.35x | 1     | 1
+| Electricity | 8 | 1024 | 80596  | 107761 | 1.34x | 7.92x | 7.86x
+| Traffic     | 1 | 1024 | 10197  | 13779  | 1.35x | 1     | 1
+| Traffic     | 8 | 1024 | 80692  | 107979 | 1.34x | 7.91x | 7.84x
+
+
+To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
+
+The performance metrics used were items per second.
+
+
+##### Training performance: NVIDIA DGX-1 (V100 16GB)
+
+Our results were obtained by running the `train.sh` training script in the [PyTorch 21.06 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch) on NVIDIA DGX-1 with (V100 16GB) GPUs. Performance numbers (in items/images per second) were averaged over an entire training epoch.
+
+| Dataset | GPUs   | Batch size / GPU   | Throughput - FP32    | Throughput - mixed precision    | Throughput speedup (FP32 - mixed precision)   | Weak scaling - FP32    | Weak scaling - mixed precision        
+|-------------|---|------|-------|-------|-------|------|----
+| Electricity | 1 | 1024 | 5580  | 9148  | 1.64x | 1     | 1
+| Electricity | 8 | 1024 | 43351 | 69855 | 1.61x | 7.77x | 7.64x
+| Traffic     | 1 | 1024 | 5593  | 9194  | 1.64x | 1     | 1
+| Traffic     | 8 | 1024 | 43426 | 69983 | 1.61x | 7.76x | 7.61x
+
+
+
+To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
+
+The performance metrics used were items per second.
+
+## Release notes
+The performance measurements in this document were conducted at the time of publication and may not reflect the performance achieved from NVIDIA’s latest software release. For the most up-to-date performance measurements, go to https://developer.nvidia.com/deep-learning-performance-training-inference.
+
+### Changelog
+
+October 2021
+- Initial release
+
+### Known issues
+There are no known issues with this model.
+
diff --git a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/TFT_architecture.PNG b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/TFT_architecture.PNG
new file mode 100644
index 00000000..c3431031
Binary files /dev/null and b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/TFT_architecture.PNG differ
diff --git a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/configuration.py b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/configuration.py
new file mode 100644
index 00000000..bef26e66
--- /dev/null
+++ b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/configuration.py
@@ -0,0 +1,128 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from data_utils import InputTypes, DataTypes, FeatureSpec
+import datetime
+
+class ElectricityConfig():
+    def __init__(self):
+
+        self.features = [
+                         FeatureSpec('id', InputTypes.ID, DataTypes.CATEGORICAL),
+                         FeatureSpec('hours_from_start', InputTypes.TIME, DataTypes.CONTINUOUS),
+                         FeatureSpec('power_usage', InputTypes.TARGET, DataTypes.CONTINUOUS),
+                         FeatureSpec('hour', InputTypes.KNOWN, DataTypes.CONTINUOUS),
+                         FeatureSpec('day_of_week', InputTypes.KNOWN, DataTypes.CONTINUOUS),
+                         FeatureSpec('hours_from_start', InputTypes.KNOWN, DataTypes.CONTINUOUS),
+                         FeatureSpec('categorical_id', InputTypes.STATIC, DataTypes.CATEGORICAL),
+                        ]
+        # Dataset split boundaries
+        self.time_ids = 'days_from_start' # This column contains time indices across which we split the data
+        self.train_range = (1096, 1315)
+        self.valid_range = (1308, 1339)
+        self.test_range = (1332, 1346)
+        self.dataset_stride = 1 #how many timesteps between examples
+        self.scale_per_id = True
+        self.missing_id_strategy = None
+        self.missing_cat_data_strategy='encode_all'
+
+        # Feature sizes
+        self.static_categorical_inp_lens = [369]
+        self.temporal_known_categorical_inp_lens = []
+        self.temporal_observed_categorical_inp_lens = []
+        self.quantiles = [0.1, 0.5, 0.9]
+
+        self.example_length = 8 * 24
+        self.encoder_length = 7 * 24
+
+        self.n_head = 4
+        self.hidden_size = 128
+        self.dropout = 0.1
+        self.attn_dropout = 0.0
+
+        #### Derived variables ####
+        self.temporal_known_continuous_inp_size = len([x for x in self.features 
+            if x.feature_type == InputTypes.KNOWN and x.feature_embed_type == DataTypes.CONTINUOUS])
+        self.temporal_observed_continuous_inp_size = len([x for x in self.features 
+            if x.feature_type == InputTypes.OBSERVED and x.feature_embed_type == DataTypes.CONTINUOUS])
+        self.temporal_target_size = len([x for x in self.features if x.feature_type == InputTypes.TARGET])
+        self.static_continuous_inp_size = len([x for x in self.features 
+            if x.feature_type == InputTypes.STATIC and x.feature_embed_type == DataTypes.CONTINUOUS])
+
+        self.num_static_vars = self.static_continuous_inp_size + len(self.static_categorical_inp_lens)
+        self.num_future_vars = self.temporal_known_continuous_inp_size + len(self.temporal_known_categorical_inp_lens)
+        self.num_historic_vars = sum([self.num_future_vars,
+                                      self.temporal_observed_continuous_inp_size,
+                                      self.temporal_target_size,
+                                      len(self.temporal_observed_categorical_inp_lens),
+                                      ])
+
+
+class TrafficConfig():
+    def __init__(self):
+
+        self.features = [
+                         FeatureSpec('id', InputTypes.ID, DataTypes.CATEGORICAL),
+                         FeatureSpec('hours_from_start', InputTypes.TIME, DataTypes.CONTINUOUS),
+                         FeatureSpec('values', InputTypes.TARGET, DataTypes.CONTINUOUS),
+                         FeatureSpec('time_on_day', InputTypes.KNOWN, DataTypes.CONTINUOUS),
+                         FeatureSpec('day_of_week', InputTypes.KNOWN, DataTypes.CONTINUOUS),
+                         FeatureSpec('hours_from_start', InputTypes.KNOWN, DataTypes.CONTINUOUS),
+                         FeatureSpec('categorical_id', InputTypes.STATIC, DataTypes.CATEGORICAL),
+                        ]
+        # Dataset split boundaries
+        self.time_ids = 'sensor_day' # This column contains time indices across which we split the data
+        self.train_range = (0, 151)
+        self.valid_range = (144, 166)
+        self.test_range = (159, float('inf'))
+        self.dataset_stride = 1 #how many timesteps between examples
+        self.scale_per_id = False
+        self.missing_id_strategy = None
+        self.missing_cat_data_strategy='encode_all'
+
+        # Feature sizes
+        self.static_categorical_inp_lens = [963]
+        self.temporal_known_categorical_inp_lens = []
+        self.temporal_observed_categorical_inp_lens = []
+        self.quantiles = [0.1, 0.5, 0.9]
+
+        self.example_length = 8 * 24
+        self.encoder_length = 7 * 24
+
+        self.n_head = 4
+        self.hidden_size = 128
+        self.dropout = 0.3
+        self.attn_dropout = 0.0
+
+        #### Derived variables ####
+        self.temporal_known_continuous_inp_size = len([x for x in self.features 
+            if x.feature_type == InputTypes.KNOWN and x.feature_embed_type == DataTypes.CONTINUOUS])
+        self.temporal_observed_continuous_inp_size = len([x for x in self.features 
+            if x.feature_type == InputTypes.OBSERVED and x.feature_embed_type == DataTypes.CONTINUOUS])
+        self.temporal_target_size = len([x for x in self.features if x.feature_type == InputTypes.TARGET])
+        self.static_continuous_inp_size = len([x for x in self.features 
+            if x.feature_type == InputTypes.STATIC and x.feature_embed_type == DataTypes.CONTINUOUS])
+
+        self.num_static_vars = self.static_continuous_inp_size + len(self.static_categorical_inp_lens)
+        self.num_future_vars = self.temporal_known_continuous_inp_size + len(self.temporal_known_categorical_inp_lens)
+        self.num_historic_vars = sum([self.num_future_vars,
+                                      self.temporal_observed_continuous_inp_size,
+                                      self.temporal_target_size,
+                                      len(self.temporal_observed_categorical_inp_lens),
+                                      ])
+
+
+CONFIGS = {'electricity':  ElectricityConfig,
+           'traffic':      TrafficConfig, 
+           }
diff --git a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/criterions.py b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/criterions.py
new file mode 100644
index 00000000..5c9df6ae
--- /dev/null
+++ b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/criterions.py
@@ -0,0 +1,28 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class QuantileLoss(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.register_buffer('q', torch.tensor(config.quantiles))
+
+    def forward(self, predictions, targets):
+        diff = predictions - targets
+        ql = (1-self.q)*F.relu(diff) + self.q*F.relu(-diff)
+        losses = ql.view(-1, ql.shape[-1]).mean(0)
+        return losses
diff --git a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/data_utils.py b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/data_utils.py
new file mode 100644
index 00000000..f38f8bfb
--- /dev/null
+++ b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/data_utils.py
@@ -0,0 +1,790 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################
+# Copyright 2021 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import math
+import pickle
+import enum
+import datetime
+
+from collections import namedtuple, OrderedDict
+
+import sklearn.preprocessing
+from sklearn.impute import SimpleImputer
+import pandas as pd
+import numpy as np
+from bisect import bisect
+
+import torch
+from torch.utils.data import Dataset,IterableDataset,DataLoader
+
+class DataTypes(enum.IntEnum):
+    """Defines numerical types of each column."""
+    CONTINUOUS = 0
+    CATEGORICAL = 1
+    DATE = 2
+    STR = 3
+
+class InputTypes(enum.IntEnum):
+    """Defines input types of each column."""
+    TARGET = 0
+    OBSERVED = 1
+    KNOWN = 2
+    STATIC = 3
+    ID = 4  # Single column used as an entity identifier
+    TIME = 5  # Single column exclusively used as a time index
+
+FeatureSpec = namedtuple('FeatureSpec', ['name', 'feature_type', 'feature_embed_type'])
+DTYPE_MAP = {
+        DataTypes.CONTINUOUS : np.float32,
+        DataTypes.CATEGORICAL : np.int64,
+        DataTypes.DATE:'datetime64[ns]',
+        DataTypes.STR: str
+        }
+
+FEAT_ORDER = [
+        (InputTypes.STATIC, DataTypes.CATEGORICAL),
+        (InputTypes.STATIC, DataTypes.CONTINUOUS),
+        (InputTypes.KNOWN, DataTypes.CATEGORICAL),
+        (InputTypes.KNOWN, DataTypes.CONTINUOUS),
+        (InputTypes.OBSERVED, DataTypes.CATEGORICAL),
+        (InputTypes.OBSERVED, DataTypes.CONTINUOUS),
+        (InputTypes.TARGET, DataTypes.CONTINUOUS),
+        (InputTypes.ID, DataTypes.CATEGORICAL)
+        ]
+
+FEAT_NAMES = ['s_cat' , 's_cont' , 'k_cat' , 'k_cont' , 'o_cat' , 'o_cont' , 'target', 'id']
+DEFAULT_ID_COL = 'id'
+
+class TFTBinaryDataset(Dataset):
+    def __init__(self, path, config):
+        super(TFTBinaryDataset).__init__()
+        self.features = [x for x in config.features if x.feature_embed_type != DataTypes.DATE]
+        self.example_length = config.example_length
+        self.stride = config.dataset_stride
+
+        self.grouped = pickle.load(open(path, 'rb'))
+        self.grouped = [x for x in self.grouped if x.shape[0] >= self.example_length]
+        self._cum_examples_in_group = np.cumsum([(g.shape[0] - self.example_length + 1)//self.stride for g in self.grouped])
+
+
+        self.feature_type_col_map = [[i for i,f in enumerate(self.features) if (f.feature_type, f.feature_embed_type) == x] for x in FEAT_ORDER]
+
+        # The list comprehension below is an elaborate way of rearranging data into correct order,
+        # simultaneously doing casting to proper types. Probably can be written neater
+        self.grouped = [
+                [
+                    arr[:, idxs].view(dtype=np.float32).astype(DTYPE_MAP[t[1]]) 
+                    for t, idxs in zip(FEAT_ORDER, self.feature_type_col_map)
+                ] 
+                for arr in self.grouped
+            ]
+
+    def __len__(self):
+        return self._cum_examples_in_group[-1] if len(self._cum_examples_in_group) else 0
+
+    def __getitem__(self, idx):
+        g_idx = bisect(self._cum_examples_in_group, idx)
+        e_idx = idx - self._cum_examples_in_group[g_idx-1] if g_idx else idx
+
+        group =  self.grouped[g_idx]
+
+        tensors = [
+                   torch.from_numpy(feat[e_idx * self.stride:e_idx*self.stride + self.example_length])
+                   if feat.size else torch.empty(0)
+                   for feat in group
+                  ]
+
+        return OrderedDict(zip(FEAT_NAMES, tensors))
+
+
+class TFTDataset(Dataset):
+    def __init__(self, path, config):
+        super(TFTDataset).__init__()
+        self.features = config.features
+        self.data = pd.read_csv(path, index_col=0)
+        self.example_length = config.example_length
+        self.stride = config.dataset_stride
+
+        # name field is a column name.
+        # there can be multiple entries with the same name because one column can be interpreted in many ways
+        time_col_name = next(x.name for x in self.features if x.feature_type==InputTypes.TIME)
+        id_col_name   = next(x.name for x in self.features if x.feature_type==InputTypes.ID)
+        if not id_col_name in self.data.columns:
+            id_col_name = DEFAULT_ID_COL
+            self.features = [x for x in self.features if x.feature_type!=InputTypes.ID]
+            self.features.append(FeatureSpec(DEFAULT_ID_COL, InputTypes.ID, DataTypes.CATEGORICAL))
+        col_dtypes    = {v.name:DTYPE_MAP[v.feature_embed_type] for v in self.features}
+
+
+        self.data.sort_values(time_col_name,inplace=True)
+        self.data = self.data[set(x.name for x in self.features)] #leave only relevant columns
+        self.data = self.data.astype(col_dtypes)
+        self.data = self.data.groupby(id_col_name).filter(lambda group: len(group) >= self.example_length)
+        self.grouped = list(self.data.groupby(id_col_name))
+
+        self._cum_examples_in_group = np.cumsum([(len(g[1]) - self.example_length + 1)//self.stride for g in self.grouped])
+
+    def __len__(self):
+        return self._cum_examples_in_group[-1]
+
+    def __getitem__(self, idx):
+        g_idx = len([x for x in self._cum_examples_in_group if x <= idx])
+        e_idx = idx - self._cum_examples_in_group[g_idx-1] if g_idx else idx
+
+        group =  self.grouped[g_idx][1]
+        sliced = group.iloc[e_idx * self.stride:e_idx*self.stride + self.example_length]
+
+        # We need to be sure that tensors are returned in the correct order
+        tensors = tuple([] for _ in range(8))
+        for v in self.features:
+            if v.feature_type == InputTypes.STATIC and v.feature_embed_type == DataTypes.CATEGORICAL:
+                tensors[0].append(torch.from_numpy(sliced[v.name].to_numpy()))
+            elif v.feature_type == InputTypes.STATIC and v.feature_embed_type == DataTypes.CONTINUOUS:
+                tensors[1].append(torch.from_numpy(sliced[v.name].to_numpy()))
+            elif v.feature_type == InputTypes.KNOWN and v.feature_embed_type == DataTypes.CATEGORICAL:
+                tensors[2].append(torch.from_numpy(sliced[v.name].to_numpy()))
+            elif v.feature_type == InputTypes.KNOWN and v.feature_embed_type == DataTypes.CONTINUOUS:
+                tensors[3].append(torch.from_numpy(sliced[v.name].to_numpy()))
+            elif v.feature_type == InputTypes.OBSERVED and v.feature_embed_type == DataTypes.CATEGORICAL:
+                tensors[4].append(torch.from_numpy(sliced[v.name].to_numpy()))
+            elif v.feature_type == InputTypes.OBSERVED and v.feature_embed_type == DataTypes.CONTINUOUS:
+                tensors[5].append(torch.from_numpy(sliced[v.name].to_numpy()))
+            elif v.feature_type == InputTypes.TARGET:
+                tensors[6].append(torch.from_numpy(sliced[v.name].to_numpy()))
+            elif v.feature_type == InputTypes.ID:
+                tensors[7].append(torch.from_numpy(sliced[v.name].to_numpy()))
+
+
+        tensors = [torch.stack(x, dim=-1) if x else torch.empty(0) for x in tensors]
+
+        return OrderedDict(zip(FEAT_NAMES, tensors))
+        
+def get_dataset_splits(df, config):
+
+    if hasattr(config, 'relative_split') and config.relative_split:
+        forecast_len = config.example_length - config.encoder_length
+        # The valid split is shifted from the train split by number of the forecast steps to the future.
+        # The test split is shifted by the number of the forecast steps from the valid split
+        train = []
+        valid = []
+        test = []
+
+        for _, group in df.groupby(DEFAULT_ID_COL):
+            index = group[config.time_ids]
+            _train = group.loc[index < config.valid_boundary]
+            _valid = group.iloc[(len(_train) - config.encoder_length):(len(_train) + forecast_len)]
+            _test = group.iloc[(len(_train) - config.encoder_length + forecast_len):(len(_train) + 2*forecast_len)]
+            train.append(_train)
+            valid.append(_valid)
+            test.append(_test)
+
+        train = pd.concat(train, axis=0)
+        valid = pd.concat(valid, axis=0)
+        test = pd.concat(test, axis=0)
+    else:
+        index = df[config.time_ids]
+        train = df.loc[(index >= config.train_range[0]) & (index < config.train_range[1])]
+        valid = df.loc[(index >= config.valid_range[0]) & (index < config.valid_range[1])]
+        test  = df.loc[(index >= config.test_range[0]) & (index < config.test_range[1])]
+
+    return train, valid, test
+
+def flatten_ids(df, config):
+
+    if config.missing_id_strategy == 'drop':
+        if hasattr(config, 'combine_ids') and config.combine_ids:
+            index = np.logical_or.reduce([df[c].isna() for c in config.combine_ids])
+        else:
+            id_col = next(x.name for x in config.features if x.feature_type == InputTypes.ID)
+            index = df[id_col].isna()
+        index = index[index == True].index # Extract indices of nans
+        df.drop(index, inplace=True)
+
+    if not (hasattr(config, 'combine_ids') and config.combine_ids):
+        id_col = next(x.name for x in config.features if x.feature_type == InputTypes.ID)
+        ids = df[id_col].apply(str)
+        df.drop(id_col, axis=1, inplace=True)
+        encoder = sklearn.preprocessing.LabelEncoder().fit(ids.values)
+        df[DEFAULT_ID_COL] = encoder.transform(ids)
+        encoders = OrderedDict({id_col: encoder})
+
+    else:
+        encoders = {c:sklearn.preprocessing.LabelEncoder().fit(df[c].values) for c in config.combine_ids}
+        encoders = OrderedDict(encoders)
+        lens = [len(v.classes_) for v in encoders.values()]
+        clens = np.roll(np.cumprod(lens), 1)
+        clens[0] = 1
+
+        # this takes a looooooot of time. Probably it would be better to create 2 dummy columns
+        df[DEFAULT_ID_COL] = df.apply(lambda row: sum([encoders[c].transform([row[c]])[0]*clens[i] for i,c in enumerate(encoders.keys())]), axis=1)
+        df.drop(config.combine_ids, axis=1, inplace=True)
+
+    return DEFAULT_ID_COL, encoders
+
+def impute(df, config):
+    #XXX This ensures that out scaling will have the same mean. We still need to check the variance
+    if not hasattr(config, 'missing_data_label'):
+        return df, None
+    else:
+        imp = SimpleImputer(missing_values=config.missing_data_label, strategy='mean')
+        mask = df.applymap(lambda x: True if x == config.missing_data_label else False)
+        data = df.values
+        col_mask = (data == config.missing_data_label).all(axis=0)
+        data[:,~col_mask] = imp.fit_transform(data)
+        return data, mask
+
+def normalize_reals(train, valid, test, config, id_col=DEFAULT_ID_COL):
+    tgt_cols = [x.name for x in config.features if x.feature_type == InputTypes.TARGET]
+    real_cols = list(set(v.name for v in config.features if v.feature_embed_type == DataTypes.CONTINUOUS).difference(set(tgt_cols)))
+    real_scalers = {}
+    tgt_scalers = {}
+
+    def apply_scalers(df, name=None):
+        if name is None:
+            name = df.name
+        mask = df.applymap(lambda x: True if x == config.missing_data_label else False) if hasattr(config, 'missing_data_label') else None
+        df[real_cols] = real_scalers[name].transform(df[real_cols])
+        if mask is not None and any(mask):
+            df[real_cols].mask(mask, 10**9)
+        df[tgt_cols] = tgt_scalers[name].transform(df[tgt_cols])
+        return df
+
+    if config.scale_per_id:
+        for identifier, sliced in train.groupby(id_col):
+            data = sliced[real_cols]
+            data, _ = impute(data, config)
+            real_scalers[identifier] = sklearn.preprocessing.StandardScaler().fit(data)
+            # XXX We should probably remove examples that contain NaN as a target
+            target = sliced[tgt_cols]
+            tgt_scalers[identifier] = sklearn.preprocessing.StandardScaler().fit(target)
+
+        train = train.groupby(id_col).apply(apply_scalers)
+        # For valid and testing leave only timeseries previously present in train subset
+        # XXX for proper data science we should consider encoding unseen timeseries as a special case, not throwing them away
+        valid = valid.loc[valid[id_col].isin(real_scalers.keys())]
+        valid = valid.groupby(id_col).apply(apply_scalers)
+        test = test.loc[test[id_col].isin(real_scalers.keys())]
+        test = test.groupby(id_col).apply(apply_scalers)
+
+    else:
+        data, _ = impute(train[real_cols], config)
+        real_scalers[''] = sklearn.preprocessing.StandardScaler().fit(data)
+        tgt_scalers[''] = sklearn.preprocessing.StandardScaler().fit(train[tgt_cols])
+
+        train = apply_scalers(train, name='')
+        valid = apply_scalers(valid, name='')
+        test = apply_scalers(test, name='')
+
+    return train, valid, test, real_scalers, tgt_scalers
+
+def encode_categoricals(train, valid, test, config):
+    cat_encodings = {}
+    cat_cols = list(set(v.name for v in config.features if v.feature_embed_type == DataTypes.CATEGORICAL and v.feature_type != InputTypes.ID))
+    num_classes = [] #XXX Maybe we should modify config based on this value? Or send a warninig?
+                     # For TC performance reasons we might want for num_classes[i] be divisible by 8
+
+    # Train categorical encoders
+    for c in cat_cols:
+        if config.missing_cat_data_strategy == 'special_token':
+            #XXX this will probably require some data augmentation
+            unique = train[c].unique()
+            valid[c].loc[valid[c].isin(unique)] = '<UNK>'
+            test[c].loc[test[c].isin(unique)] = '<UNK>'
+
+        if config.missing_cat_data_strategy == 'encode_all' or \
+                config.missing_cat_data_strategy == 'special_token':
+            srs = pd.concat([train[c], valid[c], test[c]]).apply(str)
+            cat_encodings[c] = sklearn.preprocessing.LabelEncoder().fit(srs.values)
+        elif config.missing_cat_data_strategy == 'drop':
+            # TODO: implement this. In addition to dropping rows this has to split specific time series in chunks
+            # to prevent data from having temporal gaps
+            pass
+        num_classes.append(srs.nunique())
+    print('Categorical variables encodings lens: ', num_classes)
+
+
+    for split in [train, valid, test]:
+        for c in cat_cols:
+            srs = split[c].apply(str)
+            split[c] = srs
+            split.loc[:,c] = cat_encodings[c].transform(srs)
+
+    return cat_encodings
+
+
+def preprocess(src_path, dst_path, config):
+    df = pd.read_csv(src_path, index_col=0)
+
+    for c in config.features:
+        if c.feature_embed_type == DataTypes.DATE:
+            df[c.name] = pd.to_datetime(df[c.name])
+
+    # Leave only columns relevant to preprocessing
+    relevant_columns = list(set([f.name for f in config.features] + [config.time_ids]))
+    df = df[relevant_columns]
+
+
+    id_col, id_encoders = flatten_ids(df, config)
+    df = df.reindex(sorted(df.columns), axis=1)
+    
+    train, valid, test = get_dataset_splits(df, config)
+   
+    # Length filter the data (all timeseries shorter than example len will be dropped)
+    #for df in [train, valid, test]:
+    #    df.groupby(id_col).filter(lambda x: len(x) >= config.example_length)
+    train = pd.concat([x[1] for x in train.groupby(id_col) if len(x[1]) >= config.example_length])
+    valid = pd.concat([x[1] for x in valid.groupby(id_col) if len(x[1]) >= config.example_length])
+    test  = pd.concat([x[1] for x in test.groupby(id_col)  if len(x[1]) >= config.example_length])
+
+    train, valid, test, real_scalers, tgt_scalers = normalize_reals(train, valid, test, config, id_col)
+
+    cat_encodings = encode_categoricals(train, valid, test, config)
+
+    os.makedirs(dst_path, exist_ok=True)
+    
+    train.to_csv(os.path.join(dst_path, 'train.csv'))
+    valid.to_csv(os.path.join(dst_path, 'valid.csv'))
+    test.to_csv(os.path.join(dst_path, 'test.csv'))
+
+    # Save relevant columns in binary form for faster dataloading
+    # IMORTANT: We always expect id to be a single column indicating the complete timeseries
+    # We also expect a copy of id in form of static categorical input!!!
+    col_names = [id_col] + [x.name for x in config.features if x.feature_embed_type != DataTypes.DATE and x.feature_type != InputTypes.ID]
+    grouped_train = [x[1][col_names].values.astype(np.float32).view(dtype=np.int32) for x in train.groupby(id_col)]
+    grouped_valid = [x[1][col_names].values.astype(np.float32).view(dtype=np.int32) for x in valid.groupby(id_col)]
+    grouped_test  = [x[1][col_names].values.astype(np.float32).view(dtype=np.int32) for x in test.groupby(id_col)]
+
+    pickle.dump(grouped_train, open(os.path.join(dst_path, 'train.bin'), 'wb'))
+    pickle.dump(grouped_valid, open(os.path.join(dst_path, 'valid.bin'), 'wb'))
+    pickle.dump(grouped_test,  open(os.path.join(dst_path, 'test.bin'), 'wb'))
+
+    
+    with open(os.path.join(dst_path, 'real_scalers.bin'), 'wb') as f:
+        pickle.dump(real_scalers, f)
+    with open(os.path.join(dst_path, 'tgt_scalers.bin'), 'wb') as f:
+        pickle.dump(tgt_scalers, f)
+    with open(os.path.join(dst_path, 'cat_encodings.bin'), 'wb') as f:
+        pickle.dump(cat_encodings, f)
+    with open(os.path.join(dst_path, 'id_encoders.bin'), 'wb') as f:
+        pickle.dump(id_encoders, f)
+    
+
+def sample_data(dataset, num_samples):
+    if num_samples < 0:
+        return dataset
+    else:
+        return torch.utils.data.Subset(dataset, np.random.choice(np.arange(len(dataset)), size=num_samples, replace=False))
+
+
+def standarize_electricity(path):
+    """Code taken from https://github.com/google-research/google-research/blob/master/tft/script_download_data.py"""
+    df = pd.read_csv(os.path.join(path, 'LD2011_2014.txt'), index_col=0, sep=';', decimal=',')
+    df.index = pd.to_datetime(df.index)
+    df.sort_index(inplace=True)
+  
+    # Used to determine the start and end dates of a series
+    output = df.resample('1h').mean().replace(0., np.nan)
+  
+    earliest_time = output.index.min()
+  
+    df_list = []
+    for label in output:
+        print('Processing {}'.format(label))
+        srs = output[label]
+  
+        start_date = min(srs.fillna(method='ffill').dropna().index)
+        end_date = max(srs.fillna(method='bfill').dropna().index)
+  
+        active_range = (srs.index >= start_date) & (srs.index <= end_date)
+        srs = srs[active_range].fillna(0.)
+  
+        tmp = pd.DataFrame({'power_usage': srs})
+        date = tmp.index
+        tmp['t'] = (date - earliest_time).seconds / 60 / 60 + (
+            date - earliest_time).days * 24
+        tmp['days_from_start'] = (date - earliest_time).days
+        tmp['categorical_id'] = label
+        tmp['date'] = date
+        tmp['id'] = label
+        tmp['hour'] = date.hour
+        tmp['day'] = date.day
+        tmp['day_of_week'] = date.dayofweek
+        tmp['month'] = date.month
+  
+        df_list.append(tmp)
+  
+    output = pd.concat(df_list, axis=0, join='outer').reset_index(drop=True)
+  
+    output['categorical_id'] = output['id'].copy()
+    output['hours_from_start'] = output['t']
+    output['categorical_day_of_week'] = output['day_of_week'].copy()
+    output['categorical_hour'] = output['hour'].copy()
+  
+    output.to_csv(os.path.join(path, 'standarized.csv'))
+
+def standarize_volatility(path):
+    df = pd.read_csv(os.path.join(path, 'oxfordmanrealizedvolatilityindices.csv'), index_col=0)  # no explicit index
+  
+    # Adds additional date/day fields
+    idx = [str(s).split('+')[0] for s in df.index
+          ]  # ignore timezones, we don't need them
+    dates = pd.to_datetime(idx)
+    df['date'] = dates
+    df['days_from_start'] = (dates - pd.datetime(2000, 1, 3)).days
+    df['day_of_week'] = dates.dayofweek
+    df['day_of_month'] = dates.day
+    df['week_of_year'] = dates.weekofyear
+    df['month'] = dates.month
+    df['year'] = dates.year
+    df['categorical_id'] = df['Symbol'].copy()
+  
+    # Processes log volatility
+    vol = df['rv5_ss'].copy()
+    vol.loc[vol == 0.] = np.nan
+    df['log_vol'] = np.log(vol)
+  
+    # Adds static information
+    symbol_region_mapping = {
+        '.AEX': 'EMEA',
+        '.AORD': 'APAC',
+        '.BFX': 'EMEA',
+        '.BSESN': 'APAC',
+        '.BVLG': 'EMEA',
+        '.BVSP': 'AMER',
+        '.DJI': 'AMER',
+        '.FCHI': 'EMEA',
+        '.FTMIB': 'EMEA',
+        '.FTSE': 'EMEA',
+        '.GDAXI': 'EMEA',
+        '.GSPTSE': 'AMER',
+        '.HSI': 'APAC',
+        '.IBEX': 'EMEA',
+        '.IXIC': 'AMER',
+        '.KS11': 'APAC',
+        '.KSE': 'APAC',
+        '.MXX': 'AMER',
+        '.N225': 'APAC ',
+        '.NSEI': 'APAC',
+        '.OMXC20': 'EMEA',
+        '.OMXHPI': 'EMEA',
+        '.OMXSPI': 'EMEA',
+        '.OSEAX': 'EMEA',
+        '.RUT': 'EMEA',
+        '.SMSI': 'EMEA',
+        '.SPX': 'AMER',
+        '.SSEC': 'APAC',
+        '.SSMI': 'EMEA',
+        '.STI': 'APAC',
+        '.STOXX50E': 'EMEA'
+    }
+  
+    df['Region'] = df['Symbol'].apply(lambda k: symbol_region_mapping[k])
+  
+    # Performs final processing
+    output_df_list = []
+    for grp in df.groupby('Symbol'):
+        sliced = grp[1].copy()
+        sliced.sort_values('days_from_start', inplace=True)
+        # Impute log volatility values
+        sliced['log_vol'].fillna(method='ffill', inplace=True)
+        sliced.dropna()
+        output_df_list.append(sliced)
+  
+    df = pd.concat(output_df_list, axis=0)
+  
+    df.to_csv(os.path.join(path, 'standarized.csv'))
+
+
+def standarize_traffic(path):
+    def process_list(s, variable_type=int, delimiter=None):
+        """Parses a line in the PEMS format to a list."""
+        if delimiter is None:
+            l = [
+                variable_type(i) for i in s.replace('[', '').replace(']', '').split()
+            ]
+        else:
+            l = [
+                variable_type(i)
+                for i in s.replace('[', '').replace(']', '').split(delimiter)
+            ]
+  
+        return l
+  
+    def read_single_list(filename):
+        """Returns single list from a file in the PEMS-custom format."""
+        with open(os.path.join(path, filename), 'r') as dat:
+            l = process_list(dat.readlines()[0])
+        return l
+  
+    def read_matrix(filename):
+        """Returns a matrix from a file in the PEMS-custom format."""
+        array_list = []
+        with open(os.path.join(path, filename), 'r') as dat:
+            lines = dat.readlines()
+            for i, line in enumerate(lines):
+                if (i + 1) % 50 == 0:
+                    print('Completed {} of {} rows for {}'.format(i + 1, len(lines),
+                                                                filename))
+                array = [
+                    process_list(row_split, variable_type=float, delimiter=None)
+                    for row_split in process_list(
+                        line, variable_type=str, delimiter=';')
+                ]
+                array_list.append(array)
+  
+        return array_list
+  
+    shuffle_order = np.array(read_single_list('randperm')) - 1  # index from 0
+    train_dayofweek = read_single_list('PEMS_trainlabels')
+    train_tensor = read_matrix('PEMS_train')
+    test_dayofweek = read_single_list('PEMS_testlabels')
+    test_tensor = read_matrix('PEMS_test')
+  
+    # Inverse permutate shuffle order
+    print('Shuffling')
+    inverse_mapping = {
+        new_location: previous_location
+        for previous_location, new_location in enumerate(shuffle_order)
+    }
+    reverse_shuffle_order = np.array([
+        inverse_mapping[new_location]
+        for new_location, _ in enumerate(shuffle_order)
+    ])
+  
+    # Group and reoder based on permuation matrix
+    print('Reodering')
+    day_of_week = np.array(train_dayofweek + test_dayofweek)
+    combined_tensor = np.array(train_tensor + test_tensor)
+  
+    day_of_week = day_of_week[reverse_shuffle_order]
+    combined_tensor = combined_tensor[reverse_shuffle_order]
+  
+    # Put everything back into a dataframe
+    print('Parsing as dataframe')
+    labels = ['traj_{}'.format(i) for i in read_single_list('stations_list')]
+  
+    hourly_list = []
+    for day, day_matrix in enumerate(combined_tensor):
+        # Hourly data
+        hourly = pd.DataFrame(day_matrix.T, columns=labels)
+        hourly['hour_on_day'] = [int(i / 6) for i in hourly.index
+                                ]  # sampled at 10 min intervals
+        if hourly['hour_on_day'].max() > 23 or hourly['hour_on_day'].min() < 0:
+            raise ValueError('Invalid hour! {}-{}'.format(
+                hourly['hour_on_day'].min(), hourly['hour_on_day'].max()))
+  
+        hourly = hourly.groupby('hour_on_day', as_index=True).mean()[labels]
+        hourly['sensor_day'] = day
+        hourly['time_on_day'] = hourly.index
+        hourly['day_of_week'] = day_of_week[day]
+  
+        hourly_list.append(hourly)
+  
+    hourly_frame = pd.concat(hourly_list, axis=0, ignore_index=True, sort=False)
+  
+    # Flatten such that each entitiy uses one row in dataframe
+    store_columns = [c for c in hourly_frame.columns if 'traj' in c]
+    other_columns = [c for c in hourly_frame.columns if 'traj' not in c]
+    flat_df = pd.DataFrame(columns=['values', 'prev_values', 'next_values'] +
+                           other_columns + ['id'])
+  
+    for store in store_columns:
+        print('Processing {}'.format(store))
+  
+        sliced = hourly_frame[[store] + other_columns].copy()
+        sliced.columns = ['values'] + other_columns
+        sliced['id'] = int(store.replace('traj_', ''))
+  
+        # Sort by Sensor-date-time
+        key = sliced['id'].apply(str) \
+                + sliced['sensor_day'].apply(lambda x: '_{:03d}'.format(x)) \
+                + sliced['time_on_day'].apply(lambda x: '_{:03d}'.format(x))
+        sliced = sliced.set_index(key).sort_index()
+  
+        sliced['values'] = sliced['values'].fillna(method='ffill')
+        sliced['prev_values'] = sliced['values'].shift(1)
+        sliced['next_values'] = sliced['values'].shift(-1)
+  
+        flat_df = flat_df.append(sliced.dropna(), ignore_index=True, sort=False)
+  
+    # Filter to match range used by other academic papers
+    index = flat_df['sensor_day']
+    flat_df = flat_df[index < 173].copy()
+  
+    # Creating columns fo categorical inputs
+    flat_df['categorical_id'] = flat_df['id'].copy()
+    flat_df['hours_from_start'] = flat_df['time_on_day'] \
+        + flat_df['sensor_day']*24.
+    flat_df['categorical_day_of_week'] = flat_df['day_of_week'].copy()
+    flat_df['categorical_time_on_day'] = flat_df['time_on_day'].copy()
+  
+    flat_df.to_csv(os.path.join(path, 'standarized.csv'))
+
+
+# XXX needs rework
+def standarize_favorita(data_folder):
+    import gc
+    # Extract only a subset of data to save/process for efficiency
+    start_date = pd.datetime(2015, 1, 1)
+    end_date = pd.datetime(2016, 6, 1)
+  
+    print('Regenerating data...')
+  
+    # load temporal data
+    temporal = pd.read_csv(os.path.join(data_folder, 'train.csv'), index_col=0)
+  
+    store_info = pd.read_csv(os.path.join(data_folder, 'stores.csv'), index_col=0)
+    oil = pd.read_csv(
+        os.path.join(data_folder, 'oil.csv'), index_col=0).iloc[:, 0]
+    holidays = pd.read_csv(os.path.join(data_folder, 'holidays_events.csv'))
+    items = pd.read_csv(os.path.join(data_folder, 'items.csv'), index_col=0)
+    transactions = pd.read_csv(os.path.join(data_folder, 'transactions.csv'))
+  
+    # Take first 6 months of data
+    temporal['date'] = pd.to_datetime(temporal['date'])
+  
+    # Filter dates to reduce storage space requirements
+    if start_date is not None:
+        temporal = temporal[(temporal['date'] >= start_date)]
+    if end_date is not None:
+        temporal = temporal[(temporal['date'] < end_date)]
+  
+    dates = temporal['date'].unique()
+  
+    # Add trajectory identifier
+    temporal['traj_id'] = temporal['store_nbr'].apply(
+        str) + '_' + temporal['item_nbr'].apply(str)
+    temporal['unique_id'] = temporal['traj_id'] + '_' + temporal['date'].apply(
+        str)
+  
+    # Remove all IDs with negative returns
+    print('Removing returns data')
+    min_returns = temporal['unit_sales'].groupby(temporal['traj_id']).min()
+    valid_ids = set(min_returns[min_returns >= 0].index)
+    selector = temporal['traj_id'].apply(lambda traj_id: traj_id in valid_ids)
+    new_temporal = temporal[selector].copy()
+    del temporal
+    gc.collect()
+    temporal = new_temporal
+    temporal['open'] = 1
+  
+    # Resampling
+    print('Resampling to regular grid')
+    resampled_dfs = []
+    for traj_id, raw_sub_df in temporal.groupby('traj_id'):
+        print('Resampling', traj_id)
+        sub_df = raw_sub_df.set_index('date', drop=True).copy()
+        sub_df = sub_df.resample('1d').last()
+        sub_df['date'] = sub_df.index
+        sub_df[['store_nbr', 'item_nbr', 'onpromotion']] \
+            = sub_df[['store_nbr', 'item_nbr', 'onpromotion']].fillna(method='ffill')
+        sub_df['open'] = sub_df['open'].fillna(
+            0)  # flag where sales data is unknown
+        sub_df['log_sales'] = np.log(sub_df['unit_sales'])
+    
+        resampled_dfs.append(sub_df.reset_index(drop=True))
+  
+    new_temporal = pd.concat(resampled_dfs, axis=0)
+    del temporal
+    gc.collect()
+    temporal = new_temporal
+  
+    print('Adding oil')
+    oil.name = 'oil'
+    oil.index = pd.to_datetime(oil.index)
+    #XXX the lines below match the value of the oil on given date with the rest of the timeseries
+    # missing values in oil series are copied from the index before. Then the oil series is joined with
+    # temporal. Then there are some dates present in temporal which arent present in oil, for which 
+    # oil values is substituted with -1. WHY?!
+    #TODO: check how many nans there are after first step. Previously oil series was extended by dates
+    # present in dates variable with nan value, which were forward filled. 
+    # This behavior is no longer supported by pandas, so we changed to DataFrame.isin method.
+    # This leaves us with more nans after first step than previously. To achieve previous behavior
+    # we have to join series before filling nans.
+    temporal = temporal.join(
+        #oil.loc[oil.index.isin(dates)].fillna(method='ffill'), on='date', how='left')
+        oil.loc[oil.index.isin(dates)], on='date', how='left')
+    temporal['oil'] = temporal['oil'].fillna(method='ffill')
+    temporal['oil'] = temporal['oil'].fillna(-1)
+  
+    print('Adding store info')
+    temporal = temporal.join(store_info, on='store_nbr', how='left')
+  
+    print('Adding item info')
+    temporal = temporal.join(items, on='item_nbr', how='left')
+  
+    transactions['date'] = pd.to_datetime(transactions['date'])
+    temporal = temporal.merge(
+        transactions,
+        left_on=['date', 'store_nbr'],
+        right_on=['date', 'store_nbr'],
+        how='left')
+    temporal['transactions'] = temporal['transactions'].fillna(-1)
+  
+    # Additional date info
+    temporal['day_of_week'] = pd.to_datetime(temporal['date'].values).dayofweek
+    temporal['day_of_month'] = pd.to_datetime(temporal['date'].values).day
+    temporal['month'] = pd.to_datetime(temporal['date'].values).month
+  
+    # Add holiday info
+    print('Adding holidays')
+    holiday_subset = holidays[holidays['transferred'].apply(
+        lambda x: not x)].copy()
+    holiday_subset.columns = [
+        s if s != 'type' else 'holiday_type' for s in holiday_subset.columns
+    ]
+    holiday_subset['date'] = pd.to_datetime(holiday_subset['date'])
+    local_holidays = holiday_subset[holiday_subset['locale'] == 'Local']
+    regional_holidays = holiday_subset[holiday_subset['locale'] == 'Regional']
+    national_holidays = holiday_subset[holiday_subset['locale'] == 'National']
+  
+    temporal['national_hol'] = temporal.merge(
+        national_holidays, left_on=['date'], right_on=['date'],
+        how='left')['description'].fillna('')
+    temporal['regional_hol'] = temporal.merge(
+        regional_holidays,
+        left_on=['state', 'date'],
+        right_on=['locale_name', 'date'],
+        how='left')['description'].fillna('')
+    temporal['local_hol'] = temporal.merge(
+        local_holidays,
+        left_on=['city', 'date'],
+        right_on=['locale_name', 'date'],
+        how='left')['description'].fillna('')
+  
+    temporal.sort_values('unique_id', inplace=True)
+
+    # Transform date to integer index
+    start_date = pd.to_datetime(min(temporal['date']))
+    dates = temporal['date'].apply(pd.to_datetime)
+    temporal['days_from_start'] = (dates - start_date).dt.days
+    temporal['categorical_id'] = temporal['traj_id'].copy()
+  
+    print('Saving processed file to {}'.format(os.path.join(data_folder, 'standarized.csv')))
+    temporal.to_csv(os.path.join(data_folder, 'standarized.csv'))
diff --git a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/ema.py b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/ema.py
new file mode 100644
index 00000000..f8f5b331
--- /dev/null
+++ b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/ema.py
@@ -0,0 +1,73 @@
+# Copyright 2021 NVIDIA CORPORATION
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Copyright 2019 Ross Wightman
+
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+
+#        http://www.apache.org/licenses/LICENSE-2.0
+
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+
+""" 
+Exponential Moving Average (EMA) of model updates
+"""
+
+from collections import OrderedDict
+from copy import deepcopy
+
+import torch
+import torch.nn as nn
+
+class ModelEma(nn.Module):
+    """ Model Exponential Moving Average V2
+
+    Keep a moving average of everything in the model state_dict (parameters and buffers).
+    V2 of this module is simpler, it does not match params/buffers based on name but simply
+    iterates in order. It works with torchscript (JIT of full model).
+
+    """
+    def __init__(self, model, decay=0.999, device=None):
+        super().__init__()
+        # make a copy of the model for accumulating moving average of weights
+        self.module = deepcopy(model)
+        self.module.eval()
+        self.decay = decay
+        self.device = device  # perform ema on different device from model if set
+        if self.device is not None:
+            self.module.to(device=device)
+
+    def update(self, model):
+        update_fn=lambda ema_v, model_v: self.decay * ema_v + (1. - self.decay) * model_v
+        with torch.no_grad():
+            for ema_v, model_v in zip(self.module.state_dict().values(), model.state_dict().values()):
+                if self.device is not None:
+                    model_v = model_v.to(device=self.device)
+                ema_v.copy_(update_fn(ema_v, model_v))
+
+    def set(self, model):
+        with torch.no_grad():
+            for ema_v, model_v in zip(self.module.state_dict().values(), model.state_dict().values()):
+                if self.device is not None:
+                    model_v = model_v.to(device=self.device)
+                ema_v.copy_( model_v )
+
+    def forward(self, x):
+        return self.module(x)
diff --git a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/gpu_affinity.py b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/gpu_affinity.py
new file mode 100644
index 00000000..79fb1fc4
--- /dev/null
+++ b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/gpu_affinity.py
@@ -0,0 +1,157 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+import math
+import os
+import pathlib
+import re
+
+import pynvml
+
+pynvml.nvmlInit()
+
+
+def systemGetDriverVersion():
+    return pynvml.nvmlSystemGetDriverVersion()
+
+
+def deviceGetCount():
+    return pynvml.nvmlDeviceGetCount()
+
+
+class device:
+    # assume nvml returns list of 64 bit ints
+    _nvml_affinity_elements = math.ceil(os.cpu_count() / 64)
+
+    def __init__(self, device_idx):
+        super().__init__()
+        self.handle = pynvml.nvmlDeviceGetHandleByIndex(device_idx)
+
+    def getName(self):
+        return pynvml.nvmlDeviceGetName(self.handle)
+
+    def getCpuAffinity(self):
+        affinity_string = ''
+        for j in pynvml.nvmlDeviceGetCpuAffinity(
+            self.handle, device._nvml_affinity_elements
+        ):
+            # assume nvml returns list of 64 bit ints
+            affinity_string = '{:064b}'.format(j) + affinity_string
+        affinity_list = [int(x) for x in affinity_string]
+        affinity_list.reverse()  # so core 0 is in 0th element of list
+
+        ret = [i for i, e in enumerate(affinity_list) if e != 0]
+        return ret
+
+
+def set_socket_affinity(gpu_id):
+    dev = device(gpu_id)
+    affinity = dev.getCpuAffinity()
+    os.sched_setaffinity(0, affinity)
+
+
+def set_single_affinity(gpu_id):
+    dev = device(gpu_id)
+    affinity = dev.getCpuAffinity()
+    os.sched_setaffinity(0, affinity[:1])
+
+
+def set_single_unique_affinity(gpu_id, nproc_per_node):
+    devices = [device(i) for i in range(nproc_per_node)]
+    socket_affinities = [dev.getCpuAffinity() for dev in devices]
+
+    siblings_list = get_thread_siblings_list()
+    siblings_dict = dict(siblings_list)
+
+    # remove siblings
+    for idx, socket_affinity in enumerate(socket_affinities):
+        socket_affinities[idx] = list(set(socket_affinity) - set(siblings_dict.values()))
+
+    affinities = []
+    assigned = []
+
+    for socket_affinity in socket_affinities:
+        for core in socket_affinity:
+            if core not in assigned:
+                affinities.append([core])
+                assigned.append(core)
+                break
+    os.sched_setaffinity(0, affinities[gpu_id])
+
+
+def set_socket_unique_affinity(gpu_id, nproc_per_node, mode):
+    device_ids = [device(i) for i in range(nproc_per_node)]
+    socket_affinities = [dev.getCpuAffinity() for dev in device_ids]
+
+    siblings_list = get_thread_siblings_list()
+    siblings_dict = dict(siblings_list)
+
+    # remove siblings
+    for idx, socket_affinity in enumerate(socket_affinities):
+        socket_affinities[idx] = list(set(socket_affinity) - set(siblings_dict.values()))
+
+    socket_affinities_to_device_ids = collections.defaultdict(list)
+
+    for idx, socket_affinity in enumerate(socket_affinities):
+        socket_affinities_to_device_ids[tuple(socket_affinity)].append(idx)
+
+    for socket_affinity, device_ids in socket_affinities_to_device_ids.items():
+        devices_per_group = len(device_ids)
+        cores_per_device = len(socket_affinity) // devices_per_group
+        for group_id, device_id in enumerate(device_ids):
+            if device_id == gpu_id:
+                if mode == 'interleaved':
+                    affinity = list(socket_affinity[group_id::devices_per_group])
+                elif mode == 'continuous':
+                    affinity = list(socket_affinity[group_id*cores_per_device:(group_id+1)*cores_per_device])
+                else:
+                    raise RuntimeError('Unknown set_socket_unique_affinity mode')
+
+                # reintroduce siblings
+                affinity += [siblings_dict[aff] for aff in affinity if aff in siblings_dict]
+                os.sched_setaffinity(0, affinity)
+
+
+def get_thread_siblings_list():
+    path = '/sys/devices/system/cpu/cpu*/topology/thread_siblings_list'
+    thread_siblings_list = []
+    pattern = re.compile(r'(\d+)\D(\d+)')
+    for fname in pathlib.Path(path[0]).glob(path[1:]):
+        with open(fname) as f:
+            content = f.read().strip()
+            res = pattern.findall(content)
+            if res:
+                pair = tuple(map(int, res[0]))
+                thread_siblings_list.append(pair)
+    return thread_siblings_list
+
+
+def set_affinity(gpu_id, nproc_per_node, mode='socket'):
+    if mode == 'socket':
+        set_socket_affinity(gpu_id)
+    elif mode == 'single':
+        set_single_affinity(gpu_id)
+    elif mode == 'single_unique':
+        set_single_unique_affinity(gpu_id, nproc_per_node)
+    elif mode == 'socket_unique_interleaved':
+        set_socket_unique_affinity(gpu_id, nproc_per_node, 'interleaved')
+    elif mode == 'socket_unique_continuous':
+        set_socket_unique_affinity(gpu_id, nproc_per_node, 'continuous')
+    else:
+        raise RuntimeError('Unknown affinity mode')
+
+    affinity = os.sched_getaffinity(0)
+    return affinity
+
diff --git a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/inference.py b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/inference.py
new file mode 100644
index 00000000..056429f1
--- /dev/null
+++ b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/inference.py
@@ -0,0 +1,239 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import pandas as pd
+import numpy as np
+import pickle
+import argparse
+import torch
+from torch.utils.data import DataLoader
+from torch.cuda import amp
+from torch.utils.tensorboard import SummaryWriter
+from tqdm import tqdm
+from modeling import TemporalFusionTransformer
+from configuration import ElectricityConfig
+from data_utils import TFTDataset
+from utils import PerformanceMeter
+from criterions import QuantileLoss
+import dllogger
+from log_helper import setup_logger
+
+def _unscale_per_id(config, values, ids, scalers):
+    values = values.cpu().numpy()
+    num_horizons = config.example_length - config.encoder_length + 1
+    flat_values = pd.DataFrame(
+            values,
+            columns=[f't{j}' for j in range(num_horizons - values.shape[1], num_horizons)]
+            )
+    flat_values['id'] = ids
+    df_list = []
+    for idx, group in flat_values.groupby('id'):
+        scaler = scalers[idx]
+        group_copy = group.copy()
+        for col in group_copy.columns:
+            if not 'id' in col:
+                _col = np.expand_dims(group_copy[col].values, -1)
+                _t_col = scaler.inverse_transform(_col)[:,-1]
+                group_copy[col] = _t_col
+        df_list.append(group_copy)
+    flat_values = pd.concat(df_list, axis=0)
+
+    flat_values = flat_values[[col for col in flat_values if not 'id' in col]]
+    flat_tensor = torch.from_numpy(flat_values.values)
+    return flat_tensor
+
+def _unscale(config, values, scaler):
+    values = values.cpu().numpy()
+    num_horizons = config.example_length - config.encoder_length + 1
+    flat_values = pd.DataFrame(
+            values,
+            columns=[f't{j}' for j in range(num_horizons - values.shape[1], num_horizons)]
+            )
+    for col in flat_values.columns:
+        if not 'id' in col:
+            _col = np.expand_dims(flat_values[col].values, -1)
+            _t_col = scaler.inverse_transform(_col)[:,-1]
+            flat_values[col] = _t_col
+
+    flat_values = flat_values[[col for col in flat_values if not 'id' in col]]
+    flat_tensor = torch.from_numpy(flat_values.values)
+    return flat_tensor
+
+def predict(args, config, model, data_loader, scalers, cat_encodings, extend_targets=False):
+    model.eval()
+    predictions = []
+    targets = []
+    ids = []
+    perf_meter = PerformanceMeter()
+    n_workers = args.distributed_world_size if hasattr(args, 'distributed_world_size') else 1
+
+    for step, batch in enumerate(data_loader):
+        perf_meter.reset_current_lap()
+        with torch.no_grad():
+            batch = {key: tensor.cuda() if tensor.numel() else None for key, tensor in batch.items()}
+            ids.append(batch['id'][:,0,:])
+            targets.append(batch['target'])
+            predictions.append(model(batch).float())
+
+        perf_meter.update(args.batch_size * n_workers,
+            exclude_from_total=step in [0, len(data_loader)-1])
+
+    targets = torch.cat(targets, dim=0)
+    if not extend_targets:
+        targets = targets[:,config.encoder_length:,:] 
+    predictions = torch.cat(predictions, dim=0)
+    
+    if config.scale_per_id:
+        ids = torch.cat(ids, dim=0).cpu().numpy()
+
+        unscaled_predictions = torch.stack(
+                [_unscale_per_id(config, predictions[:,:,i], ids, scalers) for i in range(len(config.quantiles))], 
+                dim=-1)
+        unscaled_targets = _unscale_per_id(config, targets[:,:,0], ids, scalers).unsqueeze(-1)
+    else:
+        ids = None
+        unscaled_predictions = torch.stack(
+                [_unscale(config, predictions[:,:,i], scalers['']) for i in range(len(config.quantiles))], 
+                dim=-1)
+        unscaled_targets = _unscale(config, targets[:,:,0], scalers['']).unsqueeze(-1)
+
+    return unscaled_predictions, unscaled_targets, ids, perf_meter
+
+def visualize_v2(args, config, model, data_loader, scalers, cat_encodings):
+    unscaled_predictions, unscaled_targets, ids, _ = predict(args, config, model, data_loader, scalers, cat_encodings, extend_targets=True)
+
+    num_horizons = config.example_length - config.encoder_length + 1
+    pad = unscaled_predictions.new_full((unscaled_targets.shape[0], unscaled_targets.shape[1] - unscaled_predictions.shape[1], unscaled_predictions.shape[2]), fill_value=float('nan'))
+    pad[:,-1,:] = unscaled_targets[:,-num_horizons,:]
+    unscaled_predictions = torch.cat((pad, unscaled_predictions), dim=1)
+
+    ids = torch.from_numpy(ids.squeeze())
+    joint_graphs = torch.cat([unscaled_targets, unscaled_predictions], dim=2)
+    graphs = {i:joint_graphs[ids == i, :, :] for i in set(ids.tolist())}
+    for key, g in graphs.items():
+        for i, ex in enumerate(g):
+            df = pd.DataFrame(ex.numpy(), 
+                    index=range(num_horizons - ex.shape[0], num_horizons),
+                    columns=['target'] + [f'P{int(q*100)}' for q in config.quantiles])
+            fig = df.plot().get_figure()
+            ax = fig.get_axes()[0]
+            _values = df.values[config.encoder_length-1:,:]
+            ax.fill_between(range(num_horizons), _values[:,1], _values[:,-1], alpha=0.2, color='green')
+            os.makedirs(os.path.join(args.results, 'single_example_vis', str(key)), exist_ok=True)
+            fig.savefig(os.path.join(args.results, 'single_example_vis', str(key), f'{i}.pdf'))
+
+def inference(args, config, model, data_loader, scalers, cat_encodings):
+    unscaled_predictions, unscaled_targets, ids, perf_meter = predict(args, config, model, data_loader, scalers, cat_encodings)
+
+    if args.joint_visualization or args.save_predictions:
+        ids = torch.from_numpy(ids.squeeze())
+        #ids = torch.cat([x['id'][0] for x in data_loader.dataset])
+        joint_graphs = torch.cat([unscaled_targets, unscaled_predictions], dim=2)
+        graphs = {i:joint_graphs[ids == i, :, :] for i in set(ids.tolist())}
+        for key, g in graphs.items(): #timeseries id, joint targets and predictions
+            _g = {'targets': g[:,:,0]}
+            _g.update({f'P{int(q*100)}':g[:,:,i+1] for i, q in enumerate(config.quantiles)})
+            
+            if args.joint_visualization:
+                summary_writer = SummaryWriter(log_dir=os.path.join(args.results, 'predictions_vis', str(key)))
+                for q, t in _g.items(): # target and quantiles, timehorizon values
+                    if q == 'targets':
+                        targets = torch.cat([t[:,0], t[-1,1:]]) # WIP
+                        # We want to plot targets on the same graph as predictions. Probably could be written better.
+                        for i, val in enumerate(targets):
+                            summary_writer.add_scalars(str(key), {f'{q}':val}, i)
+                        continue
+
+                    # Tensor t contains different time horizons which are shifted in phase
+                    # Next lines realign them
+                    y = t.new_full((t.shape[0] + t.shape[1] -1, t.shape[1]), float('nan'))
+                    for i in range(y.shape[1]):
+                        y[i:i+t.shape[0], i] = t[:,i]
+
+                    for i, vals in enumerate(y): # timestep, timehorizon values value
+                        summary_writer.add_scalars(str(key), {f'{q}_t+{j+1}':v for j,v in enumerate(vals) if v == v}, i)
+                summary_writer.close()
+
+            if args.save_predictions:
+                for q, t in _g.items():
+                    df = pd.DataFrame(t.tolist())
+                    df.columns = [f't+{i+1}' for i in range(len(df.columns))]
+                    os.makedirs(os.path.join(args.results, 'predictions', str(key)), exist_ok=True)
+                    df.to_csv(os.path.join(args.results, 'predictions', str(key), q+'.csv'))
+
+    losses = QuantileLoss(config)(unscaled_predictions, unscaled_targets)
+    normalizer = unscaled_targets.abs().mean()
+    q_risk = 2 * losses / normalizer
+
+    perf_dict = {
+                'throughput': perf_meter.avg,
+                'latency_avg': perf_meter.total_time/len(perf_meter.intervals),
+                'latency_p90': perf_meter.p(90),
+                'latency_p95': perf_meter.p(95),
+                'latency_p99': perf_meter.p(99),
+                'total_infernece_time': perf_meter.total_time,
+                }
+
+    return q_risk, perf_dict
+
+
+def main(args):
+    
+    setup_logger(args)
+    # Set up model
+    state_dict = torch.load(args.checkpoint)
+    config = state_dict['config']
+    model = TemporalFusionTransformer(config).cuda()
+    model.load_state_dict(state_dict['model'])
+    model.eval()
+    model.cuda()
+
+    # Set up dataset
+    test_split = TFTDataset(args.data, config)
+    data_loader = DataLoader(test_split, batch_size=args.batch_size, num_workers=4)
+
+    scalers = pickle.load(open(args.tgt_scalers, 'rb'))
+    cat_encodings = pickle.load(open(args.cat_encodings, 'rb'))
+
+    if args.visualize:
+        # TODO: abstract away all forms of visualization.
+        visualize_v2(args, config, model, data_loader, scalers, cat_encodings)
+
+    quantiles, perf_dict = inference(args, config, model, data_loader, scalers, cat_encodings)
+    quantiles = {'test_p10': quantiles[0].item(), 'test_p50': quantiles[1].item(), 'test_p90': quantiles[2].item(), 'sum':sum(quantiles).item()}
+    finish_log = {**quantiles, **perf_dict}
+    dllogger.log(step=(), data=finish_log, verbosity=1)
+    print('Test q-risk: P10 {} | P50 {} | P90 {}'.format(*quantiles))
+    print('Latency:\n\tAverage {:.3f}s\n\tp90 {:.3f}s\n\tp95 {:.3f}s\n\tp99 {:.3f}s'.format(
+        perf_dict['latency_avg'], perf_dict['latency_p90'], perf_dict['latency_p95'], perf_dict['latency_p99']))
+
+if __name__=='__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--checkpoint', type=str,
+                        help='Path to the checkpoint')
+    parser.add_argument('--data', type=str,
+                        help='Path to the test split of the dataset')
+    parser.add_argument('--tgt_scalers', type=str,
+                        help='Path to the tgt_scalers.bin file produced by the preprocessing')
+    parser.add_argument('--cat_encodings', type=str,
+                        help='Path to the cat_encodings.bin file produced by the preprocessing')
+    parser.add_argument('--batch_size', type=int, default=64)
+    parser.add_argument('--visualize', action='store_true', help='Visualize predictions - each example on the separate plot')
+    parser.add_argument('--joint_visualization', action='store_true', help='Visualize predictions - each timeseries on separate plot. Projections will be concatenated.')
+    parser.add_argument('--save_predictions', action='store_true')
+    parser.add_argument('--results', type=str, default='/results')
+    parser.add_argument('--log_file', type=str, default='dllogger.json')
+    ARGS = parser.parse_args()
+    main(ARGS)
diff --git a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/log_helper.py b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/log_helper.py
new file mode 100644
index 00000000..83d2ac7f
--- /dev/null
+++ b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/log_helper.py
@@ -0,0 +1,141 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import subprocess
+import sys
+import itertools
+import atexit
+
+import dllogger
+from dllogger import Backend, JSONStreamBackend, StdOutBackend
+
+import torch.distributed as dist
+from torch.utils.tensorboard import SummaryWriter
+
+class TensorBoardBackend(Backend):
+    def __init__(self, verbosity, log_dir):
+        super().__init__(verbosity=verbosity)
+        self.summary_writer = SummaryWriter(log_dir=os.path.join(log_dir, 'TB_summary'),
+                                            flush_secs=120,
+                                            max_queue=200
+                                            )
+        self.hp_cache = None
+        atexit.register(self.summary_writer.close)
+
+    @property
+    def log_level(self):
+        return self._log_level
+
+    def metadata(self, timestamp, elapsedtime, metric, metadata):
+        pass
+
+    def log(self, timestamp, elapsedtime, step, data):
+        if step == 'HPARAMS':
+            parameters = {k: v for k, v in data.items() if not isinstance(v, (list, tuple))}
+            #Unpack list and tuples
+            for d in [{k+f'_{i}':v for i,v in enumerate(l)} for k,l in data.items() if isinstance(l, (list, tuple))]:
+                parameters.update(d)
+            #Remove custom classes
+            parameters = {k: v for k, v in data.items() if isinstance(v, (int, float, str, bool))}
+            parameters.update({k:'None' for k, v in data.items() if v is None})
+            self.hp_cache = parameters
+        if step == ():
+            if self.hp_cache is None:
+                print('Warning: Cannot save HParameters. Please log HParameters with step=\'HPARAMS\'', file=sys.stderr)
+                return
+            self.summary_writer.add_hparams(self.hp_cache, data)
+        if not isinstance(step, int):
+            return
+        for k, v in data.items():
+            self.summary_writer.add_scalar(k, v, step)
+
+    def flush(self):
+        pass
+
+def setup_logger(args):
+    os.makedirs(args.results, exist_ok=True)
+    log_path = os.path.join(args.results, args.log_file)
+
+    if os.path.exists(log_path):
+        for i in itertools.count():
+            s_fname = args.log_file.split('.')
+            fname = '.'.join(s_fname[:-1]) + f'_{i}.' + s_fname[-1] if len(s_fname) > 1 else args.stat_file + f'.{i}'
+            log_path = os.path.join(args.results, fname)
+            if not os.path.exists(log_path):
+                break
+
+    def metric_format(metric, metadata, value):
+        return "{}: {}".format(metric, f'{value:.5f}' if isinstance(value, float) else value)
+    def step_format(step):
+        if step == ():
+            return "Finished |"
+        elif isinstance(step, int):
+            return "Step {0: <5} |".format(step)
+        return "Step {} |".format(step)
+
+
+    if not dist.is_initialized() or not args.distributed_world_size > 1 or args.distributed_rank == 0:
+        dllogger.init(backends=[JSONStreamBackend(verbosity=1, filename=log_path),
+                                TensorBoardBackend(verbosity=1, log_dir=args.results),
+                                StdOutBackend(verbosity=2, 
+                                              step_format=step_format,
+                                              prefix_format=lambda x: "")#,
+                                              #metric_format=metric_format)
+                                ])
+    else:
+        dllogger.init(backends=[])
+    dllogger.log(step='PARAMETER', data=vars(args), verbosity=0)
+
+    container_setup_info = {**get_framework_env_vars(), **get_system_info()}
+    dllogger.log(step='ENVIRONMENT', data=container_setup_info, verbosity=0)
+
+    dllogger.metadata('loss', {'GOAL': 'MINIMIZE', 'STAGE': 'TRAIN', 'format': ':5f'})
+    dllogger.metadata('P10', {'GOAL': 'MINIMIZE', 'STAGE': 'TRAIN', 'format': ':5f'})
+    dllogger.metadata('P50', {'GOAL': 'MINIMIZE', 'STAGE': 'TRAIN', 'format': ':5f'})
+    dllogger.metadata('P90', {'GOAL': 'MINIMIZE', 'STAGE': 'TRAIN', 'format': ':5f'})
+    dllogger.metadata('items/s', {'GOAL': 'MAXIMIZE', 'STAGE': 'TRAIN', 'format': ':1f'})
+    dllogger.metadata('val_loss', {'GOAL': 'MINIMIZE', 'STAGE': 'VAL', 'format':':5f'})
+    dllogger.metadata('val_P10', {'GOAL': 'MINIMIZE', 'STAGE': 'VAL', 'format': ':5f'})
+    dllogger.metadata('val_P50', {'GOAL': 'MINIMIZE', 'STAGE': 'VAL', 'format': ':5f'})
+    dllogger.metadata('val_P90', {'GOAL': 'MINIMIZE', 'STAGE': 'VAL', 'format': ':5f'})
+    dllogger.metadata('val_items/s', {'GOAL': 'MAXIMIZE', 'STAGE': 'VAL', 'format': ':1f'})
+    dllogger.metadata('test_P10', {'GOAL': 'MINIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
+    dllogger.metadata('test_P50', {'GOAL': 'MINIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
+    dllogger.metadata('test_P90', {'GOAL': 'MINIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
+    dllogger.metadata('throughput', {'GOAL': 'MAXIMIZE', 'STAGE': 'TEST', 'format': ':1f'})
+    dllogger.metadata('latency_p90', {'GOAL': 'MIMIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
+    dllogger.metadata('latency_p95', {'GOAL': 'MIMIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
+    dllogger.metadata('latency_p99', {'GOAL': 'MIMIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
+
+
+def get_framework_env_vars():
+    return {
+        'NVIDIA_PYTORCH_VERSION': os.environ.get('NVIDIA_PYTORCH_VERSION'),
+        'PYTORCH_VERSION': os.environ.get('PYTORCH_VERSION'),
+        'CUBLAS_VERSION': os.environ.get('CUBLAS_VERSION'),
+        'NCCL_VERSION': os.environ.get('NCCL_VERSION'),
+        'CUDA_DRIVER_VERSION': os.environ.get('CUDA_DRIVER_VERSION'),
+        'CUDNN_VERSION': os.environ.get('CUDNN_VERSION'),
+        'CUDA_VERSION': os.environ.get('CUDA_VERSION'),
+        'NVIDIA_PIPELINE_ID': os.environ.get('NVIDIA_PIPELINE_ID'),
+        'NVIDIA_BUILD_ID': os.environ.get('NVIDIA_BUILD_ID'),
+        'NVIDIA_TF32_OVERRIDE': os.environ.get('NVIDIA_TF32_OVERRIDE'),
+    }
+
+def get_system_info():
+    system_info = subprocess.run('nvidia-smi --query-gpu=gpu_name,memory.total,enforced.power.limit --format=csv'.split(), capture_output=True).stdout
+    system_info = [i.decode('utf-8') for i in system_info.split(b'\n')]
+    system_info = [x for x in system_info if x]
+    return {'system_info': system_info}
diff --git a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/modeling.py b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/modeling.py
new file mode 100644
index 00000000..65e64983
--- /dev/null
+++ b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/modeling.py
@@ -0,0 +1,367 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from torch import Tensor
+from typing import Dict, Tuple, Optional, List
+
+if os.environ.get("TFT_SCRIPTING", False):
+    from torch.nn import LayerNorm
+else:
+    from apex.normalization.fused_layer_norm import FusedLayerNorm as LayerNorm
+
+class MaybeLayerNorm(nn.Module):
+    def __init__(self, output_size, hidden_size, eps):
+        super().__init__()
+        if output_size and output_size == 1:
+            self.ln = nn.Identity()
+        else:
+            self.ln = LayerNorm(output_size if output_size else hidden_size, eps=eps)
+    
+    def forward(self, x):
+        return self.ln(x)
+
+
+class GLU(nn.Module):
+    def __init__(self, hidden_size, output_size):
+        super().__init__()
+        self.lin = nn.Linear(hidden_size, output_size * 2)
+
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.lin(x)
+        x = F.glu(x)
+        return x
+
+
+class GRN(nn.Module):
+    def __init__(self,
+                 input_size,
+                 hidden_size, 
+                 output_size=None,
+                 context_hidden_size=None,
+                 dropout=0):
+        super().__init__()
+
+        
+        self.layer_norm = MaybeLayerNorm(output_size, hidden_size, eps=1e-3)
+        self.lin_a = nn.Linear(input_size, hidden_size)
+        if context_hidden_size is not None:
+            self.lin_c = nn.Linear(context_hidden_size, hidden_size, bias=False)
+        self.lin_i = nn.Linear(hidden_size, hidden_size)
+        self.glu = GLU(hidden_size, output_size if output_size else hidden_size)
+        self.dropout = nn.Dropout(dropout)
+        self.out_proj = nn.Linear(input_size, output_size) if output_size else None
+
+    def forward(self, a: Tensor, c: Optional[Tensor] = None):
+        x = self.lin_a(a)
+        if c is not None:
+            x = x + self.lin_c(c).unsqueeze(1)
+        x = F.elu(x)
+        x = self.lin_i(x)
+        x = self.dropout(x)
+        x = self.glu(x)
+        y = a if not self.out_proj else self.out_proj(a)
+        x = x + y
+        x = self.layer_norm(x)
+        return x 
+
+class TFTEmbedding(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.s_cat_inp_lens    = config.static_categorical_inp_lens
+        self.t_cat_k_inp_lens  = config.temporal_known_categorical_inp_lens
+        self.t_cat_o_inp_lens  = config.temporal_observed_categorical_inp_lens
+        self.s_cont_inp_size   = config.static_continuous_inp_size
+        self.t_cont_k_inp_size = config.temporal_known_continuous_inp_size
+        self.t_cont_o_inp_size = config.temporal_observed_continuous_inp_size
+        self.t_tgt_size        = config.temporal_target_size
+
+        self.hidden_size = config.hidden_size
+
+        # There are 7 types of input:
+        # 1. Static categorical
+        # 2. Static continuous
+        # 3. Temporal known a priori categorical
+        # 4. Temporal known a priori continuous
+        # 5. Temporal observed categorical
+        # 6. Temporal observed continuous
+        # 7. Temporal observed targets (time series obseved so far)
+
+        self.s_cat_embed = nn.ModuleList([
+            nn.Embedding(n, self.hidden_size) for n in self.s_cat_inp_lens]) if self.s_cat_inp_lens else None
+        self.t_cat_k_embed = nn.ModuleList([
+            nn.Embedding(n, self.hidden_size) for n in self.t_cat_k_inp_lens]) if self.t_cat_k_inp_lens else None
+        self.t_cat_o_embed = nn.ModuleList([
+            nn.Embedding(n, self.hidden_size) for n in self.t_cat_o_inp_lens]) if self.t_cat_o_inp_lens else None
+
+        self.s_cont_embedding_vectors = nn.Parameter(torch.Tensor(self.s_cont_inp_size, self.hidden_size)) if self.s_cont_inp_size else None
+        self.t_cont_k_embedding_vectors = nn.Parameter(torch.Tensor(self.t_cont_k_inp_size, self.hidden_size)) if self.t_cont_k_inp_size else None
+        self.t_cont_o_embedding_vectors = nn.Parameter(torch.Tensor(self.t_cont_o_inp_size, self.hidden_size)) if self.t_cont_o_inp_size else None
+        self.t_tgt_embedding_vectors = nn.Parameter(torch.Tensor(self.t_tgt_size, self.hidden_size))
+
+        self.s_cont_embedding_bias = nn.Parameter(torch.zeros(self.s_cont_inp_size, self.hidden_size)) if self.s_cont_inp_size else None
+        self.t_cont_k_embedding_bias = nn.Parameter(torch.zeros(self.t_cont_k_inp_size, self.hidden_size)) if self.t_cont_k_inp_size else None
+        self.t_cont_o_embedding_bias = nn.Parameter(torch.zeros(self.t_cont_o_inp_size, self.hidden_size)) if self.t_cont_o_inp_size else None
+        self.t_tgt_embedding_bias = nn.Parameter(torch.zeros(self.t_tgt_size, self.hidden_size))
+
+        if self.s_cont_embedding_vectors is not None:
+            torch.nn.init.xavier_normal_(self.s_cont_embedding_vectors)
+        if self.t_cont_k_embedding_vectors is not None:
+            torch.nn.init.xavier_normal_(self.t_cont_k_embedding_vectors)
+        if self.t_cont_o_embedding_vectors is not None:
+            torch.nn.init.xavier_normal_(self.t_cont_o_embedding_vectors)
+        torch.nn.init.xavier_normal_(self.t_tgt_embedding_vectors)
+
+    def _apply_embedding(self,
+            cat: Optional[Tensor],
+            cont: Optional[Tensor],
+            cat_emb: Optional[nn.ModuleList], 
+            cont_emb: Tensor,
+            cont_bias: Tensor,
+            ) -> Tuple[Optional[Tensor], Optional[Tensor]]:
+        e_cat = torch.stack([embed(cat[...,i]) for i, embed in enumerate(cat_emb)], dim=-2) if cat is not None else None
+        if cont is not None:
+            #the line below is equivalent to following einsums
+            #e_cont = torch.einsum('btf,fh->bthf', cont, cont_emb)
+            #e_cont = torch.einsum('bf,fh->bhf', cont, cont_emb)
+            e_cont = torch.mul(cont.unsqueeze(-1), cont_emb)
+            e_cont = e_cont + cont_bias
+        else:
+            e_cont = None
+
+        if e_cat is not None and e_cont is not None:
+            return torch.cat([e_cat, e_cont], dim=-2)
+        elif e_cat is not None:
+            return e_cat
+        elif e_cont is not None:
+            return e_cont
+        else:
+            return None
+
+    def forward(self, x: Dict[str, Tensor]):
+        # temporal/static categorical/continuous known/observed input 
+        s_cat_inp = x.get('s_cat', None)
+        s_cont_inp = x.get('s_cont', None)
+        t_cat_k_inp = x.get('k_cat', None)
+        t_cont_k_inp = x.get('k_cont', None)
+        t_cat_o_inp = x.get('o_cat', None)
+        t_cont_o_inp = x.get('o_cont', None)
+        t_tgt_obs = x['target'] # Has to be present
+
+        # Static inputs are expected to be equal for all timesteps
+        # For memory efficiency there is no assert statement
+        s_cat_inp = s_cat_inp[:,0,:] if s_cat_inp is not None else None
+        s_cont_inp = s_cont_inp[:,0,:] if s_cont_inp is not None else None
+
+        s_inp = self._apply_embedding(s_cat_inp,
+                                      s_cont_inp,
+                                      self.s_cat_embed,
+                                      self.s_cont_embedding_vectors,
+                                      self.s_cont_embedding_bias)
+        t_known_inp = self._apply_embedding(t_cat_k_inp,
+                                            t_cont_k_inp,
+                                            self.t_cat_k_embed,
+                                            self.t_cont_k_embedding_vectors,
+                                            self.t_cont_k_embedding_bias)
+        t_observed_inp = self._apply_embedding(t_cat_o_inp,
+                                               t_cont_o_inp,
+                                               self.t_cat_o_embed,
+                                               self.t_cont_o_embedding_vectors,
+                                               self.t_cont_o_embedding_bias)
+
+        # Temporal observed targets
+        # t_observed_tgt = torch.einsum('btf,fh->btfh', t_tgt_obs, self.t_tgt_embedding_vectors)
+        t_observed_tgt = torch.matmul(t_tgt_obs.unsqueeze(3).unsqueeze(4), self.t_tgt_embedding_vectors.unsqueeze(1)).squeeze(3)
+        t_observed_tgt = t_observed_tgt + self.t_tgt_embedding_bias
+
+        return s_inp, t_known_inp, t_observed_inp, t_observed_tgt
+
+class VariableSelectionNetwork(nn.Module):
+    def __init__(self, config, num_inputs):
+        super().__init__()
+        self.joint_grn = GRN(config.hidden_size*num_inputs, config.hidden_size, output_size=num_inputs, context_hidden_size=config.hidden_size)
+        self.var_grns = nn.ModuleList([GRN(config.hidden_size, config.hidden_size, dropout=config.dropout) for _ in range(num_inputs)])
+
+    def forward(self, x: Tensor, context: Optional[Tensor] = None):
+        Xi = x.reshape(*x.shape[:-2], -1)
+        grn_outputs = self.joint_grn(Xi, c=context)
+        sparse_weights = F.softmax(grn_outputs, dim=-1)
+        transformed_embed_list = [m(x[...,i,:]) for i, m in enumerate(self.var_grns)]
+        transformed_embed = torch.stack(transformed_embed_list, dim=-1)
+        #the line below performs batched matrix vector multiplication
+        #for temporal features it's bthf,btf->bth
+        #for static features it's bhf,bf->bh
+        variable_ctx = torch.matmul(transformed_embed, sparse_weights.unsqueeze(-1)).squeeze(-1)
+
+        return variable_ctx, sparse_weights
+
+class StaticCovariateEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.vsn = VariableSelectionNetwork(config, config.num_static_vars)
+        self.context_grns = nn.ModuleList([GRN(config.hidden_size, config.hidden_size, dropout=config.dropout) for _ in range(4)])
+
+    def forward(self, x: Tensor) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
+        variable_ctx, sparse_weights = self.vsn(x)
+
+        # Context vectors:
+        # variable selection context
+        # enrichment context
+        # state_c context
+        # state_h context
+        cs, ce, ch, cc = tuple(m(variable_ctx) for m in self.context_grns)
+
+        return cs, ce, ch, cc
+
+
+class InterpretableMultiHeadAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.n_head = config.n_head
+        assert config.hidden_size % config.n_head == 0
+        self.d_head = config.hidden_size // config.n_head
+        self.qkv_linears = nn.Linear(config.hidden_size, (2 * self.n_head + 1) * self.d_head, bias=False)
+        self.out_proj = nn.Linear(self.d_head, config.hidden_size, bias=False)
+        self.attn_dropout = nn.Dropout(config.attn_dropout)
+        self.out_dropout = nn.Dropout(config.dropout)
+        self.scale = self.d_head**-0.5
+        self.register_buffer("_mask", torch.triu(torch.full((config.example_length, config.example_length), float('-inf')), 1).unsqueeze(0))
+
+    def forward(self, x: Tensor, mask_future_timesteps: bool = True) -> Tuple[Tensor, Tensor]:
+        bs, t, h_size = x.shape
+        qkv = self.qkv_linears(x)
+        q, k, v = qkv.split((self.n_head * self.d_head, self.n_head * self.d_head, self.d_head), dim=-1)
+        q = q.view(bs, t, self.n_head, self.d_head)
+        k = k.view(bs, t, self.n_head, self.d_head)
+        v = v.view(bs, t, self.d_head)
+
+        # attn_score = torch.einsum('bind,bjnd->bnij', q, k)
+        attn_score = torch.matmul(q.permute((0, 2, 1, 3)), k.permute((0, 2, 3, 1)))
+        attn_score.mul_(self.scale)
+
+        if mask_future_timesteps:
+            attn_score = attn_score + self._mask
+
+        attn_prob = F.softmax(attn_score, dim=3)
+        attn_prob = self.attn_dropout(attn_prob)
+
+        # attn_vec = torch.einsum('bnij,bjd->bnid', attn_prob, v)
+        attn_vec = torch.matmul(attn_prob, v.unsqueeze(1))
+        m_attn_vec = torch.mean(attn_vec, dim=1)
+        out = self.out_proj(m_attn_vec)
+        out = self.out_dropout(out)
+
+        return out, attn_vec
+
+
+
+class TemporalFusionTransformer(nn.Module):
+    """ 
+    Implementation of https://arxiv.org/abs/1912.09363 
+    """
+    def __init__(self, config):
+        super().__init__()
+
+        if hasattr(config, 'model'):
+            config = config.model
+
+        self.encoder_length = config.encoder_length #this determines from how distant past we want to use data from
+
+        self.embedding = TFTEmbedding(config)
+        self.static_encoder = StaticCovariateEncoder(config)
+
+        self.history_vsn = VariableSelectionNetwork(config, config.num_historic_vars) 
+        self.history_encoder = nn.LSTM(config.hidden_size, config.hidden_size, batch_first=True)
+        self.future_vsn = VariableSelectionNetwork(config, config.num_future_vars)
+        self.future_encoder = nn.LSTM(config.hidden_size, config.hidden_size, batch_first=True)
+
+
+        self.input_gate = GLU(config.hidden_size, config.hidden_size)
+        self.input_gate_ln = LayerNorm(config.hidden_size, eps=1e-3)
+
+        self.enrichment_grn = GRN(config.hidden_size,
+                                  config.hidden_size,
+                                  context_hidden_size=config.hidden_size, 
+                                  dropout=config.dropout)
+        self.attention = InterpretableMultiHeadAttention(config)
+        self.attention_gate = GLU(config.hidden_size, config.hidden_size)
+        self.attention_ln = LayerNorm(config.hidden_size, eps=1e-3)
+
+        self.positionwise_grn = GRN(config.hidden_size,
+                                    config.hidden_size,
+                                    dropout=config.dropout)
+
+        self.decoder_gate = GLU(config.hidden_size, config.hidden_size)
+        self.decoder_ln = LayerNorm(config.hidden_size, eps=1e-3)
+
+        self.quantile_proj = nn.Linear(config.hidden_size, len(config.quantiles))
+
+    def forward(self, x: Dict[str, Tensor]) -> Tensor:
+        s_inp, t_known_inp, t_observed_inp, t_observed_tgt = self.embedding(x)
+
+        # Static context
+        cs, ce, ch, cc = self.static_encoder(s_inp)
+        ch, cc = ch.unsqueeze(0), cc.unsqueeze(0) #lstm initial states
+
+        # Temporal input
+        _historical_inputs = [t_known_inp[:,:self.encoder_length,:], t_observed_tgt[:,:self.encoder_length,:]]
+        if t_observed_inp is not None:
+            _historical_inputs.insert(0,t_observed_inp[:,:self.encoder_length,:])
+
+        historical_inputs = torch.cat(_historical_inputs, dim=-2)
+        future_inputs = t_known_inp[:, self.encoder_length:]
+
+        # Encoders
+        historical_features, _ = self.history_vsn(historical_inputs, cs)
+        history, state = self.history_encoder(historical_features, (ch, cc))
+        future_features, _ = self.future_vsn(future_inputs, cs)
+        future, _ = self.future_encoder(future_features, state)
+        torch.cuda.synchronize() # this call gives perf boost for unknown reasons
+
+        # skip connection
+        input_embedding = torch.cat([historical_features, future_features], dim=1)
+        temporal_features = torch.cat([history, future], dim=1)
+        temporal_features = self.input_gate(temporal_features)
+        temporal_features = temporal_features + input_embedding
+        temporal_features = self.input_gate_ln(temporal_features)
+
+        # Static enrichment
+        enriched = self.enrichment_grn(temporal_features, c=ce)
+
+        # Temporal self attention
+        x, _ = self.attention(enriched, mask_future_timesteps=True)
+
+        # Don't compute hictorical quantiles
+        x = x[:, self.encoder_length:, :]
+        temporal_features = temporal_features[:, self.encoder_length:, :]
+        enriched = enriched[:, self.encoder_length:, :]
+
+        x = self.attention_gate(x)
+        x = x + enriched
+        x = self.attention_ln(x)
+
+        # Position-wise feed-forward
+        x = self.positionwise_grn(x)
+
+        # Final skip connection
+        x = self.decoder_gate(x)
+        x = x + temporal_features
+        x = self.decoder_ln(x)
+
+        out = self.quantile_proj(x)
+
+        return out
diff --git a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/requirements.txt b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/requirements.txt
new file mode 100644
index 00000000..8ba46efc
--- /dev/null
+++ b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/requirements.txt
@@ -0,0 +1 @@
+tensorboard
diff --git a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/scripts/benchmark.sh b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/scripts/benchmark.sh
new file mode 100644
index 00000000..c8a04c36
--- /dev/null
+++ b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/scripts/benchmark.sh
@@ -0,0 +1,54 @@
+#! /bin/bash
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+NUM_GPUS=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
+[ $NUM_GPUS -eq 16 ] && WORKER_NUMS=(1 8 16) || WORKER_NUMS=(1 8)
+DATASETS=(electricity traffic)
+
+rm -r /tmp/benchmark_results
+
+for DATASET in ${DATASETS[@]}
+do
+    for NGPU in ${WORKER_NUMS[@]}
+    do
+        for BATCH_SIZE in 512 1024 1536 2048 2560
+        do
+            for USE_AMP in --use_amp ""
+            do
+                for AFFINITY in "--affinity disabled" "--affinity single" "--affinity socket_unique_interleaved"
+                do 
+                    EXP_NAME="TFT_benchmark_${DATASET}_BS_${BATCH_SIZE}_${NGPU}GPU${USE_AMP}_${AFFINITY}"
+                    python -m torch.distributed.launch --nproc_per_node=${NGPU} train.py \
+                            --dataset ${DATASET} \
+                            --data_path /data/processed/${DATASET}_bin \
+                            --batch_size=${BATCH_SIZE} \
+                            --lr 5e-4 \
+                            --epochs 1 \
+                            --sample 100000 5000 \
+                            --seed 1 \
+                            ${USE_AMP} \
+                            ${AFFINITY} \
+                            --clip_grad 0.1 \
+                            --results /tmp/benchmark_results/${EXP_NAME}
+                done
+            done
+        done
+    done
+done
+for P in `ls /tmp/benchmark_results/`;
+do
+    echo ${P}
+    tail -n 1 /tmp/benchmark_results/${P}/dllogger.json
+done
diff --git a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/scripts/get_data.sh b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/scripts/get_data.sh
new file mode 100644
index 00000000..d4c7c7e1
--- /dev/null
+++ b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/scripts/get_data.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+DATAPATH='/data'
+
+declare -A URLS=( ['electricity']='https://archive.ics.uci.edu/ml/machine-learning-databases/00321/LD2011_2014.txt.zip'
+                  ['traffic']='https://archive.ics.uci.edu/ml/machine-learning-databases/00204/PEMS-SF.zip'
+                )
+
+mkdir -p ${DATAPATH}/raw
+mkdir -p ${DATAPATH}/processed
+
+for DS in electricity traffic
+do
+	DS_PATH=${DATAPATH}/raw/${DS}
+	ZIP_FNAME=${DS_PATH}.zip
+    if [ ! -d ${DS_PATH} ]
+    then
+        wget "${URLS[${DS}]}" -O ${ZIP_FNAME}
+        unzip ${ZIP_FNAME} -d ${DS_PATH}
+    fi
+	python -c "from data_utils import standarize_${DS} as standarize; standarize(\"${DS_PATH}\")"
+	python -c "from data_utils import preprocess; \
+               from configuration import ${DS^}Config as Config; \
+               preprocess(\"${DS_PATH}/standarized.csv\", \"${DATAPATH}/processed/${DS}_bin\", Config())" 
+done
+
+
diff --git a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/scripts/run_electricity.sh b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/scripts/run_electricity.sh
new file mode 100644
index 00000000..86214a9a
--- /dev/null
+++ b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/scripts/run_electricity.sh
@@ -0,0 +1,30 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+: ${SEED:=1}
+: ${LR:=1e-3}
+: ${NGPU:=8}
+: ${BATCH_SIZE:=1024}
+: ${EPOCHS:=30}
+
+python -m torch.distributed.launch --nproc_per_node=${NGPU} train.py \
+        --dataset electricity \
+        --data_path /data/processed/electricity_bin \
+        --batch_size=${BATCH_SIZE} \
+        --sample 450000 50000 \
+        --lr ${LR} \
+        --epochs ${EPOCHS} \
+        --seed ${SEED} \
+        --use_amp \
+        --results /results/TFT_electricity_bs${NGPU}x${BATCH_SIZE}_lr${LR}/seed_${SEED}
diff --git a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/scripts/run_electricity_DGX1-16G.sh b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/scripts/run_electricity_DGX1-16G.sh
new file mode 100644
index 00000000..86214a9a
--- /dev/null
+++ b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/scripts/run_electricity_DGX1-16G.sh
@@ -0,0 +1,30 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+: ${SEED:=1}
+: ${LR:=1e-3}
+: ${NGPU:=8}
+: ${BATCH_SIZE:=1024}
+: ${EPOCHS:=30}
+
+python -m torch.distributed.launch --nproc_per_node=${NGPU} train.py \
+        --dataset electricity \
+        --data_path /data/processed/electricity_bin \
+        --batch_size=${BATCH_SIZE} \
+        --sample 450000 50000 \
+        --lr ${LR} \
+        --epochs ${EPOCHS} \
+        --seed ${SEED} \
+        --use_amp \
+        --results /results/TFT_electricity_bs${NGPU}x${BATCH_SIZE}_lr${LR}/seed_${SEED}
diff --git a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/scripts/run_traffic.sh b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/scripts/run_traffic.sh
new file mode 100644
index 00000000..cab8e473
--- /dev/null
+++ b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/scripts/run_traffic.sh
@@ -0,0 +1,30 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+: ${SEED:=1}
+: ${LR:=1e-3}
+: ${NGPU:=8}
+: ${BATCH_SIZE:=1024}
+: ${EPOCHS:=20}
+
+python -m torch.distributed.launch --nproc_per_node=${NGPU} train.py \
+        --dataset traffic \
+        --data_path /data/processed/traffic_bin \
+        --batch_size=${BATCH_SIZE} \
+        --sample 450000 50000 \
+        --lr ${LR} \
+        --epochs ${EPOCHS} \
+        --seed ${SEED} \
+        --use_amp \
+        --results /results/TFT_traffic_bs${NGPU}x${BATCH_SIZE}_lr${LR}/seed_${SEED}
diff --git a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/scripts/run_traffic_DGX1-16G.sh b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/scripts/run_traffic_DGX1-16G.sh
new file mode 100644
index 00000000..cab8e473
--- /dev/null
+++ b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/scripts/run_traffic_DGX1-16G.sh
@@ -0,0 +1,30 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+: ${SEED:=1}
+: ${LR:=1e-3}
+: ${NGPU:=8}
+: ${BATCH_SIZE:=1024}
+: ${EPOCHS:=20}
+
+python -m torch.distributed.launch --nproc_per_node=${NGPU} train.py \
+        --dataset traffic \
+        --data_path /data/processed/traffic_bin \
+        --batch_size=${BATCH_SIZE} \
+        --sample 450000 50000 \
+        --lr ${LR} \
+        --epochs ${EPOCHS} \
+        --seed ${SEED} \
+        --use_amp \
+        --results /results/TFT_traffic_bs${NGPU}x${BATCH_SIZE}_lr${LR}/seed_${SEED}
diff --git a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/train.py b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/train.py
new file mode 100644
index 00000000..e5ceceeb
--- /dev/null
+++ b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/train.py
@@ -0,0 +1,294 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import time
+import os
+import pickle
+import json
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.distributed as dist
+from torch.utils.data import DataLoader, DistributedSampler, RandomSampler
+from apex import amp
+from apex.optimizers import FusedAdam
+#from torch.nn.parallel import DistributedDataParallel as DDP
+from apex.parallel import DistributedDataParallel as DDP
+
+import numpy as np
+
+import dllogger
+
+from modeling import TemporalFusionTransformer
+from configuration import CONFIGS
+from data_utils import TFTBinaryDataset, sample_data
+from log_helper import setup_logger
+from criterions import QuantileLoss
+from inference import predict
+from utils import PerformanceMeter
+import gpu_affinity
+from ema import ModelEma
+
+def load_dataset(args, config):
+    train_split = TFTBinaryDataset(os.path.join(args.data_path, 'train.bin'), config)
+    train_split = sample_data(train_split, args.sample_data[0])
+    if args.distributed_world_size > 1:
+        data_sampler = DistributedSampler(train_split, args.distributed_world_size, args.distributed_rank, seed=args.seed + args.distributed_rank, drop_last=True)
+    else:
+        data_sampler = RandomSampler(train_split)
+    train_loader = DataLoader(train_split, batch_size=args.batch_size, num_workers=4, sampler=data_sampler, pin_memory=True)
+
+    valid_split = TFTBinaryDataset(os.path.join(args.data_path, 'valid.bin'), config)
+    valid_split = sample_data(valid_split, args.sample_data[1])
+    if args.distributed_world_size > 1:
+        data_sampler = DistributedSampler(valid_split, args.distributed_world_size, args.distributed_rank, shuffle=False, drop_last=False)
+    else:
+        data_sampler = None
+    valid_loader = DataLoader(valid_split, batch_size=args.batch_size, sampler=data_sampler, num_workers=4, pin_memory=True)
+
+    test_split = TFTBinaryDataset(os.path.join(args.data_path, 'test.bin'), config)
+    if args.distributed_world_size > 1:
+        data_sampler = DistributedSampler(test_split, args.distributed_world_size, args.distributed_rank, shuffle=False, drop_last=False)
+    else:
+        data_sampler = None
+    test_loader = DataLoader(test_split, batch_size=args.batch_size, sampler=data_sampler, num_workers=4, pin_memory=True)
+
+    print_once(f'Train split length: {len(train_split)}')
+    print_once(f'Valid split length: {len(valid_split)}')
+    print_once(f'Test split length: {len(test_split)}')
+
+    return train_loader, valid_loader, test_loader
+
+def print_once(*args, **kwargs):
+    if not dist.is_initialized() or dist.get_rank() == 0:
+        print(*args, **kwargs)
+
+
+def main(args):
+    # Enable CuDNN autotuner
+    nproc_per_node = torch.cuda.device_count()
+    if args.affinity != 'disabled':
+        affinity = gpu_affinity.set_affinity(
+                args.local_rank,
+                nproc_per_node,
+                args.affinity
+            )
+        print(f'{args.local_rank}: thread affinity: {affinity}')
+
+
+    torch.backends.cudnn.benchmark = True
+
+    ### INIT DISTRIBUTED
+    if args.distributed_world_size > 1:
+        args.local_rank = int(os.environ.get('LOCAL_RANK', args.local_rank))
+        torch.cuda.set_device(args.local_rank)
+        dist.init_process_group(backend='nccl', init_method='env://')
+        args.distributed_world_size = int(os.environ['WORLD_SIZE'])
+        args.distributed_rank = dist.get_rank()
+        print_once(f'Distributed training with {args.distributed_world_size} GPUs')
+        torch.cuda.synchronize()
+
+    if args.seed:
+        np.random.seed(args.seed)
+        torch.manual_seed(args.seed)
+        torch.cuda.manual_seed(args.seed)
+
+    setup_logger(args)
+
+    config = CONFIGS[args.dataset]()
+    if args.overwrite_config:
+        config.__dict__.update(json.loads(args.overwrite_config))
+
+    dllogger.log(step='HPARAMS', data={**vars(args), **vars(config)}, verbosity=1)
+
+    model = TemporalFusionTransformer(config).cuda()
+    if args.ema_decay:
+        model_ema = ModelEma(model, decay=args.ema_decay)
+
+    print_once('Model params: {}'.format(sum(p.numel() for p in model.parameters())))
+    criterion = QuantileLoss(config).cuda()
+    optimizer = FusedAdam(model.parameters(), lr=args.lr)
+    if args.use_amp:
+        model, optimizer = amp.initialize(model, optimizer, opt_level="O2", loss_scale="dynamic")
+    if args.distributed_world_size > 1:
+        #model = DDP(model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True)
+        model = DDP(model)
+
+    train_loader, valid_loader, test_loader = load_dataset(args, config)
+
+    global_step = 0
+    perf_meter = PerformanceMeter()
+
+    for epoch in range(args.epochs):
+        start = time.time()
+        dllogger.log(step=global_step, data={'epoch': epoch}, verbosity=1)
+
+        model.train() 
+        for local_step, batch in enumerate(train_loader):
+            perf_meter.reset_current_lap()
+            batch = {key: tensor.cuda() if tensor.numel() else None for key, tensor in batch.items()}
+            predictions = model(batch)
+            targets = batch['target'][:,config.encoder_length:,:]
+            p_losses = criterion(predictions, targets)
+            loss = p_losses.sum()
+
+            if args.use_amp:
+                with amp.scale_loss(loss, optimizer) as scaled_loss:
+                    scaled_loss.backward()
+            else:
+                loss.backward()
+            if not args.grad_accumulation or (global_step+1) % args.grad_accumulation == 0:
+                if args.clip_grad:
+                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip_grad)
+                optimizer.step()
+                optimizer.zero_grad()
+                if args.ema_decay:
+                    model_ema.update(model)
+
+            if args.distributed_world_size > 1:
+                dist.all_reduce(p_losses)
+                p_losses /= args.distributed_world_size
+                loss = p_losses.sum()
+
+            torch.cuda.synchronize()
+            ips = perf_meter.update(args.batch_size * args.distributed_world_size,
+                    exclude_from_total=local_step in [0, len(train_loader)-1])
+
+            log_dict = {'P10':p_losses[0].item(), 'P50':p_losses[1].item(), 'P90':p_losses[2].item(), 'loss': loss.item(), 'items/s':ips}
+            dllogger.log(step=global_step, data=log_dict, verbosity=1)
+            global_step += 1
+
+        validate(args, config, model_ema if args.ema_decay else model, criterion, valid_loader, global_step)
+
+        if validate.early_stop_c >= args.early_stopping:
+            print_once('Early stopping')
+            break
+
+    ### TEST PHASE ###
+    state_dict = torch.load(os.path.join(args.results, 'checkpoint.pt'), map_location='cpu')
+    if isinstance(model, DDP):
+        model.module.load_state_dict(state_dict['model'])
+    else:
+        model.load_state_dict(state_dict['model'])
+    model.cuda().eval()
+
+    tgt_scalers = pickle.load(open(os.path.join(args.data_path, 'tgt_scalers.bin'), 'rb'))
+    cat_encodings = pickle.load(open(os.path.join(args.data_path,'cat_encodings.bin'), 'rb'))
+
+    unscaled_predictions, unscaled_targets, _, _ = predict(args, config, model, test_loader, tgt_scalers, cat_encodings)
+    losses = QuantileLoss(config)(unscaled_predictions, unscaled_targets)
+    normalizer = unscaled_targets.abs().mean()
+    quantiles = 2 * losses / normalizer
+
+    if args.distributed_world_size > 1:
+        quantiles = quantiles.cuda()
+        dist.all_reduce(quantiles)
+        quantiles /= args.distributed_world_size
+
+    quantiles = {'test_p10': quantiles[0].item(), 'test_p50': quantiles[1].item(), 'test_p90': quantiles[2].item(), 'sum':sum(quantiles).item()}
+    finish_log = {**quantiles, 'average_ips':perf_meter.avg, 'convergence_step':validate.conv_step}
+    dllogger.log(step=(), data=finish_log, verbosity=1)
+
+def validate(args, config, model, criterion, dataloader, global_step):
+    if not hasattr(validate, 'best_valid_loss'):
+        validate.best_valid_loss = float('inf')
+    if not hasattr(validate, 'early_stop_c'):
+        validate.early_stop_c = 0
+    model.eval()
+
+    losses = []
+    validation_start = time.time()
+    for batch in dataloader:
+        with torch.no_grad():
+            batch = {key: tensor.cuda() if tensor.numel() else None for key, tensor in batch.items()}
+            predictions = model(batch)
+            targets = batch['target'][:,config.encoder_length:,:]
+            p_losses = criterion(predictions, targets)
+            bs = next(t for t in batch.values() if t is not None).shape[0]
+            losses.append((p_losses, bs))
+
+    validation_end = time.time()
+
+    p_losses = sum([l[0]*l[1] for l in losses])/sum([l[1] for l in losses]) #takes into accunt that the last batch is not full
+    if args.distributed_world_size > 1:
+        dist.all_reduce(p_losses)
+        p_losses = p_losses/args.distributed_world_size
+
+    ips = len(dataloader.dataset) / (validation_end - validation_start)
+
+    log_dict = {'P10':p_losses[0].item(), 'P50':p_losses[1].item(), 'P90':p_losses[2].item(), 'loss': p_losses.sum().item(), 'items/s':ips}
+
+    if log_dict['loss'] < validate.best_valid_loss:
+        validate.best_valid_loss = log_dict['loss']
+        validate.early_stop_c = 0
+        validate.conv_step = global_step
+        if not dist.is_initialized() or dist.get_rank() == 0:
+            state_dict = model.module.state_dict() if isinstance(model, (DDP, ModelEma)) else model.state_dict()
+            ckpt = {'args':args, 'config':config, 'model':state_dict}
+            torch.save(ckpt, os.path.join(args.results, 'checkpoint.pt'))
+        if args.distributed_world_size > 1:
+            dist.barrier()
+    else:
+        validate.early_stop_c += 1
+        
+    log_dict = {'val_'+k:v for k,v in log_dict.items()}
+    dllogger.log(step=global_step, data=log_dict, verbosity=1)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--data_path', type=str, required=True,
+                        help='Path to the dataset')
+    parser.add_argument('--dataset', type=str, required=True, choices=CONFIGS.keys(),
+                        help='Dataset name')
+    parser.add_argument('--epochs', type=int, default=25,
+                        help='Default number of training epochs')
+    parser.add_argument('--sample_data', type=lambda x: int(float(x)), nargs=2, default=[-1, -1],
+                        help="""Subsample the dataset. Specify number of training and valid examples.
+                        Values can be provided in scientific notation. Floats will be truncated.""")
+    parser.add_argument('--batch_size', type=int, default=64)
+    parser.add_argument('--lr', type=float, default=1e-3)
+    parser.add_argument('--seed', type=int, default=1)
+    parser.add_argument('--use_amp', action='store_true', help='Enable automatic mixed precision')
+    parser.add_argument('--clip_grad', type=float, default=0.0)
+    parser.add_argument('--grad_accumulation', type=int, default=0)
+    parser.add_argument('--early_stopping', type=int, default=1000,
+                        help='Stop training if validation loss does not improve for more than this number of epochs.')
+    parser.add_argument('--results', type=str, default='/results',
+                        help='Directory in which results are stored')
+    parser.add_argument('--log_file', type=str, default='dllogger.json',
+                        help='Name of dllogger output file')
+    parser.add_argument('--distributed_world_size', type=int, metavar='N',
+                       default=torch.cuda.device_count(),
+                       help='total number of GPUs across all nodes (default: all visible GPUs)')
+    parser.add_argument('--distributed_rank', default=os.getenv('LOCAL_RANK', 0), type=int,
+                       help='rank of the current worker')
+    parser.add_argument('--local_rank', default=0, type=int,
+                       help='rank of the current worker')
+    parser.add_argument('--overwrite_config', type=str, default='',
+                       help='JSON string used to overload config')
+    parser.add_argument('--affinity', type=str,
+                         default='socket_unique_interleaved',
+                         choices=['socket', 'single', 'single_unique',
+                                  'socket_unique_interleaved',
+                                  'socket_unique_continuous',
+                                  'disabled'],
+                         help='type of CPU affinity')
+    parser.add_argument("--ema_decay", type=float, default=0.0, help='Use exponential moving average')
+
+
+    ARGS = parser.parse_args()
+    main(ARGS)
diff --git a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/utils.py b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/utils.py
new file mode 100644
index 00000000..bf88be40
--- /dev/null
+++ b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/utils.py
@@ -0,0 +1,46 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import time
+
+class PerformanceMeter():
+    def __init__(self):
+        self.reset()
+
+    def reset(self):
+        self.avg = 0
+        self.count = 0
+        self.total_time = 0
+        self.last_update_time = time.time()
+        self.intervals = []
+
+    def update(self, n, exclude_from_total=False):
+        delta = time.time() - self.last_update_time
+        self.intervals.append(delta)
+        if not exclude_from_total:
+            self.total_time += delta
+            self.count += n
+            self.avg = self.count / self.total_time
+        self.last_update_time = time.time()
+
+        return n/delta
+
+    def reset_current_lap(self):
+        self.last_update_time = time.time()
+
+    def p(self, i):
+        assert i <= 100
+        idx = int(len(self.intervals) * i / 100)
+        return sorted(self.intervals)[idx]
+
diff --git a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/train.py b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/train.py
new file mode 100644
index 00000000..e5ceceeb
--- /dev/null
+++ b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/train.py
@@ -0,0 +1,294 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import time
+import os
+import pickle
+import json
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.distributed as dist
+from torch.utils.data import DataLoader, DistributedSampler, RandomSampler
+from apex import amp
+from apex.optimizers import FusedAdam
+#from torch.nn.parallel import DistributedDataParallel as DDP
+from apex.parallel import DistributedDataParallel as DDP
+
+import numpy as np
+
+import dllogger
+
+from modeling import TemporalFusionTransformer
+from configuration import CONFIGS
+from data_utils import TFTBinaryDataset, sample_data
+from log_helper import setup_logger
+from criterions import QuantileLoss
+from inference import predict
+from utils import PerformanceMeter
+import gpu_affinity
+from ema import ModelEma
+
+def load_dataset(args, config):
+    train_split = TFTBinaryDataset(os.path.join(args.data_path, 'train.bin'), config)
+    train_split = sample_data(train_split, args.sample_data[0])
+    if args.distributed_world_size > 1:
+        data_sampler = DistributedSampler(train_split, args.distributed_world_size, args.distributed_rank, seed=args.seed + args.distributed_rank, drop_last=True)
+    else:
+        data_sampler = RandomSampler(train_split)
+    train_loader = DataLoader(train_split, batch_size=args.batch_size, num_workers=4, sampler=data_sampler, pin_memory=True)
+
+    valid_split = TFTBinaryDataset(os.path.join(args.data_path, 'valid.bin'), config)
+    valid_split = sample_data(valid_split, args.sample_data[1])
+    if args.distributed_world_size > 1:
+        data_sampler = DistributedSampler(valid_split, args.distributed_world_size, args.distributed_rank, shuffle=False, drop_last=False)
+    else:
+        data_sampler = None
+    valid_loader = DataLoader(valid_split, batch_size=args.batch_size, sampler=data_sampler, num_workers=4, pin_memory=True)
+
+    test_split = TFTBinaryDataset(os.path.join(args.data_path, 'test.bin'), config)
+    if args.distributed_world_size > 1:
+        data_sampler = DistributedSampler(test_split, args.distributed_world_size, args.distributed_rank, shuffle=False, drop_last=False)
+    else:
+        data_sampler = None
+    test_loader = DataLoader(test_split, batch_size=args.batch_size, sampler=data_sampler, num_workers=4, pin_memory=True)
+
+    print_once(f'Train split length: {len(train_split)}')
+    print_once(f'Valid split length: {len(valid_split)}')
+    print_once(f'Test split length: {len(test_split)}')
+
+    return train_loader, valid_loader, test_loader
+
+def print_once(*args, **kwargs):
+    if not dist.is_initialized() or dist.get_rank() == 0:
+        print(*args, **kwargs)
+
+
+def main(args):
+    # Enable CuDNN autotuner
+    nproc_per_node = torch.cuda.device_count()
+    if args.affinity != 'disabled':
+        affinity = gpu_affinity.set_affinity(
+                args.local_rank,
+                nproc_per_node,
+                args.affinity
+            )
+        print(f'{args.local_rank}: thread affinity: {affinity}')
+
+
+    torch.backends.cudnn.benchmark = True
+
+    ### INIT DISTRIBUTED
+    if args.distributed_world_size > 1:
+        args.local_rank = int(os.environ.get('LOCAL_RANK', args.local_rank))
+        torch.cuda.set_device(args.local_rank)
+        dist.init_process_group(backend='nccl', init_method='env://')
+        args.distributed_world_size = int(os.environ['WORLD_SIZE'])
+        args.distributed_rank = dist.get_rank()
+        print_once(f'Distributed training with {args.distributed_world_size} GPUs')
+        torch.cuda.synchronize()
+
+    if args.seed:
+        np.random.seed(args.seed)
+        torch.manual_seed(args.seed)
+        torch.cuda.manual_seed(args.seed)
+
+    setup_logger(args)
+
+    config = CONFIGS[args.dataset]()
+    if args.overwrite_config:
+        config.__dict__.update(json.loads(args.overwrite_config))
+
+    dllogger.log(step='HPARAMS', data={**vars(args), **vars(config)}, verbosity=1)
+
+    model = TemporalFusionTransformer(config).cuda()
+    if args.ema_decay:
+        model_ema = ModelEma(model, decay=args.ema_decay)
+
+    print_once('Model params: {}'.format(sum(p.numel() for p in model.parameters())))
+    criterion = QuantileLoss(config).cuda()
+    optimizer = FusedAdam(model.parameters(), lr=args.lr)
+    if args.use_amp:
+        model, optimizer = amp.initialize(model, optimizer, opt_level="O2", loss_scale="dynamic")
+    if args.distributed_world_size > 1:
+        #model = DDP(model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True)
+        model = DDP(model)
+
+    train_loader, valid_loader, test_loader = load_dataset(args, config)
+
+    global_step = 0
+    perf_meter = PerformanceMeter()
+
+    for epoch in range(args.epochs):
+        start = time.time()
+        dllogger.log(step=global_step, data={'epoch': epoch}, verbosity=1)
+
+        model.train() 
+        for local_step, batch in enumerate(train_loader):
+            perf_meter.reset_current_lap()
+            batch = {key: tensor.cuda() if tensor.numel() else None for key, tensor in batch.items()}
+            predictions = model(batch)
+            targets = batch['target'][:,config.encoder_length:,:]
+            p_losses = criterion(predictions, targets)
+            loss = p_losses.sum()
+
+            if args.use_amp:
+                with amp.scale_loss(loss, optimizer) as scaled_loss:
+                    scaled_loss.backward()
+            else:
+                loss.backward()
+            if not args.grad_accumulation or (global_step+1) % args.grad_accumulation == 0:
+                if args.clip_grad:
+                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip_grad)
+                optimizer.step()
+                optimizer.zero_grad()
+                if args.ema_decay:
+                    model_ema.update(model)
+
+            if args.distributed_world_size > 1:
+                dist.all_reduce(p_losses)
+                p_losses /= args.distributed_world_size
+                loss = p_losses.sum()
+
+            torch.cuda.synchronize()
+            ips = perf_meter.update(args.batch_size * args.distributed_world_size,
+                    exclude_from_total=local_step in [0, len(train_loader)-1])
+
+            log_dict = {'P10':p_losses[0].item(), 'P50':p_losses[1].item(), 'P90':p_losses[2].item(), 'loss': loss.item(), 'items/s':ips}
+            dllogger.log(step=global_step, data=log_dict, verbosity=1)
+            global_step += 1
+
+        validate(args, config, model_ema if args.ema_decay else model, criterion, valid_loader, global_step)
+
+        if validate.early_stop_c >= args.early_stopping:
+            print_once('Early stopping')
+            break
+
+    ### TEST PHASE ###
+    state_dict = torch.load(os.path.join(args.results, 'checkpoint.pt'), map_location='cpu')
+    if isinstance(model, DDP):
+        model.module.load_state_dict(state_dict['model'])
+    else:
+        model.load_state_dict(state_dict['model'])
+    model.cuda().eval()
+
+    tgt_scalers = pickle.load(open(os.path.join(args.data_path, 'tgt_scalers.bin'), 'rb'))
+    cat_encodings = pickle.load(open(os.path.join(args.data_path,'cat_encodings.bin'), 'rb'))
+
+    unscaled_predictions, unscaled_targets, _, _ = predict(args, config, model, test_loader, tgt_scalers, cat_encodings)
+    losses = QuantileLoss(config)(unscaled_predictions, unscaled_targets)
+    normalizer = unscaled_targets.abs().mean()
+    quantiles = 2 * losses / normalizer
+
+    if args.distributed_world_size > 1:
+        quantiles = quantiles.cuda()
+        dist.all_reduce(quantiles)
+        quantiles /= args.distributed_world_size
+
+    quantiles = {'test_p10': quantiles[0].item(), 'test_p50': quantiles[1].item(), 'test_p90': quantiles[2].item(), 'sum':sum(quantiles).item()}
+    finish_log = {**quantiles, 'average_ips':perf_meter.avg, 'convergence_step':validate.conv_step}
+    dllogger.log(step=(), data=finish_log, verbosity=1)
+
+def validate(args, config, model, criterion, dataloader, global_step):
+    if not hasattr(validate, 'best_valid_loss'):
+        validate.best_valid_loss = float('inf')
+    if not hasattr(validate, 'early_stop_c'):
+        validate.early_stop_c = 0
+    model.eval()
+
+    losses = []
+    validation_start = time.time()
+    for batch in dataloader:
+        with torch.no_grad():
+            batch = {key: tensor.cuda() if tensor.numel() else None for key, tensor in batch.items()}
+            predictions = model(batch)
+            targets = batch['target'][:,config.encoder_length:,:]
+            p_losses = criterion(predictions, targets)
+            bs = next(t for t in batch.values() if t is not None).shape[0]
+            losses.append((p_losses, bs))
+
+    validation_end = time.time()
+
+    p_losses = sum([l[0]*l[1] for l in losses])/sum([l[1] for l in losses]) #takes into accunt that the last batch is not full
+    if args.distributed_world_size > 1:
+        dist.all_reduce(p_losses)
+        p_losses = p_losses/args.distributed_world_size
+
+    ips = len(dataloader.dataset) / (validation_end - validation_start)
+
+    log_dict = {'P10':p_losses[0].item(), 'P50':p_losses[1].item(), 'P90':p_losses[2].item(), 'loss': p_losses.sum().item(), 'items/s':ips}
+
+    if log_dict['loss'] < validate.best_valid_loss:
+        validate.best_valid_loss = log_dict['loss']
+        validate.early_stop_c = 0
+        validate.conv_step = global_step
+        if not dist.is_initialized() or dist.get_rank() == 0:
+            state_dict = model.module.state_dict() if isinstance(model, (DDP, ModelEma)) else model.state_dict()
+            ckpt = {'args':args, 'config':config, 'model':state_dict}
+            torch.save(ckpt, os.path.join(args.results, 'checkpoint.pt'))
+        if args.distributed_world_size > 1:
+            dist.barrier()
+    else:
+        validate.early_stop_c += 1
+        
+    log_dict = {'val_'+k:v for k,v in log_dict.items()}
+    dllogger.log(step=global_step, data=log_dict, verbosity=1)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--data_path', type=str, required=True,
+                        help='Path to the dataset')
+    parser.add_argument('--dataset', type=str, required=True, choices=CONFIGS.keys(),
+                        help='Dataset name')
+    parser.add_argument('--epochs', type=int, default=25,
+                        help='Default number of training epochs')
+    parser.add_argument('--sample_data', type=lambda x: int(float(x)), nargs=2, default=[-1, -1],
+                        help="""Subsample the dataset. Specify number of training and valid examples.
+                        Values can be provided in scientific notation. Floats will be truncated.""")
+    parser.add_argument('--batch_size', type=int, default=64)
+    parser.add_argument('--lr', type=float, default=1e-3)
+    parser.add_argument('--seed', type=int, default=1)
+    parser.add_argument('--use_amp', action='store_true', help='Enable automatic mixed precision')
+    parser.add_argument('--clip_grad', type=float, default=0.0)
+    parser.add_argument('--grad_accumulation', type=int, default=0)
+    parser.add_argument('--early_stopping', type=int, default=1000,
+                        help='Stop training if validation loss does not improve for more than this number of epochs.')
+    parser.add_argument('--results', type=str, default='/results',
+                        help='Directory in which results are stored')
+    parser.add_argument('--log_file', type=str, default='dllogger.json',
+                        help='Name of dllogger output file')
+    parser.add_argument('--distributed_world_size', type=int, metavar='N',
+                       default=torch.cuda.device_count(),
+                       help='total number of GPUs across all nodes (default: all visible GPUs)')
+    parser.add_argument('--distributed_rank', default=os.getenv('LOCAL_RANK', 0), type=int,
+                       help='rank of the current worker')
+    parser.add_argument('--local_rank', default=0, type=int,
+                       help='rank of the current worker')
+    parser.add_argument('--overwrite_config', type=str, default='',
+                       help='JSON string used to overload config')
+    parser.add_argument('--affinity', type=str,
+                         default='socket_unique_interleaved',
+                         choices=['socket', 'single', 'single_unique',
+                                  'socket_unique_interleaved',
+                                  'socket_unique_continuous',
+                                  'disabled'],
+                         help='type of CPU affinity')
+    parser.add_argument("--ema_decay", type=float, default=0.0, help='Use exponential moving average')
+
+
+    ARGS = parser.parse_args()
+    main(ARGS)
diff --git a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/utils.py b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/utils.py
new file mode 100644
index 00000000..bf88be40
--- /dev/null
+++ b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/utils.py
@@ -0,0 +1,46 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import time
+
+class PerformanceMeter():
+    def __init__(self):
+        self.reset()
+
+    def reset(self):
+        self.avg = 0
+        self.count = 0
+        self.total_time = 0
+        self.last_update_time = time.time()
+        self.intervals = []
+
+    def update(self, n, exclude_from_total=False):
+        delta = time.time() - self.last_update_time
+        self.intervals.append(delta)
+        if not exclude_from_total:
+            self.total_time += delta
+            self.count += n
+            self.avg = self.count / self.total_time
+        self.last_update_time = time.time()
+
+        return n/delta
+
+    def reset_current_lap(self):
+        self.last_update_time = time.time()
+
+    def p(self, i):
+        assert i <= 100
+        idx = int(len(self.intervals) * i / 100)
+        return sorted(self.intervals)[idx]
+
diff --git a/PyTorch/Forecasting/TFT/configuration.py b/PyTorch/Forecasting/TFT/configuration.py
new file mode 100644
index 00000000..bef26e66
--- /dev/null
+++ b/PyTorch/Forecasting/TFT/configuration.py
@@ -0,0 +1,128 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from data_utils import InputTypes, DataTypes, FeatureSpec
+import datetime
+
+class ElectricityConfig():
+    def __init__(self):
+
+        self.features = [
+                         FeatureSpec('id', InputTypes.ID, DataTypes.CATEGORICAL),
+                         FeatureSpec('hours_from_start', InputTypes.TIME, DataTypes.CONTINUOUS),
+                         FeatureSpec('power_usage', InputTypes.TARGET, DataTypes.CONTINUOUS),
+                         FeatureSpec('hour', InputTypes.KNOWN, DataTypes.CONTINUOUS),
+                         FeatureSpec('day_of_week', InputTypes.KNOWN, DataTypes.CONTINUOUS),
+                         FeatureSpec('hours_from_start', InputTypes.KNOWN, DataTypes.CONTINUOUS),
+                         FeatureSpec('categorical_id', InputTypes.STATIC, DataTypes.CATEGORICAL),
+                        ]
+        # Dataset split boundaries
+        self.time_ids = 'days_from_start' # This column contains time indices across which we split the data
+        self.train_range = (1096, 1315)
+        self.valid_range = (1308, 1339)
+        self.test_range = (1332, 1346)
+        self.dataset_stride = 1 #how many timesteps between examples
+        self.scale_per_id = True
+        self.missing_id_strategy = None
+        self.missing_cat_data_strategy='encode_all'
+
+        # Feature sizes
+        self.static_categorical_inp_lens = [369]
+        self.temporal_known_categorical_inp_lens = []
+        self.temporal_observed_categorical_inp_lens = []
+        self.quantiles = [0.1, 0.5, 0.9]
+
+        self.example_length = 8 * 24
+        self.encoder_length = 7 * 24
+
+        self.n_head = 4
+        self.hidden_size = 128
+        self.dropout = 0.1
+        self.attn_dropout = 0.0
+
+        #### Derived variables ####
+        self.temporal_known_continuous_inp_size = len([x for x in self.features 
+            if x.feature_type == InputTypes.KNOWN and x.feature_embed_type == DataTypes.CONTINUOUS])
+        self.temporal_observed_continuous_inp_size = len([x for x in self.features 
+            if x.feature_type == InputTypes.OBSERVED and x.feature_embed_type == DataTypes.CONTINUOUS])
+        self.temporal_target_size = len([x for x in self.features if x.feature_type == InputTypes.TARGET])
+        self.static_continuous_inp_size = len([x for x in self.features 
+            if x.feature_type == InputTypes.STATIC and x.feature_embed_type == DataTypes.CONTINUOUS])
+
+        self.num_static_vars = self.static_continuous_inp_size + len(self.static_categorical_inp_lens)
+        self.num_future_vars = self.temporal_known_continuous_inp_size + len(self.temporal_known_categorical_inp_lens)
+        self.num_historic_vars = sum([self.num_future_vars,
+                                      self.temporal_observed_continuous_inp_size,
+                                      self.temporal_target_size,
+                                      len(self.temporal_observed_categorical_inp_lens),
+                                      ])
+
+
+class TrafficConfig():
+    def __init__(self):
+
+        self.features = [
+                         FeatureSpec('id', InputTypes.ID, DataTypes.CATEGORICAL),
+                         FeatureSpec('hours_from_start', InputTypes.TIME, DataTypes.CONTINUOUS),
+                         FeatureSpec('values', InputTypes.TARGET, DataTypes.CONTINUOUS),
+                         FeatureSpec('time_on_day', InputTypes.KNOWN, DataTypes.CONTINUOUS),
+                         FeatureSpec('day_of_week', InputTypes.KNOWN, DataTypes.CONTINUOUS),
+                         FeatureSpec('hours_from_start', InputTypes.KNOWN, DataTypes.CONTINUOUS),
+                         FeatureSpec('categorical_id', InputTypes.STATIC, DataTypes.CATEGORICAL),
+                        ]
+        # Dataset split boundaries
+        self.time_ids = 'sensor_day' # This column contains time indices across which we split the data
+        self.train_range = (0, 151)
+        self.valid_range = (144, 166)
+        self.test_range = (159, float('inf'))
+        self.dataset_stride = 1 #how many timesteps between examples
+        self.scale_per_id = False
+        self.missing_id_strategy = None
+        self.missing_cat_data_strategy='encode_all'
+
+        # Feature sizes
+        self.static_categorical_inp_lens = [963]
+        self.temporal_known_categorical_inp_lens = []
+        self.temporal_observed_categorical_inp_lens = []
+        self.quantiles = [0.1, 0.5, 0.9]
+
+        self.example_length = 8 * 24
+        self.encoder_length = 7 * 24
+
+        self.n_head = 4
+        self.hidden_size = 128
+        self.dropout = 0.3
+        self.attn_dropout = 0.0
+
+        #### Derived variables ####
+        self.temporal_known_continuous_inp_size = len([x for x in self.features 
+            if x.feature_type == InputTypes.KNOWN and x.feature_embed_type == DataTypes.CONTINUOUS])
+        self.temporal_observed_continuous_inp_size = len([x for x in self.features 
+            if x.feature_type == InputTypes.OBSERVED and x.feature_embed_type == DataTypes.CONTINUOUS])
+        self.temporal_target_size = len([x for x in self.features if x.feature_type == InputTypes.TARGET])
+        self.static_continuous_inp_size = len([x for x in self.features 
+            if x.feature_type == InputTypes.STATIC and x.feature_embed_type == DataTypes.CONTINUOUS])
+
+        self.num_static_vars = self.static_continuous_inp_size + len(self.static_categorical_inp_lens)
+        self.num_future_vars = self.temporal_known_continuous_inp_size + len(self.temporal_known_categorical_inp_lens)
+        self.num_historic_vars = sum([self.num_future_vars,
+                                      self.temporal_observed_continuous_inp_size,
+                                      self.temporal_target_size,
+                                      len(self.temporal_observed_categorical_inp_lens),
+                                      ])
+
+
+CONFIGS = {'electricity':  ElectricityConfig,
+           'traffic':      TrafficConfig, 
+           }
diff --git a/PyTorch/Forecasting/TFT/criterions.py b/PyTorch/Forecasting/TFT/criterions.py
new file mode 100644
index 00000000..5c9df6ae
--- /dev/null
+++ b/PyTorch/Forecasting/TFT/criterions.py
@@ -0,0 +1,28 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class QuantileLoss(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.register_buffer('q', torch.tensor(config.quantiles))
+
+    def forward(self, predictions, targets):
+        diff = predictions - targets
+        ql = (1-self.q)*F.relu(diff) + self.q*F.relu(-diff)
+        losses = ql.view(-1, ql.shape[-1]).mean(0)
+        return losses
diff --git a/PyTorch/Forecasting/TFT/data_utils.py b/PyTorch/Forecasting/TFT/data_utils.py
new file mode 100644
index 00000000..f38f8bfb
--- /dev/null
+++ b/PyTorch/Forecasting/TFT/data_utils.py
@@ -0,0 +1,790 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################
+# Copyright 2021 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import math
+import pickle
+import enum
+import datetime
+
+from collections import namedtuple, OrderedDict
+
+import sklearn.preprocessing
+from sklearn.impute import SimpleImputer
+import pandas as pd
+import numpy as np
+from bisect import bisect
+
+import torch
+from torch.utils.data import Dataset,IterableDataset,DataLoader
+
+class DataTypes(enum.IntEnum):
+    """Defines numerical types of each column."""
+    CONTINUOUS = 0
+    CATEGORICAL = 1
+    DATE = 2
+    STR = 3
+
+class InputTypes(enum.IntEnum):
+    """Defines input types of each column."""
+    TARGET = 0
+    OBSERVED = 1
+    KNOWN = 2
+    STATIC = 3
+    ID = 4  # Single column used as an entity identifier
+    TIME = 5  # Single column exclusively used as a time index
+
+FeatureSpec = namedtuple('FeatureSpec', ['name', 'feature_type', 'feature_embed_type'])
+DTYPE_MAP = {
+        DataTypes.CONTINUOUS : np.float32,
+        DataTypes.CATEGORICAL : np.int64,
+        DataTypes.DATE:'datetime64[ns]',
+        DataTypes.STR: str
+        }
+
+FEAT_ORDER = [
+        (InputTypes.STATIC, DataTypes.CATEGORICAL),
+        (InputTypes.STATIC, DataTypes.CONTINUOUS),
+        (InputTypes.KNOWN, DataTypes.CATEGORICAL),
+        (InputTypes.KNOWN, DataTypes.CONTINUOUS),
+        (InputTypes.OBSERVED, DataTypes.CATEGORICAL),
+        (InputTypes.OBSERVED, DataTypes.CONTINUOUS),
+        (InputTypes.TARGET, DataTypes.CONTINUOUS),
+        (InputTypes.ID, DataTypes.CATEGORICAL)
+        ]
+
+FEAT_NAMES = ['s_cat' , 's_cont' , 'k_cat' , 'k_cont' , 'o_cat' , 'o_cont' , 'target', 'id']
+DEFAULT_ID_COL = 'id'
+
+class TFTBinaryDataset(Dataset):
+    def __init__(self, path, config):
+        super(TFTBinaryDataset).__init__()
+        self.features = [x for x in config.features if x.feature_embed_type != DataTypes.DATE]
+        self.example_length = config.example_length
+        self.stride = config.dataset_stride
+
+        self.grouped = pickle.load(open(path, 'rb'))
+        self.grouped = [x for x in self.grouped if x.shape[0] >= self.example_length]
+        self._cum_examples_in_group = np.cumsum([(g.shape[0] - self.example_length + 1)//self.stride for g in self.grouped])
+
+
+        self.feature_type_col_map = [[i for i,f in enumerate(self.features) if (f.feature_type, f.feature_embed_type) == x] for x in FEAT_ORDER]
+
+        # The list comprehension below is an elaborate way of rearranging data into correct order,
+        # simultaneously doing casting to proper types. Probably can be written neater
+        self.grouped = [
+                [
+                    arr[:, idxs].view(dtype=np.float32).astype(DTYPE_MAP[t[1]]) 
+                    for t, idxs in zip(FEAT_ORDER, self.feature_type_col_map)
+                ] 
+                for arr in self.grouped
+            ]
+
+    def __len__(self):
+        return self._cum_examples_in_group[-1] if len(self._cum_examples_in_group) else 0
+
+    def __getitem__(self, idx):
+        g_idx = bisect(self._cum_examples_in_group, idx)
+        e_idx = idx - self._cum_examples_in_group[g_idx-1] if g_idx else idx
+
+        group =  self.grouped[g_idx]
+
+        tensors = [
+                   torch.from_numpy(feat[e_idx * self.stride:e_idx*self.stride + self.example_length])
+                   if feat.size else torch.empty(0)
+                   for feat in group
+                  ]
+
+        return OrderedDict(zip(FEAT_NAMES, tensors))
+
+
+class TFTDataset(Dataset):
+    def __init__(self, path, config):
+        super(TFTDataset).__init__()
+        self.features = config.features
+        self.data = pd.read_csv(path, index_col=0)
+        self.example_length = config.example_length
+        self.stride = config.dataset_stride
+
+        # name field is a column name.
+        # there can be multiple entries with the same name because one column can be interpreted in many ways
+        time_col_name = next(x.name for x in self.features if x.feature_type==InputTypes.TIME)
+        id_col_name   = next(x.name for x in self.features if x.feature_type==InputTypes.ID)
+        if not id_col_name in self.data.columns:
+            id_col_name = DEFAULT_ID_COL
+            self.features = [x for x in self.features if x.feature_type!=InputTypes.ID]
+            self.features.append(FeatureSpec(DEFAULT_ID_COL, InputTypes.ID, DataTypes.CATEGORICAL))
+        col_dtypes    = {v.name:DTYPE_MAP[v.feature_embed_type] for v in self.features}
+
+
+        self.data.sort_values(time_col_name,inplace=True)
+        self.data = self.data[set(x.name for x in self.features)] #leave only relevant columns
+        self.data = self.data.astype(col_dtypes)
+        self.data = self.data.groupby(id_col_name).filter(lambda group: len(group) >= self.example_length)
+        self.grouped = list(self.data.groupby(id_col_name))
+
+        self._cum_examples_in_group = np.cumsum([(len(g[1]) - self.example_length + 1)//self.stride for g in self.grouped])
+
+    def __len__(self):
+        return self._cum_examples_in_group[-1]
+
+    def __getitem__(self, idx):
+        g_idx = len([x for x in self._cum_examples_in_group if x <= idx])
+        e_idx = idx - self._cum_examples_in_group[g_idx-1] if g_idx else idx
+
+        group =  self.grouped[g_idx][1]
+        sliced = group.iloc[e_idx * self.stride:e_idx*self.stride + self.example_length]
+
+        # We need to be sure that tensors are returned in the correct order
+        tensors = tuple([] for _ in range(8))
+        for v in self.features:
+            if v.feature_type == InputTypes.STATIC and v.feature_embed_type == DataTypes.CATEGORICAL:
+                tensors[0].append(torch.from_numpy(sliced[v.name].to_numpy()))
+            elif v.feature_type == InputTypes.STATIC and v.feature_embed_type == DataTypes.CONTINUOUS:
+                tensors[1].append(torch.from_numpy(sliced[v.name].to_numpy()))
+            elif v.feature_type == InputTypes.KNOWN and v.feature_embed_type == DataTypes.CATEGORICAL:
+                tensors[2].append(torch.from_numpy(sliced[v.name].to_numpy()))
+            elif v.feature_type == InputTypes.KNOWN and v.feature_embed_type == DataTypes.CONTINUOUS:
+                tensors[3].append(torch.from_numpy(sliced[v.name].to_numpy()))
+            elif v.feature_type == InputTypes.OBSERVED and v.feature_embed_type == DataTypes.CATEGORICAL:
+                tensors[4].append(torch.from_numpy(sliced[v.name].to_numpy()))
+            elif v.feature_type == InputTypes.OBSERVED and v.feature_embed_type == DataTypes.CONTINUOUS:
+                tensors[5].append(torch.from_numpy(sliced[v.name].to_numpy()))
+            elif v.feature_type == InputTypes.TARGET:
+                tensors[6].append(torch.from_numpy(sliced[v.name].to_numpy()))
+            elif v.feature_type == InputTypes.ID:
+                tensors[7].append(torch.from_numpy(sliced[v.name].to_numpy()))
+
+
+        tensors = [torch.stack(x, dim=-1) if x else torch.empty(0) for x in tensors]
+
+        return OrderedDict(zip(FEAT_NAMES, tensors))
+        
+def get_dataset_splits(df, config):
+
+    if hasattr(config, 'relative_split') and config.relative_split:
+        forecast_len = config.example_length - config.encoder_length
+        # The valid split is shifted from the train split by number of the forecast steps to the future.
+        # The test split is shifted by the number of the forecast steps from the valid split
+        train = []
+        valid = []
+        test = []
+
+        for _, group in df.groupby(DEFAULT_ID_COL):
+            index = group[config.time_ids]
+            _train = group.loc[index < config.valid_boundary]
+            _valid = group.iloc[(len(_train) - config.encoder_length):(len(_train) + forecast_len)]
+            _test = group.iloc[(len(_train) - config.encoder_length + forecast_len):(len(_train) + 2*forecast_len)]
+            train.append(_train)
+            valid.append(_valid)
+            test.append(_test)
+
+        train = pd.concat(train, axis=0)
+        valid = pd.concat(valid, axis=0)
+        test = pd.concat(test, axis=0)
+    else:
+        index = df[config.time_ids]
+        train = df.loc[(index >= config.train_range[0]) & (index < config.train_range[1])]
+        valid = df.loc[(index >= config.valid_range[0]) & (index < config.valid_range[1])]
+        test  = df.loc[(index >= config.test_range[0]) & (index < config.test_range[1])]
+
+    return train, valid, test
+
+def flatten_ids(df, config):
+
+    if config.missing_id_strategy == 'drop':
+        if hasattr(config, 'combine_ids') and config.combine_ids:
+            index = np.logical_or.reduce([df[c].isna() for c in config.combine_ids])
+        else:
+            id_col = next(x.name for x in config.features if x.feature_type == InputTypes.ID)
+            index = df[id_col].isna()
+        index = index[index == True].index # Extract indices of nans
+        df.drop(index, inplace=True)
+
+    if not (hasattr(config, 'combine_ids') and config.combine_ids):
+        id_col = next(x.name for x in config.features if x.feature_type == InputTypes.ID)
+        ids = df[id_col].apply(str)
+        df.drop(id_col, axis=1, inplace=True)
+        encoder = sklearn.preprocessing.LabelEncoder().fit(ids.values)
+        df[DEFAULT_ID_COL] = encoder.transform(ids)
+        encoders = OrderedDict({id_col: encoder})
+
+    else:
+        encoders = {c:sklearn.preprocessing.LabelEncoder().fit(df[c].values) for c in config.combine_ids}
+        encoders = OrderedDict(encoders)
+        lens = [len(v.classes_) for v in encoders.values()]
+        clens = np.roll(np.cumprod(lens), 1)
+        clens[0] = 1
+
+        # this takes a looooooot of time. Probably it would be better to create 2 dummy columns
+        df[DEFAULT_ID_COL] = df.apply(lambda row: sum([encoders[c].transform([row[c]])[0]*clens[i] for i,c in enumerate(encoders.keys())]), axis=1)
+        df.drop(config.combine_ids, axis=1, inplace=True)
+
+    return DEFAULT_ID_COL, encoders
+
+def impute(df, config):
+    #XXX This ensures that out scaling will have the same mean. We still need to check the variance
+    if not hasattr(config, 'missing_data_label'):
+        return df, None
+    else:
+        imp = SimpleImputer(missing_values=config.missing_data_label, strategy='mean')
+        mask = df.applymap(lambda x: True if x == config.missing_data_label else False)
+        data = df.values
+        col_mask = (data == config.missing_data_label).all(axis=0)
+        data[:,~col_mask] = imp.fit_transform(data)
+        return data, mask
+
+def normalize_reals(train, valid, test, config, id_col=DEFAULT_ID_COL):
+    tgt_cols = [x.name for x in config.features if x.feature_type == InputTypes.TARGET]
+    real_cols = list(set(v.name for v in config.features if v.feature_embed_type == DataTypes.CONTINUOUS).difference(set(tgt_cols)))
+    real_scalers = {}
+    tgt_scalers = {}
+
+    def apply_scalers(df, name=None):
+        if name is None:
+            name = df.name
+        mask = df.applymap(lambda x: True if x == config.missing_data_label else False) if hasattr(config, 'missing_data_label') else None
+        df[real_cols] = real_scalers[name].transform(df[real_cols])
+        if mask is not None and any(mask):
+            df[real_cols].mask(mask, 10**9)
+        df[tgt_cols] = tgt_scalers[name].transform(df[tgt_cols])
+        return df
+
+    if config.scale_per_id:
+        for identifier, sliced in train.groupby(id_col):
+            data = sliced[real_cols]
+            data, _ = impute(data, config)
+            real_scalers[identifier] = sklearn.preprocessing.StandardScaler().fit(data)
+            # XXX We should probably remove examples that contain NaN as a target
+            target = sliced[tgt_cols]
+            tgt_scalers[identifier] = sklearn.preprocessing.StandardScaler().fit(target)
+
+        train = train.groupby(id_col).apply(apply_scalers)
+        # For valid and testing leave only timeseries previously present in train subset
+        # XXX for proper data science we should consider encoding unseen timeseries as a special case, not throwing them away
+        valid = valid.loc[valid[id_col].isin(real_scalers.keys())]
+        valid = valid.groupby(id_col).apply(apply_scalers)
+        test = test.loc[test[id_col].isin(real_scalers.keys())]
+        test = test.groupby(id_col).apply(apply_scalers)
+
+    else:
+        data, _ = impute(train[real_cols], config)
+        real_scalers[''] = sklearn.preprocessing.StandardScaler().fit(data)
+        tgt_scalers[''] = sklearn.preprocessing.StandardScaler().fit(train[tgt_cols])
+
+        train = apply_scalers(train, name='')
+        valid = apply_scalers(valid, name='')
+        test = apply_scalers(test, name='')
+
+    return train, valid, test, real_scalers, tgt_scalers
+
+def encode_categoricals(train, valid, test, config):
+    cat_encodings = {}
+    cat_cols = list(set(v.name for v in config.features if v.feature_embed_type == DataTypes.CATEGORICAL and v.feature_type != InputTypes.ID))
+    num_classes = [] #XXX Maybe we should modify config based on this value? Or send a warninig?
+                     # For TC performance reasons we might want for num_classes[i] be divisible by 8
+
+    # Train categorical encoders
+    for c in cat_cols:
+        if config.missing_cat_data_strategy == 'special_token':
+            #XXX this will probably require some data augmentation
+            unique = train[c].unique()
+            valid[c].loc[valid[c].isin(unique)] = '<UNK>'
+            test[c].loc[test[c].isin(unique)] = '<UNK>'
+
+        if config.missing_cat_data_strategy == 'encode_all' or \
+                config.missing_cat_data_strategy == 'special_token':
+            srs = pd.concat([train[c], valid[c], test[c]]).apply(str)
+            cat_encodings[c] = sklearn.preprocessing.LabelEncoder().fit(srs.values)
+        elif config.missing_cat_data_strategy == 'drop':
+            # TODO: implement this. In addition to dropping rows this has to split specific time series in chunks
+            # to prevent data from having temporal gaps
+            pass
+        num_classes.append(srs.nunique())
+    print('Categorical variables encodings lens: ', num_classes)
+
+
+    for split in [train, valid, test]:
+        for c in cat_cols:
+            srs = split[c].apply(str)
+            split[c] = srs
+            split.loc[:,c] = cat_encodings[c].transform(srs)
+
+    return cat_encodings
+
+
+def preprocess(src_path, dst_path, config):
+    df = pd.read_csv(src_path, index_col=0)
+
+    for c in config.features:
+        if c.feature_embed_type == DataTypes.DATE:
+            df[c.name] = pd.to_datetime(df[c.name])
+
+    # Leave only columns relevant to preprocessing
+    relevant_columns = list(set([f.name for f in config.features] + [config.time_ids]))
+    df = df[relevant_columns]
+
+
+    id_col, id_encoders = flatten_ids(df, config)
+    df = df.reindex(sorted(df.columns), axis=1)
+    
+    train, valid, test = get_dataset_splits(df, config)
+   
+    # Length filter the data (all timeseries shorter than example len will be dropped)
+    #for df in [train, valid, test]:
+    #    df.groupby(id_col).filter(lambda x: len(x) >= config.example_length)
+    train = pd.concat([x[1] for x in train.groupby(id_col) if len(x[1]) >= config.example_length])
+    valid = pd.concat([x[1] for x in valid.groupby(id_col) if len(x[1]) >= config.example_length])
+    test  = pd.concat([x[1] for x in test.groupby(id_col)  if len(x[1]) >= config.example_length])
+
+    train, valid, test, real_scalers, tgt_scalers = normalize_reals(train, valid, test, config, id_col)
+
+    cat_encodings = encode_categoricals(train, valid, test, config)
+
+    os.makedirs(dst_path, exist_ok=True)
+    
+    train.to_csv(os.path.join(dst_path, 'train.csv'))
+    valid.to_csv(os.path.join(dst_path, 'valid.csv'))
+    test.to_csv(os.path.join(dst_path, 'test.csv'))
+
+    # Save relevant columns in binary form for faster dataloading
+    # IMORTANT: We always expect id to be a single column indicating the complete timeseries
+    # We also expect a copy of id in form of static categorical input!!!
+    col_names = [id_col] + [x.name for x in config.features if x.feature_embed_type != DataTypes.DATE and x.feature_type != InputTypes.ID]
+    grouped_train = [x[1][col_names].values.astype(np.float32).view(dtype=np.int32) for x in train.groupby(id_col)]
+    grouped_valid = [x[1][col_names].values.astype(np.float32).view(dtype=np.int32) for x in valid.groupby(id_col)]
+    grouped_test  = [x[1][col_names].values.astype(np.float32).view(dtype=np.int32) for x in test.groupby(id_col)]
+
+    pickle.dump(grouped_train, open(os.path.join(dst_path, 'train.bin'), 'wb'))
+    pickle.dump(grouped_valid, open(os.path.join(dst_path, 'valid.bin'), 'wb'))
+    pickle.dump(grouped_test,  open(os.path.join(dst_path, 'test.bin'), 'wb'))
+
+    
+    with open(os.path.join(dst_path, 'real_scalers.bin'), 'wb') as f:
+        pickle.dump(real_scalers, f)
+    with open(os.path.join(dst_path, 'tgt_scalers.bin'), 'wb') as f:
+        pickle.dump(tgt_scalers, f)
+    with open(os.path.join(dst_path, 'cat_encodings.bin'), 'wb') as f:
+        pickle.dump(cat_encodings, f)
+    with open(os.path.join(dst_path, 'id_encoders.bin'), 'wb') as f:
+        pickle.dump(id_encoders, f)
+    
+
+def sample_data(dataset, num_samples):
+    if num_samples < 0:
+        return dataset
+    else:
+        return torch.utils.data.Subset(dataset, np.random.choice(np.arange(len(dataset)), size=num_samples, replace=False))
+
+
+def standarize_electricity(path):
+    """Code taken from https://github.com/google-research/google-research/blob/master/tft/script_download_data.py"""
+    df = pd.read_csv(os.path.join(path, 'LD2011_2014.txt'), index_col=0, sep=';', decimal=',')
+    df.index = pd.to_datetime(df.index)
+    df.sort_index(inplace=True)
+  
+    # Used to determine the start and end dates of a series
+    output = df.resample('1h').mean().replace(0., np.nan)
+  
+    earliest_time = output.index.min()
+  
+    df_list = []
+    for label in output:
+        print('Processing {}'.format(label))
+        srs = output[label]
+  
+        start_date = min(srs.fillna(method='ffill').dropna().index)
+        end_date = max(srs.fillna(method='bfill').dropna().index)
+  
+        active_range = (srs.index >= start_date) & (srs.index <= end_date)
+        srs = srs[active_range].fillna(0.)
+  
+        tmp = pd.DataFrame({'power_usage': srs})
+        date = tmp.index
+        tmp['t'] = (date - earliest_time).seconds / 60 / 60 + (
+            date - earliest_time).days * 24
+        tmp['days_from_start'] = (date - earliest_time).days
+        tmp['categorical_id'] = label
+        tmp['date'] = date
+        tmp['id'] = label
+        tmp['hour'] = date.hour
+        tmp['day'] = date.day
+        tmp['day_of_week'] = date.dayofweek
+        tmp['month'] = date.month
+  
+        df_list.append(tmp)
+  
+    output = pd.concat(df_list, axis=0, join='outer').reset_index(drop=True)
+  
+    output['categorical_id'] = output['id'].copy()
+    output['hours_from_start'] = output['t']
+    output['categorical_day_of_week'] = output['day_of_week'].copy()
+    output['categorical_hour'] = output['hour'].copy()
+  
+    output.to_csv(os.path.join(path, 'standarized.csv'))
+
+def standarize_volatility(path):
+    df = pd.read_csv(os.path.join(path, 'oxfordmanrealizedvolatilityindices.csv'), index_col=0)  # no explicit index
+  
+    # Adds additional date/day fields
+    idx = [str(s).split('+')[0] for s in df.index
+          ]  # ignore timezones, we don't need them
+    dates = pd.to_datetime(idx)
+    df['date'] = dates
+    df['days_from_start'] = (dates - pd.datetime(2000, 1, 3)).days
+    df['day_of_week'] = dates.dayofweek
+    df['day_of_month'] = dates.day
+    df['week_of_year'] = dates.weekofyear
+    df['month'] = dates.month
+    df['year'] = dates.year
+    df['categorical_id'] = df['Symbol'].copy()
+  
+    # Processes log volatility
+    vol = df['rv5_ss'].copy()
+    vol.loc[vol == 0.] = np.nan
+    df['log_vol'] = np.log(vol)
+  
+    # Adds static information
+    symbol_region_mapping = {
+        '.AEX': 'EMEA',
+        '.AORD': 'APAC',
+        '.BFX': 'EMEA',
+        '.BSESN': 'APAC',
+        '.BVLG': 'EMEA',
+        '.BVSP': 'AMER',
+        '.DJI': 'AMER',
+        '.FCHI': 'EMEA',
+        '.FTMIB': 'EMEA',
+        '.FTSE': 'EMEA',
+        '.GDAXI': 'EMEA',
+        '.GSPTSE': 'AMER',
+        '.HSI': 'APAC',
+        '.IBEX': 'EMEA',
+        '.IXIC': 'AMER',
+        '.KS11': 'APAC',
+        '.KSE': 'APAC',
+        '.MXX': 'AMER',
+        '.N225': 'APAC ',
+        '.NSEI': 'APAC',
+        '.OMXC20': 'EMEA',
+        '.OMXHPI': 'EMEA',
+        '.OMXSPI': 'EMEA',
+        '.OSEAX': 'EMEA',
+        '.RUT': 'EMEA',
+        '.SMSI': 'EMEA',
+        '.SPX': 'AMER',
+        '.SSEC': 'APAC',
+        '.SSMI': 'EMEA',
+        '.STI': 'APAC',
+        '.STOXX50E': 'EMEA'
+    }
+  
+    df['Region'] = df['Symbol'].apply(lambda k: symbol_region_mapping[k])
+  
+    # Performs final processing
+    output_df_list = []
+    for grp in df.groupby('Symbol'):
+        sliced = grp[1].copy()
+        sliced.sort_values('days_from_start', inplace=True)
+        # Impute log volatility values
+        sliced['log_vol'].fillna(method='ffill', inplace=True)
+        sliced.dropna()
+        output_df_list.append(sliced)
+  
+    df = pd.concat(output_df_list, axis=0)
+  
+    df.to_csv(os.path.join(path, 'standarized.csv'))
+
+
+def standarize_traffic(path):
+    def process_list(s, variable_type=int, delimiter=None):
+        """Parses a line in the PEMS format to a list."""
+        if delimiter is None:
+            l = [
+                variable_type(i) for i in s.replace('[', '').replace(']', '').split()
+            ]
+        else:
+            l = [
+                variable_type(i)
+                for i in s.replace('[', '').replace(']', '').split(delimiter)
+            ]
+  
+        return l
+  
+    def read_single_list(filename):
+        """Returns single list from a file in the PEMS-custom format."""
+        with open(os.path.join(path, filename), 'r') as dat:
+            l = process_list(dat.readlines()[0])
+        return l
+  
+    def read_matrix(filename):
+        """Returns a matrix from a file in the PEMS-custom format."""
+        array_list = []
+        with open(os.path.join(path, filename), 'r') as dat:
+            lines = dat.readlines()
+            for i, line in enumerate(lines):
+                if (i + 1) % 50 == 0:
+                    print('Completed {} of {} rows for {}'.format(i + 1, len(lines),
+                                                                filename))
+                array = [
+                    process_list(row_split, variable_type=float, delimiter=None)
+                    for row_split in process_list(
+                        line, variable_type=str, delimiter=';')
+                ]
+                array_list.append(array)
+  
+        return array_list
+  
+    shuffle_order = np.array(read_single_list('randperm')) - 1  # index from 0
+    train_dayofweek = read_single_list('PEMS_trainlabels')
+    train_tensor = read_matrix('PEMS_train')
+    test_dayofweek = read_single_list('PEMS_testlabels')
+    test_tensor = read_matrix('PEMS_test')
+  
+    # Inverse permutate shuffle order
+    print('Shuffling')
+    inverse_mapping = {
+        new_location: previous_location
+        for previous_location, new_location in enumerate(shuffle_order)
+    }
+    reverse_shuffle_order = np.array([
+        inverse_mapping[new_location]
+        for new_location, _ in enumerate(shuffle_order)
+    ])
+  
+    # Group and reoder based on permuation matrix
+    print('Reodering')
+    day_of_week = np.array(train_dayofweek + test_dayofweek)
+    combined_tensor = np.array(train_tensor + test_tensor)
+  
+    day_of_week = day_of_week[reverse_shuffle_order]
+    combined_tensor = combined_tensor[reverse_shuffle_order]
+  
+    # Put everything back into a dataframe
+    print('Parsing as dataframe')
+    labels = ['traj_{}'.format(i) for i in read_single_list('stations_list')]
+  
+    hourly_list = []
+    for day, day_matrix in enumerate(combined_tensor):
+        # Hourly data
+        hourly = pd.DataFrame(day_matrix.T, columns=labels)
+        hourly['hour_on_day'] = [int(i / 6) for i in hourly.index
+                                ]  # sampled at 10 min intervals
+        if hourly['hour_on_day'].max() > 23 or hourly['hour_on_day'].min() < 0:
+            raise ValueError('Invalid hour! {}-{}'.format(
+                hourly['hour_on_day'].min(), hourly['hour_on_day'].max()))
+  
+        hourly = hourly.groupby('hour_on_day', as_index=True).mean()[labels]
+        hourly['sensor_day'] = day
+        hourly['time_on_day'] = hourly.index
+        hourly['day_of_week'] = day_of_week[day]
+  
+        hourly_list.append(hourly)
+  
+    hourly_frame = pd.concat(hourly_list, axis=0, ignore_index=True, sort=False)
+  
+    # Flatten such that each entitiy uses one row in dataframe
+    store_columns = [c for c in hourly_frame.columns if 'traj' in c]
+    other_columns = [c for c in hourly_frame.columns if 'traj' not in c]
+    flat_df = pd.DataFrame(columns=['values', 'prev_values', 'next_values'] +
+                           other_columns + ['id'])
+  
+    for store in store_columns:
+        print('Processing {}'.format(store))
+  
+        sliced = hourly_frame[[store] + other_columns].copy()
+        sliced.columns = ['values'] + other_columns
+        sliced['id'] = int(store.replace('traj_', ''))
+  
+        # Sort by Sensor-date-time
+        key = sliced['id'].apply(str) \
+                + sliced['sensor_day'].apply(lambda x: '_{:03d}'.format(x)) \
+                + sliced['time_on_day'].apply(lambda x: '_{:03d}'.format(x))
+        sliced = sliced.set_index(key).sort_index()
+  
+        sliced['values'] = sliced['values'].fillna(method='ffill')
+        sliced['prev_values'] = sliced['values'].shift(1)
+        sliced['next_values'] = sliced['values'].shift(-1)
+  
+        flat_df = flat_df.append(sliced.dropna(), ignore_index=True, sort=False)
+  
+    # Filter to match range used by other academic papers
+    index = flat_df['sensor_day']
+    flat_df = flat_df[index < 173].copy()
+  
+    # Creating columns fo categorical inputs
+    flat_df['categorical_id'] = flat_df['id'].copy()
+    flat_df['hours_from_start'] = flat_df['time_on_day'] \
+        + flat_df['sensor_day']*24.
+    flat_df['categorical_day_of_week'] = flat_df['day_of_week'].copy()
+    flat_df['categorical_time_on_day'] = flat_df['time_on_day'].copy()
+  
+    flat_df.to_csv(os.path.join(path, 'standarized.csv'))
+
+
+# XXX needs rework
+def standarize_favorita(data_folder):
+    import gc
+    # Extract only a subset of data to save/process for efficiency
+    start_date = pd.datetime(2015, 1, 1)
+    end_date = pd.datetime(2016, 6, 1)
+  
+    print('Regenerating data...')
+  
+    # load temporal data
+    temporal = pd.read_csv(os.path.join(data_folder, 'train.csv'), index_col=0)
+  
+    store_info = pd.read_csv(os.path.join(data_folder, 'stores.csv'), index_col=0)
+    oil = pd.read_csv(
+        os.path.join(data_folder, 'oil.csv'), index_col=0).iloc[:, 0]
+    holidays = pd.read_csv(os.path.join(data_folder, 'holidays_events.csv'))
+    items = pd.read_csv(os.path.join(data_folder, 'items.csv'), index_col=0)
+    transactions = pd.read_csv(os.path.join(data_folder, 'transactions.csv'))
+  
+    # Take first 6 months of data
+    temporal['date'] = pd.to_datetime(temporal['date'])
+  
+    # Filter dates to reduce storage space requirements
+    if start_date is not None:
+        temporal = temporal[(temporal['date'] >= start_date)]
+    if end_date is not None:
+        temporal = temporal[(temporal['date'] < end_date)]
+  
+    dates = temporal['date'].unique()
+  
+    # Add trajectory identifier
+    temporal['traj_id'] = temporal['store_nbr'].apply(
+        str) + '_' + temporal['item_nbr'].apply(str)
+    temporal['unique_id'] = temporal['traj_id'] + '_' + temporal['date'].apply(
+        str)
+  
+    # Remove all IDs with negative returns
+    print('Removing returns data')
+    min_returns = temporal['unit_sales'].groupby(temporal['traj_id']).min()
+    valid_ids = set(min_returns[min_returns >= 0].index)
+    selector = temporal['traj_id'].apply(lambda traj_id: traj_id in valid_ids)
+    new_temporal = temporal[selector].copy()
+    del temporal
+    gc.collect()
+    temporal = new_temporal
+    temporal['open'] = 1
+  
+    # Resampling
+    print('Resampling to regular grid')
+    resampled_dfs = []
+    for traj_id, raw_sub_df in temporal.groupby('traj_id'):
+        print('Resampling', traj_id)
+        sub_df = raw_sub_df.set_index('date', drop=True).copy()
+        sub_df = sub_df.resample('1d').last()
+        sub_df['date'] = sub_df.index
+        sub_df[['store_nbr', 'item_nbr', 'onpromotion']] \
+            = sub_df[['store_nbr', 'item_nbr', 'onpromotion']].fillna(method='ffill')
+        sub_df['open'] = sub_df['open'].fillna(
+            0)  # flag where sales data is unknown
+        sub_df['log_sales'] = np.log(sub_df['unit_sales'])
+    
+        resampled_dfs.append(sub_df.reset_index(drop=True))
+  
+    new_temporal = pd.concat(resampled_dfs, axis=0)
+    del temporal
+    gc.collect()
+    temporal = new_temporal
+  
+    print('Adding oil')
+    oil.name = 'oil'
+    oil.index = pd.to_datetime(oil.index)
+    #XXX the lines below match the value of the oil on given date with the rest of the timeseries
+    # missing values in oil series are copied from the index before. Then the oil series is joined with
+    # temporal. Then there are some dates present in temporal which arent present in oil, for which 
+    # oil values is substituted with -1. WHY?!
+    #TODO: check how many nans there are after first step. Previously oil series was extended by dates
+    # present in dates variable with nan value, which were forward filled. 
+    # This behavior is no longer supported by pandas, so we changed to DataFrame.isin method.
+    # This leaves us with more nans after first step than previously. To achieve previous behavior
+    # we have to join series before filling nans.
+    temporal = temporal.join(
+        #oil.loc[oil.index.isin(dates)].fillna(method='ffill'), on='date', how='left')
+        oil.loc[oil.index.isin(dates)], on='date', how='left')
+    temporal['oil'] = temporal['oil'].fillna(method='ffill')
+    temporal['oil'] = temporal['oil'].fillna(-1)
+  
+    print('Adding store info')
+    temporal = temporal.join(store_info, on='store_nbr', how='left')
+  
+    print('Adding item info')
+    temporal = temporal.join(items, on='item_nbr', how='left')
+  
+    transactions['date'] = pd.to_datetime(transactions['date'])
+    temporal = temporal.merge(
+        transactions,
+        left_on=['date', 'store_nbr'],
+        right_on=['date', 'store_nbr'],
+        how='left')
+    temporal['transactions'] = temporal['transactions'].fillna(-1)
+  
+    # Additional date info
+    temporal['day_of_week'] = pd.to_datetime(temporal['date'].values).dayofweek
+    temporal['day_of_month'] = pd.to_datetime(temporal['date'].values).day
+    temporal['month'] = pd.to_datetime(temporal['date'].values).month
+  
+    # Add holiday info
+    print('Adding holidays')
+    holiday_subset = holidays[holidays['transferred'].apply(
+        lambda x: not x)].copy()
+    holiday_subset.columns = [
+        s if s != 'type' else 'holiday_type' for s in holiday_subset.columns
+    ]
+    holiday_subset['date'] = pd.to_datetime(holiday_subset['date'])
+    local_holidays = holiday_subset[holiday_subset['locale'] == 'Local']
+    regional_holidays = holiday_subset[holiday_subset['locale'] == 'Regional']
+    national_holidays = holiday_subset[holiday_subset['locale'] == 'National']
+  
+    temporal['national_hol'] = temporal.merge(
+        national_holidays, left_on=['date'], right_on=['date'],
+        how='left')['description'].fillna('')
+    temporal['regional_hol'] = temporal.merge(
+        regional_holidays,
+        left_on=['state', 'date'],
+        right_on=['locale_name', 'date'],
+        how='left')['description'].fillna('')
+    temporal['local_hol'] = temporal.merge(
+        local_holidays,
+        left_on=['city', 'date'],
+        right_on=['locale_name', 'date'],
+        how='left')['description'].fillna('')
+  
+    temporal.sort_values('unique_id', inplace=True)
+
+    # Transform date to integer index
+    start_date = pd.to_datetime(min(temporal['date']))
+    dates = temporal['date'].apply(pd.to_datetime)
+    temporal['days_from_start'] = (dates - start_date).dt.days
+    temporal['categorical_id'] = temporal['traj_id'].copy()
+  
+    print('Saving processed file to {}'.format(os.path.join(data_folder, 'standarized.csv')))
+    temporal.to_csv(os.path.join(data_folder, 'standarized.csv'))
diff --git a/PyTorch/Forecasting/TFT/ema.py b/PyTorch/Forecasting/TFT/ema.py
new file mode 100644
index 00000000..f8f5b331
--- /dev/null
+++ b/PyTorch/Forecasting/TFT/ema.py
@@ -0,0 +1,73 @@
+# Copyright 2021 NVIDIA CORPORATION
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Copyright 2019 Ross Wightman
+
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+
+#        http://www.apache.org/licenses/LICENSE-2.0
+
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+
+""" 
+Exponential Moving Average (EMA) of model updates
+"""
+
+from collections import OrderedDict
+from copy import deepcopy
+
+import torch
+import torch.nn as nn
+
+class ModelEma(nn.Module):
+    """ Model Exponential Moving Average V2
+
+    Keep a moving average of everything in the model state_dict (parameters and buffers).
+    V2 of this module is simpler, it does not match params/buffers based on name but simply
+    iterates in order. It works with torchscript (JIT of full model).
+
+    """
+    def __init__(self, model, decay=0.999, device=None):
+        super().__init__()
+        # make a copy of the model for accumulating moving average of weights
+        self.module = deepcopy(model)
+        self.module.eval()
+        self.decay = decay
+        self.device = device  # perform ema on different device from model if set
+        if self.device is not None:
+            self.module.to(device=device)
+
+    def update(self, model):
+        update_fn=lambda ema_v, model_v: self.decay * ema_v + (1. - self.decay) * model_v
+        with torch.no_grad():
+            for ema_v, model_v in zip(self.module.state_dict().values(), model.state_dict().values()):
+                if self.device is not None:
+                    model_v = model_v.to(device=self.device)
+                ema_v.copy_(update_fn(ema_v, model_v))
+
+    def set(self, model):
+        with torch.no_grad():
+            for ema_v, model_v in zip(self.module.state_dict().values(), model.state_dict().values()):
+                if self.device is not None:
+                    model_v = model_v.to(device=self.device)
+                ema_v.copy_( model_v )
+
+    def forward(self, x):
+        return self.module(x)
diff --git a/PyTorch/Forecasting/TFT/gpu_affinity.py b/PyTorch/Forecasting/TFT/gpu_affinity.py
new file mode 100644
index 00000000..79fb1fc4
--- /dev/null
+++ b/PyTorch/Forecasting/TFT/gpu_affinity.py
@@ -0,0 +1,157 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+import math
+import os
+import pathlib
+import re
+
+import pynvml
+
+pynvml.nvmlInit()
+
+
+def systemGetDriverVersion():
+    return pynvml.nvmlSystemGetDriverVersion()
+
+
+def deviceGetCount():
+    return pynvml.nvmlDeviceGetCount()
+
+
+class device:
+    # assume nvml returns list of 64 bit ints
+    _nvml_affinity_elements = math.ceil(os.cpu_count() / 64)
+
+    def __init__(self, device_idx):
+        super().__init__()
+        self.handle = pynvml.nvmlDeviceGetHandleByIndex(device_idx)
+
+    def getName(self):
+        return pynvml.nvmlDeviceGetName(self.handle)
+
+    def getCpuAffinity(self):
+        affinity_string = ''
+        for j in pynvml.nvmlDeviceGetCpuAffinity(
+            self.handle, device._nvml_affinity_elements
+        ):
+            # assume nvml returns list of 64 bit ints
+            affinity_string = '{:064b}'.format(j) + affinity_string
+        affinity_list = [int(x) for x in affinity_string]
+        affinity_list.reverse()  # so core 0 is in 0th element of list
+
+        ret = [i for i, e in enumerate(affinity_list) if e != 0]
+        return ret
+
+
+def set_socket_affinity(gpu_id):
+    dev = device(gpu_id)
+    affinity = dev.getCpuAffinity()
+    os.sched_setaffinity(0, affinity)
+
+
+def set_single_affinity(gpu_id):
+    dev = device(gpu_id)
+    affinity = dev.getCpuAffinity()
+    os.sched_setaffinity(0, affinity[:1])
+
+
+def set_single_unique_affinity(gpu_id, nproc_per_node):
+    devices = [device(i) for i in range(nproc_per_node)]
+    socket_affinities = [dev.getCpuAffinity() for dev in devices]
+
+    siblings_list = get_thread_siblings_list()
+    siblings_dict = dict(siblings_list)
+
+    # remove siblings
+    for idx, socket_affinity in enumerate(socket_affinities):
+        socket_affinities[idx] = list(set(socket_affinity) - set(siblings_dict.values()))
+
+    affinities = []
+    assigned = []
+
+    for socket_affinity in socket_affinities:
+        for core in socket_affinity:
+            if core not in assigned:
+                affinities.append([core])
+                assigned.append(core)
+                break
+    os.sched_setaffinity(0, affinities[gpu_id])
+
+
+def set_socket_unique_affinity(gpu_id, nproc_per_node, mode):
+    device_ids = [device(i) for i in range(nproc_per_node)]
+    socket_affinities = [dev.getCpuAffinity() for dev in device_ids]
+
+    siblings_list = get_thread_siblings_list()
+    siblings_dict = dict(siblings_list)
+
+    # remove siblings
+    for idx, socket_affinity in enumerate(socket_affinities):
+        socket_affinities[idx] = list(set(socket_affinity) - set(siblings_dict.values()))
+
+    socket_affinities_to_device_ids = collections.defaultdict(list)
+
+    for idx, socket_affinity in enumerate(socket_affinities):
+        socket_affinities_to_device_ids[tuple(socket_affinity)].append(idx)
+
+    for socket_affinity, device_ids in socket_affinities_to_device_ids.items():
+        devices_per_group = len(device_ids)
+        cores_per_device = len(socket_affinity) // devices_per_group
+        for group_id, device_id in enumerate(device_ids):
+            if device_id == gpu_id:
+                if mode == 'interleaved':
+                    affinity = list(socket_affinity[group_id::devices_per_group])
+                elif mode == 'continuous':
+                    affinity = list(socket_affinity[group_id*cores_per_device:(group_id+1)*cores_per_device])
+                else:
+                    raise RuntimeError('Unknown set_socket_unique_affinity mode')
+
+                # reintroduce siblings
+                affinity += [siblings_dict[aff] for aff in affinity if aff in siblings_dict]
+                os.sched_setaffinity(0, affinity)
+
+
+def get_thread_siblings_list():
+    path = '/sys/devices/system/cpu/cpu*/topology/thread_siblings_list'
+    thread_siblings_list = []
+    pattern = re.compile(r'(\d+)\D(\d+)')
+    for fname in pathlib.Path(path[0]).glob(path[1:]):
+        with open(fname) as f:
+            content = f.read().strip()
+            res = pattern.findall(content)
+            if res:
+                pair = tuple(map(int, res[0]))
+                thread_siblings_list.append(pair)
+    return thread_siblings_list
+
+
+def set_affinity(gpu_id, nproc_per_node, mode='socket'):
+    if mode == 'socket':
+        set_socket_affinity(gpu_id)
+    elif mode == 'single':
+        set_single_affinity(gpu_id)
+    elif mode == 'single_unique':
+        set_single_unique_affinity(gpu_id, nproc_per_node)
+    elif mode == 'socket_unique_interleaved':
+        set_socket_unique_affinity(gpu_id, nproc_per_node, 'interleaved')
+    elif mode == 'socket_unique_continuous':
+        set_socket_unique_affinity(gpu_id, nproc_per_node, 'continuous')
+    else:
+        raise RuntimeError('Unknown affinity mode')
+
+    affinity = os.sched_getaffinity(0)
+    return affinity
+
diff --git a/PyTorch/Forecasting/TFT/inference.py b/PyTorch/Forecasting/TFT/inference.py
new file mode 100644
index 00000000..056429f1
--- /dev/null
+++ b/PyTorch/Forecasting/TFT/inference.py
@@ -0,0 +1,239 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import pandas as pd
+import numpy as np
+import pickle
+import argparse
+import torch
+from torch.utils.data import DataLoader
+from torch.cuda import amp
+from torch.utils.tensorboard import SummaryWriter
+from tqdm import tqdm
+from modeling import TemporalFusionTransformer
+from configuration import ElectricityConfig
+from data_utils import TFTDataset
+from utils import PerformanceMeter
+from criterions import QuantileLoss
+import dllogger
+from log_helper import setup_logger
+
+def _unscale_per_id(config, values, ids, scalers):
+    values = values.cpu().numpy()
+    num_horizons = config.example_length - config.encoder_length + 1
+    flat_values = pd.DataFrame(
+            values,
+            columns=[f't{j}' for j in range(num_horizons - values.shape[1], num_horizons)]
+            )
+    flat_values['id'] = ids
+    df_list = []
+    for idx, group in flat_values.groupby('id'):
+        scaler = scalers[idx]
+        group_copy = group.copy()
+        for col in group_copy.columns:
+            if not 'id' in col:
+                _col = np.expand_dims(group_copy[col].values, -1)
+                _t_col = scaler.inverse_transform(_col)[:,-1]
+                group_copy[col] = _t_col
+        df_list.append(group_copy)
+    flat_values = pd.concat(df_list, axis=0)
+
+    flat_values = flat_values[[col for col in flat_values if not 'id' in col]]
+    flat_tensor = torch.from_numpy(flat_values.values)
+    return flat_tensor
+
+def _unscale(config, values, scaler):
+    values = values.cpu().numpy()
+    num_horizons = config.example_length - config.encoder_length + 1
+    flat_values = pd.DataFrame(
+            values,
+            columns=[f't{j}' for j in range(num_horizons - values.shape[1], num_horizons)]
+            )
+    for col in flat_values.columns:
+        if not 'id' in col:
+            _col = np.expand_dims(flat_values[col].values, -1)
+            _t_col = scaler.inverse_transform(_col)[:,-1]
+            flat_values[col] = _t_col
+
+    flat_values = flat_values[[col for col in flat_values if not 'id' in col]]
+    flat_tensor = torch.from_numpy(flat_values.values)
+    return flat_tensor
+
+def predict(args, config, model, data_loader, scalers, cat_encodings, extend_targets=False):
+    model.eval()
+    predictions = []
+    targets = []
+    ids = []
+    perf_meter = PerformanceMeter()
+    n_workers = args.distributed_world_size if hasattr(args, 'distributed_world_size') else 1
+
+    for step, batch in enumerate(data_loader):
+        perf_meter.reset_current_lap()
+        with torch.no_grad():
+            batch = {key: tensor.cuda() if tensor.numel() else None for key, tensor in batch.items()}
+            ids.append(batch['id'][:,0,:])
+            targets.append(batch['target'])
+            predictions.append(model(batch).float())
+
+        perf_meter.update(args.batch_size * n_workers,
+            exclude_from_total=step in [0, len(data_loader)-1])
+
+    targets = torch.cat(targets, dim=0)
+    if not extend_targets:
+        targets = targets[:,config.encoder_length:,:] 
+    predictions = torch.cat(predictions, dim=0)
+    
+    if config.scale_per_id:
+        ids = torch.cat(ids, dim=0).cpu().numpy()
+
+        unscaled_predictions = torch.stack(
+                [_unscale_per_id(config, predictions[:,:,i], ids, scalers) for i in range(len(config.quantiles))], 
+                dim=-1)
+        unscaled_targets = _unscale_per_id(config, targets[:,:,0], ids, scalers).unsqueeze(-1)
+    else:
+        ids = None
+        unscaled_predictions = torch.stack(
+                [_unscale(config, predictions[:,:,i], scalers['']) for i in range(len(config.quantiles))], 
+                dim=-1)
+        unscaled_targets = _unscale(config, targets[:,:,0], scalers['']).unsqueeze(-1)
+
+    return unscaled_predictions, unscaled_targets, ids, perf_meter
+
+def visualize_v2(args, config, model, data_loader, scalers, cat_encodings):
+    unscaled_predictions, unscaled_targets, ids, _ = predict(args, config, model, data_loader, scalers, cat_encodings, extend_targets=True)
+
+    num_horizons = config.example_length - config.encoder_length + 1
+    pad = unscaled_predictions.new_full((unscaled_targets.shape[0], unscaled_targets.shape[1] - unscaled_predictions.shape[1], unscaled_predictions.shape[2]), fill_value=float('nan'))
+    pad[:,-1,:] = unscaled_targets[:,-num_horizons,:]
+    unscaled_predictions = torch.cat((pad, unscaled_predictions), dim=1)
+
+    ids = torch.from_numpy(ids.squeeze())
+    joint_graphs = torch.cat([unscaled_targets, unscaled_predictions], dim=2)
+    graphs = {i:joint_graphs[ids == i, :, :] for i in set(ids.tolist())}
+    for key, g in graphs.items():
+        for i, ex in enumerate(g):
+            df = pd.DataFrame(ex.numpy(), 
+                    index=range(num_horizons - ex.shape[0], num_horizons),
+                    columns=['target'] + [f'P{int(q*100)}' for q in config.quantiles])
+            fig = df.plot().get_figure()
+            ax = fig.get_axes()[0]
+            _values = df.values[config.encoder_length-1:,:]
+            ax.fill_between(range(num_horizons), _values[:,1], _values[:,-1], alpha=0.2, color='green')
+            os.makedirs(os.path.join(args.results, 'single_example_vis', str(key)), exist_ok=True)
+            fig.savefig(os.path.join(args.results, 'single_example_vis', str(key), f'{i}.pdf'))
+
+def inference(args, config, model, data_loader, scalers, cat_encodings):
+    unscaled_predictions, unscaled_targets, ids, perf_meter = predict(args, config, model, data_loader, scalers, cat_encodings)
+
+    if args.joint_visualization or args.save_predictions:
+        ids = torch.from_numpy(ids.squeeze())
+        #ids = torch.cat([x['id'][0] for x in data_loader.dataset])
+        joint_graphs = torch.cat([unscaled_targets, unscaled_predictions], dim=2)
+        graphs = {i:joint_graphs[ids == i, :, :] for i in set(ids.tolist())}
+        for key, g in graphs.items(): #timeseries id, joint targets and predictions
+            _g = {'targets': g[:,:,0]}
+            _g.update({f'P{int(q*100)}':g[:,:,i+1] for i, q in enumerate(config.quantiles)})
+            
+            if args.joint_visualization:
+                summary_writer = SummaryWriter(log_dir=os.path.join(args.results, 'predictions_vis', str(key)))
+                for q, t in _g.items(): # target and quantiles, timehorizon values
+                    if q == 'targets':
+                        targets = torch.cat([t[:,0], t[-1,1:]]) # WIP
+                        # We want to plot targets on the same graph as predictions. Probably could be written better.
+                        for i, val in enumerate(targets):
+                            summary_writer.add_scalars(str(key), {f'{q}':val}, i)
+                        continue
+
+                    # Tensor t contains different time horizons which are shifted in phase
+                    # Next lines realign them
+                    y = t.new_full((t.shape[0] + t.shape[1] -1, t.shape[1]), float('nan'))
+                    for i in range(y.shape[1]):
+                        y[i:i+t.shape[0], i] = t[:,i]
+
+                    for i, vals in enumerate(y): # timestep, timehorizon values value
+                        summary_writer.add_scalars(str(key), {f'{q}_t+{j+1}':v for j,v in enumerate(vals) if v == v}, i)
+                summary_writer.close()
+
+            if args.save_predictions:
+                for q, t in _g.items():
+                    df = pd.DataFrame(t.tolist())
+                    df.columns = [f't+{i+1}' for i in range(len(df.columns))]
+                    os.makedirs(os.path.join(args.results, 'predictions', str(key)), exist_ok=True)
+                    df.to_csv(os.path.join(args.results, 'predictions', str(key), q+'.csv'))
+
+    losses = QuantileLoss(config)(unscaled_predictions, unscaled_targets)
+    normalizer = unscaled_targets.abs().mean()
+    q_risk = 2 * losses / normalizer
+
+    perf_dict = {
+                'throughput': perf_meter.avg,
+                'latency_avg': perf_meter.total_time/len(perf_meter.intervals),
+                'latency_p90': perf_meter.p(90),
+                'latency_p95': perf_meter.p(95),
+                'latency_p99': perf_meter.p(99),
+                'total_infernece_time': perf_meter.total_time,
+                }
+
+    return q_risk, perf_dict
+
+
+def main(args):
+    
+    setup_logger(args)
+    # Set up model
+    state_dict = torch.load(args.checkpoint)
+    config = state_dict['config']
+    model = TemporalFusionTransformer(config).cuda()
+    model.load_state_dict(state_dict['model'])
+    model.eval()
+    model.cuda()
+
+    # Set up dataset
+    test_split = TFTDataset(args.data, config)
+    data_loader = DataLoader(test_split, batch_size=args.batch_size, num_workers=4)
+
+    scalers = pickle.load(open(args.tgt_scalers, 'rb'))
+    cat_encodings = pickle.load(open(args.cat_encodings, 'rb'))
+
+    if args.visualize:
+        # TODO: abstract away all forms of visualization.
+        visualize_v2(args, config, model, data_loader, scalers, cat_encodings)
+
+    quantiles, perf_dict = inference(args, config, model, data_loader, scalers, cat_encodings)
+    quantiles = {'test_p10': quantiles[0].item(), 'test_p50': quantiles[1].item(), 'test_p90': quantiles[2].item(), 'sum':sum(quantiles).item()}
+    finish_log = {**quantiles, **perf_dict}
+    dllogger.log(step=(), data=finish_log, verbosity=1)
+    print('Test q-risk: P10 {} | P50 {} | P90 {}'.format(*quantiles))
+    print('Latency:\n\tAverage {:.3f}s\n\tp90 {:.3f}s\n\tp95 {:.3f}s\n\tp99 {:.3f}s'.format(
+        perf_dict['latency_avg'], perf_dict['latency_p90'], perf_dict['latency_p95'], perf_dict['latency_p99']))
+
+if __name__=='__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--checkpoint', type=str,
+                        help='Path to the checkpoint')
+    parser.add_argument('--data', type=str,
+                        help='Path to the test split of the dataset')
+    parser.add_argument('--tgt_scalers', type=str,
+                        help='Path to the tgt_scalers.bin file produced by the preprocessing')
+    parser.add_argument('--cat_encodings', type=str,
+                        help='Path to the cat_encodings.bin file produced by the preprocessing')
+    parser.add_argument('--batch_size', type=int, default=64)
+    parser.add_argument('--visualize', action='store_true', help='Visualize predictions - each example on the separate plot')
+    parser.add_argument('--joint_visualization', action='store_true', help='Visualize predictions - each timeseries on separate plot. Projections will be concatenated.')
+    parser.add_argument('--save_predictions', action='store_true')
+    parser.add_argument('--results', type=str, default='/results')
+    parser.add_argument('--log_file', type=str, default='dllogger.json')
+    ARGS = parser.parse_args()
+    main(ARGS)
diff --git a/PyTorch/Forecasting/TFT/log_helper.py b/PyTorch/Forecasting/TFT/log_helper.py
new file mode 100644
index 00000000..83d2ac7f
--- /dev/null
+++ b/PyTorch/Forecasting/TFT/log_helper.py
@@ -0,0 +1,141 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import subprocess
+import sys
+import itertools
+import atexit
+
+import dllogger
+from dllogger import Backend, JSONStreamBackend, StdOutBackend
+
+import torch.distributed as dist
+from torch.utils.tensorboard import SummaryWriter
+
+class TensorBoardBackend(Backend):
+    def __init__(self, verbosity, log_dir):
+        super().__init__(verbosity=verbosity)
+        self.summary_writer = SummaryWriter(log_dir=os.path.join(log_dir, 'TB_summary'),
+                                            flush_secs=120,
+                                            max_queue=200
+                                            )
+        self.hp_cache = None
+        atexit.register(self.summary_writer.close)
+
+    @property
+    def log_level(self):
+        return self._log_level
+
+    def metadata(self, timestamp, elapsedtime, metric, metadata):
+        pass
+
+    def log(self, timestamp, elapsedtime, step, data):
+        if step == 'HPARAMS':
+            parameters = {k: v for k, v in data.items() if not isinstance(v, (list, tuple))}
+            #Unpack list and tuples
+            for d in [{k+f'_{i}':v for i,v in enumerate(l)} for k,l in data.items() if isinstance(l, (list, tuple))]:
+                parameters.update(d)
+            #Remove custom classes
+            parameters = {k: v for k, v in data.items() if isinstance(v, (int, float, str, bool))}
+            parameters.update({k:'None' for k, v in data.items() if v is None})
+            self.hp_cache = parameters
+        if step == ():
+            if self.hp_cache is None:
+                print('Warning: Cannot save HParameters. Please log HParameters with step=\'HPARAMS\'', file=sys.stderr)
+                return
+            self.summary_writer.add_hparams(self.hp_cache, data)
+        if not isinstance(step, int):
+            return
+        for k, v in data.items():
+            self.summary_writer.add_scalar(k, v, step)
+
+    def flush(self):
+        pass
+
+def setup_logger(args):
+    os.makedirs(args.results, exist_ok=True)
+    log_path = os.path.join(args.results, args.log_file)
+
+    if os.path.exists(log_path):
+        for i in itertools.count():
+            s_fname = args.log_file.split('.')
+            fname = '.'.join(s_fname[:-1]) + f'_{i}.' + s_fname[-1] if len(s_fname) > 1 else args.stat_file + f'.{i}'
+            log_path = os.path.join(args.results, fname)
+            if not os.path.exists(log_path):
+                break
+
+    def metric_format(metric, metadata, value):
+        return "{}: {}".format(metric, f'{value:.5f}' if isinstance(value, float) else value)
+    def step_format(step):
+        if step == ():
+            return "Finished |"
+        elif isinstance(step, int):
+            return "Step {0: <5} |".format(step)
+        return "Step {} |".format(step)
+
+
+    if not dist.is_initialized() or not args.distributed_world_size > 1 or args.distributed_rank == 0:
+        dllogger.init(backends=[JSONStreamBackend(verbosity=1, filename=log_path),
+                                TensorBoardBackend(verbosity=1, log_dir=args.results),
+                                StdOutBackend(verbosity=2, 
+                                              step_format=step_format,
+                                              prefix_format=lambda x: "")#,
+                                              #metric_format=metric_format)
+                                ])
+    else:
+        dllogger.init(backends=[])
+    dllogger.log(step='PARAMETER', data=vars(args), verbosity=0)
+
+    container_setup_info = {**get_framework_env_vars(), **get_system_info()}
+    dllogger.log(step='ENVIRONMENT', data=container_setup_info, verbosity=0)
+
+    dllogger.metadata('loss', {'GOAL': 'MINIMIZE', 'STAGE': 'TRAIN', 'format': ':5f'})
+    dllogger.metadata('P10', {'GOAL': 'MINIMIZE', 'STAGE': 'TRAIN', 'format': ':5f'})
+    dllogger.metadata('P50', {'GOAL': 'MINIMIZE', 'STAGE': 'TRAIN', 'format': ':5f'})
+    dllogger.metadata('P90', {'GOAL': 'MINIMIZE', 'STAGE': 'TRAIN', 'format': ':5f'})
+    dllogger.metadata('items/s', {'GOAL': 'MAXIMIZE', 'STAGE': 'TRAIN', 'format': ':1f'})
+    dllogger.metadata('val_loss', {'GOAL': 'MINIMIZE', 'STAGE': 'VAL', 'format':':5f'})
+    dllogger.metadata('val_P10', {'GOAL': 'MINIMIZE', 'STAGE': 'VAL', 'format': ':5f'})
+    dllogger.metadata('val_P50', {'GOAL': 'MINIMIZE', 'STAGE': 'VAL', 'format': ':5f'})
+    dllogger.metadata('val_P90', {'GOAL': 'MINIMIZE', 'STAGE': 'VAL', 'format': ':5f'})
+    dllogger.metadata('val_items/s', {'GOAL': 'MAXIMIZE', 'STAGE': 'VAL', 'format': ':1f'})
+    dllogger.metadata('test_P10', {'GOAL': 'MINIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
+    dllogger.metadata('test_P50', {'GOAL': 'MINIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
+    dllogger.metadata('test_P90', {'GOAL': 'MINIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
+    dllogger.metadata('throughput', {'GOAL': 'MAXIMIZE', 'STAGE': 'TEST', 'format': ':1f'})
+    dllogger.metadata('latency_p90', {'GOAL': 'MIMIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
+    dllogger.metadata('latency_p95', {'GOAL': 'MIMIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
+    dllogger.metadata('latency_p99', {'GOAL': 'MIMIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
+
+
+def get_framework_env_vars():
+    return {
+        'NVIDIA_PYTORCH_VERSION': os.environ.get('NVIDIA_PYTORCH_VERSION'),
+        'PYTORCH_VERSION': os.environ.get('PYTORCH_VERSION'),
+        'CUBLAS_VERSION': os.environ.get('CUBLAS_VERSION'),
+        'NCCL_VERSION': os.environ.get('NCCL_VERSION'),
+        'CUDA_DRIVER_VERSION': os.environ.get('CUDA_DRIVER_VERSION'),
+        'CUDNN_VERSION': os.environ.get('CUDNN_VERSION'),
+        'CUDA_VERSION': os.environ.get('CUDA_VERSION'),
+        'NVIDIA_PIPELINE_ID': os.environ.get('NVIDIA_PIPELINE_ID'),
+        'NVIDIA_BUILD_ID': os.environ.get('NVIDIA_BUILD_ID'),
+        'NVIDIA_TF32_OVERRIDE': os.environ.get('NVIDIA_TF32_OVERRIDE'),
+    }
+
+def get_system_info():
+    system_info = subprocess.run('nvidia-smi --query-gpu=gpu_name,memory.total,enforced.power.limit --format=csv'.split(), capture_output=True).stdout
+    system_info = [i.decode('utf-8') for i in system_info.split(b'\n')]
+    system_info = [x for x in system_info if x]
+    return {'system_info': system_info}
diff --git a/PyTorch/Forecasting/TFT/modeling.py b/PyTorch/Forecasting/TFT/modeling.py
new file mode 100644
index 00000000..65e64983
--- /dev/null
+++ b/PyTorch/Forecasting/TFT/modeling.py
@@ -0,0 +1,367 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from torch import Tensor
+from typing import Dict, Tuple, Optional, List
+
+if os.environ.get("TFT_SCRIPTING", False):
+    from torch.nn import LayerNorm
+else:
+    from apex.normalization.fused_layer_norm import FusedLayerNorm as LayerNorm
+
+class MaybeLayerNorm(nn.Module):
+    def __init__(self, output_size, hidden_size, eps):
+        super().__init__()
+        if output_size and output_size == 1:
+            self.ln = nn.Identity()
+        else:
+            self.ln = LayerNorm(output_size if output_size else hidden_size, eps=eps)
+    
+    def forward(self, x):
+        return self.ln(x)
+
+
+class GLU(nn.Module):
+    def __init__(self, hidden_size, output_size):
+        super().__init__()
+        self.lin = nn.Linear(hidden_size, output_size * 2)
+
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.lin(x)
+        x = F.glu(x)
+        return x
+
+
+class GRN(nn.Module):
+    def __init__(self,
+                 input_size,
+                 hidden_size, 
+                 output_size=None,
+                 context_hidden_size=None,
+                 dropout=0):
+        super().__init__()
+
+        
+        self.layer_norm = MaybeLayerNorm(output_size, hidden_size, eps=1e-3)
+        self.lin_a = nn.Linear(input_size, hidden_size)
+        if context_hidden_size is not None:
+            self.lin_c = nn.Linear(context_hidden_size, hidden_size, bias=False)
+        self.lin_i = nn.Linear(hidden_size, hidden_size)
+        self.glu = GLU(hidden_size, output_size if output_size else hidden_size)
+        self.dropout = nn.Dropout(dropout)
+        self.out_proj = nn.Linear(input_size, output_size) if output_size else None
+
+    def forward(self, a: Tensor, c: Optional[Tensor] = None):
+        x = self.lin_a(a)
+        if c is not None:
+            x = x + self.lin_c(c).unsqueeze(1)
+        x = F.elu(x)
+        x = self.lin_i(x)
+        x = self.dropout(x)
+        x = self.glu(x)
+        y = a if not self.out_proj else self.out_proj(a)
+        x = x + y
+        x = self.layer_norm(x)
+        return x 
+
+class TFTEmbedding(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.s_cat_inp_lens    = config.static_categorical_inp_lens
+        self.t_cat_k_inp_lens  = config.temporal_known_categorical_inp_lens
+        self.t_cat_o_inp_lens  = config.temporal_observed_categorical_inp_lens
+        self.s_cont_inp_size   = config.static_continuous_inp_size
+        self.t_cont_k_inp_size = config.temporal_known_continuous_inp_size
+        self.t_cont_o_inp_size = config.temporal_observed_continuous_inp_size
+        self.t_tgt_size        = config.temporal_target_size
+
+        self.hidden_size = config.hidden_size
+
+        # There are 7 types of input:
+        # 1. Static categorical
+        # 2. Static continuous
+        # 3. Temporal known a priori categorical
+        # 4. Temporal known a priori continuous
+        # 5. Temporal observed categorical
+        # 6. Temporal observed continuous
+        # 7. Temporal observed targets (time series obseved so far)
+
+        self.s_cat_embed = nn.ModuleList([
+            nn.Embedding(n, self.hidden_size) for n in self.s_cat_inp_lens]) if self.s_cat_inp_lens else None
+        self.t_cat_k_embed = nn.ModuleList([
+            nn.Embedding(n, self.hidden_size) for n in self.t_cat_k_inp_lens]) if self.t_cat_k_inp_lens else None
+        self.t_cat_o_embed = nn.ModuleList([
+            nn.Embedding(n, self.hidden_size) for n in self.t_cat_o_inp_lens]) if self.t_cat_o_inp_lens else None
+
+        self.s_cont_embedding_vectors = nn.Parameter(torch.Tensor(self.s_cont_inp_size, self.hidden_size)) if self.s_cont_inp_size else None
+        self.t_cont_k_embedding_vectors = nn.Parameter(torch.Tensor(self.t_cont_k_inp_size, self.hidden_size)) if self.t_cont_k_inp_size else None
+        self.t_cont_o_embedding_vectors = nn.Parameter(torch.Tensor(self.t_cont_o_inp_size, self.hidden_size)) if self.t_cont_o_inp_size else None
+        self.t_tgt_embedding_vectors = nn.Parameter(torch.Tensor(self.t_tgt_size, self.hidden_size))
+
+        self.s_cont_embedding_bias = nn.Parameter(torch.zeros(self.s_cont_inp_size, self.hidden_size)) if self.s_cont_inp_size else None
+        self.t_cont_k_embedding_bias = nn.Parameter(torch.zeros(self.t_cont_k_inp_size, self.hidden_size)) if self.t_cont_k_inp_size else None
+        self.t_cont_o_embedding_bias = nn.Parameter(torch.zeros(self.t_cont_o_inp_size, self.hidden_size)) if self.t_cont_o_inp_size else None
+        self.t_tgt_embedding_bias = nn.Parameter(torch.zeros(self.t_tgt_size, self.hidden_size))
+
+        if self.s_cont_embedding_vectors is not None:
+            torch.nn.init.xavier_normal_(self.s_cont_embedding_vectors)
+        if self.t_cont_k_embedding_vectors is not None:
+            torch.nn.init.xavier_normal_(self.t_cont_k_embedding_vectors)
+        if self.t_cont_o_embedding_vectors is not None:
+            torch.nn.init.xavier_normal_(self.t_cont_o_embedding_vectors)
+        torch.nn.init.xavier_normal_(self.t_tgt_embedding_vectors)
+
+    def _apply_embedding(self,
+            cat: Optional[Tensor],
+            cont: Optional[Tensor],
+            cat_emb: Optional[nn.ModuleList], 
+            cont_emb: Tensor,
+            cont_bias: Tensor,
+            ) -> Tuple[Optional[Tensor], Optional[Tensor]]:
+        e_cat = torch.stack([embed(cat[...,i]) for i, embed in enumerate(cat_emb)], dim=-2) if cat is not None else None
+        if cont is not None:
+            #the line below is equivalent to following einsums
+            #e_cont = torch.einsum('btf,fh->bthf', cont, cont_emb)
+            #e_cont = torch.einsum('bf,fh->bhf', cont, cont_emb)
+            e_cont = torch.mul(cont.unsqueeze(-1), cont_emb)
+            e_cont = e_cont + cont_bias
+        else:
+            e_cont = None
+
+        if e_cat is not None and e_cont is not None:
+            return torch.cat([e_cat, e_cont], dim=-2)
+        elif e_cat is not None:
+            return e_cat
+        elif e_cont is not None:
+            return e_cont
+        else:
+            return None
+
+    def forward(self, x: Dict[str, Tensor]):
+        # temporal/static categorical/continuous known/observed input 
+        s_cat_inp = x.get('s_cat', None)
+        s_cont_inp = x.get('s_cont', None)
+        t_cat_k_inp = x.get('k_cat', None)
+        t_cont_k_inp = x.get('k_cont', None)
+        t_cat_o_inp = x.get('o_cat', None)
+        t_cont_o_inp = x.get('o_cont', None)
+        t_tgt_obs = x['target'] # Has to be present
+
+        # Static inputs are expected to be equal for all timesteps
+        # For memory efficiency there is no assert statement
+        s_cat_inp = s_cat_inp[:,0,:] if s_cat_inp is not None else None
+        s_cont_inp = s_cont_inp[:,0,:] if s_cont_inp is not None else None
+
+        s_inp = self._apply_embedding(s_cat_inp,
+                                      s_cont_inp,
+                                      self.s_cat_embed,
+                                      self.s_cont_embedding_vectors,
+                                      self.s_cont_embedding_bias)
+        t_known_inp = self._apply_embedding(t_cat_k_inp,
+                                            t_cont_k_inp,
+                                            self.t_cat_k_embed,
+                                            self.t_cont_k_embedding_vectors,
+                                            self.t_cont_k_embedding_bias)
+        t_observed_inp = self._apply_embedding(t_cat_o_inp,
+                                               t_cont_o_inp,
+                                               self.t_cat_o_embed,
+                                               self.t_cont_o_embedding_vectors,
+                                               self.t_cont_o_embedding_bias)
+
+        # Temporal observed targets
+        # t_observed_tgt = torch.einsum('btf,fh->btfh', t_tgt_obs, self.t_tgt_embedding_vectors)
+        t_observed_tgt = torch.matmul(t_tgt_obs.unsqueeze(3).unsqueeze(4), self.t_tgt_embedding_vectors.unsqueeze(1)).squeeze(3)
+        t_observed_tgt = t_observed_tgt + self.t_tgt_embedding_bias
+
+        return s_inp, t_known_inp, t_observed_inp, t_observed_tgt
+
+class VariableSelectionNetwork(nn.Module):
+    def __init__(self, config, num_inputs):
+        super().__init__()
+        self.joint_grn = GRN(config.hidden_size*num_inputs, config.hidden_size, output_size=num_inputs, context_hidden_size=config.hidden_size)
+        self.var_grns = nn.ModuleList([GRN(config.hidden_size, config.hidden_size, dropout=config.dropout) for _ in range(num_inputs)])
+
+    def forward(self, x: Tensor, context: Optional[Tensor] = None):
+        Xi = x.reshape(*x.shape[:-2], -1)
+        grn_outputs = self.joint_grn(Xi, c=context)
+        sparse_weights = F.softmax(grn_outputs, dim=-1)
+        transformed_embed_list = [m(x[...,i,:]) for i, m in enumerate(self.var_grns)]
+        transformed_embed = torch.stack(transformed_embed_list, dim=-1)
+        #the line below performs batched matrix vector multiplication
+        #for temporal features it's bthf,btf->bth
+        #for static features it's bhf,bf->bh
+        variable_ctx = torch.matmul(transformed_embed, sparse_weights.unsqueeze(-1)).squeeze(-1)
+
+        return variable_ctx, sparse_weights
+
+class StaticCovariateEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.vsn = VariableSelectionNetwork(config, config.num_static_vars)
+        self.context_grns = nn.ModuleList([GRN(config.hidden_size, config.hidden_size, dropout=config.dropout) for _ in range(4)])
+
+    def forward(self, x: Tensor) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
+        variable_ctx, sparse_weights = self.vsn(x)
+
+        # Context vectors:
+        # variable selection context
+        # enrichment context
+        # state_c context
+        # state_h context
+        cs, ce, ch, cc = tuple(m(variable_ctx) for m in self.context_grns)
+
+        return cs, ce, ch, cc
+
+
+class InterpretableMultiHeadAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.n_head = config.n_head
+        assert config.hidden_size % config.n_head == 0
+        self.d_head = config.hidden_size // config.n_head
+        self.qkv_linears = nn.Linear(config.hidden_size, (2 * self.n_head + 1) * self.d_head, bias=False)
+        self.out_proj = nn.Linear(self.d_head, config.hidden_size, bias=False)
+        self.attn_dropout = nn.Dropout(config.attn_dropout)
+        self.out_dropout = nn.Dropout(config.dropout)
+        self.scale = self.d_head**-0.5
+        self.register_buffer("_mask", torch.triu(torch.full((config.example_length, config.example_length), float('-inf')), 1).unsqueeze(0))
+
+    def forward(self, x: Tensor, mask_future_timesteps: bool = True) -> Tuple[Tensor, Tensor]:
+        bs, t, h_size = x.shape
+        qkv = self.qkv_linears(x)
+        q, k, v = qkv.split((self.n_head * self.d_head, self.n_head * self.d_head, self.d_head), dim=-1)
+        q = q.view(bs, t, self.n_head, self.d_head)
+        k = k.view(bs, t, self.n_head, self.d_head)
+        v = v.view(bs, t, self.d_head)
+
+        # attn_score = torch.einsum('bind,bjnd->bnij', q, k)
+        attn_score = torch.matmul(q.permute((0, 2, 1, 3)), k.permute((0, 2, 3, 1)))
+        attn_score.mul_(self.scale)
+
+        if mask_future_timesteps:
+            attn_score = attn_score + self._mask
+
+        attn_prob = F.softmax(attn_score, dim=3)
+        attn_prob = self.attn_dropout(attn_prob)
+
+        # attn_vec = torch.einsum('bnij,bjd->bnid', attn_prob, v)
+        attn_vec = torch.matmul(attn_prob, v.unsqueeze(1))
+        m_attn_vec = torch.mean(attn_vec, dim=1)
+        out = self.out_proj(m_attn_vec)
+        out = self.out_dropout(out)
+
+        return out, attn_vec
+
+
+
+class TemporalFusionTransformer(nn.Module):
+    """ 
+    Implementation of https://arxiv.org/abs/1912.09363 
+    """
+    def __init__(self, config):
+        super().__init__()
+
+        if hasattr(config, 'model'):
+            config = config.model
+
+        self.encoder_length = config.encoder_length #this determines from how distant past we want to use data from
+
+        self.embedding = TFTEmbedding(config)
+        self.static_encoder = StaticCovariateEncoder(config)
+
+        self.history_vsn = VariableSelectionNetwork(config, config.num_historic_vars) 
+        self.history_encoder = nn.LSTM(config.hidden_size, config.hidden_size, batch_first=True)
+        self.future_vsn = VariableSelectionNetwork(config, config.num_future_vars)
+        self.future_encoder = nn.LSTM(config.hidden_size, config.hidden_size, batch_first=True)
+
+
+        self.input_gate = GLU(config.hidden_size, config.hidden_size)
+        self.input_gate_ln = LayerNorm(config.hidden_size, eps=1e-3)
+
+        self.enrichment_grn = GRN(config.hidden_size,
+                                  config.hidden_size,
+                                  context_hidden_size=config.hidden_size, 
+                                  dropout=config.dropout)
+        self.attention = InterpretableMultiHeadAttention(config)
+        self.attention_gate = GLU(config.hidden_size, config.hidden_size)
+        self.attention_ln = LayerNorm(config.hidden_size, eps=1e-3)
+
+        self.positionwise_grn = GRN(config.hidden_size,
+                                    config.hidden_size,
+                                    dropout=config.dropout)
+
+        self.decoder_gate = GLU(config.hidden_size, config.hidden_size)
+        self.decoder_ln = LayerNorm(config.hidden_size, eps=1e-3)
+
+        self.quantile_proj = nn.Linear(config.hidden_size, len(config.quantiles))
+
+    def forward(self, x: Dict[str, Tensor]) -> Tensor:
+        s_inp, t_known_inp, t_observed_inp, t_observed_tgt = self.embedding(x)
+
+        # Static context
+        cs, ce, ch, cc = self.static_encoder(s_inp)
+        ch, cc = ch.unsqueeze(0), cc.unsqueeze(0) #lstm initial states
+
+        # Temporal input
+        _historical_inputs = [t_known_inp[:,:self.encoder_length,:], t_observed_tgt[:,:self.encoder_length,:]]
+        if t_observed_inp is not None:
+            _historical_inputs.insert(0,t_observed_inp[:,:self.encoder_length,:])
+
+        historical_inputs = torch.cat(_historical_inputs, dim=-2)
+        future_inputs = t_known_inp[:, self.encoder_length:]
+
+        # Encoders
+        historical_features, _ = self.history_vsn(historical_inputs, cs)
+        history, state = self.history_encoder(historical_features, (ch, cc))
+        future_features, _ = self.future_vsn(future_inputs, cs)
+        future, _ = self.future_encoder(future_features, state)
+        torch.cuda.synchronize() # this call gives perf boost for unknown reasons
+
+        # skip connection
+        input_embedding = torch.cat([historical_features, future_features], dim=1)
+        temporal_features = torch.cat([history, future], dim=1)
+        temporal_features = self.input_gate(temporal_features)
+        temporal_features = temporal_features + input_embedding
+        temporal_features = self.input_gate_ln(temporal_features)
+
+        # Static enrichment
+        enriched = self.enrichment_grn(temporal_features, c=ce)
+
+        # Temporal self attention
+        x, _ = self.attention(enriched, mask_future_timesteps=True)
+
+        # Don't compute hictorical quantiles
+        x = x[:, self.encoder_length:, :]
+        temporal_features = temporal_features[:, self.encoder_length:, :]
+        enriched = enriched[:, self.encoder_length:, :]
+
+        x = self.attention_gate(x)
+        x = x + enriched
+        x = self.attention_ln(x)
+
+        # Position-wise feed-forward
+        x = self.positionwise_grn(x)
+
+        # Final skip connection
+        x = self.decoder_gate(x)
+        x = x + temporal_features
+        x = self.decoder_ln(x)
+
+        out = self.quantile_proj(x)
+
+        return out
diff --git a/PyTorch/Forecasting/TFT/requirements.txt b/PyTorch/Forecasting/TFT/requirements.txt
new file mode 100644
index 00000000..8ba46efc
--- /dev/null
+++ b/PyTorch/Forecasting/TFT/requirements.txt
@@ -0,0 +1 @@
+tensorboard
diff --git a/PyTorch/Forecasting/TFT/scripts/benchmark.sh b/PyTorch/Forecasting/TFT/scripts/benchmark.sh
new file mode 100644
index 00000000..c8a04c36
--- /dev/null
+++ b/PyTorch/Forecasting/TFT/scripts/benchmark.sh
@@ -0,0 +1,54 @@
+#! /bin/bash
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+NUM_GPUS=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
+[ $NUM_GPUS -eq 16 ] && WORKER_NUMS=(1 8 16) || WORKER_NUMS=(1 8)
+DATASETS=(electricity traffic)
+
+rm -r /tmp/benchmark_results
+
+for DATASET in ${DATASETS[@]}
+do
+    for NGPU in ${WORKER_NUMS[@]}
+    do
+        for BATCH_SIZE in 512 1024 1536 2048 2560
+        do
+            for USE_AMP in --use_amp ""
+            do
+                for AFFINITY in "--affinity disabled" "--affinity single" "--affinity socket_unique_interleaved"
+                do 
+                    EXP_NAME="TFT_benchmark_${DATASET}_BS_${BATCH_SIZE}_${NGPU}GPU${USE_AMP}_${AFFINITY}"
+                    python -m torch.distributed.launch --nproc_per_node=${NGPU} train.py \
+                            --dataset ${DATASET} \
+                            --data_path /data/processed/${DATASET}_bin \
+                            --batch_size=${BATCH_SIZE} \
+                            --lr 5e-4 \
+                            --epochs 1 \
+                            --sample 100000 5000 \
+                            --seed 1 \
+                            ${USE_AMP} \
+                            ${AFFINITY} \
+                            --clip_grad 0.1 \
+                            --results /tmp/benchmark_results/${EXP_NAME}
+                done
+            done
+        done
+    done
+done
+for P in `ls /tmp/benchmark_results/`;
+do
+    echo ${P}
+    tail -n 1 /tmp/benchmark_results/${P}/dllogger.json
+done
diff --git a/PyTorch/Forecasting/TFT/scripts/get_data.sh b/PyTorch/Forecasting/TFT/scripts/get_data.sh
new file mode 100644
index 00000000..d4c7c7e1
--- /dev/null
+++ b/PyTorch/Forecasting/TFT/scripts/get_data.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+DATAPATH='/data'
+
+declare -A URLS=( ['electricity']='https://archive.ics.uci.edu/ml/machine-learning-databases/00321/LD2011_2014.txt.zip'
+                  ['traffic']='https://archive.ics.uci.edu/ml/machine-learning-databases/00204/PEMS-SF.zip'
+                )
+
+mkdir -p ${DATAPATH}/raw
+mkdir -p ${DATAPATH}/processed
+
+for DS in electricity traffic
+do
+	DS_PATH=${DATAPATH}/raw/${DS}
+	ZIP_FNAME=${DS_PATH}.zip
+    if [ ! -d ${DS_PATH} ]
+    then
+        wget "${URLS[${DS}]}" -O ${ZIP_FNAME}
+        unzip ${ZIP_FNAME} -d ${DS_PATH}
+    fi
+	python -c "from data_utils import standarize_${DS} as standarize; standarize(\"${DS_PATH}\")"
+	python -c "from data_utils import preprocess; \
+               from configuration import ${DS^}Config as Config; \
+               preprocess(\"${DS_PATH}/standarized.csv\", \"${DATAPATH}/processed/${DS}_bin\", Config())" 
+done
+
+
diff --git a/PyTorch/Forecasting/TFT/scripts/run_electricity.sh b/PyTorch/Forecasting/TFT/scripts/run_electricity.sh
new file mode 100644
index 00000000..86214a9a
--- /dev/null
+++ b/PyTorch/Forecasting/TFT/scripts/run_electricity.sh
@@ -0,0 +1,30 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+: ${SEED:=1}
+: ${LR:=1e-3}
+: ${NGPU:=8}
+: ${BATCH_SIZE:=1024}
+: ${EPOCHS:=30}
+
+python -m torch.distributed.launch --nproc_per_node=${NGPU} train.py \
+        --dataset electricity \
+        --data_path /data/processed/electricity_bin \
+        --batch_size=${BATCH_SIZE} \
+        --sample 450000 50000 \
+        --lr ${LR} \
+        --epochs ${EPOCHS} \
+        --seed ${SEED} \
+        --use_amp \
+        --results /results/TFT_electricity_bs${NGPU}x${BATCH_SIZE}_lr${LR}/seed_${SEED}
diff --git a/PyTorch/Forecasting/TFT/scripts/run_electricity_DGX1-16G.sh b/PyTorch/Forecasting/TFT/scripts/run_electricity_DGX1-16G.sh
new file mode 100644
index 00000000..86214a9a
--- /dev/null
+++ b/PyTorch/Forecasting/TFT/scripts/run_electricity_DGX1-16G.sh
@@ -0,0 +1,30 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+: ${SEED:=1}
+: ${LR:=1e-3}
+: ${NGPU:=8}
+: ${BATCH_SIZE:=1024}
+: ${EPOCHS:=30}
+
+python -m torch.distributed.launch --nproc_per_node=${NGPU} train.py \
+        --dataset electricity \
+        --data_path /data/processed/electricity_bin \
+        --batch_size=${BATCH_SIZE} \
+        --sample 450000 50000 \
+        --lr ${LR} \
+        --epochs ${EPOCHS} \
+        --seed ${SEED} \
+        --use_amp \
+        --results /results/TFT_electricity_bs${NGPU}x${BATCH_SIZE}_lr${LR}/seed_${SEED}
diff --git a/PyTorch/Forecasting/TFT/scripts/run_traffic.sh b/PyTorch/Forecasting/TFT/scripts/run_traffic.sh
new file mode 100644
index 00000000..cab8e473
--- /dev/null
+++ b/PyTorch/Forecasting/TFT/scripts/run_traffic.sh
@@ -0,0 +1,30 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+: ${SEED:=1}
+: ${LR:=1e-3}
+: ${NGPU:=8}
+: ${BATCH_SIZE:=1024}
+: ${EPOCHS:=20}
+
+python -m torch.distributed.launch --nproc_per_node=${NGPU} train.py \
+        --dataset traffic \
+        --data_path /data/processed/traffic_bin \
+        --batch_size=${BATCH_SIZE} \
+        --sample 450000 50000 \
+        --lr ${LR} \
+        --epochs ${EPOCHS} \
+        --seed ${SEED} \
+        --use_amp \
+        --results /results/TFT_traffic_bs${NGPU}x${BATCH_SIZE}_lr${LR}/seed_${SEED}
diff --git a/PyTorch/Forecasting/TFT/scripts/run_traffic_DGX1-16G.sh b/PyTorch/Forecasting/TFT/scripts/run_traffic_DGX1-16G.sh
new file mode 100644
index 00000000..cab8e473
--- /dev/null
+++ b/PyTorch/Forecasting/TFT/scripts/run_traffic_DGX1-16G.sh
@@ -0,0 +1,30 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+: ${SEED:=1}
+: ${LR:=1e-3}
+: ${NGPU:=8}
+: ${BATCH_SIZE:=1024}
+: ${EPOCHS:=20}
+
+python -m torch.distributed.launch --nproc_per_node=${NGPU} train.py \
+        --dataset traffic \
+        --data_path /data/processed/traffic_bin \
+        --batch_size=${BATCH_SIZE} \
+        --sample 450000 50000 \
+        --lr ${LR} \
+        --epochs ${EPOCHS} \
+        --seed ${SEED} \
+        --use_amp \
+        --results /results/TFT_traffic_bs${NGPU}x${BATCH_SIZE}_lr${LR}/seed_${SEED}
diff --git a/PyTorch/Forecasting/TFT/tft_pyt/Dockerfile b/PyTorch/Forecasting/TFT/tft_pyt/Dockerfile
new file mode 100644
index 00000000..70552ea1
--- /dev/null
+++ b/PyTorch/Forecasting/TFT/tft_pyt/Dockerfile
@@ -0,0 +1,36 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:21.06-py3
+
+FROM ${FROM_IMAGE_NAME}
+
+RUN apt-get update && apt-get install -y libb64-dev libb64-0d
+WORKDIR /workspace
+#ENV PYTHONPATH /workspace
+RUN pip uninstall -y typing
+
+RUN apt update && apt install -y p7zip-full
+COPY requirements.txt .
+RUN pip install --upgrade pip
+RUN pip install --no-cache-dir --ignore-installed -r requirements.txt
+RUN pip install --no-cache-dir -e git://github.com/NVIDIA/dllogger#egg=dllogger
+
+COPY . .
+ENV PYTHONPATH="${PYTHONPATH}:/workspace"
+
+# AMP monkey-patch
+RUN sed -i 's/  def forward(ctx,/  @amp.custom_fwd\(cast_inputs=torch.float32\)\n  def forward(ctx,/g' /opt/conda/lib/python3.8/site-packages/apex/normalization/fused_layer_norm.py
+RUN sed -i 's/  def backward(ctx,/  @amp.custom_bwd\n  def backward(ctx,/g' /opt/conda/lib/python3.8/site-packages/apex/normalization/fused_layer_norm.py
+RUN sed -i 's/^import torch$/import torch\nfrom torch.cuda import amp/' /opt/conda/lib/python3.8/site-packages/apex/normalization/fused_layer_norm.py
diff --git a/PyTorch/Forecasting/TFT/tft_pyt/LICENCE b/PyTorch/Forecasting/TFT/tft_pyt/LICENCE
new file mode 100644
index 00000000..261eeb9e
--- /dev/null
+++ b/PyTorch/Forecasting/TFT/tft_pyt/LICENCE
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/PyTorch/Forecasting/TFT/tft_pyt/LICENSE AGREEMENT b/PyTorch/Forecasting/TFT/tft_pyt/LICENSE AGREEMENT
new file mode 100644
index 00000000..5d1d88cf
--- /dev/null
+++ b/PyTorch/Forecasting/TFT/tft_pyt/LICENSE AGREEMENT	
@@ -0,0 +1,25 @@
+Individual Contributor License Agreement (CLA)
+Thank you for submitting your contributions to this project.
+
+By signing this CLA, you agree that the following terms apply to all of your past, present and future contributions to the project.
+
+License.
+You hereby represent that all present, past and future contributions are governed by the Apache 2.0 License copyright statement.
+
+This entails that to the extent possible under law, you transfer all copyright and related or neighboring rights of the code or documents you contribute to the project itself or its maintainers. Furthermore you also represent that you have the authority to perform the above waiver with respect to the entirety of you contributions.
+
+Moral Rights.
+To the fullest extent permitted under applicable law, you hereby waive, and agree not to assert, all of your “moral rights” in or relating to your contributions for the benefit of the project.
+
+Third Party Content.
+If your Contribution includes or is based on any source code, object code, bug fixes, configuration changes, tools, specifications, documentation, data, materials, feedback, information or other works of authorship that were not authored by you (“Third Party Content”) or if you are aware of any third party intellectual property or proprietary rights associated with your Contribution (“Third Party Rights”), then you agree to include with the submission of your Contribution full details respecting such Third Party Content and Third Party Rights, including, without limitation, identification of which aspects of your Contribution contain Third Party Content or are associated with Third Party Rights, the owner/author of the Third Party Content and Third Party Rights, where you obtained the Third Party Content, and any applicable third party license terms or restrictions respecting the Third Party Content and Third Party Rights. For greater certainty, the foregoing obligations respecting the identification of Third Party Content and Third Party Rights do not apply to any portion of a Project that is incorporated into your Contribution to that same Project.
+
+Representations.
+You represent that, other than the Third Party Content and Third Party Rights identified by you in accordance with this Agreement, you are the sole author of your Contributions and are legally entitled to grant the foregoing licenses and waivers in respect of your Contributions. If your Contributions were created in the course of your employment with your past or present employer(s), you represent that such employer(s) has authorized you to make your Contributions on behalf of such employer(s) or such employer (s) has waived all of their right, title or interest in or to your Contributions.
+
+Disclaimer.
+To the fullest extent permitted under applicable law, your Contributions are provided on an "as is" basis, without any warranties or conditions, express or implied, including, without limitation, any implied warranties or conditions of non-infringement, merchantability or fitness for a particular purpose. You are not required to provide support for your Contributions, except to the extent you desire to provide support.
+
+No Obligation.
+You acknowledge that the maintainers of this project are under no obligation to use or incorporate your contributions into the project. The decision to use or incorporate your contributions into the project will be made at the sole discretion of the maintainers or their authorized delegates.
+
diff --git a/PyTorch/Forecasting/TFT/tft_pyt/NOTICE b/PyTorch/Forecasting/TFT/tft_pyt/NOTICE
new file mode 100644
index 00000000..ae19bb47
--- /dev/null
+++ b/PyTorch/Forecasting/TFT/tft_pyt/NOTICE
@@ -0,0 +1,3 @@
+TFT for PyTorch
+
+This repository includes software from https://github.com/google-research/google-research/tree/master/tft licensed under the Apache License, Version 2.0
diff --git a/PyTorch/Forecasting/TFT/tft_pyt/README.md b/PyTorch/Forecasting/TFT/tft_pyt/README.md
new file mode 100644
index 00000000..69b39d12
--- /dev/null
+++ b/PyTorch/Forecasting/TFT/tft_pyt/README.md
@@ -0,0 +1,465 @@
+# Temporal Fusion Transformer For PyTorch
+
+This repository provides a script and recipe to train the Temporal Fusion Transformer model to achieve state-of-the-art accuracy. The content of this repository is tested and maintained by NVIDIA.
+
+## Table Of Contents
+
+- [Model overview](#model-overview)
+    * [Model architecture](#model-architecture)
+    * [Default configuration](#default-configuration)
+    * [Feature support matrix](#feature-support-matrix)
+	    * [Features](#features)
+    * [Mixed precision training](#mixed-precision-training)
+	    * [Enabling mixed precision](#enabling-mixed-precision)
+          * [Enabling TF32](#enabling-tf32)
+    * [Glossary](#glossary)
+- [Setup](#setup)
+    * [Requirements](#requirements)
+- [Quick Start Guide](#quick-start-guide)
+- [Advanced](#advanced)
+    * [Scripts and sample code](#scripts-and-sample-code)
+    * [Command-line options](#command-line-options)
+    * [Getting the data](#getting-the-data)
+        * [Dataset guidelines](#dataset-guidelines)
+        * [Multi-dataset](#multi-dataset)
+    * [Training process](#training-process)
+    * [Inference process](#inference-process)
+- [Performance](#performance)
+    * [Benchmarking](#benchmarking)
+        * [Training performance benchmark](#training-performance-benchmark)
+        * [Inference performance benchmark](#inference-performance-benchmark)
+    * [Results](#results)
+        * [Training accuracy results](#training-accuracy-results)                         
+            * [Training accuracy: NVIDIA DGX A100 (8x A100 80GB)](#training-accuracy-nvidia-dgx-a100-8x-a100-80gb)
+            * [Training accuracy: NVIDIA DGX-1 (8x V100 16GB)](#training-accuracy-nvidia-dgx-1-8x-v100-16gb)
+            * [Training stability test](#training-stability-test)
+        * [Training performance results](#training-performance-results)
+            * [Training performance: NVIDIA DGX A100 (8x A100 80GB)](#training-performance-nvidia-dgx-a100-8x-a100-80gb)
+            * [Training performance: NVIDIA DGX-1 (8x V100 16GB)](#training-performance-nvidia-dgx-1-8x-v100-16gb)
+- [Release notes](#release-notes)
+    * [Changelog](#changelog)
+    * [Known issues](#known-issues)
+
+
+
+## Model overview
+
+The Temporal Fusion Transformer [TFT](https://arxiv.org/abs/1912.09363) model is a state-of-the-art architecture for interpretable, multi-horizon time-series prediction. The model was first developed and [implemented by Google](https://github.com/google-research/google-research/tree/master/tft) with the collaboration with the University of Oxford.
+This implementation differs from the reference implementation by addressing the issue of missing data, which is common in production datasets, by either masking their values in attention matrices or embedding them as a special value in the latent space.
+This model enables the prediction of confidence intervals for future values of time series for multiple future timesteps.
+
+This model is trained with mixed precision using Tensor Cores on Volta, Turing, and the NVIDIA Ampere GPU architectures. Therefore, researchers can get results 1.45x faster than training without Tensor Cores while experiencing the benefits of mixed precision training. This model is tested against each NGC monthly container release to ensure consistent accuracy and performance over time.
+
+### Model architecture
+
+The TFT model is a hybrid architecture joining LSTM encoding of time series and interpretability of transformer attention layers. Prediction is based on three  types of variables: static (constant for a given time series), known (known in advance for whole history and future), observed (known only for historical data). All these variables come in two flavors: categorical, and continuous. In addition to historical data, we feed the model with historical values of time series. All variables are embedded in high-dimensional space by learning an embedding vector. Categorical variables embeddings are learned in the classical sense of embedding discrete values. The model learns a single vector for each continuous variable, which is then scaled by this variable’s value for further processing. The next step is to filter variables through the Variable Selection Network (VSN), which assigns weights to the inputs in accordance with their relevance to the prediction. Static variables are used as a context for variable selection of other variables and as an initial state of LSTM encoders.
+After encoding, variables are passed to multi-head attention layers (decoder), which produce the final prediction. Whole architecture is interwoven with residual connections with gating mechanisms that allow  the architecture to adapt to various problems by skipping some parts of it.
+For the sake of explainability, heads of self-attention layers share value matrices. This allows interpreting  self-attention as an ensemble of models predicting different temporal patterns over the same feature set. The other feature that helps us understand the model is VSN activations, which tells us how relevant the given feature is to the prediction.
+![](TFT_architecture.PNG)
+*image source: https://arxiv.org/abs/1912.09363*
+
+### Default configuration
+
+The specific configuration of the TFT model depends on the dataset used. Not only is the volume of the model subject to change but so are the data sampling and preprocessing strategies. During preprocessing, data is normalized per feature. For a part of the datasets, we apply scaling per-time-series, which takes into account shifts in distribution between entities (i.e., a factory consumes more electricity than an average house). The model is trained with the quantile loss: <img src="https://render.githubusercontent.com/render/math?math=\Large\sum_{i=1}^N\sum_{q\in\mathcal{Q}}\sum_{t=1}^{t_{max}}\frac{QL(y_it,\hat{y}_i(q,t),q)}{Nt_{max}}">
+For quantiles in [0.1, 0.5, 0.9]. The default configurations are tuned for distributed training on DGX-1-32G with mixed precision. We use dynamic loss scaling. Specific values are provided in the table below.
+
+| Dataset | Training samples | Validation samples | Test samples | History length | Forecast horizon | Dropout | Hidden size | #Heads | BS | LR | Gradient clipping |
+| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
+| Electricity | 450k | 50k | 53.5k | 168 | 24 | 0.1 | 128 | 4 | 8x1024 | 1e-3 | 0.0 |
+| Traffic | 450k | 50k | 139.6k | 168 | 24 | 0.3 | 128 | 4 | 8x1024 | 1e-3 | 0.0
+
+### Feature support matrix
+
+The following features are supported by this model: 
+
+| Feature                    | Yes column                
+|----------------------------|--------------------------
+|Distributed data parallel   |         Yes
+|PyTorch AMP                 |         Yes 
+    
+         
+#### Features
+
+[Automatic Mixed Precision](https://pytorch.org/docs/stable/amp.html)
+provides an easy way to leverage Tensor Cores’ performance. It allows the execution of parts of a network in lower precision. Refer to [Mixed precision training](#mixed-precision-training) for more information.
+
+[PyTorch
+DistributedDataParallel](https://pytorch.org/docs/stable/nn.html#torch.nn.parallel.DistributedDataParallel) - a module
+wrapper that enables easy multiprocess distributed data-parallel
+training.
+
+### Mixed precision training
+
+Mixed precision is the combined use of different numerical precisions in a
+computational method.
+[Mixed precision](https://arxiv.org/abs/1710.03740) training offers significant
+computational speedup by performing operations in half-precision format while
+storing minimal information in single-precision to retain as much information
+as possible in critical parts of the network. Since the introduction of [Tensor Cores](https://developer.nvidia.com/tensor-cores) in Volta, and following with 
+both the Turing and Ampere architectures, significant training speedups are 
+experienced by switching to
+mixed precision -- up to 3x overall speedup on the most arithmetically intense
+model architectures. Using mixed precision training previously required two
+steps:
+
+1. Porting the model to use the FP16 data type where appropriate.
+2. Manually adding loss scaling to preserve small gradient values.
+
+The ability to train deep learning networks with lower precision was introduced
+in the Pascal architecture and first supported in [CUDA
+8](https://devblogs.nvidia.com/parallelforall/tag/fp16/) in the NVIDIA Deep
+Learning SDK.
+
+For information about:
+* How to train using mixed precision, refer to the [Mixed Precision
+  Training](https://arxiv.org/abs/1710.03740) paper and [Training With Mixed
+  Precision](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html)
+  documentation.
+* Techniques used for mixed precision training, refer to the [Mixed-Precision
+  Training of Deep Neural
+  Networks](https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/)
+  blog.
+* APEX tools for mixed precision training, refer to the [NVIDIA Apex: Tools for Easy Mixed-Precision Training in
+  PyTorch](https://devblogs.nvidia.com/apex-pytorch-easy-mixed-precision-training/)
+  .
+
+
+#### Enabling mixed precision
+
+
+Mixed precision is enabled in PyTorch by using the Automatic Mixed Precision torch.cuda.amp module, which casts variables to half-precision upon retrieval while storing variables in single-precision format. Furthermore, to preserve small gradient magnitudes in backpropagation, a [loss scaling](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html#lossscaling) step must be included when applying gradients. In PyTorch, loss scaling can be applied automatically by the GradScaler class. All the necessary steps to implement AMP are verbosely described [here](https://pytorch.org/docs/stable/notes/amp_examples.html#amp-examples).
+
+To enable mixed precision for TFT, simply add the `--use_amp` option to the training script.
+#### Enabling TF32
+
+TensorFloat-32 (TF32) is the new math mode in [NVIDIA A100](https://www.nvidia.com/en-us/data-center/a100/) GPUs for handling the matrix math, also called tensor operations. TF32 running on Tensor Cores in A100 GPUs can provide up to 10x speedups compared to single-precision floating-point math (FP32) on Volta GPUs. 
+
+TF32 Tensor Cores can speed up networks using FP32, typically with no loss of accuracy. It is more robust than FP16 for models which require high dynamic range for weights or activations.
+
+For more information, refer to the [TensorFloat-32 in the A100 GPU Accelerates AI Training, HPC up to 20x](https://blogs.nvidia.com/blog/2020/05/14/tensorfloat-32-precision-format/) blog post.
+
+TF32 is supported in the NVIDIA Ampere GPU architecture and is enabled by default.
+
+
+
+### Glossary
+
+**Multi horizon prediction**  
+Process of estimating values of a time series for multiple future time steps.
+
+**Quantiles**  
+Cut points dividing the range of a probability distribution intervals with equal probabilities.
+
+**Time series**  
+Series of data points indexed and equally spaced in time.
+
+**Transformer**  
+The paper [Attention Is All You Need](https://arxiv.org/abs/1706.03762) introduces a novel architecture called Transformer that uses an attention mechanism and transforms one sequence into another.
+ 
+
+## Setup
+
+The following section lists the requirements that you need to meet in order to start training the TFT model.
+
+### Requirements
+
+This repository contains Dockerfile, which extends the PyTorch NGC container and encapsulates some dependencies. Aside from these dependencies, ensure you have the following components:
+-   [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker)
+-   [PyTorch 21.06 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch)
+-   Supported GPUs:
+- [NVIDIA Volta architecture](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/)
+- [NVIDIA Turing architecture](https://www.nvidia.com/en-us/design-visualization/technologies/turing-architecture/)
+- [NVIDIA Ampere architecture](https://www.nvidia.com/en-us/data-center/nvidia-ampere-gpu-architecture/)
+
+For more information about how to get started with NGC containers, refer to the following sections from the NVIDIA GPU Cloud Documentation and the Deep Learning Documentation:
+-   [Getting Started Using NVIDIA GPU Cloud](https://docs.nvidia.com/ngc/ngc-getting-started-guide/index.html)
+-   [Accessing And Pulling From The NGC Container Registry](https://docs.nvidia.com/deeplearning/frameworks/user-guide/index.html#accessing_registry)
+-   Running [PyTorch](https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/running.html#running)
+
+  
+For those unable to use the PyTorch NGC container to set up the required environment or create your own container, refer to the versioned [NVIDIA Container Support Matrix](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html).
+
+## Quick Start Guide
+
+To train your model using mixed or TF32 precision with Tensor Cores, perform the following steps using the default parameters of the TFT model on any of the benchmark datasets. For the specifics concerning training and inference, refer to the [Advanced](#advanced) section.
+
+1. Clone the repository.
+```bash
+git clone https://github.com/NVIDIA/DeepLearningExamples
+cd DeepLearningExamples/PyTorch/Forecasting/TFT
+```
+
+2.  Build the TFT PyTorch NGC container.
+```bash
+docker build --network=host -t tft .
+```
+
+3.  Start an interactive session in the NGC container to run training/inference.
+```bash
+docker run -it --rm --ipc=host --network=host --gpus all -v /path/to/your/data:/data/ tft
+```
+
+Note: Ensure to mount your dataset using the -v flag to make it available for training inside the NVIDIA Docker container.
+
+4.  Download and preprocess datasets.
+```bash
+bash scripts/get_data.sh
+```
+
+5. Start training. Choose one of the scripts provided in the `scripts/` directory. Results are stored in the `/results` directory.
+These scripts are tuned for DGX1-32G. If you have a different system, use NGPU and BATCH_SIZE variables to adjust the parameters for your system.
+```bash
+bash scripts/run_electricity.sh
+bash scripts/run_traffic.sh
+```
+
+6. Start validation/evaluation. The metric we use for evaluation is q-risk. We can compare it per-quantile in the Pareto sense or jointly as one number indicating accuracy.
+```bash
+python inference.py \
+--checkpoint <your_checkpoint> \
+--data /data/processed/<dataset>/test.csv \
+--cat_encodings /data/processed/<dataset>/cat_encodings.bin \
+--tgt_scalers /data/processed/<dataset>/tgt_scalers.bin
+```
+
+7. Start inference/predictions. Visualize and save predictions by running the following command.
+```bash
+python inference.py \
+--checkpoint <your_checkpoint> \
+--data /data/processed/<dataset>/test.csv \
+--cat_encodings /data/processed/<dataset>/cat_encodings.bin \
+--tgt_scalers /data/processed/<dataset>/tgt_scalers.bin \
+--visualize \
+--save_predictions
+```
+
+
+
+Now that you have your model trained and evaluated, you can choose to compare your training results with our [Training accuracy results](#training-accuracy-results). You can also choose to benchmark your performance to [Training performance benchmark](#training-performance-results). Following the steps in these sections will ensure that you achieve the same accuracy and performance results as stated in the [Results](#results) section.
+## Advanced
+
+The following sections provide more  details about the dataset, running training and inference, and the training results.
+
+### Scripts and sample code
+
+In the root directory, the most important files are:
+
+`train.py`: Entry point for training
+`data_utils.py`: File containing the dataset implementation and preprocessing functions
+`modeling.py`: Definition of the model
+`configuration.py`: Contains configuration classes for various experiments
+`test.py`: Entry point testing trained model.
+`Dockerfile`: Container definition
+`log_helper.py`: Contains helper functions for setting up dllogger
+`criterions.py`: Definitions of loss functions
+
+The `scripts` directory contains scripts for default use cases:
+`run_electricity.sh`: train default model on the electricity dataset
+`run_traffic.sh`: train default model on the traffic dataset
+
+### Command-line options
+
+To view the full list of available options and their descriptions, use the `-h` or `--help` command-line option, for example:
+`python train.py --help`.
+
+The following example output is printed when running the model:
+```
+usage: train.py [-h] --data_path DATA_PATH --dataset {electricity,volatility,traffic,favorita} [--epochs EPOCHS] [--sample_data SAMPLE_DATA SAMPLE_DATA] [--batch_size BATCH_SIZE] [--lr LR] [--seed SEED] [--use_amp] [--clip_grad CLIP_GRAD]
+                [--early_stopping EARLY_STOPPING] [--results RESULTS] [--log_file LOG_FILE] [--distributed_world_size N] [--distributed_rank DISTRIBUTED_RANK] [--local_rank LOCAL_RANK] [--overwrite_config OVERWRITE_CONFIG]
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --data_path DATA_PATH
+  --dataset {electricity,volatility,traffic,favorita}
+  --epochs EPOCHS
+  --sample_data SAMPLE_DATA SAMPLE_DATA
+  --batch_size BATCH_SIZE
+  --lr LR
+  --seed SEED
+  --use_amp             Enable automatic mixed precision
+  --clip_grad CLIP_GRAD
+  --early_stopping EARLY_STOPPING
+                        Stop training if validation loss does not improve for more than this number of epochs.
+  --results RESULTS
+  --log_file LOG_FILE
+  --distributed_world_size N
+                        total number of GPUs across all nodes (default: all visible GPUs)
+  --distributed_rank DISTRIBUTED_RANK
+                        rank of the current worker
+  --local_rank LOCAL_RANK
+                        rank of the current worker
+  --overwrite_config OVERWRITE_CONFIG
+                        JSON string used to overload config
+
+```
+
+### Getting the data
+    
+The TFT model was trained on the electricity and traffic benchmark datasets. This repository contains the `get_data.sh` download script, which for electricity and and traffic datasets will automatically download and preprocess the training, validation and test datasets, and produce files that contain scalers.
+#### Dataset guidelines
+
+The `data_utils.py` file contains all functions that are used to preprocess the data. Initially the data is loaded to a `pandas.DataFrame` and parsed to the common format which contains the features we will use for training. Then standardized data is cleaned, normalized, encoded and binarized.
+This step does the following:
+Drop all the columns that are not marked in the configuration file as used for training or preprocessing
+Flatten indices in case time series are indexed by more than one column
+Split the data into training, validation and test splits
+Filter out all the time series shorter than minimal example length
+Normalize columns marked as continuous in the configuration file
+Encode as integers columns marked as categorical
+Save the data in csv and binary formats
+
+#### Multi-dataset
+In order to use an alternate dataset, you have to write a function that parses your data to a common format. The format is as follows:
+There is at least one id column
+There is exactly one time column (that can also be used as a feature column)
+Each feature is in a separate column
+Each row represents a moment in time for only one time series
+Additionally, you must specify a configuration of the network, including a data description. Refer to the example in `configuration.py` file.
+### Training process
+
+The `train.py` script is an entry point for a training procedure. Refined recipes can be found in the `scripts` directory.
+The model trains for at most `--epochs` epochs. If option `--early_stopping N` is set, then training will end if for N subsequent epochs validation loss hadn’t improved.
+The details of the architecture and the dataset configuration are encapsulated by the `--dataset` option. This option chooses one of the configurations stored in the `configuration.py` file. You can enable mixed precision training by providing the `--use_amp` option. The training script supports multi-GPU training with the APEX package. To enable distributed training prepend training command with `python -m torch.distributed.launch --nproc_per_node=${NGPU}`.
+
+Example command:
+```
+python -m torch.distributed.launch --nproc_per_node=8 train.py \
+        --dataset electricity \
+        --data_path /data/processed/electricity_bin \
+        --batch_size=1024 \
+        --sample 450000 50000 \
+        --lr 1e-3 \
+        --epochs 25 \
+        --early_stopping 5 \
+        --seed 1 \
+        --use_amp \
+        --results /results/TFT_electricity_bs8x1024_lr1e-3/seed_1
+```
+
+The model is trained by optimizing quantile loss <img src="https://render.githubusercontent.com/render/math?math=\Large\sum_{i=1}^N\sum_{q\in\mathcal{Q}}\sum_{t=1}^{t_{max}}\frac{QL(y_{it},\hat{y}_i(q,t),q)}{Nt_{max}}">
+. After training, the checkpoint with the least validation loss is evaluated on a test split with q-risk metric <img src="https://render.githubusercontent.com/render/math?math=\Large\frac{2\sum_{y\in\Omega}\sum_{t=1}^{t_{max}}QL(y_t,\hat{y}(q,t),q)}{\sum_{y\in\Omega}\sum_{t=1}^{t_{max}}|y_t|}">.
+Results are by default stored in the `/results` directory. This can be changed by providing the `--results` option. At the end of the training,  the results directory will contain the trained checkpoint which had the lowest validation loss, dllogger logs (in dictionary per line format), and TensorBoard logs.
+
+### Inference process
+
+Inference can be run by launching the `inference.py` script. The script requires a trained checkpoint to run. It is crucial to prepare the data in the same way as training data prior to running the inference. Example command:
+```
+python inference.py \
+--checkpoint /results/checkpoint.pt \
+--data /data/processed/electricity_bin/test.csv \
+--tgt_scalers /data/processed/electricity_bin/tgt_scalers.bin \
+--cat_encodings /data/processed/electricity_bin/cat_encodings.bin \
+--batch_size 2048 \
+--visualize \
+--save_predictions \
+--joint_visualization \
+--results /results \
+--use_amp
+```
+
+In the default setting, it performs the evaluation of the model on a specified dataset and prints q-risk evaluated on this dataset. In order to save the predictions, use the `--save_predictions` option. Predictions will be stored in the directory specified by the `--results` option in the csv format. Option `--joint_visualization` allows us to plot graphs in TensorBoard format, allowing us to inspect the results and compare them to true values. Using `--visualize`, you can save plots for each example in a separate file.
+## Performance
+
+### Benchmarking
+
+The following section shows how to run benchmarks measuring the model performance in training and inference modes.
+
+#### Training performance benchmark
+
+In order to run training benchmarks, use the `scripts/benchmark.sh` script.
+
+#### Inference performance benchmark
+
+To benchmark the inference performance on a specific batch size and dataset, run the `inference.py` script.
+### Results
+
+The following sections provide details on how we achieved our performance and accuracy in training and inference.
+
+#### Training accuracy results
+
+We conducted an extensive hyperparameter search along with stability tests. The presented results are the averages from the hundreds of runs.
+
+##### Training accuracy: NVIDIA DGX A100 (A100 80GB)
+
+Our results were obtained by running the `train.sh` training script in the [PyTorch 21.06 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch) on NVIDIA A100 GPUs.
+
+| Dataset | GPUs | Batch size / GPU    | Accuracy - TF32  | Accuracy - mixed precision  |   Time to train - TF32  |  Time to train - mixed precision | Time to train speedup (TF32 to mixed precision)     
+|-------------|---|------|-----------------------|-----------------------|-------|-------|-------
+| Electricity | 1 | 1024 | 0.027 / 0.059 / 0.029 | 0.028 / 0.058 / 0.029 | 1427s | 1087s | 1.313x
+| Electricity | 8 | 1024 | 0.027 / 0.056 / 0.028 | 0.026 / 0.054 / 0.029 | 216s  | 176s  | 1.227x
+| Traffic     | 1 | 1024 | 0.040 / 0.103 / 0.075 | 0.040 / 0.103 / 0.075 | 957s  | 726s  | 1.318x
+| Traffic     | 8 | 1024 | 0.042 / 0.104 / 0.076 | 0.042 / 0.106 / 0.077 | 151s  | 126s  | 1.198x
+
+
+
+
+##### Training accuracy: NVIDIA DGX-1 (V100 16GB)
+
+Our results were obtained by running the `train.sh` training script in the [PyTorch 21.06 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch) on NVIDIA DGX-1 with V100 16GB GPUs.
+
+| Dataset | GPUs    | Batch size / GPU    | Accuracy - FP32  | Accuracy - mixed precision  |   Time to train - FP32  |  Time to train - mixed precision | Time to train speedup (FP32 to mixed precision)        
+|-------------|---|------|-----------------------|-----------------------|-------|-------|-----------
+| Electricity | 1 | 1024 | 0.027 / 0.056 / 0.028 | 0.027 / 0.058 / 0.029 | 2559s | 1598s | 1.601x 
+| Electricity | 8 | 1024 | 0.027 / 0.055 / 0.028 | 0.027 / 0.055 / 0.029 | 381s  | 261s  | 1.460x   
+| Traffic     | 1 | 1024 | 0.040 / 0.102 / 0.075 | 0.041 / 0.101 / 0.074 | 1718s | 1062s | 1.618x 
+| Traffic     | 8 | 1024 | 0.042 / 0.106 / 0.076 | 0.042 / 0.105 / 0.077 | 256s  | 176s  | 1.455x
+
+
+
+##### Training stability test
+
+In order to get a greater picture of the model’s accuracy, we performed a hyperparameter search along with stability tests on 100 random seeds for each configuration. Then, for each benchmark dataset, we have chosen the architecture with the least mean test q-risk. The table below summarizes the best configurations.
+
+| Dataset     | #GPU | Hidden size | #Heads | Local BS | LR   | Gradient clipping | Dropout | Mean q-risk | Std q-risk | Min q-risk | Max q-risk
+|-------------|------|-------------|--------|----------|------|-------------------|---------|-------------|------------| -----------|------ 
+| Electricity | 8    | 128         | 4      | 1024     | 1e-3 | 0.0               | 0.1     | 0.1131      | 0.0025     | 0.1080     | 0.1200
+| Traffic     | 8    | 128         | 4      | 1024     | 1e-3 | 0.0               | 0.3     | 0.2180      | 0.0049     | 0.2069     | 0.2336
+
+
+#### Training performance results
+
+##### Training performance: NVIDIA DGX A100 (A100 80GB)
+
+Our results were obtained by running the `train.sh` training script in the [PyTorch 21.06 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch) on NVIDIA A100 (A100 80GB) GPUs. Performance numbers (in items/images per second) were averaged over an entire training epoch.
+
+| Dataset | GPUs   | Batch size / GPU   | Throughput - TF32    | Throughput - mixed precision    | Throughput speedup (TF32 - mixed precision)   | Weak scaling - TF32    | Weak scaling - mixed precision        
+|-------------|---|------|--------|--------|-------|-------|-----
+| Electricity | 1 | 1024 | 10173  | 13703  | 1.35x | 1     | 1
+| Electricity | 8 | 1024 | 80596  | 107761 | 1.34x | 7.92x | 7.86x
+| Traffic     | 1 | 1024 | 10197  | 13779  | 1.35x | 1     | 1
+| Traffic     | 8 | 1024 | 80692  | 107979 | 1.34x | 7.91x | 7.84x
+
+
+To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
+
+The performance metrics used were items per second.
+
+
+##### Training performance: NVIDIA DGX-1 (V100 16GB)
+
+Our results were obtained by running the `train.sh` training script in the [PyTorch 21.06 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch) on NVIDIA DGX-1 with (V100 16GB) GPUs. Performance numbers (in items/images per second) were averaged over an entire training epoch.
+
+| Dataset | GPUs   | Batch size / GPU   | Throughput - FP32    | Throughput - mixed precision    | Throughput speedup (FP32 - mixed precision)   | Weak scaling - FP32    | Weak scaling - mixed precision        
+|-------------|---|------|-------|-------|-------|------|----
+| Electricity | 1 | 1024 | 5580  | 9148  | 1.64x | 1     | 1
+| Electricity | 8 | 1024 | 43351 | 69855 | 1.61x | 7.77x | 7.64x
+| Traffic     | 1 | 1024 | 5593  | 9194  | 1.64x | 1     | 1
+| Traffic     | 8 | 1024 | 43426 | 69983 | 1.61x | 7.76x | 7.61x
+
+
+
+To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
+
+The performance metrics used were items per second.
+
+## Release notes
+The performance measurements in this document were conducted at the time of publication and may not reflect the performance achieved from NVIDIA’s latest software release. For the most up-to-date performance measurements, go to https://developer.nvidia.com/deep-learning-performance-training-inference.
+
+### Changelog
+
+October 2021
+- Initial release
+
+### Known issues
+There are no known issues with this model.
+
diff --git a/PyTorch/Forecasting/TFT/tft_pyt/TFT_architecture.PNG b/PyTorch/Forecasting/TFT/tft_pyt/TFT_architecture.PNG
new file mode 100644
index 00000000..c3431031
Binary files /dev/null and b/PyTorch/Forecasting/TFT/tft_pyt/TFT_architecture.PNG differ
diff --git a/PyTorch/Forecasting/TFT/tft_pyt/configuration.py b/PyTorch/Forecasting/TFT/tft_pyt/configuration.py
new file mode 100644
index 00000000..bef26e66
--- /dev/null
+++ b/PyTorch/Forecasting/TFT/tft_pyt/configuration.py
@@ -0,0 +1,128 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from data_utils import InputTypes, DataTypes, FeatureSpec
+import datetime
+
+class ElectricityConfig():
+    def __init__(self):
+
+        self.features = [
+                         FeatureSpec('id', InputTypes.ID, DataTypes.CATEGORICAL),
+                         FeatureSpec('hours_from_start', InputTypes.TIME, DataTypes.CONTINUOUS),
+                         FeatureSpec('power_usage', InputTypes.TARGET, DataTypes.CONTINUOUS),
+                         FeatureSpec('hour', InputTypes.KNOWN, DataTypes.CONTINUOUS),
+                         FeatureSpec('day_of_week', InputTypes.KNOWN, DataTypes.CONTINUOUS),
+                         FeatureSpec('hours_from_start', InputTypes.KNOWN, DataTypes.CONTINUOUS),
+                         FeatureSpec('categorical_id', InputTypes.STATIC, DataTypes.CATEGORICAL),
+                        ]
+        # Dataset split boundaries
+        self.time_ids = 'days_from_start' # This column contains time indices across which we split the data
+        self.train_range = (1096, 1315)
+        self.valid_range = (1308, 1339)
+        self.test_range = (1332, 1346)
+        self.dataset_stride = 1 #how many timesteps between examples
+        self.scale_per_id = True
+        self.missing_id_strategy = None
+        self.missing_cat_data_strategy='encode_all'
+
+        # Feature sizes
+        self.static_categorical_inp_lens = [369]
+        self.temporal_known_categorical_inp_lens = []
+        self.temporal_observed_categorical_inp_lens = []
+        self.quantiles = [0.1, 0.5, 0.9]
+
+        self.example_length = 8 * 24
+        self.encoder_length = 7 * 24
+
+        self.n_head = 4
+        self.hidden_size = 128
+        self.dropout = 0.1
+        self.attn_dropout = 0.0
+
+        #### Derived variables ####
+        self.temporal_known_continuous_inp_size = len([x for x in self.features 
+            if x.feature_type == InputTypes.KNOWN and x.feature_embed_type == DataTypes.CONTINUOUS])
+        self.temporal_observed_continuous_inp_size = len([x for x in self.features 
+            if x.feature_type == InputTypes.OBSERVED and x.feature_embed_type == DataTypes.CONTINUOUS])
+        self.temporal_target_size = len([x for x in self.features if x.feature_type == InputTypes.TARGET])
+        self.static_continuous_inp_size = len([x for x in self.features 
+            if x.feature_type == InputTypes.STATIC and x.feature_embed_type == DataTypes.CONTINUOUS])
+
+        self.num_static_vars = self.static_continuous_inp_size + len(self.static_categorical_inp_lens)
+        self.num_future_vars = self.temporal_known_continuous_inp_size + len(self.temporal_known_categorical_inp_lens)
+        self.num_historic_vars = sum([self.num_future_vars,
+                                      self.temporal_observed_continuous_inp_size,
+                                      self.temporal_target_size,
+                                      len(self.temporal_observed_categorical_inp_lens),
+                                      ])
+
+
+class TrafficConfig():
+    def __init__(self):
+
+        self.features = [
+                         FeatureSpec('id', InputTypes.ID, DataTypes.CATEGORICAL),
+                         FeatureSpec('hours_from_start', InputTypes.TIME, DataTypes.CONTINUOUS),
+                         FeatureSpec('values', InputTypes.TARGET, DataTypes.CONTINUOUS),
+                         FeatureSpec('time_on_day', InputTypes.KNOWN, DataTypes.CONTINUOUS),
+                         FeatureSpec('day_of_week', InputTypes.KNOWN, DataTypes.CONTINUOUS),
+                         FeatureSpec('hours_from_start', InputTypes.KNOWN, DataTypes.CONTINUOUS),
+                         FeatureSpec('categorical_id', InputTypes.STATIC, DataTypes.CATEGORICAL),
+                        ]
+        # Dataset split boundaries
+        self.time_ids = 'sensor_day' # This column contains time indices across which we split the data
+        self.train_range = (0, 151)
+        self.valid_range = (144, 166)
+        self.test_range = (159, float('inf'))
+        self.dataset_stride = 1 #how many timesteps between examples
+        self.scale_per_id = False
+        self.missing_id_strategy = None
+        self.missing_cat_data_strategy='encode_all'
+
+        # Feature sizes
+        self.static_categorical_inp_lens = [963]
+        self.temporal_known_categorical_inp_lens = []
+        self.temporal_observed_categorical_inp_lens = []
+        self.quantiles = [0.1, 0.5, 0.9]
+
+        self.example_length = 8 * 24
+        self.encoder_length = 7 * 24
+
+        self.n_head = 4
+        self.hidden_size = 128
+        self.dropout = 0.3
+        self.attn_dropout = 0.0
+
+        #### Derived variables ####
+        self.temporal_known_continuous_inp_size = len([x for x in self.features 
+            if x.feature_type == InputTypes.KNOWN and x.feature_embed_type == DataTypes.CONTINUOUS])
+        self.temporal_observed_continuous_inp_size = len([x for x in self.features 
+            if x.feature_type == InputTypes.OBSERVED and x.feature_embed_type == DataTypes.CONTINUOUS])
+        self.temporal_target_size = len([x for x in self.features if x.feature_type == InputTypes.TARGET])
+        self.static_continuous_inp_size = len([x for x in self.features 
+            if x.feature_type == InputTypes.STATIC and x.feature_embed_type == DataTypes.CONTINUOUS])
+
+        self.num_static_vars = self.static_continuous_inp_size + len(self.static_categorical_inp_lens)
+        self.num_future_vars = self.temporal_known_continuous_inp_size + len(self.temporal_known_categorical_inp_lens)
+        self.num_historic_vars = sum([self.num_future_vars,
+                                      self.temporal_observed_continuous_inp_size,
+                                      self.temporal_target_size,
+                                      len(self.temporal_observed_categorical_inp_lens),
+                                      ])
+
+
+CONFIGS = {'electricity':  ElectricityConfig,
+           'traffic':      TrafficConfig, 
+           }
diff --git a/PyTorch/Forecasting/TFT/tft_pyt/criterions.py b/PyTorch/Forecasting/TFT/tft_pyt/criterions.py
new file mode 100644
index 00000000..5c9df6ae
--- /dev/null
+++ b/PyTorch/Forecasting/TFT/tft_pyt/criterions.py
@@ -0,0 +1,28 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class QuantileLoss(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.register_buffer('q', torch.tensor(config.quantiles))
+
+    def forward(self, predictions, targets):
+        diff = predictions - targets
+        ql = (1-self.q)*F.relu(diff) + self.q*F.relu(-diff)
+        losses = ql.view(-1, ql.shape[-1]).mean(0)
+        return losses
diff --git a/PyTorch/Forecasting/TFT/tft_pyt/data_utils.py b/PyTorch/Forecasting/TFT/tft_pyt/data_utils.py
new file mode 100644
index 00000000..f38f8bfb
--- /dev/null
+++ b/PyTorch/Forecasting/TFT/tft_pyt/data_utils.py
@@ -0,0 +1,790 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################
+# Copyright 2021 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import math
+import pickle
+import enum
+import datetime
+
+from collections import namedtuple, OrderedDict
+
+import sklearn.preprocessing
+from sklearn.impute import SimpleImputer
+import pandas as pd
+import numpy as np
+from bisect import bisect
+
+import torch
+from torch.utils.data import Dataset,IterableDataset,DataLoader
+
+class DataTypes(enum.IntEnum):
+    """Defines numerical types of each column."""
+    CONTINUOUS = 0
+    CATEGORICAL = 1
+    DATE = 2
+    STR = 3
+
+class InputTypes(enum.IntEnum):
+    """Defines input types of each column."""
+    TARGET = 0
+    OBSERVED = 1
+    KNOWN = 2
+    STATIC = 3
+    ID = 4  # Single column used as an entity identifier
+    TIME = 5  # Single column exclusively used as a time index
+
+FeatureSpec = namedtuple('FeatureSpec', ['name', 'feature_type', 'feature_embed_type'])
+DTYPE_MAP = {
+        DataTypes.CONTINUOUS : np.float32,
+        DataTypes.CATEGORICAL : np.int64,
+        DataTypes.DATE:'datetime64[ns]',
+        DataTypes.STR: str
+        }
+
+FEAT_ORDER = [
+        (InputTypes.STATIC, DataTypes.CATEGORICAL),
+        (InputTypes.STATIC, DataTypes.CONTINUOUS),
+        (InputTypes.KNOWN, DataTypes.CATEGORICAL),
+        (InputTypes.KNOWN, DataTypes.CONTINUOUS),
+        (InputTypes.OBSERVED, DataTypes.CATEGORICAL),
+        (InputTypes.OBSERVED, DataTypes.CONTINUOUS),
+        (InputTypes.TARGET, DataTypes.CONTINUOUS),
+        (InputTypes.ID, DataTypes.CATEGORICAL)
+        ]
+
+FEAT_NAMES = ['s_cat' , 's_cont' , 'k_cat' , 'k_cont' , 'o_cat' , 'o_cont' , 'target', 'id']
+DEFAULT_ID_COL = 'id'
+
+class TFTBinaryDataset(Dataset):
+    def __init__(self, path, config):
+        super(TFTBinaryDataset).__init__()
+        self.features = [x for x in config.features if x.feature_embed_type != DataTypes.DATE]
+        self.example_length = config.example_length
+        self.stride = config.dataset_stride
+
+        self.grouped = pickle.load(open(path, 'rb'))
+        self.grouped = [x for x in self.grouped if x.shape[0] >= self.example_length]
+        self._cum_examples_in_group = np.cumsum([(g.shape[0] - self.example_length + 1)//self.stride for g in self.grouped])
+
+
+        self.feature_type_col_map = [[i for i,f in enumerate(self.features) if (f.feature_type, f.feature_embed_type) == x] for x in FEAT_ORDER]
+
+        # The list comprehension below is an elaborate way of rearranging data into correct order,
+        # simultaneously doing casting to proper types. Probably can be written neater
+        self.grouped = [
+                [
+                    arr[:, idxs].view(dtype=np.float32).astype(DTYPE_MAP[t[1]]) 
+                    for t, idxs in zip(FEAT_ORDER, self.feature_type_col_map)
+                ] 
+                for arr in self.grouped
+            ]
+
+    def __len__(self):
+        return self._cum_examples_in_group[-1] if len(self._cum_examples_in_group) else 0
+
+    def __getitem__(self, idx):
+        g_idx = bisect(self._cum_examples_in_group, idx)
+        e_idx = idx - self._cum_examples_in_group[g_idx-1] if g_idx else idx
+
+        group =  self.grouped[g_idx]
+
+        tensors = [
+                   torch.from_numpy(feat[e_idx * self.stride:e_idx*self.stride + self.example_length])
+                   if feat.size else torch.empty(0)
+                   for feat in group
+                  ]
+
+        return OrderedDict(zip(FEAT_NAMES, tensors))
+
+
+class TFTDataset(Dataset):
+    def __init__(self, path, config):
+        super(TFTDataset).__init__()
+        self.features = config.features
+        self.data = pd.read_csv(path, index_col=0)
+        self.example_length = config.example_length
+        self.stride = config.dataset_stride
+
+        # name field is a column name.
+        # there can be multiple entries with the same name because one column can be interpreted in many ways
+        time_col_name = next(x.name for x in self.features if x.feature_type==InputTypes.TIME)
+        id_col_name   = next(x.name for x in self.features if x.feature_type==InputTypes.ID)
+        if not id_col_name in self.data.columns:
+            id_col_name = DEFAULT_ID_COL
+            self.features = [x for x in self.features if x.feature_type!=InputTypes.ID]
+            self.features.append(FeatureSpec(DEFAULT_ID_COL, InputTypes.ID, DataTypes.CATEGORICAL))
+        col_dtypes    = {v.name:DTYPE_MAP[v.feature_embed_type] for v in self.features}
+
+
+        self.data.sort_values(time_col_name,inplace=True)
+        self.data = self.data[set(x.name for x in self.features)] #leave only relevant columns
+        self.data = self.data.astype(col_dtypes)
+        self.data = self.data.groupby(id_col_name).filter(lambda group: len(group) >= self.example_length)
+        self.grouped = list(self.data.groupby(id_col_name))
+
+        self._cum_examples_in_group = np.cumsum([(len(g[1]) - self.example_length + 1)//self.stride for g in self.grouped])
+
+    def __len__(self):
+        return self._cum_examples_in_group[-1]
+
+    def __getitem__(self, idx):
+        g_idx = len([x for x in self._cum_examples_in_group if x <= idx])
+        e_idx = idx - self._cum_examples_in_group[g_idx-1] if g_idx else idx
+
+        group =  self.grouped[g_idx][1]
+        sliced = group.iloc[e_idx * self.stride:e_idx*self.stride + self.example_length]
+
+        # We need to be sure that tensors are returned in the correct order
+        tensors = tuple([] for _ in range(8))
+        for v in self.features:
+            if v.feature_type == InputTypes.STATIC and v.feature_embed_type == DataTypes.CATEGORICAL:
+                tensors[0].append(torch.from_numpy(sliced[v.name].to_numpy()))
+            elif v.feature_type == InputTypes.STATIC and v.feature_embed_type == DataTypes.CONTINUOUS:
+                tensors[1].append(torch.from_numpy(sliced[v.name].to_numpy()))
+            elif v.feature_type == InputTypes.KNOWN and v.feature_embed_type == DataTypes.CATEGORICAL:
+                tensors[2].append(torch.from_numpy(sliced[v.name].to_numpy()))
+            elif v.feature_type == InputTypes.KNOWN and v.feature_embed_type == DataTypes.CONTINUOUS:
+                tensors[3].append(torch.from_numpy(sliced[v.name].to_numpy()))
+            elif v.feature_type == InputTypes.OBSERVED and v.feature_embed_type == DataTypes.CATEGORICAL:
+                tensors[4].append(torch.from_numpy(sliced[v.name].to_numpy()))
+            elif v.feature_type == InputTypes.OBSERVED and v.feature_embed_type == DataTypes.CONTINUOUS:
+                tensors[5].append(torch.from_numpy(sliced[v.name].to_numpy()))
+            elif v.feature_type == InputTypes.TARGET:
+                tensors[6].append(torch.from_numpy(sliced[v.name].to_numpy()))
+            elif v.feature_type == InputTypes.ID:
+                tensors[7].append(torch.from_numpy(sliced[v.name].to_numpy()))
+
+
+        tensors = [torch.stack(x, dim=-1) if x else torch.empty(0) for x in tensors]
+
+        return OrderedDict(zip(FEAT_NAMES, tensors))
+        
+def get_dataset_splits(df, config):
+
+    if hasattr(config, 'relative_split') and config.relative_split:
+        forecast_len = config.example_length - config.encoder_length
+        # The valid split is shifted from the train split by number of the forecast steps to the future.
+        # The test split is shifted by the number of the forecast steps from the valid split
+        train = []
+        valid = []
+        test = []
+
+        for _, group in df.groupby(DEFAULT_ID_COL):
+            index = group[config.time_ids]
+            _train = group.loc[index < config.valid_boundary]
+            _valid = group.iloc[(len(_train) - config.encoder_length):(len(_train) + forecast_len)]
+            _test = group.iloc[(len(_train) - config.encoder_length + forecast_len):(len(_train) + 2*forecast_len)]
+            train.append(_train)
+            valid.append(_valid)
+            test.append(_test)
+
+        train = pd.concat(train, axis=0)
+        valid = pd.concat(valid, axis=0)
+        test = pd.concat(test, axis=0)
+    else:
+        index = df[config.time_ids]
+        train = df.loc[(index >= config.train_range[0]) & (index < config.train_range[1])]
+        valid = df.loc[(index >= config.valid_range[0]) & (index < config.valid_range[1])]
+        test  = df.loc[(index >= config.test_range[0]) & (index < config.test_range[1])]
+
+    return train, valid, test
+
+def flatten_ids(df, config):
+
+    if config.missing_id_strategy == 'drop':
+        if hasattr(config, 'combine_ids') and config.combine_ids:
+            index = np.logical_or.reduce([df[c].isna() for c in config.combine_ids])
+        else:
+            id_col = next(x.name for x in config.features if x.feature_type == InputTypes.ID)
+            index = df[id_col].isna()
+        index = index[index == True].index # Extract indices of nans
+        df.drop(index, inplace=True)
+
+    if not (hasattr(config, 'combine_ids') and config.combine_ids):
+        id_col = next(x.name for x in config.features if x.feature_type == InputTypes.ID)
+        ids = df[id_col].apply(str)
+        df.drop(id_col, axis=1, inplace=True)
+        encoder = sklearn.preprocessing.LabelEncoder().fit(ids.values)
+        df[DEFAULT_ID_COL] = encoder.transform(ids)
+        encoders = OrderedDict({id_col: encoder})
+
+    else:
+        encoders = {c:sklearn.preprocessing.LabelEncoder().fit(df[c].values) for c in config.combine_ids}
+        encoders = OrderedDict(encoders)
+        lens = [len(v.classes_) for v in encoders.values()]
+        clens = np.roll(np.cumprod(lens), 1)
+        clens[0] = 1
+
+        # this takes a looooooot of time. Probably it would be better to create 2 dummy columns
+        df[DEFAULT_ID_COL] = df.apply(lambda row: sum([encoders[c].transform([row[c]])[0]*clens[i] for i,c in enumerate(encoders.keys())]), axis=1)
+        df.drop(config.combine_ids, axis=1, inplace=True)
+
+    return DEFAULT_ID_COL, encoders
+
+def impute(df, config):
+    #XXX This ensures that out scaling will have the same mean. We still need to check the variance
+    if not hasattr(config, 'missing_data_label'):
+        return df, None
+    else:
+        imp = SimpleImputer(missing_values=config.missing_data_label, strategy='mean')
+        mask = df.applymap(lambda x: True if x == config.missing_data_label else False)
+        data = df.values
+        col_mask = (data == config.missing_data_label).all(axis=0)
+        data[:,~col_mask] = imp.fit_transform(data)
+        return data, mask
+
+def normalize_reals(train, valid, test, config, id_col=DEFAULT_ID_COL):
+    tgt_cols = [x.name for x in config.features if x.feature_type == InputTypes.TARGET]
+    real_cols = list(set(v.name for v in config.features if v.feature_embed_type == DataTypes.CONTINUOUS).difference(set(tgt_cols)))
+    real_scalers = {}
+    tgt_scalers = {}
+
+    def apply_scalers(df, name=None):
+        if name is None:
+            name = df.name
+        mask = df.applymap(lambda x: True if x == config.missing_data_label else False) if hasattr(config, 'missing_data_label') else None
+        df[real_cols] = real_scalers[name].transform(df[real_cols])
+        if mask is not None and any(mask):
+            df[real_cols].mask(mask, 10**9)
+        df[tgt_cols] = tgt_scalers[name].transform(df[tgt_cols])
+        return df
+
+    if config.scale_per_id:
+        for identifier, sliced in train.groupby(id_col):
+            data = sliced[real_cols]
+            data, _ = impute(data, config)
+            real_scalers[identifier] = sklearn.preprocessing.StandardScaler().fit(data)
+            # XXX We should probably remove examples that contain NaN as a target
+            target = sliced[tgt_cols]
+            tgt_scalers[identifier] = sklearn.preprocessing.StandardScaler().fit(target)
+
+        train = train.groupby(id_col).apply(apply_scalers)
+        # For valid and testing leave only timeseries previously present in train subset
+        # XXX for proper data science we should consider encoding unseen timeseries as a special case, not throwing them away
+        valid = valid.loc[valid[id_col].isin(real_scalers.keys())]
+        valid = valid.groupby(id_col).apply(apply_scalers)
+        test = test.loc[test[id_col].isin(real_scalers.keys())]
+        test = test.groupby(id_col).apply(apply_scalers)
+
+    else:
+        data, _ = impute(train[real_cols], config)
+        real_scalers[''] = sklearn.preprocessing.StandardScaler().fit(data)
+        tgt_scalers[''] = sklearn.preprocessing.StandardScaler().fit(train[tgt_cols])
+
+        train = apply_scalers(train, name='')
+        valid = apply_scalers(valid, name='')
+        test = apply_scalers(test, name='')
+
+    return train, valid, test, real_scalers, tgt_scalers
+
+def encode_categoricals(train, valid, test, config):
+    cat_encodings = {}
+    cat_cols = list(set(v.name for v in config.features if v.feature_embed_type == DataTypes.CATEGORICAL and v.feature_type != InputTypes.ID))
+    num_classes = [] #XXX Maybe we should modify config based on this value? Or send a warninig?
+                     # For TC performance reasons we might want for num_classes[i] be divisible by 8
+
+    # Train categorical encoders
+    for c in cat_cols:
+        if config.missing_cat_data_strategy == 'special_token':
+            #XXX this will probably require some data augmentation
+            unique = train[c].unique()
+            valid[c].loc[valid[c].isin(unique)] = '<UNK>'
+            test[c].loc[test[c].isin(unique)] = '<UNK>'
+
+        if config.missing_cat_data_strategy == 'encode_all' or \
+                config.missing_cat_data_strategy == 'special_token':
+            srs = pd.concat([train[c], valid[c], test[c]]).apply(str)
+            cat_encodings[c] = sklearn.preprocessing.LabelEncoder().fit(srs.values)
+        elif config.missing_cat_data_strategy == 'drop':
+            # TODO: implement this. In addition to dropping rows this has to split specific time series in chunks
+            # to prevent data from having temporal gaps
+            pass
+        num_classes.append(srs.nunique())
+    print('Categorical variables encodings lens: ', num_classes)
+
+
+    for split in [train, valid, test]:
+        for c in cat_cols:
+            srs = split[c].apply(str)
+            split[c] = srs
+            split.loc[:,c] = cat_encodings[c].transform(srs)
+
+    return cat_encodings
+
+
+def preprocess(src_path, dst_path, config):
+    df = pd.read_csv(src_path, index_col=0)
+
+    for c in config.features:
+        if c.feature_embed_type == DataTypes.DATE:
+            df[c.name] = pd.to_datetime(df[c.name])
+
+    # Leave only columns relevant to preprocessing
+    relevant_columns = list(set([f.name for f in config.features] + [config.time_ids]))
+    df = df[relevant_columns]
+
+
+    id_col, id_encoders = flatten_ids(df, config)
+    df = df.reindex(sorted(df.columns), axis=1)
+    
+    train, valid, test = get_dataset_splits(df, config)
+   
+    # Length filter the data (all timeseries shorter than example len will be dropped)
+    #for df in [train, valid, test]:
+    #    df.groupby(id_col).filter(lambda x: len(x) >= config.example_length)
+    train = pd.concat([x[1] for x in train.groupby(id_col) if len(x[1]) >= config.example_length])
+    valid = pd.concat([x[1] for x in valid.groupby(id_col) if len(x[1]) >= config.example_length])
+    test  = pd.concat([x[1] for x in test.groupby(id_col)  if len(x[1]) >= config.example_length])
+
+    train, valid, test, real_scalers, tgt_scalers = normalize_reals(train, valid, test, config, id_col)
+
+    cat_encodings = encode_categoricals(train, valid, test, config)
+
+    os.makedirs(dst_path, exist_ok=True)
+    
+    train.to_csv(os.path.join(dst_path, 'train.csv'))
+    valid.to_csv(os.path.join(dst_path, 'valid.csv'))
+    test.to_csv(os.path.join(dst_path, 'test.csv'))
+
+    # Save relevant columns in binary form for faster dataloading
+    # IMORTANT: We always expect id to be a single column indicating the complete timeseries
+    # We also expect a copy of id in form of static categorical input!!!
+    col_names = [id_col] + [x.name for x in config.features if x.feature_embed_type != DataTypes.DATE and x.feature_type != InputTypes.ID]
+    grouped_train = [x[1][col_names].values.astype(np.float32).view(dtype=np.int32) for x in train.groupby(id_col)]
+    grouped_valid = [x[1][col_names].values.astype(np.float32).view(dtype=np.int32) for x in valid.groupby(id_col)]
+    grouped_test  = [x[1][col_names].values.astype(np.float32).view(dtype=np.int32) for x in test.groupby(id_col)]
+
+    pickle.dump(grouped_train, open(os.path.join(dst_path, 'train.bin'), 'wb'))
+    pickle.dump(grouped_valid, open(os.path.join(dst_path, 'valid.bin'), 'wb'))
+    pickle.dump(grouped_test,  open(os.path.join(dst_path, 'test.bin'), 'wb'))
+
+    
+    with open(os.path.join(dst_path, 'real_scalers.bin'), 'wb') as f:
+        pickle.dump(real_scalers, f)
+    with open(os.path.join(dst_path, 'tgt_scalers.bin'), 'wb') as f:
+        pickle.dump(tgt_scalers, f)
+    with open(os.path.join(dst_path, 'cat_encodings.bin'), 'wb') as f:
+        pickle.dump(cat_encodings, f)
+    with open(os.path.join(dst_path, 'id_encoders.bin'), 'wb') as f:
+        pickle.dump(id_encoders, f)
+    
+
+def sample_data(dataset, num_samples):
+    if num_samples < 0:
+        return dataset
+    else:
+        return torch.utils.data.Subset(dataset, np.random.choice(np.arange(len(dataset)), size=num_samples, replace=False))
+
+
+def standarize_electricity(path):
+    """Code taken from https://github.com/google-research/google-research/blob/master/tft/script_download_data.py"""
+    df = pd.read_csv(os.path.join(path, 'LD2011_2014.txt'), index_col=0, sep=';', decimal=',')
+    df.index = pd.to_datetime(df.index)
+    df.sort_index(inplace=True)
+  
+    # Used to determine the start and end dates of a series
+    output = df.resample('1h').mean().replace(0., np.nan)
+  
+    earliest_time = output.index.min()
+  
+    df_list = []
+    for label in output:
+        print('Processing {}'.format(label))
+        srs = output[label]
+  
+        start_date = min(srs.fillna(method='ffill').dropna().index)
+        end_date = max(srs.fillna(method='bfill').dropna().index)
+  
+        active_range = (srs.index >= start_date) & (srs.index <= end_date)
+        srs = srs[active_range].fillna(0.)
+  
+        tmp = pd.DataFrame({'power_usage': srs})
+        date = tmp.index
+        tmp['t'] = (date - earliest_time).seconds / 60 / 60 + (
+            date - earliest_time).days * 24
+        tmp['days_from_start'] = (date - earliest_time).days
+        tmp['categorical_id'] = label
+        tmp['date'] = date
+        tmp['id'] = label
+        tmp['hour'] = date.hour
+        tmp['day'] = date.day
+        tmp['day_of_week'] = date.dayofweek
+        tmp['month'] = date.month
+  
+        df_list.append(tmp)
+  
+    output = pd.concat(df_list, axis=0, join='outer').reset_index(drop=True)
+  
+    output['categorical_id'] = output['id'].copy()
+    output['hours_from_start'] = output['t']
+    output['categorical_day_of_week'] = output['day_of_week'].copy()
+    output['categorical_hour'] = output['hour'].copy()
+  
+    output.to_csv(os.path.join(path, 'standarized.csv'))
+
+def standarize_volatility(path):
+    df = pd.read_csv(os.path.join(path, 'oxfordmanrealizedvolatilityindices.csv'), index_col=0)  # no explicit index
+  
+    # Adds additional date/day fields
+    idx = [str(s).split('+')[0] for s in df.index
+          ]  # ignore timezones, we don't need them
+    dates = pd.to_datetime(idx)
+    df['date'] = dates
+    df['days_from_start'] = (dates - pd.datetime(2000, 1, 3)).days
+    df['day_of_week'] = dates.dayofweek
+    df['day_of_month'] = dates.day
+    df['week_of_year'] = dates.weekofyear
+    df['month'] = dates.month
+    df['year'] = dates.year
+    df['categorical_id'] = df['Symbol'].copy()
+  
+    # Processes log volatility
+    vol = df['rv5_ss'].copy()
+    vol.loc[vol == 0.] = np.nan
+    df['log_vol'] = np.log(vol)
+  
+    # Adds static information
+    symbol_region_mapping = {
+        '.AEX': 'EMEA',
+        '.AORD': 'APAC',
+        '.BFX': 'EMEA',
+        '.BSESN': 'APAC',
+        '.BVLG': 'EMEA',
+        '.BVSP': 'AMER',
+        '.DJI': 'AMER',
+        '.FCHI': 'EMEA',
+        '.FTMIB': 'EMEA',
+        '.FTSE': 'EMEA',
+        '.GDAXI': 'EMEA',
+        '.GSPTSE': 'AMER',
+        '.HSI': 'APAC',
+        '.IBEX': 'EMEA',
+        '.IXIC': 'AMER',
+        '.KS11': 'APAC',
+        '.KSE': 'APAC',
+        '.MXX': 'AMER',
+        '.N225': 'APAC ',
+        '.NSEI': 'APAC',
+        '.OMXC20': 'EMEA',
+        '.OMXHPI': 'EMEA',
+        '.OMXSPI': 'EMEA',
+        '.OSEAX': 'EMEA',
+        '.RUT': 'EMEA',
+        '.SMSI': 'EMEA',
+        '.SPX': 'AMER',
+        '.SSEC': 'APAC',
+        '.SSMI': 'EMEA',
+        '.STI': 'APAC',
+        '.STOXX50E': 'EMEA'
+    }
+  
+    df['Region'] = df['Symbol'].apply(lambda k: symbol_region_mapping[k])
+  
+    # Performs final processing
+    output_df_list = []
+    for grp in df.groupby('Symbol'):
+        sliced = grp[1].copy()
+        sliced.sort_values('days_from_start', inplace=True)
+        # Impute log volatility values
+        sliced['log_vol'].fillna(method='ffill', inplace=True)
+        sliced.dropna()
+        output_df_list.append(sliced)
+  
+    df = pd.concat(output_df_list, axis=0)
+  
+    df.to_csv(os.path.join(path, 'standarized.csv'))
+
+
+def standarize_traffic(path):
+    def process_list(s, variable_type=int, delimiter=None):
+        """Parses a line in the PEMS format to a list."""
+        if delimiter is None:
+            l = [
+                variable_type(i) for i in s.replace('[', '').replace(']', '').split()
+            ]
+        else:
+            l = [
+                variable_type(i)
+                for i in s.replace('[', '').replace(']', '').split(delimiter)
+            ]
+  
+        return l
+  
+    def read_single_list(filename):
+        """Returns single list from a file in the PEMS-custom format."""
+        with open(os.path.join(path, filename), 'r') as dat:
+            l = process_list(dat.readlines()[0])
+        return l
+  
+    def read_matrix(filename):
+        """Returns a matrix from a file in the PEMS-custom format."""
+        array_list = []
+        with open(os.path.join(path, filename), 'r') as dat:
+            lines = dat.readlines()
+            for i, line in enumerate(lines):
+                if (i + 1) % 50 == 0:
+                    print('Completed {} of {} rows for {}'.format(i + 1, len(lines),
+                                                                filename))
+                array = [
+                    process_list(row_split, variable_type=float, delimiter=None)
+                    for row_split in process_list(
+                        line, variable_type=str, delimiter=';')
+                ]
+                array_list.append(array)
+  
+        return array_list
+  
+    shuffle_order = np.array(read_single_list('randperm')) - 1  # index from 0
+    train_dayofweek = read_single_list('PEMS_trainlabels')
+    train_tensor = read_matrix('PEMS_train')
+    test_dayofweek = read_single_list('PEMS_testlabels')
+    test_tensor = read_matrix('PEMS_test')
+  
+    # Inverse permutate shuffle order
+    print('Shuffling')
+    inverse_mapping = {
+        new_location: previous_location
+        for previous_location, new_location in enumerate(shuffle_order)
+    }
+    reverse_shuffle_order = np.array([
+        inverse_mapping[new_location]
+        for new_location, _ in enumerate(shuffle_order)
+    ])
+  
+    # Group and reoder based on permuation matrix
+    print('Reodering')
+    day_of_week = np.array(train_dayofweek + test_dayofweek)
+    combined_tensor = np.array(train_tensor + test_tensor)
+  
+    day_of_week = day_of_week[reverse_shuffle_order]
+    combined_tensor = combined_tensor[reverse_shuffle_order]
+  
+    # Put everything back into a dataframe
+    print('Parsing as dataframe')
+    labels = ['traj_{}'.format(i) for i in read_single_list('stations_list')]
+  
+    hourly_list = []
+    for day, day_matrix in enumerate(combined_tensor):
+        # Hourly data
+        hourly = pd.DataFrame(day_matrix.T, columns=labels)
+        hourly['hour_on_day'] = [int(i / 6) for i in hourly.index
+                                ]  # sampled at 10 min intervals
+        if hourly['hour_on_day'].max() > 23 or hourly['hour_on_day'].min() < 0:
+            raise ValueError('Invalid hour! {}-{}'.format(
+                hourly['hour_on_day'].min(), hourly['hour_on_day'].max()))
+  
+        hourly = hourly.groupby('hour_on_day', as_index=True).mean()[labels]
+        hourly['sensor_day'] = day
+        hourly['time_on_day'] = hourly.index
+        hourly['day_of_week'] = day_of_week[day]
+  
+        hourly_list.append(hourly)
+  
+    hourly_frame = pd.concat(hourly_list, axis=0, ignore_index=True, sort=False)
+  
+    # Flatten such that each entitiy uses one row in dataframe
+    store_columns = [c for c in hourly_frame.columns if 'traj' in c]
+    other_columns = [c for c in hourly_frame.columns if 'traj' not in c]
+    flat_df = pd.DataFrame(columns=['values', 'prev_values', 'next_values'] +
+                           other_columns + ['id'])
+  
+    for store in store_columns:
+        print('Processing {}'.format(store))
+  
+        sliced = hourly_frame[[store] + other_columns].copy()
+        sliced.columns = ['values'] + other_columns
+        sliced['id'] = int(store.replace('traj_', ''))
+  
+        # Sort by Sensor-date-time
+        key = sliced['id'].apply(str) \
+                + sliced['sensor_day'].apply(lambda x: '_{:03d}'.format(x)) \
+                + sliced['time_on_day'].apply(lambda x: '_{:03d}'.format(x))
+        sliced = sliced.set_index(key).sort_index()
+  
+        sliced['values'] = sliced['values'].fillna(method='ffill')
+        sliced['prev_values'] = sliced['values'].shift(1)
+        sliced['next_values'] = sliced['values'].shift(-1)
+  
+        flat_df = flat_df.append(sliced.dropna(), ignore_index=True, sort=False)
+  
+    # Filter to match range used by other academic papers
+    index = flat_df['sensor_day']
+    flat_df = flat_df[index < 173].copy()
+  
+    # Creating columns fo categorical inputs
+    flat_df['categorical_id'] = flat_df['id'].copy()
+    flat_df['hours_from_start'] = flat_df['time_on_day'] \
+        + flat_df['sensor_day']*24.
+    flat_df['categorical_day_of_week'] = flat_df['day_of_week'].copy()
+    flat_df['categorical_time_on_day'] = flat_df['time_on_day'].copy()
+  
+    flat_df.to_csv(os.path.join(path, 'standarized.csv'))
+
+
+# XXX needs rework
+def standarize_favorita(data_folder):
+    import gc
+    # Extract only a subset of data to save/process for efficiency
+    start_date = pd.datetime(2015, 1, 1)
+    end_date = pd.datetime(2016, 6, 1)
+  
+    print('Regenerating data...')
+  
+    # load temporal data
+    temporal = pd.read_csv(os.path.join(data_folder, 'train.csv'), index_col=0)
+  
+    store_info = pd.read_csv(os.path.join(data_folder, 'stores.csv'), index_col=0)
+    oil = pd.read_csv(
+        os.path.join(data_folder, 'oil.csv'), index_col=0).iloc[:, 0]
+    holidays = pd.read_csv(os.path.join(data_folder, 'holidays_events.csv'))
+    items = pd.read_csv(os.path.join(data_folder, 'items.csv'), index_col=0)
+    transactions = pd.read_csv(os.path.join(data_folder, 'transactions.csv'))
+  
+    # Take first 6 months of data
+    temporal['date'] = pd.to_datetime(temporal['date'])
+  
+    # Filter dates to reduce storage space requirements
+    if start_date is not None:
+        temporal = temporal[(temporal['date'] >= start_date)]
+    if end_date is not None:
+        temporal = temporal[(temporal['date'] < end_date)]
+  
+    dates = temporal['date'].unique()
+  
+    # Add trajectory identifier
+    temporal['traj_id'] = temporal['store_nbr'].apply(
+        str) + '_' + temporal['item_nbr'].apply(str)
+    temporal['unique_id'] = temporal['traj_id'] + '_' + temporal['date'].apply(
+        str)
+  
+    # Remove all IDs with negative returns
+    print('Removing returns data')
+    min_returns = temporal['unit_sales'].groupby(temporal['traj_id']).min()
+    valid_ids = set(min_returns[min_returns >= 0].index)
+    selector = temporal['traj_id'].apply(lambda traj_id: traj_id in valid_ids)
+    new_temporal = temporal[selector].copy()
+    del temporal
+    gc.collect()
+    temporal = new_temporal
+    temporal['open'] = 1
+  
+    # Resampling
+    print('Resampling to regular grid')
+    resampled_dfs = []
+    for traj_id, raw_sub_df in temporal.groupby('traj_id'):
+        print('Resampling', traj_id)
+        sub_df = raw_sub_df.set_index('date', drop=True).copy()
+        sub_df = sub_df.resample('1d').last()
+        sub_df['date'] = sub_df.index
+        sub_df[['store_nbr', 'item_nbr', 'onpromotion']] \
+            = sub_df[['store_nbr', 'item_nbr', 'onpromotion']].fillna(method='ffill')
+        sub_df['open'] = sub_df['open'].fillna(
+            0)  # flag where sales data is unknown
+        sub_df['log_sales'] = np.log(sub_df['unit_sales'])
+    
+        resampled_dfs.append(sub_df.reset_index(drop=True))
+  
+    new_temporal = pd.concat(resampled_dfs, axis=0)
+    del temporal
+    gc.collect()
+    temporal = new_temporal
+  
+    print('Adding oil')
+    oil.name = 'oil'
+    oil.index = pd.to_datetime(oil.index)
+    #XXX the lines below match the value of the oil on given date with the rest of the timeseries
+    # missing values in oil series are copied from the index before. Then the oil series is joined with
+    # temporal. Then there are some dates present in temporal which arent present in oil, for which 
+    # oil values is substituted with -1. WHY?!
+    #TODO: check how many nans there are after first step. Previously oil series was extended by dates
+    # present in dates variable with nan value, which were forward filled. 
+    # This behavior is no longer supported by pandas, so we changed to DataFrame.isin method.
+    # This leaves us with more nans after first step than previously. To achieve previous behavior
+    # we have to join series before filling nans.
+    temporal = temporal.join(
+        #oil.loc[oil.index.isin(dates)].fillna(method='ffill'), on='date', how='left')
+        oil.loc[oil.index.isin(dates)], on='date', how='left')
+    temporal['oil'] = temporal['oil'].fillna(method='ffill')
+    temporal['oil'] = temporal['oil'].fillna(-1)
+  
+    print('Adding store info')
+    temporal = temporal.join(store_info, on='store_nbr', how='left')
+  
+    print('Adding item info')
+    temporal = temporal.join(items, on='item_nbr', how='left')
+  
+    transactions['date'] = pd.to_datetime(transactions['date'])
+    temporal = temporal.merge(
+        transactions,
+        left_on=['date', 'store_nbr'],
+        right_on=['date', 'store_nbr'],
+        how='left')
+    temporal['transactions'] = temporal['transactions'].fillna(-1)
+  
+    # Additional date info
+    temporal['day_of_week'] = pd.to_datetime(temporal['date'].values).dayofweek
+    temporal['day_of_month'] = pd.to_datetime(temporal['date'].values).day
+    temporal['month'] = pd.to_datetime(temporal['date'].values).month
+  
+    # Add holiday info
+    print('Adding holidays')
+    holiday_subset = holidays[holidays['transferred'].apply(
+        lambda x: not x)].copy()
+    holiday_subset.columns = [
+        s if s != 'type' else 'holiday_type' for s in holiday_subset.columns
+    ]
+    holiday_subset['date'] = pd.to_datetime(holiday_subset['date'])
+    local_holidays = holiday_subset[holiday_subset['locale'] == 'Local']
+    regional_holidays = holiday_subset[holiday_subset['locale'] == 'Regional']
+    national_holidays = holiday_subset[holiday_subset['locale'] == 'National']
+  
+    temporal['national_hol'] = temporal.merge(
+        national_holidays, left_on=['date'], right_on=['date'],
+        how='left')['description'].fillna('')
+    temporal['regional_hol'] = temporal.merge(
+        regional_holidays,
+        left_on=['state', 'date'],
+        right_on=['locale_name', 'date'],
+        how='left')['description'].fillna('')
+    temporal['local_hol'] = temporal.merge(
+        local_holidays,
+        left_on=['city', 'date'],
+        right_on=['locale_name', 'date'],
+        how='left')['description'].fillna('')
+  
+    temporal.sort_values('unique_id', inplace=True)
+
+    # Transform date to integer index
+    start_date = pd.to_datetime(min(temporal['date']))
+    dates = temporal['date'].apply(pd.to_datetime)
+    temporal['days_from_start'] = (dates - start_date).dt.days
+    temporal['categorical_id'] = temporal['traj_id'].copy()
+  
+    print('Saving processed file to {}'.format(os.path.join(data_folder, 'standarized.csv')))
+    temporal.to_csv(os.path.join(data_folder, 'standarized.csv'))
diff --git a/PyTorch/Forecasting/TFT/tft_pyt/ema.py b/PyTorch/Forecasting/TFT/tft_pyt/ema.py
new file mode 100644
index 00000000..f8f5b331
--- /dev/null
+++ b/PyTorch/Forecasting/TFT/tft_pyt/ema.py
@@ -0,0 +1,73 @@
+# Copyright 2021 NVIDIA CORPORATION
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Copyright 2019 Ross Wightman
+
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+
+#        http://www.apache.org/licenses/LICENSE-2.0
+
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+
+""" 
+Exponential Moving Average (EMA) of model updates
+"""
+
+from collections import OrderedDict
+from copy import deepcopy
+
+import torch
+import torch.nn as nn
+
+class ModelEma(nn.Module):
+    """ Model Exponential Moving Average V2
+
+    Keep a moving average of everything in the model state_dict (parameters and buffers).
+    V2 of this module is simpler, it does not match params/buffers based on name but simply
+    iterates in order. It works with torchscript (JIT of full model).
+
+    """
+    def __init__(self, model, decay=0.999, device=None):
+        super().__init__()
+        # make a copy of the model for accumulating moving average of weights
+        self.module = deepcopy(model)
+        self.module.eval()
+        self.decay = decay
+        self.device = device  # perform ema on different device from model if set
+        if self.device is not None:
+            self.module.to(device=device)
+
+    def update(self, model):
+        update_fn=lambda ema_v, model_v: self.decay * ema_v + (1. - self.decay) * model_v
+        with torch.no_grad():
+            for ema_v, model_v in zip(self.module.state_dict().values(), model.state_dict().values()):
+                if self.device is not None:
+                    model_v = model_v.to(device=self.device)
+                ema_v.copy_(update_fn(ema_v, model_v))
+
+    def set(self, model):
+        with torch.no_grad():
+            for ema_v, model_v in zip(self.module.state_dict().values(), model.state_dict().values()):
+                if self.device is not None:
+                    model_v = model_v.to(device=self.device)
+                ema_v.copy_( model_v )
+
+    def forward(self, x):
+        return self.module(x)
diff --git a/PyTorch/Forecasting/TFT/tft_pyt/gpu_affinity.py b/PyTorch/Forecasting/TFT/tft_pyt/gpu_affinity.py
new file mode 100644
index 00000000..79fb1fc4
--- /dev/null
+++ b/PyTorch/Forecasting/TFT/tft_pyt/gpu_affinity.py
@@ -0,0 +1,157 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+import math
+import os
+import pathlib
+import re
+
+import pynvml
+
+pynvml.nvmlInit()
+
+
+def systemGetDriverVersion():
+    return pynvml.nvmlSystemGetDriverVersion()
+
+
+def deviceGetCount():
+    return pynvml.nvmlDeviceGetCount()
+
+
+class device:
+    # assume nvml returns list of 64 bit ints
+    _nvml_affinity_elements = math.ceil(os.cpu_count() / 64)
+
+    def __init__(self, device_idx):
+        super().__init__()
+        self.handle = pynvml.nvmlDeviceGetHandleByIndex(device_idx)
+
+    def getName(self):
+        return pynvml.nvmlDeviceGetName(self.handle)
+
+    def getCpuAffinity(self):
+        affinity_string = ''
+        for j in pynvml.nvmlDeviceGetCpuAffinity(
+            self.handle, device._nvml_affinity_elements
+        ):
+            # assume nvml returns list of 64 bit ints
+            affinity_string = '{:064b}'.format(j) + affinity_string
+        affinity_list = [int(x) for x in affinity_string]
+        affinity_list.reverse()  # so core 0 is in 0th element of list
+
+        ret = [i for i, e in enumerate(affinity_list) if e != 0]
+        return ret
+
+
+def set_socket_affinity(gpu_id):
+    dev = device(gpu_id)
+    affinity = dev.getCpuAffinity()
+    os.sched_setaffinity(0, affinity)
+
+
+def set_single_affinity(gpu_id):
+    dev = device(gpu_id)
+    affinity = dev.getCpuAffinity()
+    os.sched_setaffinity(0, affinity[:1])
+
+
+def set_single_unique_affinity(gpu_id, nproc_per_node):
+    devices = [device(i) for i in range(nproc_per_node)]
+    socket_affinities = [dev.getCpuAffinity() for dev in devices]
+
+    siblings_list = get_thread_siblings_list()
+    siblings_dict = dict(siblings_list)
+
+    # remove siblings
+    for idx, socket_affinity in enumerate(socket_affinities):
+        socket_affinities[idx] = list(set(socket_affinity) - set(siblings_dict.values()))
+
+    affinities = []
+    assigned = []
+
+    for socket_affinity in socket_affinities:
+        for core in socket_affinity:
+            if core not in assigned:
+                affinities.append([core])
+                assigned.append(core)
+                break
+    os.sched_setaffinity(0, affinities[gpu_id])
+
+
+def set_socket_unique_affinity(gpu_id, nproc_per_node, mode):
+    device_ids = [device(i) for i in range(nproc_per_node)]
+    socket_affinities = [dev.getCpuAffinity() for dev in device_ids]
+
+    siblings_list = get_thread_siblings_list()
+    siblings_dict = dict(siblings_list)
+
+    # remove siblings
+    for idx, socket_affinity in enumerate(socket_affinities):
+        socket_affinities[idx] = list(set(socket_affinity) - set(siblings_dict.values()))
+
+    socket_affinities_to_device_ids = collections.defaultdict(list)
+
+    for idx, socket_affinity in enumerate(socket_affinities):
+        socket_affinities_to_device_ids[tuple(socket_affinity)].append(idx)
+
+    for socket_affinity, device_ids in socket_affinities_to_device_ids.items():
+        devices_per_group = len(device_ids)
+        cores_per_device = len(socket_affinity) // devices_per_group
+        for group_id, device_id in enumerate(device_ids):
+            if device_id == gpu_id:
+                if mode == 'interleaved':
+                    affinity = list(socket_affinity[group_id::devices_per_group])
+                elif mode == 'continuous':
+                    affinity = list(socket_affinity[group_id*cores_per_device:(group_id+1)*cores_per_device])
+                else:
+                    raise RuntimeError('Unknown set_socket_unique_affinity mode')
+
+                # reintroduce siblings
+                affinity += [siblings_dict[aff] for aff in affinity if aff in siblings_dict]
+                os.sched_setaffinity(0, affinity)
+
+
+def get_thread_siblings_list():
+    path = '/sys/devices/system/cpu/cpu*/topology/thread_siblings_list'
+    thread_siblings_list = []
+    pattern = re.compile(r'(\d+)\D(\d+)')
+    for fname in pathlib.Path(path[0]).glob(path[1:]):
+        with open(fname) as f:
+            content = f.read().strip()
+            res = pattern.findall(content)
+            if res:
+                pair = tuple(map(int, res[0]))
+                thread_siblings_list.append(pair)
+    return thread_siblings_list
+
+
+def set_affinity(gpu_id, nproc_per_node, mode='socket'):
+    if mode == 'socket':
+        set_socket_affinity(gpu_id)
+    elif mode == 'single':
+        set_single_affinity(gpu_id)
+    elif mode == 'single_unique':
+        set_single_unique_affinity(gpu_id, nproc_per_node)
+    elif mode == 'socket_unique_interleaved':
+        set_socket_unique_affinity(gpu_id, nproc_per_node, 'interleaved')
+    elif mode == 'socket_unique_continuous':
+        set_socket_unique_affinity(gpu_id, nproc_per_node, 'continuous')
+    else:
+        raise RuntimeError('Unknown affinity mode')
+
+    affinity = os.sched_getaffinity(0)
+    return affinity
+
diff --git a/PyTorch/Forecasting/TFT/tft_pyt/inference.py b/PyTorch/Forecasting/TFT/tft_pyt/inference.py
new file mode 100644
index 00000000..056429f1
--- /dev/null
+++ b/PyTorch/Forecasting/TFT/tft_pyt/inference.py
@@ -0,0 +1,239 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import pandas as pd
+import numpy as np
+import pickle
+import argparse
+import torch
+from torch.utils.data import DataLoader
+from torch.cuda import amp
+from torch.utils.tensorboard import SummaryWriter
+from tqdm import tqdm
+from modeling import TemporalFusionTransformer
+from configuration import ElectricityConfig
+from data_utils import TFTDataset
+from utils import PerformanceMeter
+from criterions import QuantileLoss
+import dllogger
+from log_helper import setup_logger
+
+def _unscale_per_id(config, values, ids, scalers):
+    values = values.cpu().numpy()
+    num_horizons = config.example_length - config.encoder_length + 1
+    flat_values = pd.DataFrame(
+            values,
+            columns=[f't{j}' for j in range(num_horizons - values.shape[1], num_horizons)]
+            )
+    flat_values['id'] = ids
+    df_list = []
+    for idx, group in flat_values.groupby('id'):
+        scaler = scalers[idx]
+        group_copy = group.copy()
+        for col in group_copy.columns:
+            if not 'id' in col:
+                _col = np.expand_dims(group_copy[col].values, -1)
+                _t_col = scaler.inverse_transform(_col)[:,-1]
+                group_copy[col] = _t_col
+        df_list.append(group_copy)
+    flat_values = pd.concat(df_list, axis=0)
+
+    flat_values = flat_values[[col for col in flat_values if not 'id' in col]]
+    flat_tensor = torch.from_numpy(flat_values.values)
+    return flat_tensor
+
+def _unscale(config, values, scaler):
+    values = values.cpu().numpy()
+    num_horizons = config.example_length - config.encoder_length + 1
+    flat_values = pd.DataFrame(
+            values,
+            columns=[f't{j}' for j in range(num_horizons - values.shape[1], num_horizons)]
+            )
+    for col in flat_values.columns:
+        if not 'id' in col:
+            _col = np.expand_dims(flat_values[col].values, -1)
+            _t_col = scaler.inverse_transform(_col)[:,-1]
+            flat_values[col] = _t_col
+
+    flat_values = flat_values[[col for col in flat_values if not 'id' in col]]
+    flat_tensor = torch.from_numpy(flat_values.values)
+    return flat_tensor
+
+def predict(args, config, model, data_loader, scalers, cat_encodings, extend_targets=False):
+    model.eval()
+    predictions = []
+    targets = []
+    ids = []
+    perf_meter = PerformanceMeter()
+    n_workers = args.distributed_world_size if hasattr(args, 'distributed_world_size') else 1
+
+    for step, batch in enumerate(data_loader):
+        perf_meter.reset_current_lap()
+        with torch.no_grad():
+            batch = {key: tensor.cuda() if tensor.numel() else None for key, tensor in batch.items()}
+            ids.append(batch['id'][:,0,:])
+            targets.append(batch['target'])
+            predictions.append(model(batch).float())
+
+        perf_meter.update(args.batch_size * n_workers,
+            exclude_from_total=step in [0, len(data_loader)-1])
+
+    targets = torch.cat(targets, dim=0)
+    if not extend_targets:
+        targets = targets[:,config.encoder_length:,:] 
+    predictions = torch.cat(predictions, dim=0)
+    
+    if config.scale_per_id:
+        ids = torch.cat(ids, dim=0).cpu().numpy()
+
+        unscaled_predictions = torch.stack(
+                [_unscale_per_id(config, predictions[:,:,i], ids, scalers) for i in range(len(config.quantiles))], 
+                dim=-1)
+        unscaled_targets = _unscale_per_id(config, targets[:,:,0], ids, scalers).unsqueeze(-1)
+    else:
+        ids = None
+        unscaled_predictions = torch.stack(
+                [_unscale(config, predictions[:,:,i], scalers['']) for i in range(len(config.quantiles))], 
+                dim=-1)
+        unscaled_targets = _unscale(config, targets[:,:,0], scalers['']).unsqueeze(-1)
+
+    return unscaled_predictions, unscaled_targets, ids, perf_meter
+
+def visualize_v2(args, config, model, data_loader, scalers, cat_encodings):
+    unscaled_predictions, unscaled_targets, ids, _ = predict(args, config, model, data_loader, scalers, cat_encodings, extend_targets=True)
+
+    num_horizons = config.example_length - config.encoder_length + 1
+    pad = unscaled_predictions.new_full((unscaled_targets.shape[0], unscaled_targets.shape[1] - unscaled_predictions.shape[1], unscaled_predictions.shape[2]), fill_value=float('nan'))
+    pad[:,-1,:] = unscaled_targets[:,-num_horizons,:]
+    unscaled_predictions = torch.cat((pad, unscaled_predictions), dim=1)
+
+    ids = torch.from_numpy(ids.squeeze())
+    joint_graphs = torch.cat([unscaled_targets, unscaled_predictions], dim=2)
+    graphs = {i:joint_graphs[ids == i, :, :] for i in set(ids.tolist())}
+    for key, g in graphs.items():
+        for i, ex in enumerate(g):
+            df = pd.DataFrame(ex.numpy(), 
+                    index=range(num_horizons - ex.shape[0], num_horizons),
+                    columns=['target'] + [f'P{int(q*100)}' for q in config.quantiles])
+            fig = df.plot().get_figure()
+            ax = fig.get_axes()[0]
+            _values = df.values[config.encoder_length-1:,:]
+            ax.fill_between(range(num_horizons), _values[:,1], _values[:,-1], alpha=0.2, color='green')
+            os.makedirs(os.path.join(args.results, 'single_example_vis', str(key)), exist_ok=True)
+            fig.savefig(os.path.join(args.results, 'single_example_vis', str(key), f'{i}.pdf'))
+
+def inference(args, config, model, data_loader, scalers, cat_encodings):
+    unscaled_predictions, unscaled_targets, ids, perf_meter = predict(args, config, model, data_loader, scalers, cat_encodings)
+
+    if args.joint_visualization or args.save_predictions:
+        ids = torch.from_numpy(ids.squeeze())
+        #ids = torch.cat([x['id'][0] for x in data_loader.dataset])
+        joint_graphs = torch.cat([unscaled_targets, unscaled_predictions], dim=2)
+        graphs = {i:joint_graphs[ids == i, :, :] for i in set(ids.tolist())}
+        for key, g in graphs.items(): #timeseries id, joint targets and predictions
+            _g = {'targets': g[:,:,0]}
+            _g.update({f'P{int(q*100)}':g[:,:,i+1] for i, q in enumerate(config.quantiles)})
+            
+            if args.joint_visualization:
+                summary_writer = SummaryWriter(log_dir=os.path.join(args.results, 'predictions_vis', str(key)))
+                for q, t in _g.items(): # target and quantiles, timehorizon values
+                    if q == 'targets':
+                        targets = torch.cat([t[:,0], t[-1,1:]]) # WIP
+                        # We want to plot targets on the same graph as predictions. Probably could be written better.
+                        for i, val in enumerate(targets):
+                            summary_writer.add_scalars(str(key), {f'{q}':val}, i)
+                        continue
+
+                    # Tensor t contains different time horizons which are shifted in phase
+                    # Next lines realign them
+                    y = t.new_full((t.shape[0] + t.shape[1] -1, t.shape[1]), float('nan'))
+                    for i in range(y.shape[1]):
+                        y[i:i+t.shape[0], i] = t[:,i]
+
+                    for i, vals in enumerate(y): # timestep, timehorizon values value
+                        summary_writer.add_scalars(str(key), {f'{q}_t+{j+1}':v for j,v in enumerate(vals) if v == v}, i)
+                summary_writer.close()
+
+            if args.save_predictions:
+                for q, t in _g.items():
+                    df = pd.DataFrame(t.tolist())
+                    df.columns = [f't+{i+1}' for i in range(len(df.columns))]
+                    os.makedirs(os.path.join(args.results, 'predictions', str(key)), exist_ok=True)
+                    df.to_csv(os.path.join(args.results, 'predictions', str(key), q+'.csv'))
+
+    losses = QuantileLoss(config)(unscaled_predictions, unscaled_targets)
+    normalizer = unscaled_targets.abs().mean()
+    q_risk = 2 * losses / normalizer
+
+    perf_dict = {
+                'throughput': perf_meter.avg,
+                'latency_avg': perf_meter.total_time/len(perf_meter.intervals),
+                'latency_p90': perf_meter.p(90),
+                'latency_p95': perf_meter.p(95),
+                'latency_p99': perf_meter.p(99),
+                'total_infernece_time': perf_meter.total_time,
+                }
+
+    return q_risk, perf_dict
+
+
+def main(args):
+    
+    setup_logger(args)
+    # Set up model
+    state_dict = torch.load(args.checkpoint)
+    config = state_dict['config']
+    model = TemporalFusionTransformer(config).cuda()
+    model.load_state_dict(state_dict['model'])
+    model.eval()
+    model.cuda()
+
+    # Set up dataset
+    test_split = TFTDataset(args.data, config)
+    data_loader = DataLoader(test_split, batch_size=args.batch_size, num_workers=4)
+
+    scalers = pickle.load(open(args.tgt_scalers, 'rb'))
+    cat_encodings = pickle.load(open(args.cat_encodings, 'rb'))
+
+    if args.visualize:
+        # TODO: abstract away all forms of visualization.
+        visualize_v2(args, config, model, data_loader, scalers, cat_encodings)
+
+    quantiles, perf_dict = inference(args, config, model, data_loader, scalers, cat_encodings)
+    quantiles = {'test_p10': quantiles[0].item(), 'test_p50': quantiles[1].item(), 'test_p90': quantiles[2].item(), 'sum':sum(quantiles).item()}
+    finish_log = {**quantiles, **perf_dict}
+    dllogger.log(step=(), data=finish_log, verbosity=1)
+    print('Test q-risk: P10 {} | P50 {} | P90 {}'.format(*quantiles))
+    print('Latency:\n\tAverage {:.3f}s\n\tp90 {:.3f}s\n\tp95 {:.3f}s\n\tp99 {:.3f}s'.format(
+        perf_dict['latency_avg'], perf_dict['latency_p90'], perf_dict['latency_p95'], perf_dict['latency_p99']))
+
+if __name__=='__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--checkpoint', type=str,
+                        help='Path to the checkpoint')
+    parser.add_argument('--data', type=str,
+                        help='Path to the test split of the dataset')
+    parser.add_argument('--tgt_scalers', type=str,
+                        help='Path to the tgt_scalers.bin file produced by the preprocessing')
+    parser.add_argument('--cat_encodings', type=str,
+                        help='Path to the cat_encodings.bin file produced by the preprocessing')
+    parser.add_argument('--batch_size', type=int, default=64)
+    parser.add_argument('--visualize', action='store_true', help='Visualize predictions - each example on the separate plot')
+    parser.add_argument('--joint_visualization', action='store_true', help='Visualize predictions - each timeseries on separate plot. Projections will be concatenated.')
+    parser.add_argument('--save_predictions', action='store_true')
+    parser.add_argument('--results', type=str, default='/results')
+    parser.add_argument('--log_file', type=str, default='dllogger.json')
+    ARGS = parser.parse_args()
+    main(ARGS)
diff --git a/PyTorch/Forecasting/TFT/tft_pyt/log_helper.py b/PyTorch/Forecasting/TFT/tft_pyt/log_helper.py
new file mode 100644
index 00000000..83d2ac7f
--- /dev/null
+++ b/PyTorch/Forecasting/TFT/tft_pyt/log_helper.py
@@ -0,0 +1,141 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import subprocess
+import sys
+import itertools
+import atexit
+
+import dllogger
+from dllogger import Backend, JSONStreamBackend, StdOutBackend
+
+import torch.distributed as dist
+from torch.utils.tensorboard import SummaryWriter
+
+class TensorBoardBackend(Backend):
+    def __init__(self, verbosity, log_dir):
+        super().__init__(verbosity=verbosity)
+        self.summary_writer = SummaryWriter(log_dir=os.path.join(log_dir, 'TB_summary'),
+                                            flush_secs=120,
+                                            max_queue=200
+                                            )
+        self.hp_cache = None
+        atexit.register(self.summary_writer.close)
+
+    @property
+    def log_level(self):
+        return self._log_level
+
+    def metadata(self, timestamp, elapsedtime, metric, metadata):
+        pass
+
+    def log(self, timestamp, elapsedtime, step, data):
+        if step == 'HPARAMS':
+            parameters = {k: v for k, v in data.items() if not isinstance(v, (list, tuple))}
+            #Unpack list and tuples
+            for d in [{k+f'_{i}':v for i,v in enumerate(l)} for k,l in data.items() if isinstance(l, (list, tuple))]:
+                parameters.update(d)
+            #Remove custom classes
+            parameters = {k: v for k, v in data.items() if isinstance(v, (int, float, str, bool))}
+            parameters.update({k:'None' for k, v in data.items() if v is None})
+            self.hp_cache = parameters
+        if step == ():
+            if self.hp_cache is None:
+                print('Warning: Cannot save HParameters. Please log HParameters with step=\'HPARAMS\'', file=sys.stderr)
+                return
+            self.summary_writer.add_hparams(self.hp_cache, data)
+        if not isinstance(step, int):
+            return
+        for k, v in data.items():
+            self.summary_writer.add_scalar(k, v, step)
+
+    def flush(self):
+        pass
+
+def setup_logger(args):
+    os.makedirs(args.results, exist_ok=True)
+    log_path = os.path.join(args.results, args.log_file)
+
+    if os.path.exists(log_path):
+        for i in itertools.count():
+            s_fname = args.log_file.split('.')
+            fname = '.'.join(s_fname[:-1]) + f'_{i}.' + s_fname[-1] if len(s_fname) > 1 else args.stat_file + f'.{i}'
+            log_path = os.path.join(args.results, fname)
+            if not os.path.exists(log_path):
+                break
+
+    def metric_format(metric, metadata, value):
+        return "{}: {}".format(metric, f'{value:.5f}' if isinstance(value, float) else value)
+    def step_format(step):
+        if step == ():
+            return "Finished |"
+        elif isinstance(step, int):
+            return "Step {0: <5} |".format(step)
+        return "Step {} |".format(step)
+
+
+    if not dist.is_initialized() or not args.distributed_world_size > 1 or args.distributed_rank == 0:
+        dllogger.init(backends=[JSONStreamBackend(verbosity=1, filename=log_path),
+                                TensorBoardBackend(verbosity=1, log_dir=args.results),
+                                StdOutBackend(verbosity=2, 
+                                              step_format=step_format,
+                                              prefix_format=lambda x: "")#,
+                                              #metric_format=metric_format)
+                                ])
+    else:
+        dllogger.init(backends=[])
+    dllogger.log(step='PARAMETER', data=vars(args), verbosity=0)
+
+    container_setup_info = {**get_framework_env_vars(), **get_system_info()}
+    dllogger.log(step='ENVIRONMENT', data=container_setup_info, verbosity=0)
+
+    dllogger.metadata('loss', {'GOAL': 'MINIMIZE', 'STAGE': 'TRAIN', 'format': ':5f'})
+    dllogger.metadata('P10', {'GOAL': 'MINIMIZE', 'STAGE': 'TRAIN', 'format': ':5f'})
+    dllogger.metadata('P50', {'GOAL': 'MINIMIZE', 'STAGE': 'TRAIN', 'format': ':5f'})
+    dllogger.metadata('P90', {'GOAL': 'MINIMIZE', 'STAGE': 'TRAIN', 'format': ':5f'})
+    dllogger.metadata('items/s', {'GOAL': 'MAXIMIZE', 'STAGE': 'TRAIN', 'format': ':1f'})
+    dllogger.metadata('val_loss', {'GOAL': 'MINIMIZE', 'STAGE': 'VAL', 'format':':5f'})
+    dllogger.metadata('val_P10', {'GOAL': 'MINIMIZE', 'STAGE': 'VAL', 'format': ':5f'})
+    dllogger.metadata('val_P50', {'GOAL': 'MINIMIZE', 'STAGE': 'VAL', 'format': ':5f'})
+    dllogger.metadata('val_P90', {'GOAL': 'MINIMIZE', 'STAGE': 'VAL', 'format': ':5f'})
+    dllogger.metadata('val_items/s', {'GOAL': 'MAXIMIZE', 'STAGE': 'VAL', 'format': ':1f'})
+    dllogger.metadata('test_P10', {'GOAL': 'MINIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
+    dllogger.metadata('test_P50', {'GOAL': 'MINIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
+    dllogger.metadata('test_P90', {'GOAL': 'MINIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
+    dllogger.metadata('throughput', {'GOAL': 'MAXIMIZE', 'STAGE': 'TEST', 'format': ':1f'})
+    dllogger.metadata('latency_p90', {'GOAL': 'MIMIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
+    dllogger.metadata('latency_p95', {'GOAL': 'MIMIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
+    dllogger.metadata('latency_p99', {'GOAL': 'MIMIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
+
+
+def get_framework_env_vars():
+    return {
+        'NVIDIA_PYTORCH_VERSION': os.environ.get('NVIDIA_PYTORCH_VERSION'),
+        'PYTORCH_VERSION': os.environ.get('PYTORCH_VERSION'),
+        'CUBLAS_VERSION': os.environ.get('CUBLAS_VERSION'),
+        'NCCL_VERSION': os.environ.get('NCCL_VERSION'),
+        'CUDA_DRIVER_VERSION': os.environ.get('CUDA_DRIVER_VERSION'),
+        'CUDNN_VERSION': os.environ.get('CUDNN_VERSION'),
+        'CUDA_VERSION': os.environ.get('CUDA_VERSION'),
+        'NVIDIA_PIPELINE_ID': os.environ.get('NVIDIA_PIPELINE_ID'),
+        'NVIDIA_BUILD_ID': os.environ.get('NVIDIA_BUILD_ID'),
+        'NVIDIA_TF32_OVERRIDE': os.environ.get('NVIDIA_TF32_OVERRIDE'),
+    }
+
+def get_system_info():
+    system_info = subprocess.run('nvidia-smi --query-gpu=gpu_name,memory.total,enforced.power.limit --format=csv'.split(), capture_output=True).stdout
+    system_info = [i.decode('utf-8') for i in system_info.split(b'\n')]
+    system_info = [x for x in system_info if x]
+    return {'system_info': system_info}
diff --git a/PyTorch/Forecasting/TFT/tft_pyt/modeling.py b/PyTorch/Forecasting/TFT/tft_pyt/modeling.py
new file mode 100644
index 00000000..65e64983
--- /dev/null
+++ b/PyTorch/Forecasting/TFT/tft_pyt/modeling.py
@@ -0,0 +1,367 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from torch import Tensor
+from typing import Dict, Tuple, Optional, List
+
+if os.environ.get("TFT_SCRIPTING", False):
+    from torch.nn import LayerNorm
+else:
+    from apex.normalization.fused_layer_norm import FusedLayerNorm as LayerNorm
+
+class MaybeLayerNorm(nn.Module):
+    def __init__(self, output_size, hidden_size, eps):
+        super().__init__()
+        if output_size and output_size == 1:
+            self.ln = nn.Identity()
+        else:
+            self.ln = LayerNorm(output_size if output_size else hidden_size, eps=eps)
+    
+    def forward(self, x):
+        return self.ln(x)
+
+
+class GLU(nn.Module):
+    def __init__(self, hidden_size, output_size):
+        super().__init__()
+        self.lin = nn.Linear(hidden_size, output_size * 2)
+
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.lin(x)
+        x = F.glu(x)
+        return x
+
+
+class GRN(nn.Module):
+    def __init__(self,
+                 input_size,
+                 hidden_size, 
+                 output_size=None,
+                 context_hidden_size=None,
+                 dropout=0):
+        super().__init__()
+
+        
+        self.layer_norm = MaybeLayerNorm(output_size, hidden_size, eps=1e-3)
+        self.lin_a = nn.Linear(input_size, hidden_size)
+        if context_hidden_size is not None:
+            self.lin_c = nn.Linear(context_hidden_size, hidden_size, bias=False)
+        self.lin_i = nn.Linear(hidden_size, hidden_size)
+        self.glu = GLU(hidden_size, output_size if output_size else hidden_size)
+        self.dropout = nn.Dropout(dropout)
+        self.out_proj = nn.Linear(input_size, output_size) if output_size else None
+
+    def forward(self, a: Tensor, c: Optional[Tensor] = None):
+        x = self.lin_a(a)
+        if c is not None:
+            x = x + self.lin_c(c).unsqueeze(1)
+        x = F.elu(x)
+        x = self.lin_i(x)
+        x = self.dropout(x)
+        x = self.glu(x)
+        y = a if not self.out_proj else self.out_proj(a)
+        x = x + y
+        x = self.layer_norm(x)
+        return x 
+
+class TFTEmbedding(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.s_cat_inp_lens    = config.static_categorical_inp_lens
+        self.t_cat_k_inp_lens  = config.temporal_known_categorical_inp_lens
+        self.t_cat_o_inp_lens  = config.temporal_observed_categorical_inp_lens
+        self.s_cont_inp_size   = config.static_continuous_inp_size
+        self.t_cont_k_inp_size = config.temporal_known_continuous_inp_size
+        self.t_cont_o_inp_size = config.temporal_observed_continuous_inp_size
+        self.t_tgt_size        = config.temporal_target_size
+
+        self.hidden_size = config.hidden_size
+
+        # There are 7 types of input:
+        # 1. Static categorical
+        # 2. Static continuous
+        # 3. Temporal known a priori categorical
+        # 4. Temporal known a priori continuous
+        # 5. Temporal observed categorical
+        # 6. Temporal observed continuous
+        # 7. Temporal observed targets (time series obseved so far)
+
+        self.s_cat_embed = nn.ModuleList([
+            nn.Embedding(n, self.hidden_size) for n in self.s_cat_inp_lens]) if self.s_cat_inp_lens else None
+        self.t_cat_k_embed = nn.ModuleList([
+            nn.Embedding(n, self.hidden_size) for n in self.t_cat_k_inp_lens]) if self.t_cat_k_inp_lens else None
+        self.t_cat_o_embed = nn.ModuleList([
+            nn.Embedding(n, self.hidden_size) for n in self.t_cat_o_inp_lens]) if self.t_cat_o_inp_lens else None
+
+        self.s_cont_embedding_vectors = nn.Parameter(torch.Tensor(self.s_cont_inp_size, self.hidden_size)) if self.s_cont_inp_size else None
+        self.t_cont_k_embedding_vectors = nn.Parameter(torch.Tensor(self.t_cont_k_inp_size, self.hidden_size)) if self.t_cont_k_inp_size else None
+        self.t_cont_o_embedding_vectors = nn.Parameter(torch.Tensor(self.t_cont_o_inp_size, self.hidden_size)) if self.t_cont_o_inp_size else None
+        self.t_tgt_embedding_vectors = nn.Parameter(torch.Tensor(self.t_tgt_size, self.hidden_size))
+
+        self.s_cont_embedding_bias = nn.Parameter(torch.zeros(self.s_cont_inp_size, self.hidden_size)) if self.s_cont_inp_size else None
+        self.t_cont_k_embedding_bias = nn.Parameter(torch.zeros(self.t_cont_k_inp_size, self.hidden_size)) if self.t_cont_k_inp_size else None
+        self.t_cont_o_embedding_bias = nn.Parameter(torch.zeros(self.t_cont_o_inp_size, self.hidden_size)) if self.t_cont_o_inp_size else None
+        self.t_tgt_embedding_bias = nn.Parameter(torch.zeros(self.t_tgt_size, self.hidden_size))
+
+        if self.s_cont_embedding_vectors is not None:
+            torch.nn.init.xavier_normal_(self.s_cont_embedding_vectors)
+        if self.t_cont_k_embedding_vectors is not None:
+            torch.nn.init.xavier_normal_(self.t_cont_k_embedding_vectors)
+        if self.t_cont_o_embedding_vectors is not None:
+            torch.nn.init.xavier_normal_(self.t_cont_o_embedding_vectors)
+        torch.nn.init.xavier_normal_(self.t_tgt_embedding_vectors)
+
+    def _apply_embedding(self,
+            cat: Optional[Tensor],
+            cont: Optional[Tensor],
+            cat_emb: Optional[nn.ModuleList], 
+            cont_emb: Tensor,
+            cont_bias: Tensor,
+            ) -> Tuple[Optional[Tensor], Optional[Tensor]]:
+        e_cat = torch.stack([embed(cat[...,i]) for i, embed in enumerate(cat_emb)], dim=-2) if cat is not None else None
+        if cont is not None:
+            #the line below is equivalent to following einsums
+            #e_cont = torch.einsum('btf,fh->bthf', cont, cont_emb)
+            #e_cont = torch.einsum('bf,fh->bhf', cont, cont_emb)
+            e_cont = torch.mul(cont.unsqueeze(-1), cont_emb)
+            e_cont = e_cont + cont_bias
+        else:
+            e_cont = None
+
+        if e_cat is not None and e_cont is not None:
+            return torch.cat([e_cat, e_cont], dim=-2)
+        elif e_cat is not None:
+            return e_cat
+        elif e_cont is not None:
+            return e_cont
+        else:
+            return None
+
+    def forward(self, x: Dict[str, Tensor]):
+        # temporal/static categorical/continuous known/observed input 
+        s_cat_inp = x.get('s_cat', None)
+        s_cont_inp = x.get('s_cont', None)
+        t_cat_k_inp = x.get('k_cat', None)
+        t_cont_k_inp = x.get('k_cont', None)
+        t_cat_o_inp = x.get('o_cat', None)
+        t_cont_o_inp = x.get('o_cont', None)
+        t_tgt_obs = x['target'] # Has to be present
+
+        # Static inputs are expected to be equal for all timesteps
+        # For memory efficiency there is no assert statement
+        s_cat_inp = s_cat_inp[:,0,:] if s_cat_inp is not None else None
+        s_cont_inp = s_cont_inp[:,0,:] if s_cont_inp is not None else None
+
+        s_inp = self._apply_embedding(s_cat_inp,
+                                      s_cont_inp,
+                                      self.s_cat_embed,
+                                      self.s_cont_embedding_vectors,
+                                      self.s_cont_embedding_bias)
+        t_known_inp = self._apply_embedding(t_cat_k_inp,
+                                            t_cont_k_inp,
+                                            self.t_cat_k_embed,
+                                            self.t_cont_k_embedding_vectors,
+                                            self.t_cont_k_embedding_bias)
+        t_observed_inp = self._apply_embedding(t_cat_o_inp,
+                                               t_cont_o_inp,
+                                               self.t_cat_o_embed,
+                                               self.t_cont_o_embedding_vectors,
+                                               self.t_cont_o_embedding_bias)
+
+        # Temporal observed targets
+        # t_observed_tgt = torch.einsum('btf,fh->btfh', t_tgt_obs, self.t_tgt_embedding_vectors)
+        t_observed_tgt = torch.matmul(t_tgt_obs.unsqueeze(3).unsqueeze(4), self.t_tgt_embedding_vectors.unsqueeze(1)).squeeze(3)
+        t_observed_tgt = t_observed_tgt + self.t_tgt_embedding_bias
+
+        return s_inp, t_known_inp, t_observed_inp, t_observed_tgt
+
+class VariableSelectionNetwork(nn.Module):
+    def __init__(self, config, num_inputs):
+        super().__init__()
+        self.joint_grn = GRN(config.hidden_size*num_inputs, config.hidden_size, output_size=num_inputs, context_hidden_size=config.hidden_size)
+        self.var_grns = nn.ModuleList([GRN(config.hidden_size, config.hidden_size, dropout=config.dropout) for _ in range(num_inputs)])
+
+    def forward(self, x: Tensor, context: Optional[Tensor] = None):
+        Xi = x.reshape(*x.shape[:-2], -1)
+        grn_outputs = self.joint_grn(Xi, c=context)
+        sparse_weights = F.softmax(grn_outputs, dim=-1)
+        transformed_embed_list = [m(x[...,i,:]) for i, m in enumerate(self.var_grns)]
+        transformed_embed = torch.stack(transformed_embed_list, dim=-1)
+        #the line below performs batched matrix vector multiplication
+        #for temporal features it's bthf,btf->bth
+        #for static features it's bhf,bf->bh
+        variable_ctx = torch.matmul(transformed_embed, sparse_weights.unsqueeze(-1)).squeeze(-1)
+
+        return variable_ctx, sparse_weights
+
+class StaticCovariateEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.vsn = VariableSelectionNetwork(config, config.num_static_vars)
+        self.context_grns = nn.ModuleList([GRN(config.hidden_size, config.hidden_size, dropout=config.dropout) for _ in range(4)])
+
+    def forward(self, x: Tensor) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
+        variable_ctx, sparse_weights = self.vsn(x)
+
+        # Context vectors:
+        # variable selection context
+        # enrichment context
+        # state_c context
+        # state_h context
+        cs, ce, ch, cc = tuple(m(variable_ctx) for m in self.context_grns)
+
+        return cs, ce, ch, cc
+
+
+class InterpretableMultiHeadAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.n_head = config.n_head
+        assert config.hidden_size % config.n_head == 0
+        self.d_head = config.hidden_size // config.n_head
+        self.qkv_linears = nn.Linear(config.hidden_size, (2 * self.n_head + 1) * self.d_head, bias=False)
+        self.out_proj = nn.Linear(self.d_head, config.hidden_size, bias=False)
+        self.attn_dropout = nn.Dropout(config.attn_dropout)
+        self.out_dropout = nn.Dropout(config.dropout)
+        self.scale = self.d_head**-0.5
+        self.register_buffer("_mask", torch.triu(torch.full((config.example_length, config.example_length), float('-inf')), 1).unsqueeze(0))
+
+    def forward(self, x: Tensor, mask_future_timesteps: bool = True) -> Tuple[Tensor, Tensor]:
+        bs, t, h_size = x.shape
+        qkv = self.qkv_linears(x)
+        q, k, v = qkv.split((self.n_head * self.d_head, self.n_head * self.d_head, self.d_head), dim=-1)
+        q = q.view(bs, t, self.n_head, self.d_head)
+        k = k.view(bs, t, self.n_head, self.d_head)
+        v = v.view(bs, t, self.d_head)
+
+        # attn_score = torch.einsum('bind,bjnd->bnij', q, k)
+        attn_score = torch.matmul(q.permute((0, 2, 1, 3)), k.permute((0, 2, 3, 1)))
+        attn_score.mul_(self.scale)
+
+        if mask_future_timesteps:
+            attn_score = attn_score + self._mask
+
+        attn_prob = F.softmax(attn_score, dim=3)
+        attn_prob = self.attn_dropout(attn_prob)
+
+        # attn_vec = torch.einsum('bnij,bjd->bnid', attn_prob, v)
+        attn_vec = torch.matmul(attn_prob, v.unsqueeze(1))
+        m_attn_vec = torch.mean(attn_vec, dim=1)
+        out = self.out_proj(m_attn_vec)
+        out = self.out_dropout(out)
+
+        return out, attn_vec
+
+
+
+class TemporalFusionTransformer(nn.Module):
+    """ 
+    Implementation of https://arxiv.org/abs/1912.09363 
+    """
+    def __init__(self, config):
+        super().__init__()
+
+        if hasattr(config, 'model'):
+            config = config.model
+
+        self.encoder_length = config.encoder_length #this determines from how distant past we want to use data from
+
+        self.embedding = TFTEmbedding(config)
+        self.static_encoder = StaticCovariateEncoder(config)
+
+        self.history_vsn = VariableSelectionNetwork(config, config.num_historic_vars) 
+        self.history_encoder = nn.LSTM(config.hidden_size, config.hidden_size, batch_first=True)
+        self.future_vsn = VariableSelectionNetwork(config, config.num_future_vars)
+        self.future_encoder = nn.LSTM(config.hidden_size, config.hidden_size, batch_first=True)
+
+
+        self.input_gate = GLU(config.hidden_size, config.hidden_size)
+        self.input_gate_ln = LayerNorm(config.hidden_size, eps=1e-3)
+
+        self.enrichment_grn = GRN(config.hidden_size,
+                                  config.hidden_size,
+                                  context_hidden_size=config.hidden_size, 
+                                  dropout=config.dropout)
+        self.attention = InterpretableMultiHeadAttention(config)
+        self.attention_gate = GLU(config.hidden_size, config.hidden_size)
+        self.attention_ln = LayerNorm(config.hidden_size, eps=1e-3)
+
+        self.positionwise_grn = GRN(config.hidden_size,
+                                    config.hidden_size,
+                                    dropout=config.dropout)
+
+        self.decoder_gate = GLU(config.hidden_size, config.hidden_size)
+        self.decoder_ln = LayerNorm(config.hidden_size, eps=1e-3)
+
+        self.quantile_proj = nn.Linear(config.hidden_size, len(config.quantiles))
+
+    def forward(self, x: Dict[str, Tensor]) -> Tensor:
+        s_inp, t_known_inp, t_observed_inp, t_observed_tgt = self.embedding(x)
+
+        # Static context
+        cs, ce, ch, cc = self.static_encoder(s_inp)
+        ch, cc = ch.unsqueeze(0), cc.unsqueeze(0) #lstm initial states
+
+        # Temporal input
+        _historical_inputs = [t_known_inp[:,:self.encoder_length,:], t_observed_tgt[:,:self.encoder_length,:]]
+        if t_observed_inp is not None:
+            _historical_inputs.insert(0,t_observed_inp[:,:self.encoder_length,:])
+
+        historical_inputs = torch.cat(_historical_inputs, dim=-2)
+        future_inputs = t_known_inp[:, self.encoder_length:]
+
+        # Encoders
+        historical_features, _ = self.history_vsn(historical_inputs, cs)
+        history, state = self.history_encoder(historical_features, (ch, cc))
+        future_features, _ = self.future_vsn(future_inputs, cs)
+        future, _ = self.future_encoder(future_features, state)
+        torch.cuda.synchronize() # this call gives perf boost for unknown reasons
+
+        # skip connection
+        input_embedding = torch.cat([historical_features, future_features], dim=1)
+        temporal_features = torch.cat([history, future], dim=1)
+        temporal_features = self.input_gate(temporal_features)
+        temporal_features = temporal_features + input_embedding
+        temporal_features = self.input_gate_ln(temporal_features)
+
+        # Static enrichment
+        enriched = self.enrichment_grn(temporal_features, c=ce)
+
+        # Temporal self attention
+        x, _ = self.attention(enriched, mask_future_timesteps=True)
+
+        # Don't compute hictorical quantiles
+        x = x[:, self.encoder_length:, :]
+        temporal_features = temporal_features[:, self.encoder_length:, :]
+        enriched = enriched[:, self.encoder_length:, :]
+
+        x = self.attention_gate(x)
+        x = x + enriched
+        x = self.attention_ln(x)
+
+        # Position-wise feed-forward
+        x = self.positionwise_grn(x)
+
+        # Final skip connection
+        x = self.decoder_gate(x)
+        x = x + temporal_features
+        x = self.decoder_ln(x)
+
+        out = self.quantile_proj(x)
+
+        return out
diff --git a/PyTorch/Forecasting/TFT/tft_pyt/requirements.txt b/PyTorch/Forecasting/TFT/tft_pyt/requirements.txt
new file mode 100644
index 00000000..8ba46efc
--- /dev/null
+++ b/PyTorch/Forecasting/TFT/tft_pyt/requirements.txt
@@ -0,0 +1 @@
+tensorboard
diff --git a/PyTorch/Forecasting/TFT/tft_pyt/scripts/benchmark.sh b/PyTorch/Forecasting/TFT/tft_pyt/scripts/benchmark.sh
new file mode 100644
index 00000000..c8a04c36
--- /dev/null
+++ b/PyTorch/Forecasting/TFT/tft_pyt/scripts/benchmark.sh
@@ -0,0 +1,54 @@
+#! /bin/bash
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+NUM_GPUS=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
+[ $NUM_GPUS -eq 16 ] && WORKER_NUMS=(1 8 16) || WORKER_NUMS=(1 8)
+DATASETS=(electricity traffic)
+
+rm -r /tmp/benchmark_results
+
+for DATASET in ${DATASETS[@]}
+do
+    for NGPU in ${WORKER_NUMS[@]}
+    do
+        for BATCH_SIZE in 512 1024 1536 2048 2560
+        do
+            for USE_AMP in --use_amp ""
+            do
+                for AFFINITY in "--affinity disabled" "--affinity single" "--affinity socket_unique_interleaved"
+                do 
+                    EXP_NAME="TFT_benchmark_${DATASET}_BS_${BATCH_SIZE}_${NGPU}GPU${USE_AMP}_${AFFINITY}"
+                    python -m torch.distributed.launch --nproc_per_node=${NGPU} train.py \
+                            --dataset ${DATASET} \
+                            --data_path /data/processed/${DATASET}_bin \
+                            --batch_size=${BATCH_SIZE} \
+                            --lr 5e-4 \
+                            --epochs 1 \
+                            --sample 100000 5000 \
+                            --seed 1 \
+                            ${USE_AMP} \
+                            ${AFFINITY} \
+                            --clip_grad 0.1 \
+                            --results /tmp/benchmark_results/${EXP_NAME}
+                done
+            done
+        done
+    done
+done
+for P in `ls /tmp/benchmark_results/`;
+do
+    echo ${P}
+    tail -n 1 /tmp/benchmark_results/${P}/dllogger.json
+done
diff --git a/PyTorch/Forecasting/TFT/tft_pyt/scripts/get_data.sh b/PyTorch/Forecasting/TFT/tft_pyt/scripts/get_data.sh
new file mode 100644
index 00000000..d4c7c7e1
--- /dev/null
+++ b/PyTorch/Forecasting/TFT/tft_pyt/scripts/get_data.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+DATAPATH='/data'
+
+declare -A URLS=( ['electricity']='https://archive.ics.uci.edu/ml/machine-learning-databases/00321/LD2011_2014.txt.zip'
+                  ['traffic']='https://archive.ics.uci.edu/ml/machine-learning-databases/00204/PEMS-SF.zip'
+                )
+
+mkdir -p ${DATAPATH}/raw
+mkdir -p ${DATAPATH}/processed
+
+for DS in electricity traffic
+do
+	DS_PATH=${DATAPATH}/raw/${DS}
+	ZIP_FNAME=${DS_PATH}.zip
+    if [ ! -d ${DS_PATH} ]
+    then
+        wget "${URLS[${DS}]}" -O ${ZIP_FNAME}
+        unzip ${ZIP_FNAME} -d ${DS_PATH}
+    fi
+	python -c "from data_utils import standarize_${DS} as standarize; standarize(\"${DS_PATH}\")"
+	python -c "from data_utils import preprocess; \
+               from configuration import ${DS^}Config as Config; \
+               preprocess(\"${DS_PATH}/standarized.csv\", \"${DATAPATH}/processed/${DS}_bin\", Config())" 
+done
+
+
diff --git a/PyTorch/Forecasting/TFT/tft_pyt/scripts/run_electricity.sh b/PyTorch/Forecasting/TFT/tft_pyt/scripts/run_electricity.sh
new file mode 100644
index 00000000..86214a9a
--- /dev/null
+++ b/PyTorch/Forecasting/TFT/tft_pyt/scripts/run_electricity.sh
@@ -0,0 +1,30 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+: ${SEED:=1}
+: ${LR:=1e-3}
+: ${NGPU:=8}
+: ${BATCH_SIZE:=1024}
+: ${EPOCHS:=30}
+
+python -m torch.distributed.launch --nproc_per_node=${NGPU} train.py \
+        --dataset electricity \
+        --data_path /data/processed/electricity_bin \
+        --batch_size=${BATCH_SIZE} \
+        --sample 450000 50000 \
+        --lr ${LR} \
+        --epochs ${EPOCHS} \
+        --seed ${SEED} \
+        --use_amp \
+        --results /results/TFT_electricity_bs${NGPU}x${BATCH_SIZE}_lr${LR}/seed_${SEED}
diff --git a/PyTorch/Forecasting/TFT/tft_pyt/scripts/run_electricity_DGX1-16G.sh b/PyTorch/Forecasting/TFT/tft_pyt/scripts/run_electricity_DGX1-16G.sh
new file mode 100644
index 00000000..86214a9a
--- /dev/null
+++ b/PyTorch/Forecasting/TFT/tft_pyt/scripts/run_electricity_DGX1-16G.sh
@@ -0,0 +1,30 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+: ${SEED:=1}
+: ${LR:=1e-3}
+: ${NGPU:=8}
+: ${BATCH_SIZE:=1024}
+: ${EPOCHS:=30}
+
+python -m torch.distributed.launch --nproc_per_node=${NGPU} train.py \
+        --dataset electricity \
+        --data_path /data/processed/electricity_bin \
+        --batch_size=${BATCH_SIZE} \
+        --sample 450000 50000 \
+        --lr ${LR} \
+        --epochs ${EPOCHS} \
+        --seed ${SEED} \
+        --use_amp \
+        --results /results/TFT_electricity_bs${NGPU}x${BATCH_SIZE}_lr${LR}/seed_${SEED}
diff --git a/PyTorch/Forecasting/TFT/tft_pyt/scripts/run_traffic.sh b/PyTorch/Forecasting/TFT/tft_pyt/scripts/run_traffic.sh
new file mode 100644
index 00000000..cab8e473
--- /dev/null
+++ b/PyTorch/Forecasting/TFT/tft_pyt/scripts/run_traffic.sh
@@ -0,0 +1,30 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+: ${SEED:=1}
+: ${LR:=1e-3}
+: ${NGPU:=8}
+: ${BATCH_SIZE:=1024}
+: ${EPOCHS:=20}
+
+python -m torch.distributed.launch --nproc_per_node=${NGPU} train.py \
+        --dataset traffic \
+        --data_path /data/processed/traffic_bin \
+        --batch_size=${BATCH_SIZE} \
+        --sample 450000 50000 \
+        --lr ${LR} \
+        --epochs ${EPOCHS} \
+        --seed ${SEED} \
+        --use_amp \
+        --results /results/TFT_traffic_bs${NGPU}x${BATCH_SIZE}_lr${LR}/seed_${SEED}
diff --git a/PyTorch/Forecasting/TFT/tft_pyt/scripts/run_traffic_DGX1-16G.sh b/PyTorch/Forecasting/TFT/tft_pyt/scripts/run_traffic_DGX1-16G.sh
new file mode 100644
index 00000000..cab8e473
--- /dev/null
+++ b/PyTorch/Forecasting/TFT/tft_pyt/scripts/run_traffic_DGX1-16G.sh
@@ -0,0 +1,30 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+: ${SEED:=1}
+: ${LR:=1e-3}
+: ${NGPU:=8}
+: ${BATCH_SIZE:=1024}
+: ${EPOCHS:=20}
+
+python -m torch.distributed.launch --nproc_per_node=${NGPU} train.py \
+        --dataset traffic \
+        --data_path /data/processed/traffic_bin \
+        --batch_size=${BATCH_SIZE} \
+        --sample 450000 50000 \
+        --lr ${LR} \
+        --epochs ${EPOCHS} \
+        --seed ${SEED} \
+        --use_amp \
+        --results /results/TFT_traffic_bs${NGPU}x${BATCH_SIZE}_lr${LR}/seed_${SEED}
diff --git a/PyTorch/Forecasting/TFT/tft_pyt/train.py b/PyTorch/Forecasting/TFT/tft_pyt/train.py
new file mode 100644
index 00000000..e5ceceeb
--- /dev/null
+++ b/PyTorch/Forecasting/TFT/tft_pyt/train.py
@@ -0,0 +1,294 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import time
+import os
+import pickle
+import json
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.distributed as dist
+from torch.utils.data import DataLoader, DistributedSampler, RandomSampler
+from apex import amp
+from apex.optimizers import FusedAdam
+#from torch.nn.parallel import DistributedDataParallel as DDP
+from apex.parallel import DistributedDataParallel as DDP
+
+import numpy as np
+
+import dllogger
+
+from modeling import TemporalFusionTransformer
+from configuration import CONFIGS
+from data_utils import TFTBinaryDataset, sample_data
+from log_helper import setup_logger
+from criterions import QuantileLoss
+from inference import predict
+from utils import PerformanceMeter
+import gpu_affinity
+from ema import ModelEma
+
+def load_dataset(args, config):
+    train_split = TFTBinaryDataset(os.path.join(args.data_path, 'train.bin'), config)
+    train_split = sample_data(train_split, args.sample_data[0])
+    if args.distributed_world_size > 1:
+        data_sampler = DistributedSampler(train_split, args.distributed_world_size, args.distributed_rank, seed=args.seed + args.distributed_rank, drop_last=True)
+    else:
+        data_sampler = RandomSampler(train_split)
+    train_loader = DataLoader(train_split, batch_size=args.batch_size, num_workers=4, sampler=data_sampler, pin_memory=True)
+
+    valid_split = TFTBinaryDataset(os.path.join(args.data_path, 'valid.bin'), config)
+    valid_split = sample_data(valid_split, args.sample_data[1])
+    if args.distributed_world_size > 1:
+        data_sampler = DistributedSampler(valid_split, args.distributed_world_size, args.distributed_rank, shuffle=False, drop_last=False)
+    else:
+        data_sampler = None
+    valid_loader = DataLoader(valid_split, batch_size=args.batch_size, sampler=data_sampler, num_workers=4, pin_memory=True)
+
+    test_split = TFTBinaryDataset(os.path.join(args.data_path, 'test.bin'), config)
+    if args.distributed_world_size > 1:
+        data_sampler = DistributedSampler(test_split, args.distributed_world_size, args.distributed_rank, shuffle=False, drop_last=False)
+    else:
+        data_sampler = None
+    test_loader = DataLoader(test_split, batch_size=args.batch_size, sampler=data_sampler, num_workers=4, pin_memory=True)
+
+    print_once(f'Train split length: {len(train_split)}')
+    print_once(f'Valid split length: {len(valid_split)}')
+    print_once(f'Test split length: {len(test_split)}')
+
+    return train_loader, valid_loader, test_loader
+
+def print_once(*args, **kwargs):
+    if not dist.is_initialized() or dist.get_rank() == 0:
+        print(*args, **kwargs)
+
+
+def main(args):
+    # Enable CuDNN autotuner
+    nproc_per_node = torch.cuda.device_count()
+    if args.affinity != 'disabled':
+        affinity = gpu_affinity.set_affinity(
+                args.local_rank,
+                nproc_per_node,
+                args.affinity
+            )
+        print(f'{args.local_rank}: thread affinity: {affinity}')
+
+
+    torch.backends.cudnn.benchmark = True
+
+    ### INIT DISTRIBUTED
+    if args.distributed_world_size > 1:
+        args.local_rank = int(os.environ.get('LOCAL_RANK', args.local_rank))
+        torch.cuda.set_device(args.local_rank)
+        dist.init_process_group(backend='nccl', init_method='env://')
+        args.distributed_world_size = int(os.environ['WORLD_SIZE'])
+        args.distributed_rank = dist.get_rank()
+        print_once(f'Distributed training with {args.distributed_world_size} GPUs')
+        torch.cuda.synchronize()
+
+    if args.seed:
+        np.random.seed(args.seed)
+        torch.manual_seed(args.seed)
+        torch.cuda.manual_seed(args.seed)
+
+    setup_logger(args)
+
+    config = CONFIGS[args.dataset]()
+    if args.overwrite_config:
+        config.__dict__.update(json.loads(args.overwrite_config))
+
+    dllogger.log(step='HPARAMS', data={**vars(args), **vars(config)}, verbosity=1)
+
+    model = TemporalFusionTransformer(config).cuda()
+    if args.ema_decay:
+        model_ema = ModelEma(model, decay=args.ema_decay)
+
+    print_once('Model params: {}'.format(sum(p.numel() for p in model.parameters())))
+    criterion = QuantileLoss(config).cuda()
+    optimizer = FusedAdam(model.parameters(), lr=args.lr)
+    if args.use_amp:
+        model, optimizer = amp.initialize(model, optimizer, opt_level="O2", loss_scale="dynamic")
+    if args.distributed_world_size > 1:
+        #model = DDP(model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True)
+        model = DDP(model)
+
+    train_loader, valid_loader, test_loader = load_dataset(args, config)
+
+    global_step = 0
+    perf_meter = PerformanceMeter()
+
+    for epoch in range(args.epochs):
+        start = time.time()
+        dllogger.log(step=global_step, data={'epoch': epoch}, verbosity=1)
+
+        model.train() 
+        for local_step, batch in enumerate(train_loader):
+            perf_meter.reset_current_lap()
+            batch = {key: tensor.cuda() if tensor.numel() else None for key, tensor in batch.items()}
+            predictions = model(batch)
+            targets = batch['target'][:,config.encoder_length:,:]
+            p_losses = criterion(predictions, targets)
+            loss = p_losses.sum()
+
+            if args.use_amp:
+                with amp.scale_loss(loss, optimizer) as scaled_loss:
+                    scaled_loss.backward()
+            else:
+                loss.backward()
+            if not args.grad_accumulation or (global_step+1) % args.grad_accumulation == 0:
+                if args.clip_grad:
+                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip_grad)
+                optimizer.step()
+                optimizer.zero_grad()
+                if args.ema_decay:
+                    model_ema.update(model)
+
+            if args.distributed_world_size > 1:
+                dist.all_reduce(p_losses)
+                p_losses /= args.distributed_world_size
+                loss = p_losses.sum()
+
+            torch.cuda.synchronize()
+            ips = perf_meter.update(args.batch_size * args.distributed_world_size,
+                    exclude_from_total=local_step in [0, len(train_loader)-1])
+
+            log_dict = {'P10':p_losses[0].item(), 'P50':p_losses[1].item(), 'P90':p_losses[2].item(), 'loss': loss.item(), 'items/s':ips}
+            dllogger.log(step=global_step, data=log_dict, verbosity=1)
+            global_step += 1
+
+        validate(args, config, model_ema if args.ema_decay else model, criterion, valid_loader, global_step)
+
+        if validate.early_stop_c >= args.early_stopping:
+            print_once('Early stopping')
+            break
+
+    ### TEST PHASE ###
+    state_dict = torch.load(os.path.join(args.results, 'checkpoint.pt'), map_location='cpu')
+    if isinstance(model, DDP):
+        model.module.load_state_dict(state_dict['model'])
+    else:
+        model.load_state_dict(state_dict['model'])
+    model.cuda().eval()
+
+    tgt_scalers = pickle.load(open(os.path.join(args.data_path, 'tgt_scalers.bin'), 'rb'))
+    cat_encodings = pickle.load(open(os.path.join(args.data_path,'cat_encodings.bin'), 'rb'))
+
+    unscaled_predictions, unscaled_targets, _, _ = predict(args, config, model, test_loader, tgt_scalers, cat_encodings)
+    losses = QuantileLoss(config)(unscaled_predictions, unscaled_targets)
+    normalizer = unscaled_targets.abs().mean()
+    quantiles = 2 * losses / normalizer
+
+    if args.distributed_world_size > 1:
+        quantiles = quantiles.cuda()
+        dist.all_reduce(quantiles)
+        quantiles /= args.distributed_world_size
+
+    quantiles = {'test_p10': quantiles[0].item(), 'test_p50': quantiles[1].item(), 'test_p90': quantiles[2].item(), 'sum':sum(quantiles).item()}
+    finish_log = {**quantiles, 'average_ips':perf_meter.avg, 'convergence_step':validate.conv_step}
+    dllogger.log(step=(), data=finish_log, verbosity=1)
+
+def validate(args, config, model, criterion, dataloader, global_step):
+    if not hasattr(validate, 'best_valid_loss'):
+        validate.best_valid_loss = float('inf')
+    if not hasattr(validate, 'early_stop_c'):
+        validate.early_stop_c = 0
+    model.eval()
+
+    losses = []
+    validation_start = time.time()
+    for batch in dataloader:
+        with torch.no_grad():
+            batch = {key: tensor.cuda() if tensor.numel() else None for key, tensor in batch.items()}
+            predictions = model(batch)
+            targets = batch['target'][:,config.encoder_length:,:]
+            p_losses = criterion(predictions, targets)
+            bs = next(t for t in batch.values() if t is not None).shape[0]
+            losses.append((p_losses, bs))
+
+    validation_end = time.time()
+
+    p_losses = sum([l[0]*l[1] for l in losses])/sum([l[1] for l in losses]) #takes into accunt that the last batch is not full
+    if args.distributed_world_size > 1:
+        dist.all_reduce(p_losses)
+        p_losses = p_losses/args.distributed_world_size
+
+    ips = len(dataloader.dataset) / (validation_end - validation_start)
+
+    log_dict = {'P10':p_losses[0].item(), 'P50':p_losses[1].item(), 'P90':p_losses[2].item(), 'loss': p_losses.sum().item(), 'items/s':ips}
+
+    if log_dict['loss'] < validate.best_valid_loss:
+        validate.best_valid_loss = log_dict['loss']
+        validate.early_stop_c = 0
+        validate.conv_step = global_step
+        if not dist.is_initialized() or dist.get_rank() == 0:
+            state_dict = model.module.state_dict() if isinstance(model, (DDP, ModelEma)) else model.state_dict()
+            ckpt = {'args':args, 'config':config, 'model':state_dict}
+            torch.save(ckpt, os.path.join(args.results, 'checkpoint.pt'))
+        if args.distributed_world_size > 1:
+            dist.barrier()
+    else:
+        validate.early_stop_c += 1
+        
+    log_dict = {'val_'+k:v for k,v in log_dict.items()}
+    dllogger.log(step=global_step, data=log_dict, verbosity=1)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--data_path', type=str, required=True,
+                        help='Path to the dataset')
+    parser.add_argument('--dataset', type=str, required=True, choices=CONFIGS.keys(),
+                        help='Dataset name')
+    parser.add_argument('--epochs', type=int, default=25,
+                        help='Default number of training epochs')
+    parser.add_argument('--sample_data', type=lambda x: int(float(x)), nargs=2, default=[-1, -1],
+                        help="""Subsample the dataset. Specify number of training and valid examples.
+                        Values can be provided in scientific notation. Floats will be truncated.""")
+    parser.add_argument('--batch_size', type=int, default=64)
+    parser.add_argument('--lr', type=float, default=1e-3)
+    parser.add_argument('--seed', type=int, default=1)
+    parser.add_argument('--use_amp', action='store_true', help='Enable automatic mixed precision')
+    parser.add_argument('--clip_grad', type=float, default=0.0)
+    parser.add_argument('--grad_accumulation', type=int, default=0)
+    parser.add_argument('--early_stopping', type=int, default=1000,
+                        help='Stop training if validation loss does not improve for more than this number of epochs.')
+    parser.add_argument('--results', type=str, default='/results',
+                        help='Directory in which results are stored')
+    parser.add_argument('--log_file', type=str, default='dllogger.json',
+                        help='Name of dllogger output file')
+    parser.add_argument('--distributed_world_size', type=int, metavar='N',
+                       default=torch.cuda.device_count(),
+                       help='total number of GPUs across all nodes (default: all visible GPUs)')
+    parser.add_argument('--distributed_rank', default=os.getenv('LOCAL_RANK', 0), type=int,
+                       help='rank of the current worker')
+    parser.add_argument('--local_rank', default=0, type=int,
+                       help='rank of the current worker')
+    parser.add_argument('--overwrite_config', type=str, default='',
+                       help='JSON string used to overload config')
+    parser.add_argument('--affinity', type=str,
+                         default='socket_unique_interleaved',
+                         choices=['socket', 'single', 'single_unique',
+                                  'socket_unique_interleaved',
+                                  'socket_unique_continuous',
+                                  'disabled'],
+                         help='type of CPU affinity')
+    parser.add_argument("--ema_decay", type=float, default=0.0, help='Use exponential moving average')
+
+
+    ARGS = parser.parse_args()
+    main(ARGS)
diff --git a/PyTorch/Forecasting/TFT/tft_pyt/utils.py b/PyTorch/Forecasting/TFT/tft_pyt/utils.py
new file mode 100644
index 00000000..bf88be40
--- /dev/null
+++ b/PyTorch/Forecasting/TFT/tft_pyt/utils.py
@@ -0,0 +1,46 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import time
+
+class PerformanceMeter():
+    def __init__(self):
+        self.reset()
+
+    def reset(self):
+        self.avg = 0
+        self.count = 0
+        self.total_time = 0
+        self.last_update_time = time.time()
+        self.intervals = []
+
+    def update(self, n, exclude_from_total=False):
+        delta = time.time() - self.last_update_time
+        self.intervals.append(delta)
+        if not exclude_from_total:
+            self.total_time += delta
+            self.count += n
+            self.avg = self.count / self.total_time
+        self.last_update_time = time.time()
+
+        return n/delta
+
+    def reset_current_lap(self):
+        self.last_update_time = time.time()
+
+    def p(self, i):
+        assert i <= 100
+        idx = int(len(self.intervals) * i / 100)
+        return sorted(self.intervals)[idx]
+
diff --git a/PyTorch/Forecasting/TFT/train.py b/PyTorch/Forecasting/TFT/train.py
new file mode 100644
index 00000000..e5ceceeb
--- /dev/null
+++ b/PyTorch/Forecasting/TFT/train.py
@@ -0,0 +1,294 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import time
+import os
+import pickle
+import json
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.distributed as dist
+from torch.utils.data import DataLoader, DistributedSampler, RandomSampler
+from apex import amp
+from apex.optimizers import FusedAdam
+#from torch.nn.parallel import DistributedDataParallel as DDP
+from apex.parallel import DistributedDataParallel as DDP
+
+import numpy as np
+
+import dllogger
+
+from modeling import TemporalFusionTransformer
+from configuration import CONFIGS
+from data_utils import TFTBinaryDataset, sample_data
+from log_helper import setup_logger
+from criterions import QuantileLoss
+from inference import predict
+from utils import PerformanceMeter
+import gpu_affinity
+from ema import ModelEma
+
+def load_dataset(args, config):
+    train_split = TFTBinaryDataset(os.path.join(args.data_path, 'train.bin'), config)
+    train_split = sample_data(train_split, args.sample_data[0])
+    if args.distributed_world_size > 1:
+        data_sampler = DistributedSampler(train_split, args.distributed_world_size, args.distributed_rank, seed=args.seed + args.distributed_rank, drop_last=True)
+    else:
+        data_sampler = RandomSampler(train_split)
+    train_loader = DataLoader(train_split, batch_size=args.batch_size, num_workers=4, sampler=data_sampler, pin_memory=True)
+
+    valid_split = TFTBinaryDataset(os.path.join(args.data_path, 'valid.bin'), config)
+    valid_split = sample_data(valid_split, args.sample_data[1])
+    if args.distributed_world_size > 1:
+        data_sampler = DistributedSampler(valid_split, args.distributed_world_size, args.distributed_rank, shuffle=False, drop_last=False)
+    else:
+        data_sampler = None
+    valid_loader = DataLoader(valid_split, batch_size=args.batch_size, sampler=data_sampler, num_workers=4, pin_memory=True)
+
+    test_split = TFTBinaryDataset(os.path.join(args.data_path, 'test.bin'), config)
+    if args.distributed_world_size > 1:
+        data_sampler = DistributedSampler(test_split, args.distributed_world_size, args.distributed_rank, shuffle=False, drop_last=False)
+    else:
+        data_sampler = None
+    test_loader = DataLoader(test_split, batch_size=args.batch_size, sampler=data_sampler, num_workers=4, pin_memory=True)
+
+    print_once(f'Train split length: {len(train_split)}')
+    print_once(f'Valid split length: {len(valid_split)}')
+    print_once(f'Test split length: {len(test_split)}')
+
+    return train_loader, valid_loader, test_loader
+
+def print_once(*args, **kwargs):
+    if not dist.is_initialized() or dist.get_rank() == 0:
+        print(*args, **kwargs)
+
+
+def main(args):
+    # Enable CuDNN autotuner
+    nproc_per_node = torch.cuda.device_count()
+    if args.affinity != 'disabled':
+        affinity = gpu_affinity.set_affinity(
+                args.local_rank,
+                nproc_per_node,
+                args.affinity
+            )
+        print(f'{args.local_rank}: thread affinity: {affinity}')
+
+
+    torch.backends.cudnn.benchmark = True
+
+    ### INIT DISTRIBUTED
+    if args.distributed_world_size > 1:
+        args.local_rank = int(os.environ.get('LOCAL_RANK', args.local_rank))
+        torch.cuda.set_device(args.local_rank)
+        dist.init_process_group(backend='nccl', init_method='env://')
+        args.distributed_world_size = int(os.environ['WORLD_SIZE'])
+        args.distributed_rank = dist.get_rank()
+        print_once(f'Distributed training with {args.distributed_world_size} GPUs')
+        torch.cuda.synchronize()
+
+    if args.seed:
+        np.random.seed(args.seed)
+        torch.manual_seed(args.seed)
+        torch.cuda.manual_seed(args.seed)
+
+    setup_logger(args)
+
+    config = CONFIGS[args.dataset]()
+    if args.overwrite_config:
+        config.__dict__.update(json.loads(args.overwrite_config))
+
+    dllogger.log(step='HPARAMS', data={**vars(args), **vars(config)}, verbosity=1)
+
+    model = TemporalFusionTransformer(config).cuda()
+    if args.ema_decay:
+        model_ema = ModelEma(model, decay=args.ema_decay)
+
+    print_once('Model params: {}'.format(sum(p.numel() for p in model.parameters())))
+    criterion = QuantileLoss(config).cuda()
+    optimizer = FusedAdam(model.parameters(), lr=args.lr)
+    if args.use_amp:
+        model, optimizer = amp.initialize(model, optimizer, opt_level="O2", loss_scale="dynamic")
+    if args.distributed_world_size > 1:
+        #model = DDP(model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True)
+        model = DDP(model)
+
+    train_loader, valid_loader, test_loader = load_dataset(args, config)
+
+    global_step = 0
+    perf_meter = PerformanceMeter()
+
+    for epoch in range(args.epochs):
+        start = time.time()
+        dllogger.log(step=global_step, data={'epoch': epoch}, verbosity=1)
+
+        model.train() 
+        for local_step, batch in enumerate(train_loader):
+            perf_meter.reset_current_lap()
+            batch = {key: tensor.cuda() if tensor.numel() else None for key, tensor in batch.items()}
+            predictions = model(batch)
+            targets = batch['target'][:,config.encoder_length:,:]
+            p_losses = criterion(predictions, targets)
+            loss = p_losses.sum()
+
+            if args.use_amp:
+                with amp.scale_loss(loss, optimizer) as scaled_loss:
+                    scaled_loss.backward()
+            else:
+                loss.backward()
+            if not args.grad_accumulation or (global_step+1) % args.grad_accumulation == 0:
+                if args.clip_grad:
+                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip_grad)
+                optimizer.step()
+                optimizer.zero_grad()
+                if args.ema_decay:
+                    model_ema.update(model)
+
+            if args.distributed_world_size > 1:
+                dist.all_reduce(p_losses)
+                p_losses /= args.distributed_world_size
+                loss = p_losses.sum()
+
+            torch.cuda.synchronize()
+            ips = perf_meter.update(args.batch_size * args.distributed_world_size,
+                    exclude_from_total=local_step in [0, len(train_loader)-1])
+
+            log_dict = {'P10':p_losses[0].item(), 'P50':p_losses[1].item(), 'P90':p_losses[2].item(), 'loss': loss.item(), 'items/s':ips}
+            dllogger.log(step=global_step, data=log_dict, verbosity=1)
+            global_step += 1
+
+        validate(args, config, model_ema if args.ema_decay else model, criterion, valid_loader, global_step)
+
+        if validate.early_stop_c >= args.early_stopping:
+            print_once('Early stopping')
+            break
+
+    ### TEST PHASE ###
+    state_dict = torch.load(os.path.join(args.results, 'checkpoint.pt'), map_location='cpu')
+    if isinstance(model, DDP):
+        model.module.load_state_dict(state_dict['model'])
+    else:
+        model.load_state_dict(state_dict['model'])
+    model.cuda().eval()
+
+    tgt_scalers = pickle.load(open(os.path.join(args.data_path, 'tgt_scalers.bin'), 'rb'))
+    cat_encodings = pickle.load(open(os.path.join(args.data_path,'cat_encodings.bin'), 'rb'))
+
+    unscaled_predictions, unscaled_targets, _, _ = predict(args, config, model, test_loader, tgt_scalers, cat_encodings)
+    losses = QuantileLoss(config)(unscaled_predictions, unscaled_targets)
+    normalizer = unscaled_targets.abs().mean()
+    quantiles = 2 * losses / normalizer
+
+    if args.distributed_world_size > 1:
+        quantiles = quantiles.cuda()
+        dist.all_reduce(quantiles)
+        quantiles /= args.distributed_world_size
+
+    quantiles = {'test_p10': quantiles[0].item(), 'test_p50': quantiles[1].item(), 'test_p90': quantiles[2].item(), 'sum':sum(quantiles).item()}
+    finish_log = {**quantiles, 'average_ips':perf_meter.avg, 'convergence_step':validate.conv_step}
+    dllogger.log(step=(), data=finish_log, verbosity=1)
+
+def validate(args, config, model, criterion, dataloader, global_step):
+    if not hasattr(validate, 'best_valid_loss'):
+        validate.best_valid_loss = float('inf')
+    if not hasattr(validate, 'early_stop_c'):
+        validate.early_stop_c = 0
+    model.eval()
+
+    losses = []
+    validation_start = time.time()
+    for batch in dataloader:
+        with torch.no_grad():
+            batch = {key: tensor.cuda() if tensor.numel() else None for key, tensor in batch.items()}
+            predictions = model(batch)
+            targets = batch['target'][:,config.encoder_length:,:]
+            p_losses = criterion(predictions, targets)
+            bs = next(t for t in batch.values() if t is not None).shape[0]
+            losses.append((p_losses, bs))
+
+    validation_end = time.time()
+
+    p_losses = sum([l[0]*l[1] for l in losses])/sum([l[1] for l in losses]) #takes into accunt that the last batch is not full
+    if args.distributed_world_size > 1:
+        dist.all_reduce(p_losses)
+        p_losses = p_losses/args.distributed_world_size
+
+    ips = len(dataloader.dataset) / (validation_end - validation_start)
+
+    log_dict = {'P10':p_losses[0].item(), 'P50':p_losses[1].item(), 'P90':p_losses[2].item(), 'loss': p_losses.sum().item(), 'items/s':ips}
+
+    if log_dict['loss'] < validate.best_valid_loss:
+        validate.best_valid_loss = log_dict['loss']
+        validate.early_stop_c = 0
+        validate.conv_step = global_step
+        if not dist.is_initialized() or dist.get_rank() == 0:
+            state_dict = model.module.state_dict() if isinstance(model, (DDP, ModelEma)) else model.state_dict()
+            ckpt = {'args':args, 'config':config, 'model':state_dict}
+            torch.save(ckpt, os.path.join(args.results, 'checkpoint.pt'))
+        if args.distributed_world_size > 1:
+            dist.barrier()
+    else:
+        validate.early_stop_c += 1
+        
+    log_dict = {'val_'+k:v for k,v in log_dict.items()}
+    dllogger.log(step=global_step, data=log_dict, verbosity=1)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--data_path', type=str, required=True,
+                        help='Path to the dataset')
+    parser.add_argument('--dataset', type=str, required=True, choices=CONFIGS.keys(),
+                        help='Dataset name')
+    parser.add_argument('--epochs', type=int, default=25,
+                        help='Default number of training epochs')
+    parser.add_argument('--sample_data', type=lambda x: int(float(x)), nargs=2, default=[-1, -1],
+                        help="""Subsample the dataset. Specify number of training and valid examples.
+                        Values can be provided in scientific notation. Floats will be truncated.""")
+    parser.add_argument('--batch_size', type=int, default=64)
+    parser.add_argument('--lr', type=float, default=1e-3)
+    parser.add_argument('--seed', type=int, default=1)
+    parser.add_argument('--use_amp', action='store_true', help='Enable automatic mixed precision')
+    parser.add_argument('--clip_grad', type=float, default=0.0)
+    parser.add_argument('--grad_accumulation', type=int, default=0)
+    parser.add_argument('--early_stopping', type=int, default=1000,
+                        help='Stop training if validation loss does not improve for more than this number of epochs.')
+    parser.add_argument('--results', type=str, default='/results',
+                        help='Directory in which results are stored')
+    parser.add_argument('--log_file', type=str, default='dllogger.json',
+                        help='Name of dllogger output file')
+    parser.add_argument('--distributed_world_size', type=int, metavar='N',
+                       default=torch.cuda.device_count(),
+                       help='total number of GPUs across all nodes (default: all visible GPUs)')
+    parser.add_argument('--distributed_rank', default=os.getenv('LOCAL_RANK', 0), type=int,
+                       help='rank of the current worker')
+    parser.add_argument('--local_rank', default=0, type=int,
+                       help='rank of the current worker')
+    parser.add_argument('--overwrite_config', type=str, default='',
+                       help='JSON string used to overload config')
+    parser.add_argument('--affinity', type=str,
+                         default='socket_unique_interleaved',
+                         choices=['socket', 'single', 'single_unique',
+                                  'socket_unique_interleaved',
+                                  'socket_unique_continuous',
+                                  'disabled'],
+                         help='type of CPU affinity')
+    parser.add_argument("--ema_decay", type=float, default=0.0, help='Use exponential moving average')
+
+
+    ARGS = parser.parse_args()
+    main(ARGS)
diff --git a/PyTorch/Forecasting/TFT/utils.py b/PyTorch/Forecasting/TFT/utils.py
new file mode 100644
index 00000000..bf88be40
--- /dev/null
+++ b/PyTorch/Forecasting/TFT/utils.py
@@ -0,0 +1,46 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import time
+
+class PerformanceMeter():
+    def __init__(self):
+        self.reset()
+
+    def reset(self):
+        self.avg = 0
+        self.count = 0
+        self.total_time = 0
+        self.last_update_time = time.time()
+        self.intervals = []
+
+    def update(self, n, exclude_from_total=False):
+        delta = time.time() - self.last_update_time
+        self.intervals.append(delta)
+        if not exclude_from_total:
+            self.total_time += delta
+            self.count += n
+            self.avg = self.count / self.total_time
+        self.last_update_time = time.time()
+
+        return n/delta
+
+    def reset_current_lap(self):
+        self.last_update_time = time.time()
+
+    def p(self, i):
+        assert i <= 100
+        idx = int(len(self.intervals) * i / 100)
+        return sorted(self.intervals)[idx]
+
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/.dockerignore b/Tools/PyTorch/TimeSeriesPredictionPlatform/.dockerignore
new file mode 100755
index 00000000..de69b685
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/.dockerignore
@@ -0,0 +1,8 @@
+.idea
+**/.ipynb_checkpoints
+**/__pycache__
+**/.gitkeep
+.git
+.gitignore
+Dockerfile
+.dockerignore
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/.gitignore b/Tools/PyTorch/TimeSeriesPredictionPlatform/.gitignore
new file mode 100755
index 00000000..41d0e9e0
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/.gitignore
@@ -0,0 +1,5 @@
+.ipynb_checkpoints
+__pycache__
+/outputs/
+*.zip
+/datasets/*/
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/Dockerfile b/Tools/PyTorch/TimeSeriesPredictionPlatform/Dockerfile
new file mode 100755
index 00000000..21d34629
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/Dockerfile
@@ -0,0 +1,61 @@
+#SPDX-License-Identifier: Apache-2.0
+ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:21.09-py3
+
+FROM ${FROM_IMAGE_NAME}
+
+ENV DEBIAN_FRONTEND=noninteractive
+ENV DCGM_VERSION=2.2.9
+
+ENV MODEL_NAVIGATOR_CONTAINER=1
+RUN apt-get update && \
+    apt-get install --no-install-recommends -y software-properties-common curl python3-dev python3-pip python-is-python3 libb64-dev wget git wkhtmltopdf && \
+    \
+    curl -fsSL https://download.docker.com/linux/debian/gpg | apt-key add - && \
+    add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/debian buster stable" && \
+    apt-get update && \
+    apt-get install --no-install-recommends -y docker-ce docker-ce-cli containerd.io && \
+    \
+    . /etc/os-release && \
+    curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey| apt-key add - && \
+    curl -s -L "https://nvidia.github.io/nvidia-docker/${ID}${VERSION_ID}/nvidia-docker.list" > /etc/apt/sources.list.d/nvidia-docker.list && \
+    apt-get update && \
+    apt-get install --no-install-recommends -y nvidia-docker2 && \
+    \
+    curl -s -L -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/datacenter-gpu-manager_${DCGM_VERSION}_amd64.deb && \
+    dpkg -i datacenter-gpu-manager_${DCGM_VERSION}_amd64.deb && \
+    rm datacenter-gpu-manager_${DCGM_VERSION}_amd64.deb && \
+    \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+
+# Install perf_client required library
+RUN apt-get update && \
+    apt-get install -y libb64-dev libb64-0d curl && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+# Set workdir and python path
+WORKDIR /workspace
+ENV PYTHONPATH /workspace
+
+RUN pip install --upgrade pip
+ADD requirements.txt /workspace/requirements.txt
+ADD triton/requirements.txt /workspace/triton/requirements.txt
+RUN pip install -r /workspace/requirements.txt
+RUN pip install -r /workspace/triton/requirements.txt
+RUN pip install nvidia-pyindex
+RUN pip install nvidia-dllogger
+RUN pip install --no-cache-dir -r requirements.txt -f https://data.dgl.ai/wheels/repo.html
+
+# Add model files to workspace
+ADD . /workspace
+
+
+# AMP monkey-patch
+RUN sed -i 's/  def forward(ctx,/  @amp.custom_fwd\(cast_inputs=torch.float32\)\n  def forward(ctx,/g' /opt/conda/lib/python3.8/site-packages/apex/normalization/fused_layer_norm.py
+RUN sed -i 's/  def backward(ctx,/  @amp.custom_bwd\n  def backward(ctx,/g' /opt/conda/lib/python3.8/site-packages/apex/normalization/fused_layer_norm.py
+RUN sed -i 's/^import torch$/import torch\nfrom torch.cuda import amp/' /opt/conda/lib/python3.8/site-packages/apex/normalization/fused_layer_norm.py
+RUN rm -rf examples
+RUN rm -rf docker-examples
+RUN rm -rf tutorial
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/LICENSE b/Tools/PyTorch/TimeSeriesPredictionPlatform/LICENSE
new file mode 100755
index 00000000..c1a81fee
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/LICENSE
@@ -0,0 +1,201 @@
+    Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
\ No newline at end of file
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/LICENSE AGREEMENT b/Tools/PyTorch/TimeSeriesPredictionPlatform/LICENSE AGREEMENT
new file mode 100755
index 00000000..c79d78b1
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/LICENSE AGREEMENT	
@@ -0,0 +1,24 @@
+Individual Contributor License Agreement (CLA)
+Thank you for submitting your contributions to this project.
+
+By signing this CLA, you agree that the following terms apply to all of your past, present and future contributions to the project.
+
+License.
+You hereby represent that all present, past and future contributions are governed by the Apache 2.0 License copyright statement.
+
+This entails that to the extent possible under law, you transfer all copyright and related or neighboring rights of the code or documents you contribute to the project itself or its maintainers. Furthermore you also represent that you have the authority to perform the above waiver with respect to the entirety of you contributions.
+
+Moral Rights.
+To the fullest extent permitted under applicable law, you hereby waive, and agree not to assert, all of your “moral rights” in or relating to your contributions for the benefit of the project.
+
+Third Party Content.
+If your Contribution includes or is based on any source code, object code, bug fixes, configuration changes, tools, specifications, documentation, data, materials, feedback, information or other works of authorship that were not authored by you (“Third Party Content”) or if you are aware of any third party intellectual property or proprietary rights associated with your Contribution (“Third Party Rights”), then you agree to include with the submission of your Contribution full details respecting such Third Party Content and Third Party Rights, including, without limitation, identification of which aspects of your Contribution contain Third Party Content or are associated with Third Party Rights, the owner/author of the Third Party Content and Third Party Rights, where you obtained the Third Party Content, and any applicable third party license terms or restrictions respecting the Third Party Content and Third Party Rights. For greater certainty, the foregoing obligations respecting the identification of Third Party Content and Third Party Rights do not apply to any portion of a Project that is incorporated into your Contribution to that same Project.
+
+Representations.
+You represent that, other than the Third Party Content and Third Party Rights identified by you in accordance with this Agreement, you are the sole author of your Contributions and are legally entitled to grant the foregoing licenses and waivers in respect of your Contributions. If your Contributions were created in the course of your employment with your past or present employer(s), you represent that such employer(s) has authorized you to make your Contributions on behalf of such employer(s) or such employer (s) has waived all of their right, title or interest in or to your Contributions.
+
+Disclaimer.
+To the fullest extent permitted under applicable law, your Contributions are provided on an "as is" basis, without any warranties or conditions, express or implied, including, without limitation, any implied warranties or conditions of non-infringement, merchantability or fitness for a particular purpose. You are not required to provide support for your Contributions, except to the extent you desire to provide support.
+
+No Obligation.
+You acknowledge that the maintainers of this project are under no obligation to use or incorporate your contributions into the project. The decision to use or incorporate your contributions into the project will be made at the sole discretion of the maintainers or their authorized delegates.
\ No newline at end of file
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/NOTICE b/Tools/PyTorch/TimeSeriesPredictionPlatform/NOTICE
new file mode 100755
index 00000000..39c73b0f
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/NOTICE
@@ -0,0 +1,205 @@
+This repository contains code from https://github.com/google-research/google-research/tree/master/tft under the Apache 2.0 License (included below).
+
+This repository contains code from https://github.com/rwightman/pytorch-image-models/blob/master/timm/utils/model_ema.py under the Apache 2.0 License (included below).
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/README.md b/Tools/PyTorch/TimeSeriesPredictionPlatform/README.md
new file mode 100755
index 00000000..1845851d
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/README.md
@@ -0,0 +1,491 @@
+# Time-Series Prediction Platform 1.0 for PyTorch
+
+Time-series prediction is a common problem in multiple domains for various applications, including retail, industry, smart cities, and financial services. Research in the time-series field is growing exponentially, with hundreds of deep learning time-series forecasting paper submissions to ICML, ECML, ITISE, and multiple journals every year. However, there is currently no common framework to compare the accuracy and performance of all the models from the industry or academia.
+
+## Solution Overview
+Time-Series Prediction Platform (TSPP) enables users to mix and match datasets and models. In this case, the user has complete control over the following settings, and can compare side-by-side results obtained from various solutions. These include:
+- Evaluation metrics 
+- Evaluation datasets 
+- Prediction horizons 
+- Prediction sliding window sizes Model choice
+- Model hyperparameters
+
+### Time-Series Prediction Platform architecture
+
+The platform has the following architecture. 
+
+
+![Time-series Prediction Platform architecture
+](TSPP_Architecture.png)
+In the previous figure, the command line feeds input to the TSPP launcher, which uses said input to configure the components required to train and test the model.
+
+
+The platform is designed to support multiple data types for input features, including the observed values of the forecasted time-series, known data supporting the forecasts (for example, day of the week), and static data (for example, user ID). This is summarized in the following figure.
+
+
+<div align="center">
+<img width="70%" src="https://developer.download.nvidia.com/time-series-platform/time_series_data.png" title="Time-series data type">
+<p style="text-align:center"><b>Time-series data type</b></p>
+<br>
+</div>
+
+### Default configuration
+The TSPP utilizes the default configurations provided by each model for each accompanying dataset. More information on individual model configurations can be found within the respective model repositories. By default, Temporal Fusion Transformer (TFT) is included within the TSPP.
+
+### Models
+    - Temporal Fusion Transformers XXX INSERT LINK HERE
+    - AutoARIMA
+
+### Feature support matrix
+This tool supports the following features: 
+
+| Feature               | Time-Series Prediction Platform               
+|-----------------------|--------------------------
+|[Automatic mixed precision (AMP)](https://pytorch.org/docs/stable/amp.html)| Yes          
+|[Multi-GPU training with (PyTorch DDP)](https://pytorch.org/tutorials/intermediate/ddp_tutorial.html)   | Yes  
+|[TorchScript, ONNX, and TRT conversion and NVIDIA Triton Deployment]	| Yes  
+
+#### Features
+
+**Automatic Mixed Precision (AMP)**[Automatic mixed precision](https://pytorch.org/docs/stable/amp.html) is a mode of computation for PyTorch models that allows operations to use float16 operations instead of float32 operations, potentially accelerating selected operations and total model runtime. More information can be found under the Mixed precision training section.
+
+**Multi-GPU training with PyTorch Distributed Data Parallel (DDP)**[DDP](https://pytorch.org/tutorials/intermediate/ddp_tutorial.html) is a mode of computation for PyTorch models that allows operations to be executed across multiple GPUs in parallel to accelerate computation.
+
+**TorchScript, ONNX, and TRT conversion and NVIDIA Triton Deployment** refer to the conversion of a model to the aforementioned formats and the ability to deploy the resulting converted models to an NVIDIA Triton inference server.  More detail about this process and native inference can be found in the Advanced tab under the Conversion, Deployment, and Inference subsection.
+
+
+
+
+### Mixed precision training
+
+Mixed precision is the combined use of different numerical precisions in a computational method. [Mixed precision](https://arxiv.org/abs/1710.03740) training offers significant computational speedup by performing operations in half-precision format while storing minimal information in single-precision to retain as much information as possible in critical parts of the network. Since the introduction of [Tensor Cores](https://developer.nvidia.com/tensor-cores) in NVIDIA Volta, and following with both the NVIDIA Turing and NVIDIA Ampere Architectures, significant training speedups are experienced by switching to mixed precision -- up to 3x overall speedup on the most arithmetically intense model architectures. Using mixed precision training requires two steps:
+1.  Porting the model to use the FP16 data type where appropriate.    
+2.  Adding loss scaling to preserve small gradient values.
+
+The ability to train deep learning networks with lower precision was introduced in the NVIDIA Pascal architecture and first supported in [CUDA 8](https://devblogs.nvidia.com/parallelforall/tag/fp16/) in the NVIDIA Deep Learning SDK.
+
+For information about:
+-   How to train using mixed precision, refer to the [Mixed Precision Training](https://arxiv.org/abs/1710.03740) paper and [Training With Mixed Precision](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html) documentation.
+-   Techniques used for mixed precision training, refer to the [Mixed-Precision Training of Deep Neural Networks](https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/) blog.
+- How to access and use AMP for PyTorch, refer to [Torch-AMP](https://pytorch.org/docs/stable/amp.html) guide.
+
+#### Enabling mixed precision
+
+Mixed precision can be enabled by specifying `amp=True` in the launch call. Note that for some cases, when the batch size is small, the overhead of scheduling kernels for mixed precision can be larger than the performance gain from using lower precision, effectively succeeding with lower throughput.
+## Setup
+The following section lists the requirements that you need to meet in order to run the Time-Series Prediction Platform.
+
+
+### Requirements
+
+This repository contains a Dockerfile that extends the PyTorch NGC container and encapsulates some dependencies. Aside from these dependencies, ensure you have the following components:
+- [NVIDIA Ampere Architecture](https://www.nvidia.com/en-us/data-center/nvidia-ampere-gpu-architecture/), [NVIDIA Volta](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/) or [NVIDIA Turing](https://www.nvidia.com/en-us/geforce/turing/) based GPU
+- Ubuntu 18.04
+- [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker)
+- [docker-compose](https://docs.docker.com/compose/install/). For an up-to-date version, installing from the web is recommended
+- Custom Docker containers built for this model. Refer to the steps in the [Quick Start Guide](#quick-start-guide).
+
+For more information about how to get started with NGC containers, refer to the following sections from the NVIDIA GPU Cloud Documentation and the Deep Learning Documentation:
+-   [Getting Started Using NVIDIA GPU Cloud](https://docs.nvidia.com/ngc/ngc-getting-started-guide/index.html)
+-   [Accessing And Pulling From The NGC Container Registry](https://docs.nvidia.com/deeplearning/frameworks/user-guide/index.html#accessing_registry)
+  
+For those unable to set up the required environment or create your own container, refer to the versioned [NVIDIA Container Support Matrix](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html).
+
+
+## Quick start guide
+
+### Getting Started
+1. Create a dataset directory.  The directory can be arbitrary, and it is recommended not to include it in the TimeSeriesPredictionPlatform directory.  This arbitrary directory will be mounted to the TSPP container later.  In the following steps this directory will be referred to as /your/datasets/.
+
+2. Enter the Deep Learning Examples TSPP repository:
+
+```
+cd DeeplearningExamples/Tools/PyTorch/TimeSeriesPredictionPlatform
+```
+3. Run repository setup
+```
+source scripts/setup.sh
+```
+
+3. Build the docker image:
+```
+docker build -t tspp .
+```
+
+4. Next we will start our container and mount the dataset directory, which means that /workspace/datasets/ points to /your/datasets/.  Any changes made to this folder in the docker container are reflected in the original directory and vice versa.  If we want to mount additional folders we can add ‘-v /path/on/local/:/path/in/container/’ to the run command.  This will be useful if we want to save the outputs from training or inference once we close the container. To start the docker container:
+```
+docker run -it --gpus all --ipc=host --network=host -v /your/datasets/:/workspace/datasets/ tspp bash
+```
+
+5. After running the previous command you will be placed inside the docker container in the /workspace directory.  Inside the container, download either the electricity or traffic dataset:
+```
+python data/script_download_data.py --dataset {dataset_name} --output_dir /workspace/datasets/
+```
+The raw electricity dataset is the 15 minute electricity consumption of 370 customers from the UCI Electricity Load Diagrams.  We aggregate to an hourly forecast and use the previous week to predict the following day.
+The raw traffic dataset is the 10 minute occupancy rate of San Francisco freeways from 440 sensors downloaded from the UCI PEMS-SF Data Set.  We again aggregate to an hourly forecast and use the previous week to predict the following day.  
+
+6. Preprocess the dataset:
+```
+python launch_preproc.py dataset={dataset}
+```
+
+7. Launch the training, validation, and testing process using the temporal fusion transformer model:
+```
+python launch_tspp.py model=tft dataset={dataset} criterion=quantile
+```
+Outputs are stored in /workspace/outputs/{date}/{time}
+
+
+### Adding a new dataset
+
+The TSPP has been designed to work with most CSV input. In order to add an arbitrary dataset to the TSPP:
+
+1. Enter the Deep Learning Examples TSPP repository:
+
+```
+cd DeeplearningExamples/Tools/PyTorch/TimeSeriesPredictionPlatform
+```
+
+2. Include the target dataset in the directory in which you want to keep your datasets. The directory can be arbitrary, and it is recommended not to include it in the TimeSeriesPredictionPlatform directory. This arbitrary directory will be mounted to the TSPP container later 
+
+```
+cp -r /PATH/TO/YOUR/DATASET /your/datasets/
+```
+
+3. Create a configuration file for your dataset, found in TimeSeriesPredictionPlatform/conf/dataset, that includes the following values:
+
+ 	* source_path: The path to the CSV that contains your dataset
+
+ 	* dest_path: The path to where preprocessing should write your preprocessed dataset
+
+ 	* time_ids: The name of the column within your source CSV that is the feature to split your training, validation, and test datasets on.
+
+ 	* train_range, valid_range, test_range: The ranges that mark the edges of the train, validation, and test subsets. Remember  that there can be overlap between subsets since predicting the first ‘unseen element’ requires the input of the seen elements before it.
+
+ 	* dataset_stride: The stride the dataloader uses to walk the sliding window through the dataset. Default: 1
+    
+ 	* scale_per_id: Whether to scale continuous features during preprocessing using scalers fitted on just samples from the same ID (True), or all samples (False, Default)
+   
+ 	* encoder_length: The length of data known up until the ‘present’
+
+ 	* example_length: The length of all data, including data known into the future. The target you are predicting lies on the difference between the example_length and encoder_length.
+
+ 	* features: A list of the features that the model takes as input. Each feature should be represented by an object containing descriptive attributes. All features should have at least a feature_type (ID, TIME, TARGET, WEIGHT, SAMPLE_WEIGHT, KNOWN, OBSERVED, or STATIC) and feature_embed_type (CONTINUOUS or CATEGORICAL). Continuous features may have a scaler attribute that represents the type of scaler used in preprocessing. Categorical columns should have a cardinality attribute that represents the number of unique values that that feature takes. Examples can be found in the files in /TimeSeriesPredictionPlatform/conf/dataset/. Required features are one TIME feature, at least one ID feature, one TARGET feature, and at least one KNOWN, OBSERVED, or STATIC feature.
+
+
+ 	* train_samples: The number of samples that should be taken at train time to use as train input to your model for a single epoch
+
+ 	* valid_samples: The number of samples that should be taken at train time to use as validation input to your model for a single epoch
+
+ 	* binarized: Whether or not preprocessing should accelerate data loading by outputting the preprocessed dataset in a binarized format
+
+ 	* time_series_count: The number of unique time-series contained in the dataset.
+
+
+4. After a specification has been written, it is ready to be preprocessed with:
+
+```
+docker build -t tspp .
+docker run -it --gpus all -v /your/datasets/:/workspace/datasets/ --ipc=host tspp bash
+python launch_preproc.py dataset={dataset_name}
+```
+
+For some models, additional parameters are required per each dataset. As mentioned in the Adding a new model section, there are examples of these model-dataset combination files in `TimeSeriesPredictionPlatform/conf/model_dataset/`. An example here would be model A requiring a specific hidden size when used on dataset B. In this case, TimeSeriesPredictionPlatform/conf/model_dataset/A_B.yaml should contain the desired hidden size under config.model.hidden_size
+5. Test your dataset by training and evaluating a temporal fusion transformer. Training, validation, and testing are all included by default using the launch_tspp.py command shown below:
+
+
+```
+docker run -it --gpus all -v /your/datasets/:/workspace/datasets/ --ipc=host tspp bash
+python launch_tspp.py dataset={YOUR_DATASET} model=tft criterion=quantile
+```
+
+If you encounter errors stating that srcIndex < value, verify that your categorical cardinalities are the correct size, as this error indicates that the value of a categorical you are trying to embed is too large for its respective embedding table.
+
+
+
+
+
+
+
+
+
+
+
+
+### Adding a new model
+
+Models added to the prediction platform are subject to a few key constraints. Namely, the models should be constructed using vanilla PyTorch. Models should be handling the forecasting task (anomaly detection and classification are planned); models should expect that the data is fed in a sliding window and that tensors will be aggregated by Temporal/Data Type. An example of how this works can be found in data/data_utils.py. Integrated models should be expecting the data to be in the format described by the feature spec for a specific dataset (output being a dictionary of tensors aggregated based on temporal and feature type).
+
+To integrate a model into the TSPP: 
+
+1. Enter the Deep Learning Examples repository:
+
+```
+cd DeeplearningExamples
+```
+
+2. Copy the model files into the Deep Learning Examples PyTorch/Forecasting/ directory:
+
+```
+cp -r /PATH/TO/YOUR/MODEL PyTorch/Forecasting/
+```
+
+3. Write a configuration file for the model in `DeeplearningExamples/Tools/TimeSeriesPredictionPlatform/conf/model`. 
+
+This configuration file should reflect the default configuration for your model. Within this file, the _target_ of the model component should be set to point to your model class. If your model needs additional configuration values based on the dataset, you should create a configuration file in `DeeplearningExamples/Tools/TimeSeriesPredictionPlatform/conf/model_dataset/{modelname_dataset_name.yaml}` named according to the model and dataset names. Examples can be found in the `DeeplearningExamples/Tools/TimeSeriesPredictionPlatform/conf/model/tft.yaml` and `DeeplearningExamples/Tools/TimeSeriesPredictionPlatform/conf/model_dataset/tft_traffic.yaml` files.
+
+4. Build and launch container:
+```
+cd DeeplearningExamples/Tools/PyTorch
+source scripts/setup.sh
+docker build -t tspp TimeSeriesPredictionPlatform
+docker run -it --rm --ipc=host --network=host --gpus all -v /PATH/TO/YOUR/DATASET/FOLDER/:/workspace/datasets/ tspp bash
+```
+
+5. Verify that the model can be run within the TSPP:
+```
+python launch_tspp.py model={model_name}
+```
+Some additional values may be needed in this call. For example, if your model requires the Adam optimizer, you will need to append optimizer=Adam to your call.
+
+
+
+## Advanced
+The following sections provide greater details of changing the dataset, altering the data preprocessing, and comparing the training results.
+
+
+### Running multi-GPU experiments
+
+
+Launching on multi-GPU requires no changes to model code and can be executed as follows within a TSPP container:
+```
+python -m torch.distributed.run --nproc_per_node={num_GPUS} launch_tspp.py {override parameters} +config.device.world_size={num_GPUS}
+```
+
+Statistical models (like AutoARIMA)are not run on GPU, so they are not suitable for multi-GPU acceleration.
+
+### Running experiments with Exponential Moving Averaging
+
+Exponential moving averaging is a technique in which, while training, the model weights are integrated into a weighted moving average, and the weighted moving average is used in lieu of the directly trained model weights at test time. Our experiments have found this technique improves the convergence properties of most models and datasets we work with. The full paper of EMA can be found here (https://arxiv.org/pdf/1803.05407.pdf)
+
+To activate EMA in the TSPP, simply specify ‘ema=True’ in the command line call at runtime. The decay parameter in the moving average can be modified using the config.trainer.ema.decay parameter
+### Hyperparameter Search
+
+Hyperparameter search can be used to find semi-optimal hyperparameter configurations for a given model or dataset. In the TSPP, hyperparameter search is driven by Optuna.
+
+To launch hyperparameter search, one must first have a base config. One can be generated by running launch_tspp.py with desired values and +config.save_config=True and +config.save_path=/path/to/conf.yaml
+
+Once a config file has been generated in /path/to/conf.yaml, open it and replace any field you want to include as a searchable hyperparameter with an optuna variable config. This optuna variable config describes the value you are searching on as well as the distribution that value is pulled from.
+The possible Optuna sampling objects and the parameters that you can use are:
+
+- categorical: samples from values uniformly.
+	- values: The values categorical sampling can take
+- int_uniform: samples uniformly from the range specified by (min_value, max_value, step_value)
+	- min_value: the minimum value that int_unfiorm sampling can take
+	- max_value: the maximum value that int_unfiorm sampling can take
+- step_value (optional): the size of the steps in between possible samples
+- float_uniform: samples uniformly from the range specified by (min_value, max_value)
+	- min_value: the minimum value that float_unfiorm sampling can take
+	- max_value: the maximum value that float_unfiorm sampling can take
+- log_uniform: samples using the log distribution from the range specified by (min_value, max_value)
+	- min_value: the minimum value that log_unfiorm sampling can take
+	- max_value: the maximum value that log_unfiorm sampling can take
+- discrete_uniform: samples uniformly from the range specified by (min_value, max_value, step_value)
+	- min_value: the minimum value that discrete_uniform sampling can take
+	- max_value: the maximum value that discrete_uniform sampling can take
+- step_value (optional): the size of the steps in between possible samples
+
+For example, to sample batch size between 512 and 1024, replace the batch size object with:
+
+batch_size:
+  sampling: categorical
+  values:
+    - 512
+    - 1024
+
+To sample learning rate with uniform probability between .1 and 1, we can replace the lr with:
+
+lr: 
+	sampling: float_uniform
+	min_value: .1
+	max_value: 1.0
+
+
+
+
+
+
+
+
+
+
+
+
+
+Once all desired values have been replaced with Optuna objects, append an Optuna field within the config to the bottom, with sub field n_trials to denote how many Optuna trials should be run and optionally a description of the Optuna sampler to use.
+Once this config file is saved, you can run python launch_optuna.py --config_path /path/to/conf.yaml.  This script attempts to make use of all visible GPUs.  Currently, we do not support using a varied number of GPUs for separate searches, meaning the world_size config field should be an integer instead of a list.  In addition, we do not support the use of multi-process dataloading in parameter searches meaning the num_workers is set to 0.  The number of concurrent trials being run is equal to the floor of the number of GPUs divided by the fixed world size.  Outputs will still be saved to /workspace/outputs/{DATE}/{TIME}/.  Each concurrent trial will perform independent n_trial different runs, yet all outputs are saved by the same optuna study.  This means that if 4 subprocesses are launched with 10 trials specified in the config, then 40 trials are run. Optuna will always run n_trials trials, and will not necessarily run the entire set of possible runs if the set size is bounded. For example, if you ran a set of 4 trials, where the only Optuna object being optimized is a categorical with 3 values, not all 3 values would necessarily occur within the trials.
+
+### Conversion, Deployment, and Inference
+
+Inference takes place after a model has been trained and one wants to run data through.  Since this only entails using a forward function, the model can be optimized and converted to many different formats that  can perform the forward pass more efficiently.  In addition, one can set up a [NVIDIA Triton inference server](https://github.com/triton-inference-server/server), which allows for a continuous stream of data to be presented to and passed through the model. The server provides an inference service via an HTTP or gRPC endpoint at ports 8000 and 8001, respectively, on the “bridge” docker network.  
+ 
+
+The TSPP supports a few versions of inference, including native inference and NVIDIA Triton deployment. Both use the test_forward function specified in the model config (defaults to forward()) as the forward function.
+
+To launch native inference, one must have a checkpoint directory from a TSPP training call that includes a .hydra directory and a best_checkpoint.pth.tar.  Then run 
+```
+python launch_inference.py device={device} config.evaluator.checkpoint=/path/to/checkpoint/directory
+```
+Note: Do not confuse the checkpoint directory with the TimeSeriesPredictionPlatform/outputs/ directory.  The directory to use in the inference call is two levels lower (for example, /path/to/TimeSeriesPredictionPlatform/outputs/2021-08-23/03-03-11/).  
+
+The device argument refers to the device that one would like the model to be built on and run on.  Note that multi-GPU inference launches are not supported.  By default, the evaluator uses the configs specified in the .hydra/config.yaml file from the checkpoint directory.  One can override these by including them in the launch.  For example, if one wanted to adjust the metrics to use MAE and RMSE only and to set the device to the CPU.
+```
+python launch_inference device=cpu config.evaluator.checkpoint=/path/to/checkpoint/directory “+config.evaluator.metrics=[‘MAE’, ‘RMSE’]”
+```
+Note: Be sure to include the + when overriding any of the evaluator configs.
+
+Prior to the next section, make sure that the TSPP container is run with the following arguments from the TSPP directory
+```
+docker run -it --rm --gpus all --ipc=host --network=host -v /your/datasets/:/workspace/datasets/  -v /your/outputs/:/your/outputs/ -v $(pwd):$(pwd) -v /your/outputs/:/workspace/outputs/ -v /var/run/docker.sock:/var/run/docker.sock tspp
+```
+In the previous command, note that five different directories are mounted.  The datasets are mounted to the usual location, but we have two different mount locations for outputs.  Mounting the outputs to /workspace/outputs/ allows usual training calls to be saved in your output directory.  The second output mount is mounted to the same path as the output directory is in the host.  This is essential due to the way we deploy to NVIDIA Triton, the directory of the output in the docker must match the directory of the output on the host machine.  Additionally, the mount for /var/run/docker.sock allows the tspp docker container to launch another container, in our case this is the NVIDIA Triton server. In subsequent calls to launch_deployment.py, the /path/to/checkpoint/directory/ must be of the form /your/outputs/{checkpoint_dir} instead of /workspace/outputs/{checkpoint_dir} and should be absolute paths. From testing, the best output directory to use appears to be TSPP/outputs.
+
+Finally, note that to run the deployment script, you must be in the same directory path in the container as the TSPP is stored on your machine. This means that simply being in /workspace in the container may not work for running the deployment.  If outside the container your TimeSeriesPredictionPlatform is at /home/user/TimeSeriesPredictionPlatform, you must be at the same path in your docker container (/home/user/TimeSeriesPredictionPlatform). This is the purpose of the -v $(pwd):$(pwd) in the run script. 
+
+
+To launch conversion and deployment, one must again have a checkpoint directory from a TSPP training call that includes a .hydra directory and a best_checkpoint.pth.tar.  In addition, the model that will be converted must already support conversion to the required format.  In the current version of the TSPP, we first export the model to either TorchScript-Script or TorchScript-Trace and subsequently convert to TorchScript, Onnx, or TRT using the model-navigator package.  We also support export to Onnx and conversion to both Onnx and TRT.  To run
+```
+python launch_deployment export={ts-trace, ts-script, onnx} convert={torchscript, onnx, trt} config.evaluator.checkpoint=/path/to/checkpoint/directory
+```
+The format mapping is listed below
+TorchScript-Script: ts-script
+TorchScript-Trace: ts-trace
+TorchScript: torchscript
+Onnx: onnx
+TRT: trt
+
+Note that the conversions do not support the apex fused LayerNorm library.  In order to get around this, we set the os environ variable ‘TFT_SCRIPTING” to True when loading the model for deployment.  This changes the apex LayerNorm to vanilla torch LayerNorm.
+
+Similarly to the native inference, one can again override the evaluator configs.  In addition, one can select the batch size and precision of the conversion, using config.inference.batch_size and config.inference.precision=Choice[ fp32, fp16 ] respectively.  Once export and conversion have been done, the results are stored in /path/to/checkpoint/directory/deployment.  Subsequently, the converted model’s NVIDIA Triton config is generated in the /path/to/checkpoint/directory/deployment/navigator_workspace/model-store/ directory. In addition a docker NVIDIA Triton server is launched based on this directory and inference is run through NVIDIA Triton. Finally, the outputs of this inference are used to calculate the metrics. The outputs of this inference and results of the metric calculation are stored in the brand new output directory created at TimeSeriesPredictionPlatform/outputs/today’s date/time at launch/.  Within this directory the metrics are stored in metrics.csv, and the raw outputs of the inference are stored in the raw/ directory.  The NVIDIA Triton model name is set as the second directory to the model.  For example, in the case of our TFT model, whose path is models.tft_pyt.TemporalFusionTransformer, the name of the NVIDIA Triton model is tft_pyt.  
+
+An additional option in running deployment is selecting whether to run the basics of conversion and NVIDIA Triton config creation or to run the full pipeline of conversion, NVIDIA Triton config creation, profiling, analysis, and helm chart creation.  Setting config.inference.optimize=True during launch switches to the full pipeline.  Another part of optimization is setting the backend accelerator for NVIDIA Triton config generation. Setting config.inference.accelerator=Choice[none, trt] changes the accelerator specified.  Note that this defaults to ‘none’ and ‘trt’ is only compatible with the Onnx conversion. If one wants to launch the NVIDIA Triton inference server using a specific GPU, the cuda index can be specified with the config option config.inference.gpu, which defaults to 0.
+
+More information on the conversion is located here:
+https://github.com/triton-inference-server/model_navigator/blob/main/docs/conversion.md
+
+More information on the NVIDIA Triton config creation is located here: https://github.com/triton-inference-server/model_navigator/blob/main/docs/triton_model_configurator.md
+
+More information on the full pipeline is located here: 
+https://github.com/triton-inference-server/model_navigator/blob/main/docs/run.md
+
+If one only wants to run the latter part of the launch_deployment script, which includes the NVIDIA Triton server initialization, inference, and metrics calculation, set the option config.inference.skip_conversion=True at launch.  The call still requires the checkpoint directory and for that directory to be set up in the same format as the result for a regular launch_deployment call (contains a deployment/navigator_workspace/model-store/ directory with the NVIDIA Triton models).  
+For this option of skipping the conversion, there is a config option +config.inference.model_name, which can be set to the NVIDIA Triton model name.  This does not set the name of the model, but rather selects which of the possible models in the model-store directory will be used for inference.  This is useful after a call using the optimize option, which can generate multiple different models in the model-store. 
+If one only wants to launch the NVIDIA Triton server and keep it live, set the option config.inference.just_deploy=True at launch.  Again, like the previous option of skipping conversion, the checkpoint directory is still required and must conform to the format for the NVIDIA Triton models.  This will not run inference automatically nor perform any other actions, it will solely start the NVIDIA Triton server using the given models.  
+
+For both the launch_inference and launch_deployment one can specify what dataset and target_scalers to use (if any) as long as the data shapes do not conflict with the already trained model. To specify a dataset directory use +config.inference.dataset_dir=/path/to/dataset. The dataset directory must contain a composite_scaler.bin file as well as either train.bin/valid.bin/test.bin or train.csv/valid.csv/test.csv depending on the configuration option config.dataset.binarized (this option cannot be changed during deployment or inference).  Once the path has been set, deployment and inference both use the test dataset.  
+
+Our TFT model supports export to TorchScript-Trace and conversion to all formats.  
+
+If you encounter an error such as 
+```
+RuntimeError: Model tft_pyt:1 is not ready
+```
+Or 
+```
+ERROR root Exception in callback <function InferenceServerClient.async_infer.<locals>.wrapped_callback at 0x7f9437b469d0>: AttributeError("'InferenceServerException' object has no attribute 'get_response'")
+```
+There are a few possible reasons for this to come up. First, make sure that when the TSPP docker container was launched the network argument was set to host.  Next, one can run “docker ps”; if the container “trt_server_cont” shows up, close it using “docker stop trt_server_cont”.  After this, one should try rerunning the command.  If neither of these steps is applicable or the problem persists, it is a more specific issue that requires more debugging.
+
+
+
+### Parameters
+
+Parameters for each individual component are stored in 
+```
+/workspace/conf/{component_type}/{component_name}.yaml
+```
+
+For example, the default parameters for TFT are stored in 
+```
+/workspace/conf/model/tft.yaml
+```
+
+For component selection, the options are:
+
+**dataset**: `electricity`, `traffic`
+**model**: `tft`, `auto_arima`, `trivial_model`  
+**criterion**: `GLL`, `MSE`, `quantile`  
+**device**: `cuda`, `cuda_8GPU`, `cpu`  
+**optimizer**: refer to `/workspace/conf/optimizer`  
+**ema**: `True`, this is assumed False by default.  
+**amp**: `True`, this is assumed False by default.
+
+
+
+If a parameter does not exist in the config, you must prepend `+` to its reference in the command line call. For example, `+config.evaluator.target_scalers=...` adds target_scalers to config.evaluator, but config.evaluator.target_scalers=... errors.
+
+Non-individual component-specific parametrization is listed below. Parameters are listed hierarchically, that is the config has an attribute trainer, which has an attribute `num_epochs` that controls the length of training:
+
+`config.log_path`: where to save your logs  
+`config.trainer.batch_size`: the batch size to use  
+`config.trainer.num_workers`: the number of workers to use for dataloading  
+`config.trainer.num_epochs`: the number of epochs to train the model for  
+`config.trainer.AMP`: whether to enable AMP for accelerated training  
+`config.dataset.source_path`: where the original file (before preproc) is stored  
+`config.dataset.dest_path`: the directory from which to save/read the preprocessed dataset  
+`config.dataset.time_ids`: the feature on which to split the dataset into `train`, `valid`, `test`  
+`config.dataset.train_range`: the range of the time feature that represents the `train` set  
+`config.dataset.valid_range`: the range of the time feature that represents the `validation` set  
+`config.dataset.test_range`: the range of the time feature that represents the `test` set  
+`config.dataset.dataset_stride`: the stride to use when creating the dataset  
+`config.dataset.scale_per_id`: whether to scale each series based on series statistics (`True`) or statistics across all series (`False`)  
+`config.dataset.encoder_length`: the length of past data that is fed to the model  
+`config.dataset.example_length`: the length of the full data that we are passing to the model. The length of the prediction horizon is the difference between encoder and example length  
+`config.dataset.features`: the features that the model will be using  
+`config.dataset.train_samples`: the number of examples to sample for our `train` dataset from our `train` partition  
+`config.dataset.valid_samples`: the number of examples to sample for our `validation` dataset from our `validation` partition  
+`config.dataset.binarized`: whether or not to use a binarized dataset for speedup  
+`config.device.world_size`: the number of GPUs the launcher is running on  
+`config.optimizer.gradient_norm`: the maximum norm of gradients allowed via gradient clipping  
+`config.optimizer.lr`: the learning rate to use for the optimizer
+NOTE: Any optimizer from `torch.optim` can be used, and all keywords can be specified by changing `config.optimizer` with an additional attribute  
+`config.evaluator.use_weights`: whether to weight metrics by weights specified in the input. Note: There must be a `WEIGHT` feature specified in `config.dataset.features` for this feature to work  
+`config.evaluator.target_scalers`: scalers used to unscale targets so that non-normalized predictions and targets are used for metric calculation  
+`config.evaluator.output_selector`: selects which output to use if the model has multiple outputs per time step (quantiles are an example)  
+`config.evaluator.label_selector`: selects which label to use if the labels have multiple values per time step  
+`config.evaluator.precision`: the precision to format the output metrics to  
+`config.evaluator.metrics`: a list of metrics to calculate on the test set  
+`config.evaluator.checkpoint`: path to the checkpoint directory containing the checkpoint to be loaded for inference/deployment
+
+`config.inference.batch_size`: the batch size to be used for inference or deployment  
+`config.inference.precision`: the precision of the exported model  
+`config.inference.optimize`: setting to True runs the model-navigator run script over the convert and triton-config-model  
+`config.inference.skip_conversion`: during deployment, skips the export, conversion, and configuration. Instead, starts the inference server, run inference, and calculate metrics  
+
+`config.inference.just_deploy`: starts the NVIDIA Triton server based on the NVIDIA Triton model specified in the checkpoint directory  
+`config.inference.dataset_dir`: overrides the default dataset path  
+`config.inference.model_name`: uses the model listed under this model name when deploying to the NVIDIA Triton server. This will not change the default name assigned to the models in the model-store directory  
+`config.inference.accelerator`: switches the backend accelerator in the triton-config-model step of the process,   
+`config.inference.gpu`: uses the gpu at this cuda index when launching the NVIDIA Triton inference server
+
+
+
+
+
+
+
+
+## Release Notes
+
+We’re constantly refining and improving our performance on AI and HPC workloads, even on the same hardware with frequent updates to our software stack. For our latest performance data, refer to these pages for [AI](#https://developer.nvidia.com/deep-learning-performance-training-inference) and [HPC](#https://developer.nvidia.com/hpc-application-performance) benchmarks.
+
+
+### Changelog
+November 2021
+- Initial release
+
+### Known issues
+There are no known issues with this tool.
+
+
+
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/TSPP_Architecture.png b/Tools/PyTorch/TimeSeriesPredictionPlatform/TSPP_Architecture.png
new file mode 100644
index 00000000..6f6660a9
Binary files /dev/null and b/Tools/PyTorch/TimeSeriesPredictionPlatform/TSPP_Architecture.png differ
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/callbacks/callbacks.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/callbacks/callbacks.py
new file mode 100755
index 00000000..f7ac5e71
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/callbacks/callbacks.py
@@ -0,0 +1,17 @@
+# SPDX-License-Identifier: Apache-2.0
+class Callback(object):
+    """
+    Base class for building new callbacks.
+    """
+
+    def __init__(self):
+        pass
+
+
+class CallbackContainer(object):
+    """
+    Base class for callbacks storage.
+    """
+
+    def __init__(self):
+        pass
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/callbacks/ctl_callbacks.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/callbacks/ctl_callbacks.py
new file mode 100755
index 00000000..875572a3
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/callbacks/ctl_callbacks.py
@@ -0,0 +1,268 @@
+# SPDX-License-Identifier: Apache-2.0
+import logging
+import time
+
+import dllogger
+
+from callbacks.callbacks import Callback, CallbackContainer
+from distributed_utils import is_main_process
+from training.utils import round_dict, save_checkpoint
+
+
+class CTLCallbackContainer(CallbackContainer):
+    """
+    Base class for CTLTrainer callbacks storage.
+    """
+
+    def __init__(self, trainer, callbacks):
+        self.callbacks = callbacks
+        self.trainer = trainer
+        self._init_trainers()
+        self.logs = {}
+        super().__init__()
+
+    def _init_trainers(self):
+        for callback in self.callbacks:
+            callback.trainer = self.trainer
+
+    def on_train_begin(self, logs=None):
+        if logs is None:
+            logs = {}
+        for callback in self.callbacks:
+            callback.on_train_begin(logs)
+
+    def on_train_end(self, logs=None):
+        if logs is None:
+            logs = {}
+        for callback in self.callbacks:
+            callback.on_train_end(logs)
+
+    def on_epoch_begin(self, epoch, logs=None):
+        if logs is None:
+            logs = {}
+        for callback in self.callbacks:
+            callback.on_epoch_begin(epoch, logs)
+
+    def on_epoch_end(self, epoch, logs=None):
+        if logs is None:
+            logs = {}
+        for callback in self.callbacks:
+            callback.on_epoch_end(epoch, logs)
+
+    def on_valid_begin(self, epoch, logs=None):
+        if logs is None:
+            logs = {}
+        for callback in self.callbacks:
+            callback.on_valid_begin(epoch, logs)
+
+    def on_valid_end(self, epoch, logs=None):
+        if logs is None:
+            logs = {}
+        for callback in self.callbacks:
+            callback.on_valid_end(epoch, logs)
+
+    def on_batch_begin(self, batch, logs=None):
+        if logs is None:
+            logs = {}
+        for callback in self.callbacks:
+            callback.on_batch_begin(batch, logs)
+
+    def on_batch_end(self, batch, logs=None):
+        if logs is None:
+            logs = {}
+        for callback in self.callbacks:
+            callback.on_batch_end(batch, logs)
+
+    def on_evaluate_end(self, logs=None):
+        if logs is None:
+            logs = {}
+        for callback in self.callbacks:
+            callback.on_evaluate_end(logs)
+
+    def on_evaluate_begin(self, logs=None):
+        if logs is None:
+            logs = {}
+        for callback in self.callbacks:
+            callback.on_evaluate_begin(logs)
+
+
+class CTLCallback(Callback):
+    """
+    Base class for building new CTLTrainer callbacks.
+    """
+
+    def __init__(self):
+        self.trainer = None
+        super().__init__()
+
+    @property
+    def trainer(self):
+        return self._trainer
+
+    @trainer.setter
+    def trainer(self, trainer):
+        self._trainer = trainer
+
+    def on_train_begin(self, logs=None):
+        pass
+
+    def on_train_end(self, logs=None):
+        pass
+
+    def on_epoch_begin(self, epoch, logs=None):
+        pass
+
+    def on_epoch_end(self, epoch, logs=None):
+        pass
+
+    def on_valid_begin(self, epoch, logs=None):
+        pass
+
+    def on_valid_end(self, epoch, logs=None):
+        pass
+
+    def on_batch_begin(self, batch, logs=None):
+        pass
+
+    def on_batch_end(self, batch, logs=None):
+        pass
+
+    def on_evaluate_begin(self, logs=None):
+        pass
+
+    def on_evaluate_end(self, logs=None):
+        pass
+
+
+class LoggingCallback(CTLCallback):
+    def on_train_begin(self, logs=None):
+        self.trainer.logger.log(
+            step=[],
+            data={"String": "Training with {} epochs".format(self.trainer.config.trainer.get("num_epochs", 1))},
+            verbosity=dllogger.Verbosity.DEFAULT,
+        )
+
+    def on_train_end(self, logs=None):
+        self.trainer.logger.log(step=[], data={"String": "Training Stopped"}, verbosity=dllogger.Verbosity.DEFAULT)
+
+    def on_epoch_begin(self, epoch, logs=None):
+        self.trainer.logger.log(step=[], data={"String": "Epoch {}".format(epoch)}, verbosity=dllogger.Verbosity.DEFAULT)
+
+    def on_valid_begin(self, epoch, logs=None):
+        self.trainer.logger.log(
+            step=[], data={"String": "Calculating Validation Metrics"}, verbosity=dllogger.Verbosity.DEFAULT
+        )
+
+    def on_valid_end(self, epoch, logs=None):
+        self.trainer.logger.log(
+            step=[],
+            data={"String": "Epoch {} Validation Metrics: {}".format(epoch, round_dict(logs))},
+            verbosity=dllogger.Verbosity.DEFAULT,
+        )
+
+    def on_evaluate_begin(self, logs=None):
+        self.trainer.logger.log(
+            step=[], data={"String": "Beginning Metric Evaluation"}, verbosity=dllogger.Verbosity.DEFAULT
+        )
+
+    def on_evaluate_end(self, logs=None):
+        self.trainer.logger.log(
+            step=[], data={"String": "Evaluation Metrics: {}".format(round_dict(logs))}, verbosity=dllogger.Verbosity.DEFAULT
+        )
+
+
+class EarlyStopping(CTLCallback):
+    def __init__(self, metric="val_loss", max_diff=0, patience=5):
+        self.metric = metric
+        self.max_diff = max_diff
+        self.patience = patience
+        self.stopped_epochs = 0
+        self.best_loss = None
+        super().__init__()
+
+    def on_epoch_end(self, epoch, logs=None):
+        epoch_loss = logs.get(self.metric, None)
+        if epoch_loss is None:
+            return
+
+        if self.best_loss is None or epoch_loss < self.best_loss:
+            self.best_loss = epoch_loss
+            return
+
+        if (epoch_loss - self.best_loss) > self.max_diff:
+            self.stopped_epochs += 1
+            if self.stopped_epochs >= self.patience:
+                self.trainer._stop_training = True
+                self.trainer.logger.log(
+                    step=[], data={"String": f"Applying early stopping"}, verbosity=dllogger.Verbosity.DEFAULT
+                )
+        else:
+            self.stopped_epochs = 0
+
+
+class SaveBestCheckpoint(CTLCallback):
+    def __init__(self, metric="val_loss"):
+        self.metric = metric
+        self.best_loss = None
+        super().__init__()
+
+    def on_epoch_end(self, epoch, logs=None):
+        epoch_loss = logs.get(self.metric, None)
+        if epoch_loss is None:
+            return
+
+        if self.best_loss is None or epoch_loss < self.best_loss:
+            self.best_loss = epoch_loss
+            if is_main_process():
+                save_checkpoint(self.trainer, checkpoint_dir=self.trainer.log_path, filename="best_checkpoint.pth.tar")
+            return
+
+
+class MeanAccumulator:
+    def __init__(self):
+        self.sum = 0
+        self.count = 0
+
+    def consume(self, value):
+        self.sum += value
+        self.count += 1
+
+    @property
+    def value(self):
+        if self.count == 0:
+            return 0
+        return self.sum / self.count
+
+
+class ThroughputBenchmark(CTLCallback):
+    def __init__(self, warmup_epochs=0):
+        self.warmup_epochs = warmup_epochs
+        self.train_throughput = MeanAccumulator()
+        self.valid_throughput = MeanAccumulator()
+        self.epoch_train_start = None
+        self.epoch_train_end = None
+        super().__init__()
+
+    def on_train_end(self, logs=None):
+        if self.train_throughput.value > 0:
+            logs["Train it/s"] = self.train_throughput.value
+            logs["Valid it/s"] = self.valid_throughput.value
+
+    def on_epoch_begin(self, epoch, logs=None):
+        self.epoch_train_start = time.time()
+
+    def on_valid_end(self, epoch, logs=None):
+        if epoch >= self.warmup_epochs:
+            train_epoch_time = self.epoch_train_end - self.epoch_train_start
+            valid_epoch_time = time.time() - self.epoch_train_end
+            train_iter_per_sec = self.trainer.train_dataset_len / train_epoch_time
+            valid_iter_per_sec = self.trainer.valid_dataset_len / valid_epoch_time
+
+            logs["Train epoch it/s"] = train_iter_per_sec
+            logs["Valid epoch it/s"] = valid_iter_per_sec
+
+            self.train_throughput.consume(train_iter_per_sec)
+            self.valid_throughput.consume(valid_iter_per_sec)
+
+    def on_valid_begin(self, batch, logs=None):
+        self.epoch_train_end = time.time()
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/amp/True.yaml b/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/amp/True.yaml
new file mode 100755
index 00000000..7c04ef70
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/amp/True.yaml
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
+config:
+  trainer:
+    AMP: True
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/callback/early_stopping.yaml b/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/callback/early_stopping.yaml
new file mode 100755
index 00000000..82ee329d
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/callback/early_stopping.yaml
@@ -0,0 +1,10 @@
+# SPDX-License-Identifier: Apache-2.0
+# @package _global_
+config:
+  trainer:
+    callback:
+      early_stopping:
+        _target_: callbacks.ctl_callbacks.EarlyStopping
+        metric: val_loss
+        max_diff: 0
+        patience: 5
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/callback/save_best_checkpoint.yaml b/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/callback/save_best_checkpoint.yaml
new file mode 100755
index 00000000..834d053f
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/callback/save_best_checkpoint.yaml
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: Apache-2.0
+# @package _global_
+config:
+  trainer:
+    callback:
+      save_best_checkpoint:
+        _target_: callbacks.ctl_callbacks.SaveBestCheckpoint
+        metric: val_loss
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/callback/standard.yaml b/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/callback/standard.yaml
new file mode 100755
index 00000000..b915597e
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/callback/standard.yaml
@@ -0,0 +1,19 @@
+# SPDX-License-Identifier: Apache-2.0
+# @package _global_
+config:
+  trainer:
+    callback:
+      early_stopping:
+        _target_: callbacks.ctl_callbacks.EarlyStopping
+        metric: val_loss
+        max_diff: 0
+        patience: 5
+      save_best_checkpoint:
+        _target_: callbacks.ctl_callbacks.SaveBestCheckpoint
+        metric: val_loss
+      throughput_benchmark:
+        _target_: callbacks.ctl_callbacks.ThroughputBenchmark
+        warmup_epochs: 0
+      logging:
+        _target_: callbacks.ctl_callbacks.LoggingCallback
+               
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/callback/throughput_benchmark.yaml b/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/callback/throughput_benchmark.yaml
new file mode 100755
index 00000000..70a23184
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/callback/throughput_benchmark.yaml
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: Apache-2.0
+# @package _global_
+config:
+  trainer:
+    callback:
+      throughput_benchmark:
+        _target_: callbacks.ctl_callbacks.ThroughputBenchmark
+        warmup_epochs: 0
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/conf_utils.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/conf_utils.py
new file mode 100644
index 00000000..841dbd43
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/conf_utils.py
@@ -0,0 +1,54 @@
+from omegaconf import OmegaConf, open_dict
+
+from data.data_utils import DataTypes, InputTypes, translate_features
+
+
+def append_derived_config_fields(config):
+    OmegaConf.set_struct(config, False)
+    config = config.config
+    features = translate_features(config.dataset.features)
+    with open_dict(config):
+        config.model.example_length = config.dataset.example_length
+        config.model.encoder_length = config.dataset.encoder_length
+        config.model.temporal_known_continuous_inp_size = len(
+            [x for x in features if x.feature_type == InputTypes.KNOWN and x.feature_embed_type == DataTypes.CONTINUOUS]
+        )
+        config.model.temporal_observed_continuous_inp_size = len(
+            [x for x in features if x.feature_type == InputTypes.OBSERVED and x.feature_embed_type == DataTypes.CONTINUOUS]
+        )
+        config.model.temporal_target_size = len([x for x in features if x.feature_type == InputTypes.TARGET])
+        config.model.static_continuous_inp_size = len(
+            [x for x in features if x.feature_type == InputTypes.STATIC and x.feature_embed_type == DataTypes.CONTINUOUS]
+        )
+        config.model.static_categorical_inp_lens = [
+            # XXX: this might be a bad idea. It is better make cardinality required.
+            x.get("cardinality", 100)
+            for x in features
+            if x.feature_type == InputTypes.STATIC and x.feature_embed_type == DataTypes.CATEGORICAL
+        ]
+
+        config.model.temporal_known_categorical_inp_lens = [
+            x.get("cardinality", 100)
+            for x in features
+            if x.feature_type == InputTypes.KNOWN and x.feature_embed_type == DataTypes.CATEGORICAL
+        ]
+        config.model.temporal_observed_categorical_inp_lens = [
+            x.get("cardinality", 100)
+            for x in features
+            if x.feature_type == InputTypes.OBSERVED and x.feature_embed_type == DataTypes.CATEGORICAL
+        ]
+
+        config.model.num_static_vars = config.model.static_continuous_inp_size + len(
+            config.model.static_categorical_inp_lens
+        )
+        config.model.num_future_vars = config.model.temporal_known_continuous_inp_size + len(
+            config.model.temporal_known_categorical_inp_lens
+        )
+        config.model.num_historic_vars = sum(
+            [
+                config.model.num_future_vars,
+                config.model.temporal_observed_continuous_inp_size,
+                config.model.temporal_target_size,
+                len(config.model.temporal_observed_categorical_inp_lens),
+            ]
+        )
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/config.yaml b/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/config.yaml
new file mode 100755
index 00000000..9015fb0e
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/config.yaml
@@ -0,0 +1,23 @@
+# SPDX-License-Identifier: Apache-2.0
+defaults:
+  - hydra/job_logging: primary
+  - trainer: ctltrainer
+  - evaluator: standard
+  - optimizer: Adam
+  - criterion: MSE
+  - device: cuda
+  - callback: standard
+  - model: trivial_model
+  - dataset: electricity
+  - model_dataset: ${defaults.6.model}_${defaults.7.dataset}
+    optional: true
+  - model_dataset_evaluator: ${defaults.6.model}_${defaults.7.dataset}_${defaults.1.evaluator}
+    optional: true
+  - model_dataset_device: ${defaults.6.model}_${defaults.7.dataset}_${defaults.4.device}
+    optional: true
+  - ema: False
+    optional: true
+  - amp: False
+    optional: true
+
+_target_: ???
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/convert/onnx.yaml b/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/convert/onnx.yaml
new file mode 100644
index 00000000..9411d7a0
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/convert/onnx.yaml
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: Apache-2.0
+# @package _global_
+config:
+  inference:
+    convert:
+      type: onnx
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/convert/torchscript.yaml b/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/convert/torchscript.yaml
new file mode 100644
index 00000000..a88224b3
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/convert/torchscript.yaml
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: Apache-2.0
+# @package _global_
+config:
+  inference:
+    convert:
+      type: torchscript
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/convert/trt.yaml b/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/convert/trt.yaml
new file mode 100644
index 00000000..9bb0c4a0
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/convert/trt.yaml
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: Apache-2.0
+# @package _global_
+config:
+  inference:
+    convert:
+      type: trt
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/criterion/GLL.yaml b/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/criterion/GLL.yaml
new file mode 100755
index 00000000..0c44b7f9
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/criterion/GLL.yaml
@@ -0,0 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+# @package _global_
+config:
+  criterion:
+    _target_: criterion.GLL_wrapper
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/criterion/MSE.yaml b/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/criterion/MSE.yaml
new file mode 100755
index 00000000..b64605d1
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/criterion/MSE.yaml
@@ -0,0 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+# @package _global_
+config:
+  criterion:
+    _target_: criterion.MSE_wrapper
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/criterion/quantile.yaml b/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/criterion/quantile.yaml
new file mode 100755
index 00000000..6c9757bd
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/criterion/quantile.yaml
@@ -0,0 +1,9 @@
+# SPDX-License-Identifier: Apache-2.0
+# @package _global_
+config:
+  criterion:
+    _target_: criterion.quantile_wrapper
+  model:
+    quantiles: [ .1,.5,.9 ]
+  evaluator:
+    output_selector: 1
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/dataset/electricity.yaml b/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/dataset/electricity.yaml
new file mode 100755
index 00000000..244cf5a8
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/dataset/electricity.yaml
@@ -0,0 +1,56 @@
+# SPDX-License-Identifier: Apache-2.0
+# @package _global_
+config:
+  dataset:
+    _target_: data.data_utils.create_datasets
+    source_path: /workspace/datasets/electricity/electricity.csv
+    dest_path: /workspace/datasets/electricity/
+    time_ids: 'days_from_start'
+    train_range:
+      - 0
+      - 1315
+    valid_range:
+      - 1308
+      - 1339
+    test_range:
+      - 1332
+      - 10000
+    dataset_stride: 1
+    scale_per_id: True
+    encoder_length: 168
+    example_length: 192
+    features:
+      - name: 'categorical_id'
+        feature_type: 'ID'
+        feature_embed_type: 'CATEGORICAL'
+        cardinality: 371
+      - name: 'hours_from_start'
+        feature_type: 'TIME'
+        feature_embed_type: 'CONTINUOUS'
+      - name: 'power_usage'
+        feature_type: 'TARGET'
+        feature_embed_type: 'CONTINUOUS'
+        scaler:
+            _target_: sklearn.preprocessing.StandardScaler
+      - name: 'hour'
+        feature_type: 'KNOWN'
+        feature_embed_type: 'CATEGORICAL'
+        cardinality: 25
+      - name: 'day_of_week'
+        feature_type: 'KNOWN'
+        feature_embed_type: 'CATEGORICAL'
+        cardinality: 8
+      - name: 'hours_from_start'
+        feature_type: 'KNOWN'
+        feature_embed_type: 'CONTINUOUS'
+        scaler:
+            _target_: sklearn.preprocessing.StandardScaler
+      - name: 'categorical_id'
+        feature_type: 'STATIC'
+        feature_embed_type: 'CATEGORICAL'
+        cardinality: 371
+    train_samples: 450000
+    valid_samples: 50000
+    binarized: True
+
+    time_series_count: 370
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/dataset/traffic.yaml b/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/dataset/traffic.yaml
new file mode 100755
index 00000000..32975183
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/dataset/traffic.yaml
@@ -0,0 +1,56 @@
+# SPDX-License-Identifier: Apache-2.0
+# @package _global_
+config:
+  dataset:
+    _target_: data.data_utils.create_datasets
+    source_path: /workspace/datasets/traffic/traffic.csv
+    dest_path: /workspace/datasets/traffic/
+    time_ids: 'sensor_day'
+    train_range:
+      - 0
+      - 151
+    valid_range:
+      - 144
+      - 166
+    test_range:
+      - 159
+      - 2000
+    dataset_stride: 1
+    scale_per_id: False
+    encoder_length: 168
+    example_length: 192
+    features:
+      - name: 'id'
+        feature_type: 'ID'
+        feature_embed_type: 'CATEGORICAL'
+        cardinality: 964
+      - name: 'hours_from_start'
+        feature_type: 'TIME'
+        feature_embed_type: 'CONTINUOUS'
+      - name: 'values'
+        feature_type: 'TARGET'
+        feature_embed_type: 'CONTINUOUS'
+        scaler:
+                _target_: sklearn.preprocessing.StandardScaler
+      - name: 'time_on_day'
+        feature_type: 'KNOWN'
+        feature_embed_type: 'CONTINUOUS'
+        scaler:
+                target: sklearn.preprocessing.StandardScaler
+      - name: 'day_of_week'
+        feature_type: 'KNOWN'
+        feature_embed_type: 'CATEGORICAL'
+        cardinality: 8
+      - name: 'hours_from_start'
+        feature_type: 'KNOWN'
+        feature_embed_type: 'CONTINUOUS'
+        scaler:
+                target: sklearn.preprocessing.StandardScaler
+      - name: 'categorical_id'
+        feature_type: 'STATIC'
+        feature_embed_type: 'CATEGORICAL'
+        cardinality: 964
+    train_samples: 450000
+    valid_samples: 50000
+    binarized: True
+    time_series_count: 964
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/deployment_config.yaml b/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/deployment_config.yaml
new file mode 100644
index 00000000..18493980
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/deployment_config.yaml
@@ -0,0 +1,7 @@
+# SPDX-License-Identifier: Apache-2.0
+defaults:
+  - export: ts-trace
+  - convert: torchscript
+  - inference: triton
+_target_: ???
+
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/device/cpu.yaml b/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/device/cpu.yaml
new file mode 100755
index 00000000..a5360411
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/device/cpu.yaml
@@ -0,0 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+# @package _global_
+config:
+  device:
+    name: cpu
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/device/cuda.yaml b/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/device/cuda.yaml
new file mode 100755
index 00000000..94f5e210
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/device/cuda.yaml
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: Apache-2.0
+# @package _global_
+config:
+  device:
+    name: cuda
+    world_size: 1
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/device/cuda_8GPU.yaml b/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/device/cuda_8GPU.yaml
new file mode 100755
index 00000000..2820e644
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/device/cuda_8GPU.yaml
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: Apache-2.0
+# @package _global_
+config:
+  device:
+    name: cuda
+    world_size: 8
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/ema/True.yaml b/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/ema/True.yaml
new file mode 100755
index 00000000..8feacb37
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/ema/True.yaml
@@ -0,0 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+config:
+  trainer:
+    ema:
+      decay: 0.999
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/evaluator/standard.yaml b/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/evaluator/standard.yaml
new file mode 100755
index 00000000..fde09dd0
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/evaluator/standard.yaml
@@ -0,0 +1,14 @@
+# SPDX-License-Identifier: Apache-2.0
+# @package _global_
+config:
+  evaluator:
+    _target_: evaluators.evaluation_metrics.MetricEvaluator
+    output_selector: -1
+    label_selector: -1
+    metrics:
+      - MSE
+      - MAE
+      - RMSE
+      - SMAPE
+    precision: 5
+    use_weights: False
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/export/onnx.yaml b/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/export/onnx.yaml
new file mode 100644
index 00000000..422bd167
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/export/onnx.yaml
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: Apache-2.0
+# @package _global_
+config:
+  inference:
+    export:
+      type: onnx
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/export/ts-script.yaml b/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/export/ts-script.yaml
new file mode 100644
index 00000000..58d9edb4
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/export/ts-script.yaml
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: Apache-2.0
+# @package _global_
+config:
+  inference:
+    export:
+      type: ts-script
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/export/ts-trace.yaml b/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/export/ts-trace.yaml
new file mode 100644
index 00000000..07afa99f
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/export/ts-trace.yaml
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: Apache-2.0
+# @package _global_
+config:
+  inference:
+    export:
+      type: ts-trace
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/hydra/job_logging/primary.yaml b/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/hydra/job_logging/primary.yaml
new file mode 100755
index 00000000..7caf6df7
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/hydra/job_logging/primary.yaml
@@ -0,0 +1,15 @@
+# @package _group_
+# SPDX-License-Identifier: Apache-2.0
+version: 1
+formatters:
+  simple:
+    format: '%(message)s'
+handlers:
+  console:
+    class: logging.StreamHandler
+    formatter: simple
+    stream: ext://sys.stdout
+root:
+  handlers: [console]
+  level: INFO
+disable_existing_loggers: false
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/hydra/job_logging/secondary.yaml b/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/hydra/job_logging/secondary.yaml
new file mode 100755
index 00000000..8b82e5dc
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/hydra/job_logging/secondary.yaml
@@ -0,0 +1,15 @@
+# @package _group_
+# SPDX-License-Identifier: Apache-2.0
+version: 1
+formatters:
+  simple:
+    format: '%(message)s'
+handlers:
+  console:
+    class: logging.StreamHandler
+    formatter: simple
+    stream: ext://sys.stdout
+root:
+  handlers: [console]
+  level: ERROR
+disable_existing_loggers: false
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/inference/native.yaml b/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/inference/native.yaml
new file mode 100644
index 00000000..7ad7c13b
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/inference/native.yaml
@@ -0,0 +1,10 @@
+# SPDX-License-Identifier: Apache-2.0
+# @package _global_
+config:
+  evaluator:
+    checkpoint: ???
+  inference:
+    _target_: inference.inference.run_inference
+    batch_size: 64
+    precision: fp32
+
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/inference/triton.yaml b/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/inference/triton.yaml
new file mode 100644
index 00000000..7ac966b0
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/inference/triton.yaml
@@ -0,0 +1,15 @@
+# SPDX-License-Identifier: Apache-2.0
+# @package _global_
+config:
+  evaluator:
+    checkpoint: ???
+  inference:
+    _target_: inference.deployer.run_deployment
+    batch_size: 64
+    precision: fp32
+    optimize: False
+    skip_conversion: False
+    just_deploy: False
+    accelerator: none
+    gpu: 0
+
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/inference_config.yaml b/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/inference_config.yaml
new file mode 100644
index 00000000..387bf1f0
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/inference_config.yaml
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: Apache-2.0
+defaults:
+  - inference: native
+  - device: cuda
+_target_: ???
+
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/model/lstm.yaml b/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/model/lstm.yaml
new file mode 100755
index 00000000..869d038d
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/model/lstm.yaml
@@ -0,0 +1,13 @@
+# SPDX-License-Identifier: Apache-2.0
+# @package _global_
+config:
+  model:
+    _target_: models.lstm.LSTM
+    hidden_size: 160
+    dropout: 0.1
+    missing_data_strategy: 'mask'
+  trainer:
+    batch_size: 2048
+    num_epochs: 10
+  optimizer:
+    lr: .004
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/model/tft.yaml b/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/model/tft.yaml
new file mode 100755
index 00000000..34551014
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/model/tft.yaml
@@ -0,0 +1,10 @@
+# SPDX-License-Identifier: Apache-2.0
+# @package _global_
+config:
+  model:
+    _target_: models.tft_pyt.modeling.TemporalFusionTransformer
+    quantiles: [ .5 ]
+    n_head: 4
+    hidden_size: 160
+    dropout: 0.1
+    attn_dropout: 0
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/model/trivial_model.yaml b/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/model/trivial_model.yaml
new file mode 100755
index 00000000..4f9e41aa
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/model/trivial_model.yaml
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: Apache-2.0
+# @package _global_
+config:
+  model:
+    _target_: models.trivial_model.TrivialModel
+    test_method: test_with_previous
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/model_dataset/tft_electricity.yaml b/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/model_dataset/tft_electricity.yaml
new file mode 100755
index 00000000..041c43a8
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/model_dataset/tft_electricity.yaml
@@ -0,0 +1,17 @@
+# SPDX-License-Identifier: Apache-2.0
+config:
+  model:
+    quantiles: [.1,.5,.9]
+    n_head: 4
+    hidden_size: 128
+    dropout: 0.1
+    attn_dropout: 0
+  trainer:
+    batch_size: 1024
+    num_epochs: 20
+  optimizer:
+    lr: .001
+    gradient_norm: 1.0
+  evaluator:
+    metrics: [P50, P90, MSE, MAE]
+
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/model_dataset/tft_traffic.yaml b/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/model_dataset/tft_traffic.yaml
new file mode 100755
index 00000000..9db85246
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/model_dataset/tft_traffic.yaml
@@ -0,0 +1,17 @@
+# SPDX-License-Identifier: Apache-2.0
+config:
+  model:
+    quantiles: [.1 ,.5 , .9]
+    n_head: 4
+    hidden_size: 128
+    dropout: 0.3
+    attn_dropout: 0
+  trainer:
+    batch_size: 1024
+    num_epochs: 10
+  optimizer:
+    lr: .001
+    gradient_norm: 1.0
+  evaluator:
+    metrics: [P50, P90, MSE, MAE]
+
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/model_dataset_device/tft_electricity_cuda_8GPU.yaml b/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/model_dataset_device/tft_electricity_cuda_8GPU.yaml
new file mode 100755
index 00000000..b6bbd303
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/model_dataset_device/tft_electricity_cuda_8GPU.yaml
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: Apache-2.0
+# @package _global_
+config:
+  optimizer:
+    lr: .001
+    gradient_norm: 1.0  
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/model_dataset_device/tft_traffic_cuda_8GPU.yaml b/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/model_dataset_device/tft_traffic_cuda_8GPU.yaml
new file mode 100755
index 00000000..b6bbd303
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/model_dataset_device/tft_traffic_cuda_8GPU.yaml
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: Apache-2.0
+# @package _global_
+config:
+  optimizer:
+    lr: .001
+    gradient_norm: 1.0  
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/optimizer/ASGD.yaml b/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/optimizer/ASGD.yaml
new file mode 100755
index 00000000..609f527a
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/optimizer/ASGD.yaml
@@ -0,0 +1,11 @@
+# SPDX-License-Identifier: Apache-2.0
+# @package _global_
+config:
+  optimizer:
+    _target_: optimizers.optimizer_wrapped
+    lr: 0.01
+    lambd: 0.0001
+    alpha: 0.75
+    t0: 1000000.0
+    weight_decay: 0.0
+    name: ASGD
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/optimizer/Adadelta.yaml b/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/optimizer/Adadelta.yaml
new file mode 100755
index 00000000..c861c6d0
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/optimizer/Adadelta.yaml
@@ -0,0 +1,10 @@
+# SPDX-License-Identifier: Apache-2.0
+# @package _global_
+config:
+  optimizer:
+    _target_: optimizers.optimizer_wrapped
+    lr: 1.0
+    rho: 0.9
+    eps: 1e-06
+    weight_decay: 0.0
+    name: Adadelta
\ No newline at end of file
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/optimizer/Adagrad.yaml b/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/optimizer/Adagrad.yaml
new file mode 100755
index 00000000..edd4db50
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/optimizer/Adagrad.yaml
@@ -0,0 +1,10 @@
+# SPDX-License-Identifier: Apache-2.0
+# @package _global_
+config:
+  optimizer:
+    _target_: optimizers.optimizer_wrapped
+    lr: 0.01
+    lr_decay: 0.0
+    weight_decay: 0.0
+    eps: 1e-10
+    name: Adagrad
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/optimizer/Adam.yaml b/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/optimizer/Adam.yaml
new file mode 100755
index 00000000..e6c776c8
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/optimizer/Adam.yaml
@@ -0,0 +1,11 @@
+# SPDX-License-Identifier: Apache-2.0
+# @package _global_
+config:
+  optimizer:
+    _target_: optimizers.optimizer_wrapped
+    lr: 0.001
+    betas: [0.9, 0.999]
+    eps: 1e-8
+    weight_decay: 0.0
+    amsgrad: False
+    name: Adam
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/optimizer/AdamW.yaml b/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/optimizer/AdamW.yaml
new file mode 100755
index 00000000..7afbfc0d
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/optimizer/AdamW.yaml
@@ -0,0 +1,11 @@
+# SPDX-License-Identifier: Apache-2.0
+# @package _global_
+config:
+  optimizer:
+    _target_: optimizers.optimizer_wrapped
+    lr: 0.001
+    betas: [0.9, 0.999]
+    eps: 1e-8
+    weight_decay: 0.0
+    amsgrad: False
+    name: AdamW
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/optimizer/Adamax.yaml b/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/optimizer/Adamax.yaml
new file mode 100755
index 00000000..67befa77
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/optimizer/Adamax.yaml
@@ -0,0 +1,10 @@
+# SPDX-License-Identifier: Apache-2.0
+# @package _global_
+config:
+  optimizer:
+    _target_: optimizers.optimizer_wrapped
+    lr: 0.002
+    betas: [0.9, 0.999]
+    eps: 1e-8
+    weight_decay: 0.0
+    name: Adamax
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/optimizer/LBFGS.yaml b/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/optimizer/LBFGS.yaml
new file mode 100755
index 00000000..50018697
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/optimizer/LBFGS.yaml
@@ -0,0 +1,14 @@
+# SPDX-License-Identifier: Apache-2.0
+# @package _global_
+config:
+  optimizer:
+    _target_: optimizers.optimizer_wrapped
+    lr: 1.0
+    max_iter: 20
+    max_eval: null
+    tolerance_grad: 1e-7
+    tolerance_change: 1e-9
+    history_size: 100
+    line_search_fn: null
+    name: LBFGS
+
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/optimizer/RMSprop.yaml b/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/optimizer/RMSprop.yaml
new file mode 100755
index 00000000..ea86116c
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/optimizer/RMSprop.yaml
@@ -0,0 +1,12 @@
+# SPDX-License-Identifier: Apache-2.0
+# @package _global_
+config:
+  optimizer:
+    _target_: optimizers.optimizer_wrapped
+    lr: 0.01
+    alpha: 0.99
+    eps: 1e-8
+    weight_decay: 0.0
+    momentum: 0.0
+    centered: False
+    name: RMSprop
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/optimizer/Rprop.yaml b/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/optimizer/Rprop.yaml
new file mode 100755
index 00000000..05a273b0
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/optimizer/Rprop.yaml
@@ -0,0 +1,9 @@
+# SPDX-License-Identifier: Apache-2.0
+# @package _global_
+config:
+  optimizer:
+    _target_: optimizers.optimizer_wrapped
+    lr: 0.01
+    etas: [0.5, 1.2]
+    step_sizes: [1e-06, 50]
+    name: Rprop
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/optimizer/SGD.yaml b/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/optimizer/SGD.yaml
new file mode 100755
index 00000000..537a4e63
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/optimizer/SGD.yaml
@@ -0,0 +1,12 @@
+# SPDX-License-Identifier: Apache-2.0
+# @package _global_
+config:
+  optimizer:
+    _target_: optimizers.optimizer_wrapped
+    lr: 0.01
+    momentum: 0.0
+    weight_decay: 0.0
+    dampening: 0.0
+    nesterov: False
+    name: SGD
+
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/optimizer/SparseAdam.yaml b/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/optimizer/SparseAdam.yaml
new file mode 100755
index 00000000..60cda643
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/optimizer/SparseAdam.yaml
@@ -0,0 +1,9 @@
+# SPDX-License-Identifier: Apache-2.0
+# @package _global_
+config:
+  optimizer:
+    _target_: optimizers.optimizer_wrapped
+    lr: 0.001
+    betas: [0.9, 0.999]
+    eps: 1e-8
+    name: SparseAdam
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/preproc_config.yaml b/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/preproc_config.yaml
new file mode 100755
index 00000000..adc89019
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/preproc_config.yaml
@@ -0,0 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+defaults:
+  - dataset: electricity
+_target_: data.data_utils.preprocess
+
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/trainer/ctltrainer.yaml b/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/trainer/ctltrainer.yaml
new file mode 100755
index 00000000..251a0693
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/trainer/ctltrainer.yaml
@@ -0,0 +1,7 @@
+# SPDX-License-Identifier: Apache-2.0
+config:
+  trainer:
+    batch_size: 32
+    num_workers: 3
+    num_epochs: 15
+    log_interval: 25
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/trainer/stattrainer.yaml b/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/trainer/stattrainer.yaml
new file mode 100755
index 00000000..fa2ce7af
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/conf/trainer/stattrainer.yaml
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
+config:
+  trainer:
+    type: stat
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/criterion.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/criterion.py
new file mode 100755
index 00000000..9e94703c
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/criterion.py
@@ -0,0 +1,76 @@
+# SPDX-License-Identifier: Apache-2.0
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+def MAE_wrapper(config):
+    return nn.L1Loss()
+
+
+def MSE_wrapper(config):
+    return nn.MSELoss()
+
+
+def quantile_wrapper(config):
+    return QuantileLoss(config)
+
+
+# assumed for anomaly detection task
+def cross_entropy_wrapper(config):
+    return nn.CrossEntropyLoss()
+
+
+def huber_wrapper(config):
+    return nn.SmoothL1Loss()
+
+
+def GLL_wrapper(config):
+    return GaussianLogLikelihood()
+
+
+class QuantileLoss(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.quantiles = list(config.model.quantiles)
+
+    def forward(self, predictions, targets):
+        losses = []
+
+        for i, q in enumerate(self.quantiles):
+            diff = predictions[..., i] - targets[..., 0]
+            loss = ((1 - q) * F.relu(diff) + q * F.relu(-diff)).mean()
+            losses.append(loss)
+        losses = torch.stack(losses)
+
+        return losses
+
+
+class WeightedL1Loss(nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, inputs, targets, weights):
+        x = F.l1_loss(inputs, targets, reduction="none")
+        x = x * weights  # broadcasted along 0th dimension
+        x = x.mean()
+        return x
+
+
+class GaussianLogLikelihood(nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, inputs, targets):
+        # Inputs with shape  [BS, window, 2] (mean + std)
+        # Targets with shape [BS, window, 1]
+
+        zero_index = targets[..., 0:1] != 0
+        mu = inputs[..., 0:1]
+        sigma = inputs[..., 1:2]
+        distribution = torch.distributions.normal.Normal(mu[zero_index], sigma[zero_index])
+        likelihood = distribution.log_prob(targets[zero_index])
+        likelihood = -likelihood.view(inputs.shape[0], inputs.shape[1])
+        loss = likelihood.mean(0).mean()
+
+        return loss
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/data/data_utils.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/data/data_utils.py
new file mode 100755
index 00000000..2249dbc5
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/data/data_utils.py
@@ -0,0 +1,697 @@
+# Copyright 2021 NVIDIA CORPORATION
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Copyright 2020 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import datetime
+import enum
+import math
+import os
+import pickle
+from abc import ABC
+from bisect import bisect
+from collections import namedtuple
+from itertools import combinations
+
+import hydra
+import numpy as np
+import pandas as pd
+import sklearn.preprocessing
+import torch
+from dgl.transform import metis_partition_assignment
+from omegaconf.dictconfig import DictConfig
+from omegaconf.listconfig import ListConfig
+from sklearn.impute import SimpleImputer
+from torch.utils.data import Dataset
+
+
+class DataTypes(enum.IntEnum):
+    """Defines numerical types of each column."""
+
+    CONTINUOUS = 0
+    CATEGORICAL = 1
+    DATE = 2
+    STR = 3
+
+
+DTYPE_MAP = {
+    DataTypes.CONTINUOUS: np.float32,
+    DataTypes.CATEGORICAL: np.int64,
+    DataTypes.DATE: np.datetime64,
+    DataTypes.STR: str,
+}
+
+
+class InputTypes(enum.IntEnum):
+    """Defines input types of each column."""
+
+    TARGET = 0
+    OBSERVED = 1
+    KNOWN = 2
+    STATIC = 3
+    ID = 4  # Single column used as an entity identifier
+    TIME = 5  # Single column exclusively used as a time index
+    WEIGHT = 6
+    SAMPLE_WEIGHT = 7
+
+
+class FeatureSpec:
+    enabled_attributes = ["name", "feature_type", "feature_embed_type", "cardinality", "scaler"]
+
+    def __init__(self, input_dict):
+        for key in input_dict:
+            if key in self.enabled_attributes:
+                setattr(self, key, input_dict[key])
+            else:
+                raise ValueError("Attribute not enabled: {attr}".format(attr=key))
+        self.name = input_dict["name"]
+        self.feature_type = InputTypes[input_dict["feature_type"]]
+        self.feature_embed_type = DataTypes[input_dict["feature_embed_type"]]
+
+    def get(self, key, value=None):
+        if hasattr(self, key):
+            return getattr(self, key)
+        else:
+            return value
+
+    def __str__(self):
+        return str((self.name, self.feature_type, self.feature_embed_type))
+
+    def __repr__(self):
+        return str(self)
+
+
+FEAT_ORDER = [
+    (InputTypes.STATIC, DataTypes.CATEGORICAL),
+    (InputTypes.STATIC, DataTypes.CONTINUOUS),
+    (InputTypes.KNOWN, DataTypes.CATEGORICAL),
+    (InputTypes.KNOWN, DataTypes.CONTINUOUS),
+    (InputTypes.OBSERVED, DataTypes.CATEGORICAL),
+    (InputTypes.OBSERVED, DataTypes.CONTINUOUS),
+    (InputTypes.TARGET, DataTypes.CONTINUOUS),
+    (InputTypes.WEIGHT, DataTypes.CONTINUOUS),
+    (InputTypes.SAMPLE_WEIGHT, DataTypes.CONTINUOUS),
+    (InputTypes.ID, DataTypes.CATEGORICAL),
+]
+FEAT_NAMES = ["s_cat", "s_cont", "k_cat", "k_cont", "o_cat", "o_cont", "target", "weight", "sample_weight", "id"]
+
+
+def translate_features(features, preproc=False):
+    all_features = [FeatureSpec(feature) for feature in features]
+    if preproc:
+        return all_features
+    return [FeatureSpec({"name": "_id_", "feature_type": "ID", "feature_embed_type": "CATEGORICAL"})] + [
+        feature for feature in all_features if feature.feature_type != InputTypes.ID
+    ]
+
+
+class TSBaseDataset(Dataset):
+    def __init__(self, features, path=None, encoder_length=52, example_length=54, stride=1):
+        super().__init__()
+        assert example_length > encoder_length
+        self.features = features
+        self.encoder_length = encoder_length
+        self.example_length = example_length
+        self.stride = stride
+        self.path = path
+        self.load()
+        self.features = [i for i in self.features if i.feature_type != InputTypes.TIME]
+        self.grouped = [x for x in self.grouped if x.shape[0] >= self.example_length]
+        self.group_lens = [(g.shape[0] - self.example_length + 1) // self.stride for g in self.grouped]
+        self._cum_examples_in_group = np.cumsum(self.group_lens)
+
+        self.feature_type_col_map = [
+            [i for i, f in enumerate(self.features) if (f.feature_type, f.feature_embed_type) == x] for x in FEAT_ORDER
+        ]
+
+        self.grouped = [
+            [
+                arr[:, idxs].view(dtype=np.float32).astype(DTYPE_MAP[t[1]])
+                for t, idxs in zip(FEAT_ORDER, self.feature_type_col_map)
+            ]
+            for arr in self.grouped
+        ]
+
+    def get_probabilities(self):
+        sampled = []
+        for i in range(len(self.grouped)):
+            group_len = self.group_lens[i]
+            group = self.grouped[i]
+            sample_weights = group[-1]
+            sampled.append(sample_weights[np.arange(0, self.stride * group_len, self.stride)])
+        sampled = np.concatenate(sampled)
+        return sampled
+
+    def __len__(self):
+        return self._cum_examples_in_group[-1]
+
+    def __getitem__(self, idx):
+        g_idx = bisect(self._cum_examples_in_group, idx)
+        e_idx = idx - self._cum_examples_in_group[g_idx - 1] if g_idx else idx
+
+        group = self.grouped[g_idx]
+
+        tensors = [
+            torch.from_numpy(feat[e_idx * self.stride : e_idx * self.stride + self.example_length])
+            if feat.size
+            else torch.empty(0)
+            for feat in group
+        ]
+
+        out = dict(zip(FEAT_NAMES, tensors))
+
+        # XXX: dataset shouldn't be aware of encoder_lenght probably. Masking should occur on some other level
+        out["weight"] = out["weight"][self.encoder_length :, :] if out["weight"].numel() else out["weight"]
+        out["id"] = out["id"][0, :]
+        return out
+
+
+class TSDataset(TSBaseDataset):
+    def load(self):
+        data = pd.read_csv(self.path)
+        col_names = ["_id_"] + [
+            x.name
+            for x in self.features
+            if x.feature_embed_type != DataTypes.STR
+            and x.feature_type != InputTypes.TIME
+            and x.feature_type != InputTypes.ID
+        ]
+
+        self.grouped = [group[1][col_names].values.astype(np.float32).view(dtype=np.int32) for group in data.groupby("_id_")]
+
+
+class TSBinaryDataset(TSBaseDataset):
+    def load(self):
+        self.grouped = pickle.load(open(self.path, "rb"))
+
+
+class StatDataset(Dataset):
+    def __init__(self, features, path=None, encoder_length=52, example_length=54, stride=1, split=None, split_feature=None):
+        super().__init__()
+        assert example_length > encoder_length
+        self.features = translate_features(features)
+        self.time_feature = split_feature
+        self.weight_features = [feature.name for feature in self.features if feature.feature_type == InputTypes.WEIGHT]
+        self.encoder_length = encoder_length
+        self.example_length = example_length
+        self.horizon = self.example_length - self.encoder_length
+        self.stride = stride
+        self.split = split
+
+        self.id_col_name = next(x.name for x in self.features if x.feature_type == InputTypes.ID)
+        self.col_dtypes = {v.name: DTYPE_MAP[v.feature_embed_type] for v in self.features}
+        self.data = pd.read_csv(os.path.join(path, "full.csv"), dtype=self.col_dtypes)
+        self.data = self.data.groupby(self.id_col_name).filter(lambda group: len(group) >= self.example_length)
+        self.grouped = list(self.data.groupby(self.id_col_name))
+        self.endog = [feature.name for feature in self.features if feature.feature_type == InputTypes.TARGET]
+        self.exog = [
+            feature.name
+            for feature in self.features
+            if feature.feature_type in [InputTypes.KNOWN, InputTypes.OBSERVED, InputTypes.STATIC]
+            and feature.feature_embed_type == DataTypes.CONTINUOUS
+        ]
+        self.grouped = [group[1] for group in self.grouped]
+        self.grouped = [
+            group
+            for group in self.grouped
+            if len(group[group[self.time_feature] <= self.split]) >= self.encoder_length
+            and len(group[group[self.time_feature] > self.split]) >= self.horizon
+        ]
+
+        self._cum_examples_in_group = np.cumsum(
+            [(len(group[group[self.time_feature] > split]) - self.horizon) // self.stride + 1 for group in self.grouped]
+        )
+
+    def __len__(self):
+        return self._cum_examples_in_group[-1]
+
+    def __getitem__(self, idx):
+        if idx > self._cum_examples_in_group[-1]:
+            raise StopIteration
+        g_idx = bisect(self._cum_examples_in_group, idx)
+        e_idx = idx - self._cum_examples_in_group[g_idx - 1] if g_idx else idx
+        group = self.grouped[g_idx]
+        test = group[group[self.time_feature] > self.split]
+        train = group[group[self.time_feature] <= self.split]
+        test_slice = test[self.stride * e_idx : self.stride * e_idx + self.horizon]
+        if (self.encoder_length - self.stride * e_idx) > 0:
+            train_slice = train[-(self.encoder_length - self.stride * e_idx) :].append(
+                test[max(0, self.stride * e_idx - self.encoder_length) : self.stride * e_idx]
+            )
+        else:
+            train_slice = test[max(0, self.stride * e_idx - self.encoder_length) : self.stride * e_idx]
+
+        train_out = {"endog": train_slice[self.endog], "exog": train_slice[self.exog]}
+
+        test_out = {"endog": test_slice[self.endog], "exog": test_slice[self.exog], "id": test_slice[self.id_col_name]}
+        if len(self.weight_features):
+            test_out["weight"] = test_slice[self.weight_features]
+        return train_out, test_out
+
+
+def create_datasets(config):
+    # XXX: We should probably fill all the fields in a config during it's construction with default
+    # values so we avoid using `get`. This will reduce the number of bugs in the future.
+    def select_dataset_class(config):
+        binarized = config.dataset.get("binarized", False)
+        graph_dataset = config.dataset.get("graph", False) and config.model.get("graph_eligible", False)
+
+        if binarized and graph_dataset:
+            specific_args = {
+                "graph": os.path.join(config.dataset.dest_path, "graph.bin"),
+                "graph_partitions": config.dataset.graph_partitions,
+                "partition_joining_coef": config.dataset.partition_joining_coef,
+            }
+            return TemporalClusteredGraphDataset, specific_args
+        elif binarized:
+            return TSBinaryDataset, {}
+        elif not binarized and graph_dataset:
+            raise NotImplemented
+        else:
+            return TSDataset, {}
+
+    common_args = {
+        # XXX: calling this every time we need features in cumbersome. We could call this when the config
+        # is constructed and enjoy not typig this line in every single function.
+        "features": translate_features(config.dataset.features),
+        "encoder_length": config.dataset.encoder_length,
+        "example_length": config.dataset.example_length,
+        "stride": config.dataset.get("stride", 1),
+    }
+
+    path_template = os.path.join(config.dataset.dest_path, "{{subset}}.{extension}")
+    path_template = path_template.format(extension="bin" if config.dataset.get("binarized", False) else "csv")
+    dataset_class, specific_args = select_dataset_class(config)
+
+    train = dataset_class(path=path_template.format(subset="train"), **common_args, **specific_args)
+    valid = dataset_class(path=path_template.format(subset="valid"), **common_args, **specific_args)
+    test = dataset_class(path=path_template.format(subset="test"), **common_args, **specific_args)
+
+    return train, valid, test
+
+
+def map_dt(dt):
+    if isinstance(dt, int):
+        dt = dt
+    elif isinstance(dt, ListConfig):
+        dt = datetime.datetime(*dt)
+    elif isinstance(dt, str):
+        dt = datetime.datetime.strptime(dt, "%Y-%m-%d")
+    return dt
+
+
+class ClusteredGraphDataset(Dataset):
+    def __init__(self, graph, graph_partitions=10, partition_joining_coef=2):
+        if isinstance(graph, str):
+            self.graph = pickle.load(open(graph, "rb"))
+        else:
+            self.graph = graph
+
+        assert isinstance(graph_partitions, int) and graph_partitions > 0
+        assert partition_joining_coef <= graph_partitions
+
+        self.part_count = graph_partitions
+        if graph_partitions > 1:
+            self.partition = metis_partition_assignment(self.graph, self.part_count)
+        else:
+            self.partition = torch.zeros(self.graph.num_nodes(), dtype=torch.int64)
+        self.joining_coef = partition_joining_coef
+
+    def __len__(self):
+        return math.comb(self.part_count, self.joining_coef)
+
+    def __getitem__(self, idx):
+        indicator = self.idx_to_combination(self.part_count, self.joining_coef, idx)
+        c_ids = np.nonzero(indicator)[0]
+        subgraph = self.get_subgraph(c_ids)
+        return subgraph
+
+    def get_subgraph(self, c_ids):
+        ids = sum([self.partition == i for i in c_ids]).bool()
+        return self.graph.subgraph(ids)
+
+    def idx_to_combination(self, n, r, m):
+        """
+        n: int total number of elements
+        r: int number of elements in combination
+        m: int 0-based index of combination in reverse-lexicographic order
+        
+        Returns list - indicator vector of chosen elements
+        """
+        assert m < math.comb(n, r), "Index out of range"
+
+        out = [0] * n
+        while n > 0:
+            if n > r and r >= 0:
+                y = math.comb(n - 1, r)
+            else:
+                y = 0
+            if m >= y:
+                m -= y
+                out[n - 1] = 1
+                r -= 1
+            n -= 1
+        return out
+
+
+class TemporalClusteredGraphDataset(ClusteredGraphDataset):
+    def __init__(self, features, graph, path=None, encoder_length=52, example_length=54, stride=1, **kwargs):
+        super().__init__(graph, **kwargs)
+        assert example_length > encoder_length
+        self.features = [i for i in features if i.feature_type != InputTypes.TIME]
+        self.encoder_length = encoder_length
+        self.example_length = example_length
+        self.stride = stride
+        self.path = path
+
+        self.feature_type_col_map = [
+            np.array([i for i, f in enumerate(self.features) if (f.feature_type, f.feature_embed_type) == x])
+            for x in FEAT_ORDER
+        ]
+
+        grouped = pickle.load(open(self.path, "rb"))
+        # We assume that all the time series are of the same length and have the same set of features
+        assert all([x.shape == grouped[0].shape for x in grouped])
+
+        ndata = np.stack(grouped)
+        self.ndata = {
+            name: ndata[:, :, ids].view(dtype=np.float32).astype(DTYPE_MAP[f[1]])
+            if not ids.size == 0
+            else np.empty((*ndata.shape[:-1], 0))
+            for name, f, ids in zip(FEAT_NAMES, FEAT_ORDER, self.feature_type_col_map)
+        }
+
+        self.t_dim = ndata.shape[1]
+        self.n_timeslices = (self.t_dim - self.example_length + 1) // self.stride
+
+    def __len__(self):
+        # the number of possible subgraphs times the number of possible time slices
+        return super().__len__() * self.n_timeslices
+
+    def __getitem__(self, idx):
+        g_idx = idx // self.n_timeslices
+        t_idx = idx - g_idx * self.n_timeslices
+        subgraph = super().__getitem__(g_idx)
+        node_ids = np.array(subgraph.ndata["_ID"])
+        for k, v in self.ndata.items():
+            subgraph.ndata[k] = torch.from_numpy(
+                v[node_ids, t_idx * self.stride : t_idx * self.stride + self.example_length, :]
+            )
+
+        return subgraph
+
+
+def get_dataset_splits(df, config):
+    if hasattr(config, "valid_boundary") and config.valid_boundary != None:
+        forecast_len = config.example_length - config.encoder_length
+        # The valid split is shifted from the train split by number of the forecast steps to the future.
+        # The test split is shifted by the number of the forecast steps from the valid split
+        train = []
+        valid = []
+        test = []
+        valid_boundary = map_dt(config.valid_boundary)
+        for _, group in df.groupby("_id_"):
+            index = group[config.time_ids]
+            _train = group.loc[index < valid_boundary]
+            _valid = group.iloc[(len(_train) - config.encoder_length) : (len(_train) + forecast_len)]
+            _test = group.iloc[(len(_train) - config.encoder_length + forecast_len) : (len(_train) + 2 * forecast_len)]
+            train.append(_train)
+            valid.append(_valid)
+            test.append(_test)
+
+        train = pd.concat(train, axis=0)
+        valid = pd.concat(valid, axis=0)
+        test = pd.concat(test, axis=0)
+
+    elif df.dtypes[config.time_ids] not in [np.float64, np.int]:
+        index = df[config.time_ids]
+
+        train = df.loc[(index >= map_dt(config.train_range[0])) & (index < map_dt(config.train_range[1]))]
+        valid = df.loc[(index >= map_dt(config.valid_range[0])) & (index < map_dt(config.valid_range[1]))]
+        test = df.loc[(index >= map_dt(config.test_range[0])) & (index < map_dt(config.test_range[1]))]
+    else:
+        index = df[config.time_ids]
+        train = df.loc[(index >= config.train_range[0]) & (index < config.train_range[1])]
+        valid = df.loc[(index >= config.valid_range[0]) & (index < config.valid_range[1])]
+        test = df.loc[(index >= config.test_range[0]) & (index < config.test_range[1])]
+
+    return train, valid, test
+
+
+def recombine_datasets(train, valid, test, config):
+    if hasattr(config, "valid_boundary") and config.valid_boundary != None:
+        forecast_len = config.example_length - config.encoder_length
+        # The valid split is shifted from the train split by number of the forecast steps to the future.
+        # The test split is shifted by the number of the forecast steps from the valid split
+        train_temp = []
+        valid_temp = []
+        for g0, g1 in zip(train.groupby("_id_"), valid.groupby("_id_")):
+            _train = g0[1].iloc[: -config.encoder_length]
+            _valid = g1[1].iloc[:forecast_len]
+            train_temp.append(_train)
+            valid_temp.append(_valid)
+        train = pd.concat(train_temp, axis=0)
+        valid = pd.concat(valid_temp, axis=0)
+    elif train.dtypes[config.time_ids] not in [np.float64, np.int]:
+
+        train = train[train[config.time_ids] < map_dt(config.valid_range[0])]
+        valid = valid[valid[config.time_ids] < map_dt(config.test_range[0])]
+    else:
+        train = train[train[config.time_ids] < config.valid_range[0]]
+        valid = valid[valid[config.time_ids] < config.test_range[0]]
+    return pd.concat((train, valid, test))
+
+
+def flatten_ids(df, id_features):
+    current_id = df[id_features[0]].astype("category").cat.codes + 1
+    for additional_id in id_features[1:]:
+        current_id = df[additional_id].astype("category").cat.codes * (current_id.max() + 1) + current_id + 1
+    df["_id_"] = current_id.astype("category").cat.codes
+
+
+def impute(df, config):
+    # XXX This ensures that out scaling will have the same mean. We still need to check the variance
+    # XXX does it work in place?
+    if not (config.get("missing_data_label", False)):
+        return df, None
+    else:
+        imp = SimpleImputer(missing_values=config.missing_data_label, strategy="mean")
+        mask = df.applymap(lambda x: True if x == config.missing_data_label else False)
+        data = df.values  # XXX this probably works in place. Check that!
+        col_mask = (data == config.missing_data_label).all(axis=0)
+        data[:, ~col_mask] = imp.fit_transform(data)
+        return data, mask
+
+
+def map_scalers(features):
+    mapping = {}
+    for feature in features:
+        if feature.get("scaler", None):
+            if mapping.get(feature.scaler, None):
+                mapping[feature.scaler].append(feature.name)
+            else:
+                mapping[feature.scaler] = [feature.name]
+    return mapping
+
+
+class CompositeScaler:
+    def __init__(self, target_features, input_continuous, scale_per_id):
+        self.target_mapping = map_scalers(target_features)
+        self.continuous_mapping = map_scalers(input_continuous)
+        self.target_features = target_features
+        self.input_continuous = input_continuous
+        self.scale_per_id = scale_per_id
+        self.continuous_scalers = {}
+        self.target_scalers = {}
+
+    def fit(self, df):
+        for k, v in self.continuous_mapping.items():
+            self.continuous_scalers[k] = {}
+            if self.scale_per_id:
+                for identifier, sliced in df.groupby("_id_"):
+                    scaler = hydra.utils.instantiate(k).fit(sliced[v])
+                    self.continuous_scalers[k][identifier] = scaler
+
+            else:
+                scaler = hydra.utils.instantiate(k).fit(df[v])
+                self.continuous_scalers[k][""] = scaler
+
+        for k, v in self.target_mapping.items():
+            self.target_scalers[k] = {}
+            if self.scale_per_id:
+                for identifier, sliced in df.groupby("_id_"):
+                    scaler = hydra.utils.instantiate(k).fit(sliced[v])
+                    self.target_scalers[k][identifier] = scaler
+
+            else:
+                scaler = hydra.utils.instantiate(k).fit(df[v])
+                self.target_scalers[k][""] = scaler
+
+    def apply_scalers(self, df, name=None):
+        if name is None:
+            name = df.name
+        for k, v in self.continuous_mapping.items():
+            df[v] = self.continuous_scalers[k][name].transform(df[v])
+        for k, v in self.target_mapping.items():
+            df[v] = self.target_scalers[k][name].transform(df[v])
+        return df
+
+    def transform(self, df):
+        if self.scale_per_id:
+            df = df.groupby("_id_").apply(self.apply_scalers)
+        else:
+            df = self.apply_scalers(df, name="")
+        return df
+
+    def inverse_transform_targets(self, values, ids):
+        # Assuming single targets for now
+        if len(self.target_scalers) > 0:
+
+            scalers = list(self.target_scalers.values())[0]
+            if self.scale_per_id:
+                flat_values = values.flatten()
+                flat_ids = np.repeat(ids, values.shape[1])
+                df = pd.DataFrame({"id": flat_ids, "value": flat_values})
+                df_list = []
+                for identifier, sliced in df.groupby("id"):
+                    df_list.append(scalers[identifier].inverse_transform(sliced["value"]))
+                return np.concatenate(df_list, axis=None)
+            else:
+                flat_values = values.flatten()
+                flat_values = scalers[""].inverse_transform(flat_values)
+                return flat_values
+        return values
+
+
+def get_feature_splits(features):
+    splits = {}
+    splits["dates"] = [feature for feature in features if feature.feature_embed_type == DataTypes.DATE]
+    splits["target_features"] = [feature for feature in features if feature.feature_type == InputTypes.TARGET]
+    splits["time_feature"] = [feature for feature in features if feature.feature_type == InputTypes.TIME][0]
+    splits["id_features"] = [feature for feature in features if feature.feature_type == InputTypes.ID]
+    splits["input_categoricals"] = [
+        feature
+        for feature in features
+        if feature.feature_embed_type == DataTypes.CATEGORICAL
+        and feature.feature_type in [InputTypes.STATIC, InputTypes.KNOWN, InputTypes.OBSERVED]
+    ]
+    splits["input_continuous"] = [
+        feature
+        for feature in features
+        if feature.feature_embed_type == DataTypes.CONTINUOUS
+        and feature.feature_type in [InputTypes.STATIC, InputTypes.KNOWN, InputTypes.OBSERVED]
+    ]
+    return splits
+
+
+def preprocess(config):
+    config = config.dataset
+    dest_path = config.dest_path
+    features = translate_features(config["features"], preproc=True)
+    feat_splits = get_feature_splits(features)
+
+    print("Reading in data")
+    df = pd.read_csv(config.source_path, parse_dates=[d.name for d in feat_splits["dates"]])
+    print("Sorting on time feature")
+    df = df.sort_values([feat_splits["time_feature"].name])
+    f_names = [feature.name for feature in features] + [config.time_ids]
+    df = df[list(set(f_names))]
+    flatten_ids(df, [feature.name for feature in feat_splits["id_features"]])
+
+    if config.get("missing_data_label", False):
+        df = df.replace(config.get("missing_data_label"), np.NaN)
+    df = df.dropna(subset=[t.name for t in feat_splits["target_features"]])
+    print("Mapping categoricals to bounded range")
+
+    for categorical in feat_splits["input_categoricals"]:
+        df[categorical.name] = df[categorical.name].astype("category").cat.codes
+
+    print("Splitting datasets")
+    train, valid, test = get_dataset_splits(df, config)
+    train = train.groupby("_id_").filter(lambda x: len(x) >= config.example_length)
+    valid = valid.groupby("_id_").filter(lambda x: len(x) >= config.example_length)
+    test = test.groupby("_id_").filter(lambda x: len(x) >= config.example_length)
+    if hasattr(config, "valid_boundary") and config.valid_boundary != None:
+        arriter = ["_id_"]
+    else:
+        arriter = [cat.name for cat in feat_splits["input_categoricals"]] + ["_id_"]
+
+    if config.get("drop_unseen", False):
+        for categorical in arriter:
+            seen_values = train[categorical].unique()
+            valid = valid[valid[categorical].isin(seen_values)]
+            test = test[test[categorical].isin(seen_values)]
+    print("Applying normalization")
+    scaler = CompositeScaler(
+        feat_splits["target_features"], feat_splits["input_continuous"], scale_per_id=config.scale_per_id
+    )
+    scaler.fit(train)
+
+    train = scaler.transform(train)
+    valid = scaler.transform(valid)
+    test = scaler.transform(test)
+
+    cont_features_names = [continuous.name for continuous in feat_splits["input_continuous"]]
+    train[cont_features_names] = train[cont_features_names].replace(np.NaN, 10 ** 9)
+    valid[cont_features_names] = valid[cont_features_names].replace(np.NaN, 10 ** 9)
+    test[cont_features_names] = test[cont_features_names].replace(np.NaN, 10 ** 9)
+
+    print("Saving processed data")
+    os.makedirs(dest_path, exist_ok=True)
+
+    train.to_csv(os.path.join(dest_path, "train.csv"))
+    valid.to_csv(os.path.join(dest_path, "valid.csv"))
+    test.to_csv(os.path.join(dest_path, "test.csv"))
+    recombine_datasets(train, valid, test, config).to_csv(os.path.join(dest_path, "full.csv"))
+
+    # Save relevant columns in binary form for faster dataloading
+    # IMORTANT: We always expect id to be a single column indicating the complete timeseries
+    # We also expect a copy of id in form of static categorical input!!!]]
+    if config.get("binarized", False):
+        col_names = ["_id_"] + [
+            x.name
+            for x in features
+            if x.feature_embed_type != DataTypes.STR
+            and x.feature_type != InputTypes.TIME
+            and x.feature_type != InputTypes.ID
+        ]
+        grouped_train = [x[1][col_names].values.astype(np.float32).view(dtype=np.int32) for x in train.groupby("_id_")]
+        grouped_valid = [x[1][col_names].values.astype(np.float32).view(dtype=np.int32) for x in valid.groupby("_id_")]
+        grouped_test = [x[1][col_names].values.astype(np.float32).view(dtype=np.int32) for x in test.groupby("_id_")]
+
+        pickle.dump(grouped_train, open(os.path.join(dest_path, "train.bin"), "wb"))
+        pickle.dump(grouped_valid, open(os.path.join(dest_path, "valid.bin"), "wb"))
+        pickle.dump(grouped_test, open(os.path.join(dest_path, "test.bin"), "wb"))
+
+    with open(os.path.join(dest_path, "composite_scaler.bin"), "wb") as f:
+        pickle.dump(scaler, f)
+
+
+def sample_data(dataset, num_samples):
+    if num_samples < 0:
+        return dataset
+    else:
+        return torch.utils.data.Subset(dataset, np.random.choice(np.arange(len(dataset)), size=num_samples, replace=False))
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/data/script_download_data.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/data/script_download_data.py
new file mode 100755
index 00000000..22c80adf
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/data/script_download_data.py
@@ -0,0 +1,353 @@
+# Copyright 2021 NVIDIA CORPORATION
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Copyright 2020 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Only downloads data if the csv files are present, unless the "force_download"
+argument is supplied. For new datasets, the download_and_unzip(.) can be reused
+to pull csv files from an online repository, but may require subsequent
+dataset-specific processing.
+
+Usage:
+  python3 script_download_data --dataset {DATASET} --output_dir {DIR}
+Command line args:
+  DATASET: Name of dataset to download {e.g. electricity}
+  DIR: Path to main dataset diredtory
+"""
+
+from __future__ import absolute_import, division, print_function
+
+import argparse
+import gc
+import os
+import pickle
+import sys
+import warnings
+from datetime import date, timedelta
+
+import dgl
+import numpy as np
+import pandas as pd
+import py7zr
+import pyunpack
+import torch
+import wget
+from scipy.spatial import distance_matrix
+
+warnings.filterwarnings("ignore")
+
+
+# General functions for data downloading & aggregation.
+def download_from_url(url, output_path):
+    """Downloads a file froma url."""
+
+    print("Pulling data from {} to {}".format(url, output_path))
+    wget.download(url, output_path)
+    print("done")
+
+
+def unzip(zip_path, output_file, data_folder, use_z=False):
+    """Unzips files and checks successful completion."""
+
+    print("Unzipping file: {}".format(zip_path))
+    if use_z:
+        py7zr.SevenZipFile(zip_path, mode="r").extractall(path=data_folder)
+    else:
+        pyunpack.Archive(zip_path).extractall(data_folder)
+
+    # Checks if unzip was successful
+    if not os.path.exists(output_file):
+        raise ValueError("Error in unzipping process! {} not found.".format(output_file))
+
+
+def download_and_unzip(url, zip_path, csv_path, data_folder):
+    """Downloads and unzips an online csv file.
+
+    Args:
+      url: Web address
+      zip_path: Path to download zip file
+      csv_path: Expected path to csv file
+      data_folder: Folder in which data is stored.
+    """
+
+    download_from_url(url, zip_path)
+
+    unzip(zip_path, csv_path, data_folder)
+
+    print("Done.")
+
+
+# Dataset specific download routines.
+def download_electricity(data_folder):
+    """Downloads electricity dataset from UCI repository."""
+
+    url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00321/LD2011_2014.txt.zip"
+
+    csv_path = os.path.join(data_folder, "LD2011_2014.txt")
+    zip_path = csv_path + ".zip"
+
+    download_and_unzip(url, zip_path, csv_path, data_folder)
+
+    print("Aggregating to hourly data")
+
+    df = pd.read_csv(csv_path, index_col=0, sep=";", decimal=",")
+    df.index = pd.to_datetime(df.index)
+    df.sort_index(inplace=True)
+
+    # Used to determine the start and end dates of a series
+    output = df.resample("1h").mean().replace(0.0, np.nan)
+
+    earliest_time = output.index.min()
+
+    df_list = []
+    for label in output:
+        srs = output[label]
+
+        start_date = min(srs.fillna(method="ffill").dropna().index)
+        end_date = max(srs.fillna(method="bfill").dropna().index)
+
+        active_range = (srs.index >= start_date) & (srs.index <= end_date)
+        srs = srs[active_range].fillna(0.0)
+
+        tmp = pd.DataFrame({"power_usage": srs})
+        date = tmp.index
+        tmp["t"] = (date - earliest_time).seconds / 60 / 60 + (date - earliest_time).days * 24
+        tmp["days_from_start"] = (date - earliest_time).days
+        tmp["categorical_id"] = label
+        tmp["date"] = date
+        tmp["id"] = label
+        tmp["hour"] = date.hour
+        tmp["day"] = date.day
+        tmp["day_of_week"] = date.dayofweek
+        tmp["month"] = date.month
+
+        df_list.append(tmp)
+
+    output = pd.concat(df_list, axis=0, join="outer").reset_index(drop=True)
+
+    output["categorical_id"] = output["id"].copy()
+    output["hours_from_start"] = output["t"]
+    output["categorical_day_of_week"] = output["day_of_week"].copy()
+    output["categorical_hour"] = output["hour"].copy()
+
+    # Filter to match range used by other academic papers
+    output = output[(output["days_from_start"] >= 1096) & (output["days_from_start"] < 1346)].copy()
+
+    output.to_csv(data_folder + "/electricity.csv")
+
+    print("Done.")
+
+
+def download_traffic(data_folder):
+    """Downloads traffic dataset from UCI repository."""
+
+    url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00204/PEMS-SF.zip"
+
+    csv_path = os.path.join(data_folder, "PEMS_train")
+    zip_path = os.path.join(data_folder, "PEMS-SF.zip")
+
+    download_and_unzip(url, zip_path, csv_path, data_folder)
+
+    print("Aggregating to hourly data")
+
+    def process_list(s, variable_type=int, delimiter=None):
+        """Parses a line in the PEMS format to a list."""
+        if delimiter is None:
+            parsed_list = [variable_type(i) for i in s.replace("[", "").replace("]", "").split()]
+        else:
+            parsed_list = [variable_type(i) for i in s.replace("[", "").replace("]", "").split(delimiter)]
+
+        return parsed_list
+
+    def read_single_list(filename):
+        """Returns single list from a file in the PEMS-custom format."""
+        with open(os.path.join(data_folder, filename), "r") as dat:
+            parsed_list_from_file = process_list(dat.readlines()[0])
+        return parsed_list_from_file
+
+    def read_matrix(filename):
+        """Returns a matrix from a file in the PEMS-custom format."""
+        array_list = []
+        with open(os.path.join(data_folder, filename), "r") as dat:
+
+            lines = dat.readlines()
+            for i, line in enumerate(lines):
+
+                array = [
+                    process_list(row_split, variable_type=float, delimiter=None)
+                    for row_split in process_list(line, variable_type=str, delimiter=";")
+                ]
+                array_list.append(array)
+
+        return array_list
+
+    shuffle_order = np.array(read_single_list("randperm")) - 1  # index from 0
+    train_dayofweek = read_single_list("PEMS_trainlabels")
+    train_tensor = read_matrix("PEMS_train")
+    test_dayofweek = read_single_list("PEMS_testlabels")
+    test_tensor = read_matrix("PEMS_test")
+
+    # Inverse permutate shuffle order
+    print("Shuffling")
+    inverse_mapping = {new_location: previous_location for previous_location, new_location in enumerate(shuffle_order)}
+    reverse_shuffle_order = np.array([inverse_mapping[new_location] for new_location, _ in enumerate(shuffle_order)])
+
+    # Group and reoder based on permuation matrix
+    print("Reodering")
+    day_of_week = np.array(train_dayofweek + test_dayofweek)
+    combined_tensor = np.array(train_tensor + test_tensor)
+
+    day_of_week = day_of_week[reverse_shuffle_order]
+    combined_tensor = combined_tensor[reverse_shuffle_order]
+
+    # Put everything back into a dataframe
+    print("Parsing as dataframe")
+    labels = ["traj_{}".format(i) for i in read_single_list("stations_list")]
+
+    hourly_list = []
+    for day, day_matrix in enumerate(combined_tensor):
+
+        # Hourly data
+        hourly = pd.DataFrame(day_matrix.T, columns=labels)
+        hourly["hour_on_day"] = [int(i / 6) for i in hourly.index]  # sampled at 10 min intervals
+        if hourly["hour_on_day"].max() > 23 or hourly["hour_on_day"].min() < 0:
+            raise ValueError("Invalid hour! {}-{}".format(hourly["hour_on_day"].min(), hourly["hour_on_day"].max()))
+
+        hourly = hourly.groupby("hour_on_day", as_index=True).mean()[labels]
+        hourly["sensor_day"] = day
+        hourly["time_on_day"] = hourly.index
+        hourly["day_of_week"] = day_of_week[day]
+
+        hourly_list.append(hourly)
+
+    hourly_frame = pd.concat(hourly_list, axis=0, ignore_index=True, sort=False)
+
+    # Flatten such that each entitiy uses one row in dataframe
+    store_columns = [c for c in hourly_frame.columns if "traj" in c]
+    other_columns = [c for c in hourly_frame.columns if "traj" not in c]
+    flat_df = pd.DataFrame(columns=["values", "prev_values", "next_values"] + other_columns + ["id"])
+
+    def format_index_string(x):
+        """Returns formatted string for key."""
+
+        if x < 10:
+            return "00" + str(x)
+        elif x < 100:
+            return "0" + str(x)
+        elif x < 1000:
+            return str(x)
+
+        raise ValueError("Invalid value of x {}".format(x))
+
+    for store in store_columns:
+
+        sliced = hourly_frame[[store] + other_columns].copy()
+        sliced.columns = ["values"] + other_columns
+        sliced["id"] = int(store.replace("traj_", ""))
+
+        # Sort by Sensor-date-time
+        key = (
+            sliced["id"].apply(str)
+            + sliced["sensor_day"].apply(lambda x: "_" + format_index_string(x))
+            + sliced["time_on_day"].apply(lambda x: "_" + format_index_string(x))
+        )
+        sliced = sliced.set_index(key).sort_index()
+
+        sliced["values"] = sliced["values"].fillna(method="ffill")
+        sliced["prev_values"] = sliced["values"].shift(1)
+        sliced["next_values"] = sliced["values"].shift(-1)
+
+        flat_df = flat_df.append(sliced.dropna(), ignore_index=True, sort=False)
+
+    # Filter to match range used by other academic papers
+    index = flat_df["sensor_day"]
+    flat_df = flat_df[index < 173].copy()
+
+    # Creating columns fo categorical inputs
+    flat_df["categorical_id"] = flat_df["id"].copy()
+    flat_df["hours_from_start"] = flat_df["time_on_day"] + flat_df["sensor_day"] * 24.0
+    flat_df["categorical_day_of_week"] = flat_df["day_of_week"].copy()
+    flat_df["categorical_time_on_day"] = flat_df["time_on_day"].copy()
+
+    flat_df.to_csv(data_folder + "/traffic.csv")
+
+
+def construct_graph(nodes_loc, k=0.8):
+    """
+    Constructs a graph based on a physical location of nodes
+    nodes_loc: 2D array num_nodes x dim
+    features: list of node features
+    """
+    dist_mx = distance_matrix(nodes_loc, nodes_loc)
+
+    std = dist_mx.std()
+    adj_mx = np.exp(-np.square(dist_mx / std))
+    adj_mx[adj_mx < k] = 0
+    np.fill_diagonal(adj_mx, 0)
+
+    edges = np.nonzero(adj_mx)
+    graph = dgl.graph(edges, num_nodes=nodes_loc.shape[0])
+    return graph
+
+
+def main(args):
+    """Runs main download routine.
+
+    Args:
+      expt_name: Name of experiment
+      force_download: Whether to force data download from scratch
+      output_folder: Folder path for storing data
+    """
+
+    print("#### Running download script ###")
+
+    download_function = DOWNLOAD_FUNCTIONS[args.dataset]
+
+    print("Getting {} data...".format(args.dataset))
+    subdir = os.path.join(args.output_dir, args.dataset)
+    print(subdir)
+    if os.path.exists(subdir):
+        print(f"Warning: Path {subdir} exists. Overwritting files!", file=sys.stderr)
+    os.makedirs(subdir, exist_ok=True)
+    download_function(subdir)
+
+    print("Download completed.")
+
+
+# XXX: maybe we should add a decorator adding func to registry to keep everything in one place
+DOWNLOAD_FUNCTIONS = {"electricity": download_electricity, "traffic": download_traffic}
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser(description="Data download configs")
+    parser.add_argument(
+        "--dataset", metavar="DATASET", type=str, choices=DOWNLOAD_FUNCTIONS.keys(), required=True, help="Dataset name"
+    )
+    parser.add_argument("--output_dir", metavar="DIR", type=str, default=".", help="Path to folder for data download")
+
+    args = parser.parse_args()
+    main(args)
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/distributed_utils.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/distributed_utils.py
new file mode 100755
index 00000000..4d145cc2
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/distributed_utils.py
@@ -0,0 +1,162 @@
+# SPDX-License-Identifier: Apache-2.0
+import logging
+import os
+import random
+
+import numpy as np
+import torch
+import torch.distributed as dist
+
+
+def load_checkpoint(load_ckpt_path):
+    if load_ckpt_path:
+        checkpoint = torch.load()
+    else:
+        checkpoint = None
+    return checkpoint
+
+
+def get_device(local_rank, device_name):
+    if torch.cuda.is_available() and device_name == "cuda":
+        torch.cuda.set_device(local_rank % torch.cuda.device_count())
+        device = torch.device("cuda")
+    else:
+        device = torch.device("cpu")
+    return device
+
+
+def seed_everything(seed):
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+
+
+def generate_seeds(rng, size):
+    """
+    Generate list of random seeds
+
+    :param rng: random number generator
+    :param size: length of the returned list
+    """
+    seeds = [rng.randint(0, 2 ** 32 - 1) for _ in range(size)]
+    return seeds
+
+
+def broadcast_seeds(seeds, device):
+    """
+    Broadcasts random seeds to all distributed workers.
+    Returns list of random seeds (broadcasted from workers with rank 0).
+
+    :param seeds: list of seeds (integers)
+    :param device: torch.device
+    """
+    if torch.distributed.is_available() and torch.distributed.is_initialized():
+        seeds_tensor = torch.LongTensor(seeds).to(device)
+        torch.distributed.broadcast(seeds_tensor, 0)
+        seeds = seeds_tensor.tolist()
+    return seeds
+
+
+def setup_seeds(master_seed, epochs, device):
+    """
+    Generates seeds from one master_seed.
+    Function returns (worker_seeds, shuffling_seeds), worker_seeds are later
+    used to initialize per-worker random number generators (mostly for
+    dropouts), shuffling_seeds are for RNGs resposible for reshuffling the
+    dataset before each epoch.
+    Seeds are generated on worker with rank 0 and broadcasted to all other
+    workers.
+
+    :param master_seed: master RNG seed used to initialize other generators
+    :param epochs: number of epochs
+    :param device: torch.device (used for distributed.broadcast)
+    """
+    if master_seed == -1:
+        # random master seed, random.SystemRandom() uses /dev/urandom on Unix
+        master_seed = random.SystemRandom().randint(0, 2 ** 32 - 1)
+        if get_rank() == 0:
+            # master seed is reported only from rank=0 worker, it's to avoid
+            # confusion, seeds from rank=0 are later broadcasted to other
+            # workers
+            print(f"Using random master seed: {master_seed}")
+    else:
+        # master seed was specified from command line
+        print(f"Using master seed from command line: {master_seed}")
+
+    # initialize seeding RNG
+    seeding_rng = random.Random(master_seed)
+
+    # generate worker seeds, one seed for every distributed worker
+    worker_seeds = generate_seeds(seeding_rng, get_world_size())
+
+    # generate seeds for data shuffling, one seed for every epoch
+    shuffling_seeds = generate_seeds(seeding_rng, epochs)
+
+    # broadcast seeds from rank=0 to other workers
+    worker_seeds = broadcast_seeds(worker_seeds, device)
+    shuffling_seeds = broadcast_seeds(shuffling_seeds, device)
+    return worker_seeds, shuffling_seeds
+
+
+def get_world_size():
+    return int(os.environ.get("WORLD_SIZE", 1))
+
+
+def reduce_tensor(tensor, num_gpus, average=False):
+    if num_gpus > 1:
+        rt = tensor.clone()
+        dist.all_reduce(rt, op=dist.reduce_op.SUM)
+        if average:
+            if rt.is_floating_point():
+                rt = rt / num_gpus
+            else:
+                rt = rt // num_gpus
+        return rt
+    return tensor
+
+
+def init_distributed(world_size):
+    if dist.is_initialized():
+        return True
+    distributed = world_size > 1
+    if distributed:
+        backend = "nccl" if torch.cuda.is_available() else "gloo"
+        dist.init_process_group(backend=backend, init_method="env://")
+        assert dist.is_initialized()
+
+    if get_rank() == 0:
+        print("Distributed initialized. World size:", world_size)
+    return distributed
+
+
+def get_rank():
+    """
+    Gets distributed rank or returns zero if distributed is not initialized.
+    """
+    if torch.distributed.is_available() and torch.distributed.is_initialized():
+        rank = torch.distributed.get_rank()
+    else:
+        rank = 0
+    return rank
+
+
+def is_main_process():
+    return get_rank() == 0
+
+
+def barrier():
+    """
+    Works as a temporary distributed barrier, currently pytorch
+    doesn't implement barrier for NCCL backend.
+    Calls all_reduce on dummy tensor and synchronizes with GPU.
+    """
+    if torch.distributed.is_available() and torch.distributed.is_initialized():
+        torch.distributed.all_reduce(torch.cuda.FloatTensor(1))
+        torch.cuda.synchronize()
+
+
+# XXX: Why do we even have 2 separate logging objects?
+def log(to_log):
+    if is_main_process():
+        logging.info(to_log)
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/evaluators/evaluation_metrics.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/evaluators/evaluation_metrics.py
new file mode 100755
index 00000000..ce52daf3
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/evaluators/evaluation_metrics.py
@@ -0,0 +1,276 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+import pickle
+from abc import ABC, abstractmethod
+
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
+
+
+class AbstractMetric(ABC):
+    @staticmethod
+    @abstractmethod
+    def __call__(label, pred, weights):
+        pass
+
+
+class SMAPE(AbstractMetric):
+    name = "SMAPE"
+
+    @staticmethod
+    def __call__(labels, preds, weights):
+        if weights.shape == (0, 0):
+            weights = None
+        return 100 * np.average(2 * np.abs(preds - labels) / (np.abs(labels) + np.abs(preds)), weights=weights)
+
+
+def numpy_normalised_quantile_loss(y, y_pred, quantile):
+    prediction_underflow = y - y_pred
+    weighted_errors = quantile * np.maximum(prediction_underflow, 0.0) + (1.0 - quantile) * np.maximum(
+        -prediction_underflow, 0.0
+    )
+    loss = weighted_errors.mean()
+    normaliser = abs(y).mean()
+    return 2 * loss / normaliser
+
+
+class P50_loss(AbstractMetric):
+    name = "P50"
+    selector = 1
+
+    @staticmethod
+    def __call__(labels, preds, weights):
+        if weights.shape != (0, 0):
+            raise ValueError("Weights not currently supported for quantile metrics")
+        return numpy_normalised_quantile_loss(labels, preds, 0.5)
+
+
+class P90_loss(AbstractMetric):
+    name = "P90"
+    selector = 2
+
+    @staticmethod
+    def __call__(labels, preds, weights):
+        if weights.shape != (0, 0):
+            raise ValueError("Weights not currently supported for quantile metrics")
+        return numpy_normalised_quantile_loss(labels, preds, 0.9)
+
+
+# Normalized Deviation
+class ND(AbstractMetric):
+    name = "ND"
+
+    @staticmethod
+    def __call__(labels, preds, weights, return_individual=False):
+        if weights.shape == (0, 0):
+            if return_individual:
+                return np.abs(labels - preds) / np.abs(labels)
+            return np.sum(np.abs(labels - preds)) / np.sum(np.abs(labels))
+        else:
+
+            values = np.abs(labels - weights)
+            if return_individual:
+                return values * weights / np.sum(np.abs(labels))
+            return np.sum(values * weights) / np.sum(np.abs(labels) * weights)
+
+
+class MAE(AbstractMetric):
+    name = "MAE"
+
+    @staticmethod
+    def __call__(labels, preds, weights, return_individual=False):
+        if weights.shape == (0, 0):
+            if return_individual:
+                return mean_absolute_error(preds, labels, multioutput="raw_values")
+            return mean_absolute_error(labels, preds)
+        else:
+
+            values = mean_absolute_error(preds, labels, multioutput="raw_values")
+            if return_individual:
+                return values * weights
+            return np.sum(values * weights) / np.sum(weights)
+
+
+class MSE(AbstractMetric):
+    name = "MSE"
+
+    @staticmethod
+    def __call__(labels, preds, weights, return_individual=False):
+        if weights.shape == (0, 0):
+            if return_individual:
+                return mean_squared_error(preds, labels, multioutput="raw_values")
+            return mean_squared_error(labels, preds)
+        else:
+
+            values = mean_squared_error(preds, labels, multioutput="raw_values")
+            if return_individual:
+                return values * weights
+            return np.sum(values * weights) / np.sum(weights)
+
+
+class RMSE(AbstractMetric):
+    name = "RMSE"
+
+    @staticmethod
+    def __call__(labels, preds, weights):
+
+        if weights.shape == (0, 0):
+            return np.sqrt(mean_squared_error(labels, preds))
+        else:
+
+            values = mean_squared_error(preds, labels, multioutput="raw_values")
+            return np.sqrt(np.sum(values * weights) / np.sum(weights))
+
+
+class R_Squared(AbstractMetric):
+    name = "R_Squared"
+
+    @staticmethod
+    def __call__(labels, preds, weights, return_individual=False):
+        if weights.shape == (0, 0):
+            if return_individual:
+                return r2_score(preds, labels, multioutput="raw_values")
+            return r2_score(preds, labels)
+        else:
+            values = r2_score(preds, labels, multioutput="raw_values")
+            if return_individual:
+                return values * weights
+            return np.sum(values * weights) / np.sum(weights)
+
+
+class WMSMAPE(AbstractMetric):
+    name = "WMSMAPE"
+
+    @staticmethod
+    def __call__(labels, preds, weights, return_individual=False):
+        if weights.shape != (0, 0):
+            if return_individual:
+                return 2 * weights * np.abs(preds - labels) / (np.maximum(labels, 1) + np.abs(preds))
+            else:
+                return (
+                    100.0
+                    / np.sum(weights)
+                    * np.sum(2 * weights * np.abs(preds - labels) / (np.maximum(labels, 1) + np.abs(preds)))
+                )
+        if return_individual:
+            return 2 * np.abs(preds - labels) / (np.maximum(labels, 1) + np.abs(preds))
+        else:
+            return 100.0 / len(labels) * np.sum(2 * np.abs(preds - labels) / (np.maximum(labels, 1) + np.abs(preds)))
+
+
+mapping = {
+    "SMAPE": SMAPE,
+    "WMSMAPE": WMSMAPE,
+    "MSE": MSE,
+    "MAE": MAE,
+    "P50": P50_loss,
+    "P90": P90_loss,
+    "RMSE": RMSE,
+    "R_Squared": R_Squared,
+    "ND": ND,
+}
+
+
+class MetricEvaluator:
+    def __init__(self, config):
+        self.output_selector = config.evaluator.get("output_selector", None)
+        self.label_selector = config.evaluator.get("label_selector", None)
+        self.metrics = []
+        self.visualize_path = config.evaluator.get("visualize_path", None)
+        self.visualize_num = config.evaluator.get("visualize_num", 0)
+
+        self.scalers = pickle.load(open(os.path.join(config.dataset.dest_path, "composite_scaler.bin"), "rb"))
+
+        for metric in config.evaluator.metrics:
+            if metric not in mapping.keys():
+                raise ValueError("No metric of name: {metric}".format(metric=metric))
+            self.metrics.append(mapping[metric]())
+        self.precision = config.evaluator.precision
+        self.config = config
+
+    def __call__(self, labels, preds, weights=np.zeros((0, 0)), ids=np.zeros((0, 0))):
+        results = {}
+
+        if len(weights.shape) > 2:
+            weights = np.squeeze(weights, axis=2)
+        for metric in [metric for metric in self.metrics if metric.name in ["P50", "P90"]]:
+            q_preds, q_labels = self.select(preds, labels, metric.selector, None)
+
+            q_preds = self.scalers.inverse_transform_targets(q_preds.copy(), ids)
+            q_labels = self.scalers.inverse_transform_targets(q_labels.copy(), ids)
+            results[metric.name] = (
+                np.round(metric(q_labels, q_preds, weights), self.precision) if np.all(np.isfinite(q_preds)) else np.NaN
+            )
+
+        preds, labels = self.select(preds, labels, self.output_selector, self.label_selector)
+        preds = self.scalers.inverse_transform_targets(preds, ids)
+        if self.config.evaluator.get("test_prediction_path", None):
+            np.savetxt(self.config.evaluator.get("test_prediction_path"), preds, delimiter=",")
+        labels = self.scalers.inverse_transform_targets(labels, ids)
+
+        # naively, we are going to assume that all calls to visualize will be conducted on horizon 1 prediction for now
+        # this is a stopgap, and will take some work to generalize to horizon=N
+
+        if self.visualize_num:
+            individual_losses = self.metrics[0](labels, preds, weights, True)
+            loss_df = (
+                pd.DataFrame({"id": ids[:, 0], "loss": individual_losses[:, 0]}, index=range(len(individual_losses)))
+                .groupby("id")
+                .agg("sum")
+                .sort_values("loss")
+                .reset_index()
+            )
+            min_ids = loss_df["id"].loc[: self.visualize_num]
+            min_losses = loss_df["loss"].loc[: self.visualize_num]
+            max_ids = loss_df["id"].loc[len(loss_df) - self.visualize_num - 1 :]
+            max_losses = loss_df["loss"].loc[len(loss_df) - self.visualize_num - 1 :]
+            group_ids = max_ids.append(min_ids)
+            group_losses = max_losses.append(min_losses)
+            idx = 0
+            for group_id in group_ids:
+                plt.figure(idx)
+                rows = ids == group_id
+
+                id_preds = preds[rows]
+                id_labels = labels[rows]
+                plt.plot(id_preds, label="Predicted")
+                plt.plot(id_labels, label="Actual")
+                plt.legend(loc="best")
+                plt.title("ID: {} Loss: {}".format(group_id, group_losses.iloc[idx]))
+                plt.savefig(self.visualize_path + "{}.png".format(group_id))
+                idx += 1
+
+        for metric in [metric for metric in self.metrics if metric.name not in ["P50", "P90"]]:
+            results[metric.name] = (
+                np.round(metric(labels, preds, weights), self.precision) if np.all(np.isfinite(preds)) else np.NaN
+            )
+        targets = self.config.evaluator.get("targets", None)
+        if targets is not None:
+            missed_targets = {}
+            for target in targets:
+                if target.objective == "MIN":
+                    if target.value < results[target.name]:
+                        missed_targets[target.name] = results[target.name]
+                if target.objective == "MAX":
+                    if target.value > results[target.name]:
+                        missed_targets[target.name] = results[target.name]
+            if len(missed_targets) > 0:
+                raise ValueError("Target metrics not achieved: %s" % str(missed_targets))
+
+        return results
+
+    def select(self, preds, labels, output_selector=None, label_selector=None):
+        if len(preds.shape) > 2:
+            if output_selector is not None:
+                preds = preds[:, :, output_selector]
+            else:
+                preds = np.squeeze(preds, axis=2)
+        if len(labels.shape) > 2:
+            if label_selector is not None:
+                labels = labels[:, :, label_selector]
+            else:
+                labels = np.squeeze(labels, axis=2)
+        return preds, labels
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/hp_search.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/hp_search.py
new file mode 100644
index 00000000..c3f1ddf9
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/hp_search.py
@@ -0,0 +1,143 @@
+import argparse
+import copy
+import logging
+import multiprocessing
+import os
+import sys
+import warnings
+from contextlib import redirect_stderr, redirect_stdout
+from datetime import datetime
+
+import hydra
+import omegaconf
+import optuna
+import torch
+import torch.distributed as dist
+from omegaconf import OmegaConf
+from optuna.samplers import TPESampler
+
+from distributed_utils import (
+    get_device,
+    get_rank,
+    init_distributed,
+    is_main_process,
+    log,
+)
+from training.trainer import CTLTrainer
+
+warnings.filterwarnings("ignore")
+logging.basicConfig(level=logging.INFO)
+
+
+def sample_param(param, trial, name=""):
+    """ Sample parameters for trial """
+    if param.sampling in ["categorical", "discrete"]:
+
+        return trial.suggest_categorical(name, param.get("values"))
+    if param.sampling == "int_uniform":
+        step = param.step_value if (hasattr(param, "step_value") and param.step_value is not None) else 1
+        return trial.suggest_int(name, param.min_value, param.max_value, step=step)
+    if param.sampling == "float_uniform":
+        return trial.suggest_uniform(name, param.min_value, param.max_value)
+    if param.sampling == "log_uniform":
+        return trial.suggest_loguniform(name, param.min_value, param.max_value)
+    if param.sampling == "discrete_uniform":
+        return trial.suggest_discrete_uniform(name, param.min_value, param.max_value, param.step_value)
+
+    raise ValueError(f"Unknown sampling for param: {param.sampling}")
+
+
+def traverse_conf(node, trial, name="Root"):
+    if isinstance(node, (omegaconf.dictconfig.DictConfig, dict)):
+        if node.get("sampling", None):
+            return sample_param(node, trial, name=name)
+        else:
+            to_change = []
+            for key, value in node.items():
+                new_value = traverse_conf(value, trial, name=key)
+                if new_value is not None:
+                    to_change.append((key, new_value))
+            for key, value in to_change:
+                node[key] = value
+
+
+def launch_trial(cfg):
+    if not cfg.config.get("log_path", None):
+        cfg.config.log_path = datetime.now().strftime("./outputs/%Y-%m-%d/%H-%M-%S-%f/")
+    os.makedirs(os.path.join(cfg.config.get("log_path"), ".hydra"), exist_ok=True)
+    with open(os.path.join(cfg.config.get("log_path"), ".hydra", "config.yaml"), "w") as f:
+        OmegaConf.save(config=cfg, f=f)
+    cfg._target_ = cfg.config.dataset._target_
+    train, valid, test = hydra.utils.call(cfg)
+    cfg._target_ = cfg.config.model._target_
+    model = hydra.utils.instantiate(cfg)
+    model = model.cuda()
+    cfg._target_ = cfg.config.optimizer._target_
+    optimizer = hydra.utils.instantiate(cfg, params=model.parameters())
+    cfg._target_ = cfg.config.criterion._target_
+    criterion = hydra.utils.call(cfg)
+    cfg._target_ = cfg.config.evaluator._target_
+    evaluator = hydra.utils.instantiate(cfg)
+
+    trainer = CTLTrainer(model, train, valid, test, optimizer, evaluator, criterion, cfg.config)
+    trainer.train()
+    if is_main_process():
+        result = trainer.evaluate()
+        log(result)
+        return result[cfg.config.optuna.get("goal_metric", "MAE")]
+
+
+def main(args):
+    if args.distributed_world_size > 1:
+        args.local_rank = int(os.environ.get("LOCAL_RANK", 0))
+        torch.cuda.set_device(args.local_rank)
+        dist.init_process_group(backend="nccl", init_method="env://")
+        torch.cuda.synchronize()
+
+    with open(args.config_path, "rb") as f:
+        cfg = OmegaConf.load(f)
+    # Launching hp search with varied world size is problematic.
+    if not isinstance(cfg.config.device.get("world_size", 1), int):
+        print("HP search currently does not support varied world sizes. Setting world size = 1", file=sys.stderr)
+        cfg.config.device.world_size = 1
+
+    # BUG: Instantiating workers in second run of hp search causes cuda reinitialization which hits the same cuda context.
+    if cfg.config.trainer.num_workers != 0:
+        print("HP search currently does not support dataloading in subprocesses. Setting num_workers = 0", file=sys.stderr)
+        cfg.config.trainer.num_workers = 0
+
+    study = optuna.load_study(study_name=args.study_name, storage="sqlite:////workspace/{}.db".format(args.study_name))
+
+    def objective(trial, cfg=cfg):
+        if cfg.config.device.get("world_size", 1) > 1:
+            trial = optuna.integration.TorchDistributedTrial(trial, device="cuda")
+        cfg = copy.deepcopy(cfg)
+        traverse_conf(cfg.config, trial)
+        with open(os.devnull, "w") as devnull, redirect_stdout(devnull), redirect_stderr(devnull):
+            result = launch_trial(cfg)
+        return result
+
+    if is_main_process():
+        study.optimize(objective, n_trials=cfg.config.optuna.get("n_trials", 10), n_jobs=1)
+    else:
+        for _ in range(cfg.config.optuna.get("n_trials", 10)):
+            objective(None)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--distributed_world_size",
+        type=int,
+        metavar="N",
+        default=torch.cuda.device_count(),
+        help="total number of GPUs across all nodes (default: all visible GPUs)",
+    )
+    parser.add_argument(
+        "--distributed_rank", default=os.getenv("LOCAL_RANK", 0), type=int, help="rank of the current worker"
+    )
+    parser.add_argument("--local_rank", default=0, type=int, help="rank of the current worker")
+    parser.add_argument("--config_path", required=True, type=str)
+    parser.add_argument("--study_name", required=True, type=str)
+    ARGS = parser.parse_args()
+    main(ARGS)
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/inference/deploy.sh b/Tools/PyTorch/TimeSeriesPredictionPlatform/inference/deploy.sh
new file mode 100755
index 00000000..4e0ca182
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/inference/deploy.sh
@@ -0,0 +1,66 @@
+#!/bin/bash
+
+# Copyright (c) 2021 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# export TRITON_MODEL_OVERWRITE=True
+NAV_DIR=$1
+NV_VISIBLE_DEVICES=$2
+
+echo "Start"
+# Create common bridge for client and server
+BRIDGE_NAME="bridge"
+# docker network create ${BRIDGE_NAME}
+
+# Clean up
+# cleanup() {
+#     docker kill trt_server_cont
+#     docker network rm ${BRIDGE_NAME}
+# }
+# trap cleanup EXIT
+# trap cleanup SIGTERM
+
+# Start Server
+echo Starting server...
+SERVER_ID=$(bash inference/launch_triton_server.sh ${BRIDGE_NAME} ${NAV_DIR} $NV_VISIBLE_DEVICES )
+echo $SERVER_ID
+# SERVER_IP=$( docker inspect -f '{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}' ${SERVER_ID} )
+
+
+
+SERVER_URI="localhost"
+
+echo "Waiting for TRITON Server to be ready at http://$SERVER_URI:8000..."
+
+live_command="curl -i -m 1 -L -s -o /dev/null -w %{http_code} http://$SERVER_URI:8000/v2/health/live"
+ready_command="curl -i -m 1 -L -s -o /dev/null -w %{http_code} http://$SERVER_URI:8000/v2/health/ready"
+
+current_status=$($live_command)
+echo $current_status
+tempvar=0
+# First check the current status. If that passes, check the json. If either fail, loop
+while [[ ${current_status} != "200" ]] || [[ $($ready_command) != "200" ]]; do
+   printf "."
+   sleep 1
+   current_status=$($live_command)
+   if [[ $tempvar -ge 30 ]]; then
+      echo "Timeout waiting for triton server"
+      exit 1
+      break
+   fi
+   tempvar=$tempvar+1
+done
+
+echo "TRITON Server is ready!"
+
+
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/inference/deployer.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/inference/deployer.py
new file mode 100644
index 00000000..8f71c262
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/inference/deployer.py
@@ -0,0 +1,285 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+import subprocess
+from typing import Dict, List, Optional, Tuple
+
+import yaml
+from omegaconf import OmegaConf
+
+from conf.conf_utils import append_derived_config_fields
+
+
+def run_deployment(config):
+    inference_dir = os.getcwd()
+    cfg = config
+    with open(os.path.join(cfg.evaluator.checkpoint, ".hydra/config.yaml"), "rb") as f:
+        config = OmegaConf.load(f)
+        append_derived_config_fields(config)
+    config.config.evaluator = OmegaConf.merge(config.config.evaluator, cfg.evaluator)
+    if cfg.inference.get("dataset_dir", None):
+        config.config.dataset.dest_path = cfg.inference.dataset_dir
+    with open(os.path.join(cfg.evaluator.checkpoint, ".hydra/config_merged.yaml"), "wb") as f:
+        OmegaConf.save(config=config, f=f.name)
+    model_name = config.config.model._target_.split(".")[1]
+    precision = cfg.inference.precision
+    assert precision in ["fp16", "fp32"], "Precision needs to be either fp32 or fp16"
+    # export model
+    output_path = os.path.join(cfg.evaluator.checkpoint, "deployment")
+    os.makedirs(output_path, exist_ok=True)
+    tspp_main_dir = os.path.sep + os.path.join(*(os.getcwd().split(os.path.sep)[:-3]))
+    model_format = "torchscript" if cfg.inference.export.type != "onnx" else cfg.inference.export.type
+
+    if not cfg.inference.skip_conversion and not cfg.inference.just_deploy:
+        subprocess.run(
+            [
+                "python",
+                "triton/export_model.py",
+                "--input-path",
+                "triton/model.py",
+                "--input-type",
+                "pyt",
+                "--output-path",
+                "{}/exported_model.pt".format(output_path),
+                "--output-type",
+                "{}".format(cfg.inference.export.type),
+                "--dataloader",
+                "triton/dataloader.py",
+                "--batch-size",
+                "{}".format(cfg.inference.batch_size),
+                "--model-dir",
+                "{}".format(cfg.evaluator.checkpoint),
+                "--onnx-opset",
+                "13",
+                "--ignore-unknown-parameters",
+            ],
+            cwd=tspp_main_dir,
+            check=True,
+        )
+        # model-navigator run
+        if cfg.inference.optimize:
+            if cfg.inference.convert.type == "torchscript":
+                with open(output_path + "/exported_model.pt.yaml", "r") as stream:
+                    var_config = yaml.safe_load(stream)
+                var_config_list = []
+                for arg in ["--value-ranges", "--opt-shapes", "--dtypes"]:
+                    var_config_list.append(arg)
+                    if arg == "--value-ranges":
+                        for k, v in var_config["inputs"].items():
+                            var_config_list.append(k + "=0,0")
+                    elif arg == "--opt-shapes":
+                        for k, v in var_config["inputs"].items():
+                            var_config_list.append(k + "=" + ",".join([str(x) for x in v["shape"]]))
+                    else:
+                        for k, v in var_config["inputs"].items():
+                            var_config_list.append(k + "=" + v["dtype"])
+            else:
+                var_config_list = []
+            subprocess.run(
+                [
+                    "model-navigator",
+                    "run",
+                    "--model-name",
+                    model_name,
+                    "--model-path",
+                    "{}/exported_model.pt".format(output_path),
+                    "--config-path",
+                    "{}/exported_model.pt.yaml".format(output_path),
+                    "--override-workspace",
+                    "--workspace-path",
+                    "{}/navigator_workspace".format(output_path),
+                    "--verbose",
+                    "--target-formats",
+                    "{}".format(cfg.inference.convert.type),
+                    "--model-format",
+                    model_format,
+                    "--triton-launch-mode",
+                    "docker",
+                    "--max-workspace-size",
+                    "10000000000",
+                    "--perf-measurement-request-count",
+                    "100",
+                    "--perf-analyzer-timeout",
+                    "20",
+                    "--concurrency",
+                    "1",
+                    "32",
+                    "1024",
+                    "--max-batch-size",
+                    "{}".format(cfg.inference.batch_size),
+                    "--gpus",
+                    "all",
+                    "--atol",
+                    "1e-3",
+                    "--rtol",
+                    "100",
+                    "--onnx-opsets",
+                    "13",
+                    "--container-version",
+                    "21.09",
+                ]
+                + var_config_list,
+                cwd=tspp_main_dir,
+                check=True,
+            )
+        else:
+            subprocess.run(
+                [
+                    "model-navigator",
+                    "convert",
+                    "--model-name",
+                    model_name,
+                    "--model-path",
+                    "{}/exported_model.pt".format(output_path),
+                    "--override-workspace",
+                    "--workspace-path",
+                    "{}/navigator_workspace".format(output_path),
+                    "--output-path",
+                    "{}/converted_model".format(output_path),
+                    "--verbose",
+                    "--target-formats",
+                    "{}".format(cfg.inference.convert.type),
+                    "--model-format",
+                    model_format,
+                    "--launch-mode",
+                    "local",
+                    "--max-workspace-size",
+                    "10000000000",
+                    "--max-batch-size",
+                    "{}".format(cfg.inference.batch_size),
+                    "--target-precisions",
+                    precision,
+                    "--gpus",
+                    "all",
+                    "--atol",
+                    "1e-3",
+                    "--rtol",
+                    "100",
+                    "--onnx-opsets",
+                    "13",
+                    "--container-version",
+                    "21.09",
+                ],
+                cwd=tspp_main_dir,
+                check=True,
+            )
+            subprocess.run(
+                [
+                    "model-navigator",
+                    "triton-config-model",
+                    "--model-name",
+                    model_name,
+                    "--model-path",
+                    "{}/converted_model".format(output_path),
+                    "--model-version",
+                    "1",
+                    "--model-format",
+                    "{}".format(cfg.inference.convert.type),
+                    "--model-repository",
+                    "{}/navigator_workspace/model-store/".format(output_path),
+                    "--backend-accelerator",
+                    cfg.inference.accelerator,
+                    "--max-batch-size",
+                    "{}".format(cfg.inference.batch_size),
+                    "--engine-count-per-device",
+                    "gpu=2",
+                    "--tensorrt-precision",
+                    precision,
+                    "--tensorrt-capture-cuda-graph",
+                    "--verbose",
+                ],
+                cwd=tspp_main_dir,
+                check=True,
+            )
+            convert_type = (
+                cfg.inference.convert.type if cfg.inference.convert.type != "torchscript" else cfg.inference.export.type
+            )
+            subprocess.run(
+                [
+                    "python",
+                    "triton/check_accuracy.py",
+                    "--native-model",
+                    cfg.evaluator.checkpoint,
+                    "--native-type",
+                    "pyt",
+                    "--export-model",
+                    "{}/exported_model.pt".format(output_path),
+                    "--export-type",
+                    cfg.inference.export.type,
+                    "--convert-model",
+                    "{}/converted_model".format(output_path),
+                    "--convert-type",
+                    convert_type,
+                    "--dataloader",
+                    "triton/dataloader.py",
+                    "--batch-size",
+                    "{}".format(1),
+                    "--model-dir",
+                    "{}".format(cfg.evaluator.checkpoint),
+                ],
+                cwd=tspp_main_dir,
+                check=True,
+            )
+
+    # get the actual model name
+    if not os.path.isdir(os.path.join(output_path, "navigator_workspace")) or not os.path.isdir(
+        os.path.join(output_path, "navigator_workspace/model-store")
+    ):
+        assert (
+            False
+        ), "This checkpoint directory is not configured correctly, there should be a dir/deployment/navigator_workspace/model-store/ directory"
+    files_in_store = list(os.listdir(os.path.join(output_path, "navigator_workspace/model-store")))
+    if len(files_in_store) < 1:
+        assert False, "There needs to be exactly 1 model in the model-store directory"
+    model_name = cfg.inference.get("model_name") if cfg.inference.get("model_name", None) else files_in_store[0]
+    # deploy
+    subprocess.run(["bash", "inference/deploy.sh", output_path, str(cfg.inference.gpu)], cwd=tspp_main_dir, check=True)
+    # #create DL logger for this round of metrics
+    # #run inference
+    if not cfg.inference.just_deploy:
+        os.makedirs(os.path.join(inference_dir, "raw"))
+        dump_dir = os.path.join(inference_dir, "raw")
+        dump_array = ["--dump-labels"]
+        if config.config.evaluator.use_weights:
+            dump_array.append("--dump-inputs")
+        subprocess.run(
+            [
+                "python",
+                "triton/run_inference_on_triton.py",
+                "--model-name",
+                model_name,
+                "--model-version",
+                "1",
+                "--dataloader",
+                "triton/dataloader.py",
+                "--output-dir",
+                "{}".format(dump_dir),
+                "--batch-size",
+                "{}".format(cfg.inference.batch_size),
+                "--model-dir",
+                "{}".format(cfg.evaluator.checkpoint),
+            ]
+            + dump_array,
+            cwd=tspp_main_dir,
+            check=True,
+        )
+
+        # calculate metrics
+        subprocess.run(
+            [
+                "python",
+                "triton/calculate_metrics.py",
+                "--metrics",
+                "triton/metrics.py",
+                "--model-dir",
+                "{}".format(cfg.evaluator.checkpoint),
+                "--dump-dir",
+                "{}".format(dump_dir),
+                "--csv",
+                "{}".format(os.path.join(inference_dir, "metrics.csv")),
+            ],
+            cwd=tspp_main_dir,
+            check=True,
+        )
+
+        subprocess.run(["bash", "inference/stop_docker.sh"], cwd=tspp_main_dir)
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/inference/inference.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/inference/inference.py
new file mode 100755
index 00000000..e45dfec8
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/inference/inference.py
@@ -0,0 +1,139 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+from functools import partial
+from typing import Dict, List, Optional, Tuple
+
+import dgl
+import dllogger
+import hydra
+import numpy as np
+import torch
+from apex import amp
+from omegaconf import OmegaConf
+from torch.utils.data import DataLoader
+from torch.utils.data.dataloader import default_collate
+
+from conf.conf_utils import append_derived_config_fields
+from loggers.log_helper import setup_logger
+from training.utils import to_device
+
+
+def run_inference(config):
+    cfg = config
+    state_dict = torch.load(os.path.join(cfg.evaluator.checkpoint, "best_checkpoint.pth.tar"))["model_state_dict"]
+    device = torch.device(cfg.device.get("name", "cpu"))  # maybe change depending on evaluator
+    with open(os.path.join(cfg.evaluator.checkpoint, ".hydra/config.yaml"), "rb") as f:
+        config = OmegaConf.load(f)
+        append_derived_config_fields(config)
+
+    if config.config.device.get("world_size", 1) > 1:
+        model_params = list(state_dict.items())
+        for k, v in model_params:
+            if k[:7] == "module.":
+                state_dict[k[7:]] = v
+                del state_dict[k]
+    config.config.evaluator = OmegaConf.merge(config.config.evaluator, cfg.evaluator)
+    if cfg.inference.get("dataset_dir", None):
+        config.config.dataset.dest_path = cfg.inference.dataset_dir
+    config._target_ = config.config.evaluator._target_
+    evaluator = hydra.utils.instantiate(config)
+    config._target_ = config.config.model._target_
+    config.config.device = cfg.device
+    model = hydra.utils.instantiate(config)
+    test_method_name = config.config.model.get("test_method", "__call__")
+    test_method = getattr(model, test_method_name)
+    model.load_state_dict(state_dict)
+    model.eval()
+    model.to(device=device)
+    precision = cfg.inference.precision
+    assert precision in ["fp16", "fp32"], "Precision needs to be either fp32 or fp16"
+    if precision == "fp16":
+        model = amp.initialize(model, opt_level="O2")
+    if os.path.isdir(config.config.dataset.dest_path):
+        config._target_ = config.config.dataset._target_
+        train, valid, test = hydra.utils.call(config)
+        del train
+        del valid
+    else:
+        raise ValueError("dataset_dir must be a directory")
+    preds_full = []
+    labels_full = []
+    weights_full = []
+    ids_full = []
+    test_target = "target_masked" if config.config.model.get("test_target_mask", True) else "target"
+    preds_test_output_selector = config.config.model.get("preds_test_output_selector", -1)
+    if config.config.dataset.get("graph", False) and config.config.model.get("graph_eligible", False):
+
+        def _collate_graph(samples, target):
+            batch = dgl.batch(samples)
+            labels = batch.ndata["target"]
+            # XXX: we need discuss how to do this neatly
+            if target == "target_masked":
+                labels = labels[:, config.config.dataset.encoder_length :, :]
+
+            return batch, labels
+
+        _collate = _collate_graph
+    else:
+
+        def _collate_dict(samples, target):
+            batch = default_collate(samples)
+            labels = batch["target"]
+            if target == "target_masked":
+                labels = labels[:, config.config.dataset.encoder_length :, :]
+            return batch, labels
+
+        _collate = _collate_dict
+    data_loader = DataLoader(
+        test,
+        batch_size=int(cfg.inference.batch_size),
+        num_workers=2,
+        pin_memory=True,
+        collate_fn=partial(_collate, target=test_target),
+    )
+    with torch.no_grad():
+        for i, (batch, labels) in enumerate(data_loader):
+
+            batch = to_device(batch, device=device)
+            labels = to_device(labels, device=device)
+
+            if cfg.evaluator.get("use_weights", False):
+                weights = batch["weight"]
+            else:
+                weights = None
+            ids = batch["id"]
+
+            labels_full.append(labels)
+            weights_full.append(weights)
+            preds = test_method(batch)
+            if preds_test_output_selector >= 0:
+                preds = preds[..., preds_test_output_selector : preds_test_output_selector + 1]
+            ids_full.append(ids)
+            preds_full.append(preds)
+
+        preds_full = torch.cat(preds_full, dim=0).cpu().numpy()
+        labels_full = torch.cat(labels_full, dim=0).cpu().numpy()
+        if cfg.evaluator.get("use_weights", False):
+            weights_full = torch.cat(weights_full).cpu().numpy()
+        else:
+            weights_full = np.zeros((0, 0))
+        ids_full = torch.cat(ids_full).cpu().numpy()
+        eval_metrics = evaluator(labels_full, preds_full, weights_full, ids_full)
+    logger = setup_logger(cfg)
+    logger.log(step=[], data={k: float(v) for k, v in eval_metrics.items()}, verbosity=dllogger.Verbosity.VERBOSE)
+    logger.log(step=[], data={"String": "Evaluation Metrics: {}".format(eval_metrics)}, verbosity=dllogger.Verbosity.DEFAULT)
+    return eval_metrics
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/inference/launch_triton_server.sh b/Tools/PyTorch/TimeSeriesPredictionPlatform/inference/launch_triton_server.sh
new file mode 100755
index 00000000..3de2b6d6
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/inference/launch_triton_server.sh
@@ -0,0 +1,30 @@
+# Copyright (c) 2021 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+DOCKER_BRIDGE=${1:-"bridge"}
+NAV_DIR=${2:-"/workspace/"}
+NV_VISIBLE_DEVICES=${3-"0"}
+
+# Start TRITON server in detached state
+docker run --rm -d \
+   --gpus device=${NV_VISIBLE_DEVICES} \
+   --shm-size=1g \
+   --ulimit memlock=-1 \
+   --ulimit stack=67108864 \
+   --network=${DOCKER_BRIDGE} \
+   -p 8000:8000 \
+   -p 8001:8001 \
+   -p 8002:8002 \
+   --name trt_server_cont \
+   -v ${NAV_DIR}/navigator_workspace/model-store/:/models \
+   nvcr.io/nvidia/tritonserver:21.09-py3 tritonserver --model-store=/models --log-verbose=1 --exit-on-error=true --strict-model-config=false
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/inference/stop_docker.sh b/Tools/PyTorch/TimeSeriesPredictionPlatform/inference/stop_docker.sh
new file mode 100644
index 00000000..35be17f3
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/inference/stop_docker.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+# Copyright (c) 2021 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+docker stop trt_server_cont
\ No newline at end of file
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/launch_deployment.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/launch_deployment.py
new file mode 100644
index 00000000..8d23b0d4
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/launch_deployment.py
@@ -0,0 +1,18 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import warnings
+
+import hydra
+
+warnings.filterwarnings("ignore")
+
+
+@hydra.main(config_path="conf/", config_name="deployment_config")
+def main(cfg):
+    print(cfg)
+    cfg._target_ = cfg.config.inference._target_
+    hydra.utils.call(cfg)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/launch_inference.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/launch_inference.py
new file mode 100644
index 00000000..554bd439
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/launch_inference.py
@@ -0,0 +1,18 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import warnings
+
+import hydra
+
+warnings.filterwarnings("ignore")
+
+
+@hydra.main(config_path="conf/", config_name="inference_config")
+def main(cfg):
+    print(cfg)
+    cfg._target_ = cfg.config.inference._target_
+    hydra.utils.call(cfg)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/launch_optuna.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/launch_optuna.py
new file mode 100755
index 00000000..23b93abd
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/launch_optuna.py
@@ -0,0 +1,74 @@
+# SPDX-License-Identifier: Apache-2.0
+import argparse
+import copy
+import logging
+import multiprocessing
+import os
+import warnings
+from contextlib import contextmanager
+from datetime import datetime
+
+import hydra
+import omegaconf
+import optuna
+import torch
+from omegaconf import OmegaConf
+from optuna.samplers import TPESampler
+
+from distributed_utils import (
+    get_device,
+    get_rank,
+    init_distributed,
+    is_main_process,
+    log,
+)
+from training.trainer import CTLTrainer
+
+
+def main(args):
+    with open(args.config_path, "rb") as f:
+        cfg = OmegaConf.load(f)
+
+    if cfg.config.optuna.get("sampler", None):
+        sampler = hydra.utils.instantiate(cfg.config.optuna.sampler)
+    else:
+        sampler = TPESampler(multivariate=True)
+
+    study = optuna.create_study(
+        study_name=args.study_name,
+        sampler=sampler,
+        direction=cfg.config.optuna.get("direction", "minimize"),
+        storage="sqlite:////workspace/{}.db".format(args.study_name),  # XXX we should probably save it in results directory
+    )
+
+    import subprocess
+
+    processes = []
+    world_size = cfg.config.device.get("world_size", os.environ.get("WORLD_SIZE", 1))
+    for i in range(torch.cuda.device_count() // world_size):
+        devices = list(range(i * world_size, (i + 1) * world_size))
+        command = "export CUDA_VISIBLE_DEVICES={} ; ".format(",".join([str(x) for x in devices]))
+        command += "python "
+        if world_size > 1:
+            command += f'-m torch.distributed.run --nproc_per_node={world_size} --master_port={1234 + i} --master_addr="127.0.0.{1+i}" '
+        command += f"hp_search.py --config_path {args.config_path} --study_name {args.study_name}"
+        print(command)
+        p = subprocess.Popen(command, shell=True)
+        processes.append(p)
+    for p in processes:
+        p.wait()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--config_path",
+        default=None,
+        type=str,
+        required=True,
+        help="The path for the configuration file to run this experiement",
+    )
+    parser.add_argument("--study_name", default="study_" + str(datetime.now()).replace(" ", "_"), type=str)
+
+    args, _ = parser.parse_known_args()
+    main(args)
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/launch_preproc.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/launch_preproc.py
new file mode 100755
index 00000000..f2557d6d
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/launch_preproc.py
@@ -0,0 +1,17 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import warnings
+
+import hydra
+
+warnings.filterwarnings("ignore")
+
+
+@hydra.main(config_path="conf/", config_name="preproc_config")
+def main(cfg):
+    print(cfg)
+    hydra.utils.call(cfg)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/launch_tspp.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/launch_tspp.py
new file mode 100755
index 00000000..bbd66270
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/launch_tspp.py
@@ -0,0 +1,72 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import logging
+import warnings
+
+import hydra
+import numpy as np
+import torch
+from omegaconf import OmegaConf
+
+from conf.conf_utils import append_derived_config_fields
+from data.data_utils import StatDataset
+from distributed_utils import is_main_process
+from training.trainer import CTLTrainer, StatTrainer
+
+warnings.filterwarnings("ignore")
+
+
+def set_seed(seed):
+    if seed:
+        np.random.seed(seed)
+        torch.manual_seed(seed)
+        torch.cuda.manual_seed(seed)
+
+
+@hydra.main(config_path="conf", config_name="config")
+def main(cfg):
+    append_derived_config_fields(cfg)
+    set_seed(cfg.config.trainer.get("seed", None))
+    if cfg.config.get("save_config", False):
+        with open(cfg.config.get("save_path", "config.yaml"), "w") as f:
+            OmegaConf.save(config=cfg, f=f)
+            return
+    if cfg.config.trainer.get("type", "") != "stat":
+        device = torch.device(cfg.config.device.get("name", "cpu"))
+        cfg._target_ = cfg.config.dataset._target_
+        train, valid, test = hydra.utils.call(cfg)
+        cfg._target_ = cfg.config.model._target_
+        model = hydra.utils.instantiate(cfg)
+        cfg._target_ = cfg.config.optimizer._target_
+        optimizer = hydra.utils.instantiate(cfg, params=model.parameters())
+        cfg._target_ = cfg.config.criterion._target_
+        criterion = hydra.utils.call(cfg)
+        cfg._target_ = cfg.config.evaluator._target_
+        evaluator = hydra.utils.instantiate(cfg)
+        trainer = CTLTrainer(model, train, valid, test, optimizer, evaluator, criterion, cfg.config)
+        trainer.train()
+        if is_main_process():
+            eval_metrics = trainer.evaluate()
+        torch.cuda.synchronize()
+        del train, valid, test
+    else:
+        dataset = StatDataset(
+            cfg.config.dataset.features,
+            csv_path=cfg.config.dataset.dest_path,
+            encoder_length=cfg.config.dataset.encoder_length,
+            example_length=cfg.config.dataset.example_length,
+            stride=cfg.config.dataset.get("stride", 1),
+            split=cfg.config.dataset.test_range[0],
+            split_feature=cfg.config.dataset.time_ids,
+        )
+        cfg._target_ = cfg.config.model._target_
+        model = hydra.utils.instantiate(cfg)
+        cfg._target_ = cfg.config.evaluator._target_
+        evaluator = hydra.utils.instantiate(cfg)
+        trainer = StatTrainer(dataset, evaluator, cfg.config, model)
+        eval_metrics = trainer.evaluate()
+        logging.info(eval_metrics)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/loggers/log_helper.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/loggers/log_helper.py
new file mode 100755
index 00000000..0edb956f
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/loggers/log_helper.py
@@ -0,0 +1,205 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import atexit
+import os
+import subprocess
+import time
+from collections import OrderedDict
+
+import dllogger
+from dllogger import Backend, JSONStreamBackend, Logger, StdOutBackend
+from torch.utils.tensorboard import SummaryWriter
+
+from distributed_utils import is_main_process
+
+
+class AverageMeter:
+    def __init__(self):
+        self.reset()
+
+    def reset(self):
+        self.updated = False
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+
+    def update(self, value):
+        self.updated = True
+        if isinstance(value, (tuple, list)):
+            val = value[0]
+            n = value[1]
+        else:
+            val = value
+            n = 1
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+
+    @property
+    def value(self):
+        return self.avg
+
+
+class PerformanceMeter:
+    def __init__(self):
+        self.reset()
+
+    def reset(self):
+        self.updated = False
+        self.start = time.time()
+        self.n = 0
+
+    def update(self, val=1):
+        self.updated = True
+        self.n += val
+
+    @property
+    def value(self):
+        return self.n / self.elapsed_time
+
+    @property
+    def elapsed_time(self):
+        return time.time() - self.start
+
+
+class AggregatorBackend(Backend):
+    def __init__(self, verbosity, agg_dict):
+        super().__init__(verbosity=verbosity)
+        self.metrics = OrderedDict({k: v() for k, v in agg_dict.items()})
+        self.metrics.flushed = True
+        self.step = 0
+        self.epoch = 0
+        self.start_time = time.time()
+
+    @property
+    def log_level(self):
+        return self._log_level
+
+    def metadata(self, timestamp, elapsedtime, metric, metadata):
+        pass
+
+    def _reset_perf_meter(self, name):
+        for agg in self.metrics[name]:
+            if isinstance(agg, PerformanceMeter):
+                agg.reset()
+
+    def reset_perf_meters(self):
+        # This method allows us to reset performance metrics in case we want to
+        # exclude couple first iterations from performance measurement
+        for name in self.metrics.keys():
+            self._reset_perf_meter(name)
+
+    def log(self, timestamp, elapsedtime, step, data):
+        self.step = step
+        if self.step == []:
+            self.metrics.flushed = True
+        if "epoch" in data.keys():
+            self.epoch = data["epoch"]
+        for k, v in data.items():
+            if k not in self.metrics.keys():
+                continue
+            self.metrics.flushed = False
+            self.metrics[k].update(v)
+
+    def flush(self):
+        if self.metrics.flushed:
+            return
+        result_string = "Epoch {} | step {} |".format(self.epoch, self.step)
+        for name, agg in self.metrics.items():
+            if not agg.updated:
+                continue
+            if isinstance(agg, AverageMeter):
+                _name = "avg " + name
+            elif isinstance(agg, PerformanceMeter):
+                _name = name + "/s"
+
+            result_string += _name + " {:.3f} |".format(agg.value)
+            agg.reset()
+
+        result_string += "walltime {:.3f} |".format(time.time() - self.start_time)
+        self.metrics.flushed = True
+        print(result_string)
+
+
+class TensorBoardBackend(Backend):
+    def __init__(self, verbosity, log_dir):
+        super().__init__(verbosity=verbosity)
+        self.summary_writer = SummaryWriter(log_dir=os.path.join(log_dir, "TB_summary"), flush_secs=120, max_queue=200)
+        atexit.register(self.summary_writer.close)
+
+    @property
+    def log_level(self):
+        return self._log_level
+
+    def metadata(self, timestamp, elapsedtime, metric, metadata):
+        pass
+
+    def log(self, timestamp, elapsedtime, step, data):
+        if not isinstance(step, int):
+            return
+        for k, v in data.items():
+            self.summary_writer.add_scalar(k, v, step)
+
+    def flush(self):
+        pass
+
+
+def empty_step_format(step):
+    return ""
+
+
+def empty_prefix_format(timestamp):
+    return ""
+
+
+def no_string_metric_format(metric, metadata, value):
+    unit = metadata["unit"] if "unit" in metadata.keys() else ""
+    format = "{" + metadata["format"] + "}" if "format" in metadata.keys() else "{}"
+    if metric == "String":
+        return "{} {}".format(format.format(value) if value is not None else value, unit)
+    return "{} : {} {}".format(metric, format.format(value) if value is not None else value, unit)
+
+
+def setup_logger(config):
+    log_path = config.get("log_path", os.getcwd())
+    if is_main_process():
+        backends = [
+            TensorBoardBackend(verbosity=dllogger.Verbosity.VERBOSE, log_dir=log_path),
+            JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE, filename=os.path.join(log_path, "log.json")),
+            AggregatorBackend(verbosity=dllogger.Verbosity.VERBOSE, agg_dict={"loss": AverageMeter}),
+            StdOutBackend(
+                verbosity=dllogger.Verbosity.DEFAULT,
+                step_format=empty_step_format,
+                metric_format=no_string_metric_format,
+                prefix_format=empty_prefix_format,
+            ),
+        ]
+
+        logger = Logger(backends=backends)
+    else:
+        logger = Logger(backends=[])
+    container_setup_info = get_framework_env_vars()
+    logger.log(step="PARAMETER", data=container_setup_info, verbosity=dllogger.Verbosity.DEFAULT)
+
+    logger.metadata("loss", {"unit": "nat", "GOAL": "MINIMIZE", "STAGE": "TRAIN"})
+    logger.metadata("val_loss", {"unit": "nat", "GOAL": "MINIMIZE", "STAGE": "VAL"})
+    return logger
+
+
+def get_framework_env_vars():
+    # TODO: it fails. Probably due to the fact that docker don't copy hidden directories
+    # process = subprocess.Popen(
+    #     ["git", "rev-parse", "HEAD"], shell=False, stdout=subprocess.PIPE
+    # )
+    return {
+        "NVIDIA_PYTORCH_VERSION": os.environ.get("NVIDIA_PYTORCH_VERSION"),
+        "PYTORCH_VERSION": os.environ.get("PYTORCH_VERSION"),
+        "CUBLAS_VERSION": os.environ.get("CUBLAS_VERSION"),
+        "NCCL_VERSION": os.environ.get("NCCL_VERSION"),
+        "CUDA_DRIVER_VERSION": os.environ.get("CUDA_DRIVER_VERSION"),
+        "CUDNN_VERSION": os.environ.get("CUDNN_VERSION"),
+        "CUDA_VERSION": os.environ.get("CUDA_VERSION"),
+        "NVIDIA_PIPELINE_ID": os.environ.get("NVIDIA_PIPELINE_ID"),
+        "NVIDIA_BUILD_ID": os.environ.get("NVIDIA_BUILD_ID"),
+        "NVIDIA_TF32_OVERRIDE": os.environ.get("NVIDIA_TF32_OVERRIDE"),
+    }
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/lstm.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/lstm.py
new file mode 100755
index 00000000..cb6223b1
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/lstm.py
@@ -0,0 +1,119 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Optional, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from apex.normalization.fused_layer_norm import FusedLayerNorm
+from torch import Tensor
+
+from data.data_utils import DataTypes, InputTypes, translate_features
+from models.tft_pyt.modeling import *
+
+
+class LSTM(nn.Module):
+    """ 
+    Implementation from LSTM portion of https://arxiv.org/abs/1912.09363 
+    """
+
+    def __init__(self, config):
+        super().__init__()
+
+        self.encoder_steps = config.dataset.encoder_length  # this determines from how distant past we want to use data from
+
+        self.mask_nans = config.model.missing_data_strategy == "mask"
+        self.features = translate_features(config.dataset.features)
+        config = config.model
+
+        config.temporal_known_continuous_inp_size = len(
+            [x for x in self.features if x.feature_type == InputTypes.KNOWN and x.feature_embed_type == DataTypes.CONTINUOUS]
+        )
+        config.temporal_observed_continuous_inp_size = len(
+            [
+                x
+                for x in self.features
+                if x.feature_type == InputTypes.OBSERVED and x.feature_embed_type == DataTypes.CONTINUOUS
+            ]
+        )
+        config.temporal_target_size = len([x for x in self.features if x.feature_type == InputTypes.TARGET])
+        config.static_continuous_inp_size = len(
+            [
+                x
+                for x in self.features
+                if x.feature_type == InputTypes.STATIC and x.feature_embed_type == DataTypes.CONTINUOUS
+            ]
+        )
+        config.static_categorical_inp_lens = [
+            x.get("cardinality", 100)
+            for x in self.features
+            if x.feature_type == InputTypes.STATIC and x.feature_embed_type == DataTypes.CATEGORICAL
+        ]
+
+        config.temporal_known_categorical_inp_lens = [
+            x.get("cardinality", 100)
+            for x in self.features
+            if x.feature_type == InputTypes.KNOWN and x.feature_embed_type == DataTypes.CATEGORICAL
+        ]
+        config.temporal_observed_categorical_inp_lens = [
+            x.get("cardinality", 100)
+            for x in self.features
+            if x.feature_type == InputTypes.OBSERVED and x.feature_embed_type == DataTypes.CATEGORICAL
+        ]
+
+        config.num_static_vars = config.static_continuous_inp_size + len(config.static_categorical_inp_lens)
+        config.num_future_vars = config.temporal_known_continuous_inp_size + len(config.temporal_known_categorical_inp_lens)
+        config.num_historic_vars = sum(
+            [
+                config.num_future_vars,
+                config.temporal_observed_continuous_inp_size,
+                config.temporal_target_size,
+                len(config.temporal_observed_categorical_inp_lens),
+            ]
+        )
+
+        self.embedding = TFTEmbedding(config)
+        self.static_encoder = StaticCovariateEncoder(config)
+
+        self.history_vsn = VariableSelectionNetwork(config, config.num_historic_vars)
+        self.history_encoder = nn.LSTM(config.hidden_size, config.hidden_size, batch_first=True)
+        self.future_vsn = VariableSelectionNetwork(config, config.num_future_vars)
+        self.future_encoder = nn.LSTM(config.hidden_size, config.hidden_size, batch_first=True)
+
+        self.output_proj = nn.Linear(config.hidden_size, 1)
+
+    def forward(self, x: Tensor) -> Tensor:
+        s_inp, t_known_inp, t_observed_inp, t_observed_tgt = self.embedding(x)
+
+        # Static context
+        cs, ce, ch, cc = self.static_encoder(s_inp)
+        ch, cc = ch.unsqueeze(0), cc.unsqueeze(0)  # lstm initial states
+
+        # Temporal input
+        _historical_inputs = [t_known_inp[:, : self.encoder_steps, :], t_observed_tgt[:, : self.encoder_steps, :]]
+        if t_observed_inp is not None:
+            _historical_inputs.insert(0, t_observed_inp[:, : self.encoder_steps, :])
+
+        historical_inputs = torch.cat(_historical_inputs, dim=-2)
+        future_inputs = t_known_inp[:, self.encoder_steps :]
+
+        # Encoders
+        historical_features, _ = self.history_vsn(historical_inputs, cs)
+        history, state = self.history_encoder(historical_features, (ch, cc))
+        future_features, _ = self.future_vsn(future_inputs, cs)
+        future, _ = self.future_encoder(future_features, state)
+
+        output = self.output_proj(future)
+        return output
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/stat_models.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/stat_models.py
new file mode 100755
index 00000000..c4cd6868
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/stat_models.py
@@ -0,0 +1,27 @@
+# SPDX-License-Identifier: Apache-2.0
+from abc import ABC
+
+import pmdarima as pm
+
+
+class StatModel(ABC):
+    def __init__(self, config):
+        self.horizon = config.dataset.example_length - config.dataset.encoder_length
+        self.config = config
+
+    def fit(self, endog, exog):
+        return
+
+    def predict(self, exog):
+        return
+
+
+class AutoARIMA(StatModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+    def fit(self, endog, exog):
+        self.model = pm.auto_arima(endog, X=exog)
+
+    def predict(self, exog):
+        return self.model.predict(self.horizon, X=exog)
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/Dockerfile b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/Dockerfile
new file mode 100644
index 00000000..70552ea1
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/Dockerfile
@@ -0,0 +1,36 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:21.06-py3
+
+FROM ${FROM_IMAGE_NAME}
+
+RUN apt-get update && apt-get install -y libb64-dev libb64-0d
+WORKDIR /workspace
+#ENV PYTHONPATH /workspace
+RUN pip uninstall -y typing
+
+RUN apt update && apt install -y p7zip-full
+COPY requirements.txt .
+RUN pip install --upgrade pip
+RUN pip install --no-cache-dir --ignore-installed -r requirements.txt
+RUN pip install --no-cache-dir -e git://github.com/NVIDIA/dllogger#egg=dllogger
+
+COPY . .
+ENV PYTHONPATH="${PYTHONPATH}:/workspace"
+
+# AMP monkey-patch
+RUN sed -i 's/  def forward(ctx,/  @amp.custom_fwd\(cast_inputs=torch.float32\)\n  def forward(ctx,/g' /opt/conda/lib/python3.8/site-packages/apex/normalization/fused_layer_norm.py
+RUN sed -i 's/  def backward(ctx,/  @amp.custom_bwd\n  def backward(ctx,/g' /opt/conda/lib/python3.8/site-packages/apex/normalization/fused_layer_norm.py
+RUN sed -i 's/^import torch$/import torch\nfrom torch.cuda import amp/' /opt/conda/lib/python3.8/site-packages/apex/normalization/fused_layer_norm.py
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/LICENCE b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/LICENCE
new file mode 100644
index 00000000..261eeb9e
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/LICENCE
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/LICENSE AGREEMENT b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/LICENSE AGREEMENT
new file mode 100644
index 00000000..5d1d88cf
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/LICENSE AGREEMENT	
@@ -0,0 +1,25 @@
+Individual Contributor License Agreement (CLA)
+Thank you for submitting your contributions to this project.
+
+By signing this CLA, you agree that the following terms apply to all of your past, present and future contributions to the project.
+
+License.
+You hereby represent that all present, past and future contributions are governed by the Apache 2.0 License copyright statement.
+
+This entails that to the extent possible under law, you transfer all copyright and related or neighboring rights of the code or documents you contribute to the project itself or its maintainers. Furthermore you also represent that you have the authority to perform the above waiver with respect to the entirety of you contributions.
+
+Moral Rights.
+To the fullest extent permitted under applicable law, you hereby waive, and agree not to assert, all of your “moral rights” in or relating to your contributions for the benefit of the project.
+
+Third Party Content.
+If your Contribution includes or is based on any source code, object code, bug fixes, configuration changes, tools, specifications, documentation, data, materials, feedback, information or other works of authorship that were not authored by you (“Third Party Content”) or if you are aware of any third party intellectual property or proprietary rights associated with your Contribution (“Third Party Rights”), then you agree to include with the submission of your Contribution full details respecting such Third Party Content and Third Party Rights, including, without limitation, identification of which aspects of your Contribution contain Third Party Content or are associated with Third Party Rights, the owner/author of the Third Party Content and Third Party Rights, where you obtained the Third Party Content, and any applicable third party license terms or restrictions respecting the Third Party Content and Third Party Rights. For greater certainty, the foregoing obligations respecting the identification of Third Party Content and Third Party Rights do not apply to any portion of a Project that is incorporated into your Contribution to that same Project.
+
+Representations.
+You represent that, other than the Third Party Content and Third Party Rights identified by you in accordance with this Agreement, you are the sole author of your Contributions and are legally entitled to grant the foregoing licenses and waivers in respect of your Contributions. If your Contributions were created in the course of your employment with your past or present employer(s), you represent that such employer(s) has authorized you to make your Contributions on behalf of such employer(s) or such employer (s) has waived all of their right, title or interest in or to your Contributions.
+
+Disclaimer.
+To the fullest extent permitted under applicable law, your Contributions are provided on an "as is" basis, without any warranties or conditions, express or implied, including, without limitation, any implied warranties or conditions of non-infringement, merchantability or fitness for a particular purpose. You are not required to provide support for your Contributions, except to the extent you desire to provide support.
+
+No Obligation.
+You acknowledge that the maintainers of this project are under no obligation to use or incorporate your contributions into the project. The decision to use or incorporate your contributions into the project will be made at the sole discretion of the maintainers or their authorized delegates.
+
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/NOTICE b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/NOTICE
new file mode 100644
index 00000000..ae19bb47
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/NOTICE
@@ -0,0 +1,3 @@
+TFT for PyTorch
+
+This repository includes software from https://github.com/google-research/google-research/tree/master/tft licensed under the Apache License, Version 2.0
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/README.md b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/README.md
new file mode 100644
index 00000000..69b39d12
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/README.md
@@ -0,0 +1,465 @@
+# Temporal Fusion Transformer For PyTorch
+
+This repository provides a script and recipe to train the Temporal Fusion Transformer model to achieve state-of-the-art accuracy. The content of this repository is tested and maintained by NVIDIA.
+
+## Table Of Contents
+
+- [Model overview](#model-overview)
+    * [Model architecture](#model-architecture)
+    * [Default configuration](#default-configuration)
+    * [Feature support matrix](#feature-support-matrix)
+	    * [Features](#features)
+    * [Mixed precision training](#mixed-precision-training)
+	    * [Enabling mixed precision](#enabling-mixed-precision)
+          * [Enabling TF32](#enabling-tf32)
+    * [Glossary](#glossary)
+- [Setup](#setup)
+    * [Requirements](#requirements)
+- [Quick Start Guide](#quick-start-guide)
+- [Advanced](#advanced)
+    * [Scripts and sample code](#scripts-and-sample-code)
+    * [Command-line options](#command-line-options)
+    * [Getting the data](#getting-the-data)
+        * [Dataset guidelines](#dataset-guidelines)
+        * [Multi-dataset](#multi-dataset)
+    * [Training process](#training-process)
+    * [Inference process](#inference-process)
+- [Performance](#performance)
+    * [Benchmarking](#benchmarking)
+        * [Training performance benchmark](#training-performance-benchmark)
+        * [Inference performance benchmark](#inference-performance-benchmark)
+    * [Results](#results)
+        * [Training accuracy results](#training-accuracy-results)                         
+            * [Training accuracy: NVIDIA DGX A100 (8x A100 80GB)](#training-accuracy-nvidia-dgx-a100-8x-a100-80gb)
+            * [Training accuracy: NVIDIA DGX-1 (8x V100 16GB)](#training-accuracy-nvidia-dgx-1-8x-v100-16gb)
+            * [Training stability test](#training-stability-test)
+        * [Training performance results](#training-performance-results)
+            * [Training performance: NVIDIA DGX A100 (8x A100 80GB)](#training-performance-nvidia-dgx-a100-8x-a100-80gb)
+            * [Training performance: NVIDIA DGX-1 (8x V100 16GB)](#training-performance-nvidia-dgx-1-8x-v100-16gb)
+- [Release notes](#release-notes)
+    * [Changelog](#changelog)
+    * [Known issues](#known-issues)
+
+
+
+## Model overview
+
+The Temporal Fusion Transformer [TFT](https://arxiv.org/abs/1912.09363) model is a state-of-the-art architecture for interpretable, multi-horizon time-series prediction. The model was first developed and [implemented by Google](https://github.com/google-research/google-research/tree/master/tft) with the collaboration with the University of Oxford.
+This implementation differs from the reference implementation by addressing the issue of missing data, which is common in production datasets, by either masking their values in attention matrices or embedding them as a special value in the latent space.
+This model enables the prediction of confidence intervals for future values of time series for multiple future timesteps.
+
+This model is trained with mixed precision using Tensor Cores on Volta, Turing, and the NVIDIA Ampere GPU architectures. Therefore, researchers can get results 1.45x faster than training without Tensor Cores while experiencing the benefits of mixed precision training. This model is tested against each NGC monthly container release to ensure consistent accuracy and performance over time.
+
+### Model architecture
+
+The TFT model is a hybrid architecture joining LSTM encoding of time series and interpretability of transformer attention layers. Prediction is based on three  types of variables: static (constant for a given time series), known (known in advance for whole history and future), observed (known only for historical data). All these variables come in two flavors: categorical, and continuous. In addition to historical data, we feed the model with historical values of time series. All variables are embedded in high-dimensional space by learning an embedding vector. Categorical variables embeddings are learned in the classical sense of embedding discrete values. The model learns a single vector for each continuous variable, which is then scaled by this variable’s value for further processing. The next step is to filter variables through the Variable Selection Network (VSN), which assigns weights to the inputs in accordance with their relevance to the prediction. Static variables are used as a context for variable selection of other variables and as an initial state of LSTM encoders.
+After encoding, variables are passed to multi-head attention layers (decoder), which produce the final prediction. Whole architecture is interwoven with residual connections with gating mechanisms that allow  the architecture to adapt to various problems by skipping some parts of it.
+For the sake of explainability, heads of self-attention layers share value matrices. This allows interpreting  self-attention as an ensemble of models predicting different temporal patterns over the same feature set. The other feature that helps us understand the model is VSN activations, which tells us how relevant the given feature is to the prediction.
+![](TFT_architecture.PNG)
+*image source: https://arxiv.org/abs/1912.09363*
+
+### Default configuration
+
+The specific configuration of the TFT model depends on the dataset used. Not only is the volume of the model subject to change but so are the data sampling and preprocessing strategies. During preprocessing, data is normalized per feature. For a part of the datasets, we apply scaling per-time-series, which takes into account shifts in distribution between entities (i.e., a factory consumes more electricity than an average house). The model is trained with the quantile loss: <img src="https://render.githubusercontent.com/render/math?math=\Large\sum_{i=1}^N\sum_{q\in\mathcal{Q}}\sum_{t=1}^{t_{max}}\frac{QL(y_it,\hat{y}_i(q,t),q)}{Nt_{max}}">
+For quantiles in [0.1, 0.5, 0.9]. The default configurations are tuned for distributed training on DGX-1-32G with mixed precision. We use dynamic loss scaling. Specific values are provided in the table below.
+
+| Dataset | Training samples | Validation samples | Test samples | History length | Forecast horizon | Dropout | Hidden size | #Heads | BS | LR | Gradient clipping |
+| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
+| Electricity | 450k | 50k | 53.5k | 168 | 24 | 0.1 | 128 | 4 | 8x1024 | 1e-3 | 0.0 |
+| Traffic | 450k | 50k | 139.6k | 168 | 24 | 0.3 | 128 | 4 | 8x1024 | 1e-3 | 0.0
+
+### Feature support matrix
+
+The following features are supported by this model: 
+
+| Feature                    | Yes column                
+|----------------------------|--------------------------
+|Distributed data parallel   |         Yes
+|PyTorch AMP                 |         Yes 
+    
+         
+#### Features
+
+[Automatic Mixed Precision](https://pytorch.org/docs/stable/amp.html)
+provides an easy way to leverage Tensor Cores’ performance. It allows the execution of parts of a network in lower precision. Refer to [Mixed precision training](#mixed-precision-training) for more information.
+
+[PyTorch
+DistributedDataParallel](https://pytorch.org/docs/stable/nn.html#torch.nn.parallel.DistributedDataParallel) - a module
+wrapper that enables easy multiprocess distributed data-parallel
+training.
+
+### Mixed precision training
+
+Mixed precision is the combined use of different numerical precisions in a
+computational method.
+[Mixed precision](https://arxiv.org/abs/1710.03740) training offers significant
+computational speedup by performing operations in half-precision format while
+storing minimal information in single-precision to retain as much information
+as possible in critical parts of the network. Since the introduction of [Tensor Cores](https://developer.nvidia.com/tensor-cores) in Volta, and following with 
+both the Turing and Ampere architectures, significant training speedups are 
+experienced by switching to
+mixed precision -- up to 3x overall speedup on the most arithmetically intense
+model architectures. Using mixed precision training previously required two
+steps:
+
+1. Porting the model to use the FP16 data type where appropriate.
+2. Manually adding loss scaling to preserve small gradient values.
+
+The ability to train deep learning networks with lower precision was introduced
+in the Pascal architecture and first supported in [CUDA
+8](https://devblogs.nvidia.com/parallelforall/tag/fp16/) in the NVIDIA Deep
+Learning SDK.
+
+For information about:
+* How to train using mixed precision, refer to the [Mixed Precision
+  Training](https://arxiv.org/abs/1710.03740) paper and [Training With Mixed
+  Precision](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html)
+  documentation.
+* Techniques used for mixed precision training, refer to the [Mixed-Precision
+  Training of Deep Neural
+  Networks](https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/)
+  blog.
+* APEX tools for mixed precision training, refer to the [NVIDIA Apex: Tools for Easy Mixed-Precision Training in
+  PyTorch](https://devblogs.nvidia.com/apex-pytorch-easy-mixed-precision-training/)
+  .
+
+
+#### Enabling mixed precision
+
+
+Mixed precision is enabled in PyTorch by using the Automatic Mixed Precision torch.cuda.amp module, which casts variables to half-precision upon retrieval while storing variables in single-precision format. Furthermore, to preserve small gradient magnitudes in backpropagation, a [loss scaling](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html#lossscaling) step must be included when applying gradients. In PyTorch, loss scaling can be applied automatically by the GradScaler class. All the necessary steps to implement AMP are verbosely described [here](https://pytorch.org/docs/stable/notes/amp_examples.html#amp-examples).
+
+To enable mixed precision for TFT, simply add the `--use_amp` option to the training script.
+#### Enabling TF32
+
+TensorFloat-32 (TF32) is the new math mode in [NVIDIA A100](https://www.nvidia.com/en-us/data-center/a100/) GPUs for handling the matrix math, also called tensor operations. TF32 running on Tensor Cores in A100 GPUs can provide up to 10x speedups compared to single-precision floating-point math (FP32) on Volta GPUs. 
+
+TF32 Tensor Cores can speed up networks using FP32, typically with no loss of accuracy. It is more robust than FP16 for models which require high dynamic range for weights or activations.
+
+For more information, refer to the [TensorFloat-32 in the A100 GPU Accelerates AI Training, HPC up to 20x](https://blogs.nvidia.com/blog/2020/05/14/tensorfloat-32-precision-format/) blog post.
+
+TF32 is supported in the NVIDIA Ampere GPU architecture and is enabled by default.
+
+
+
+### Glossary
+
+**Multi horizon prediction**  
+Process of estimating values of a time series for multiple future time steps.
+
+**Quantiles**  
+Cut points dividing the range of a probability distribution intervals with equal probabilities.
+
+**Time series**  
+Series of data points indexed and equally spaced in time.
+
+**Transformer**  
+The paper [Attention Is All You Need](https://arxiv.org/abs/1706.03762) introduces a novel architecture called Transformer that uses an attention mechanism and transforms one sequence into another.
+ 
+
+## Setup
+
+The following section lists the requirements that you need to meet in order to start training the TFT model.
+
+### Requirements
+
+This repository contains Dockerfile, which extends the PyTorch NGC container and encapsulates some dependencies. Aside from these dependencies, ensure you have the following components:
+-   [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker)
+-   [PyTorch 21.06 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch)
+-   Supported GPUs:
+- [NVIDIA Volta architecture](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/)
+- [NVIDIA Turing architecture](https://www.nvidia.com/en-us/design-visualization/technologies/turing-architecture/)
+- [NVIDIA Ampere architecture](https://www.nvidia.com/en-us/data-center/nvidia-ampere-gpu-architecture/)
+
+For more information about how to get started with NGC containers, refer to the following sections from the NVIDIA GPU Cloud Documentation and the Deep Learning Documentation:
+-   [Getting Started Using NVIDIA GPU Cloud](https://docs.nvidia.com/ngc/ngc-getting-started-guide/index.html)
+-   [Accessing And Pulling From The NGC Container Registry](https://docs.nvidia.com/deeplearning/frameworks/user-guide/index.html#accessing_registry)
+-   Running [PyTorch](https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/running.html#running)
+
+  
+For those unable to use the PyTorch NGC container to set up the required environment or create your own container, refer to the versioned [NVIDIA Container Support Matrix](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html).
+
+## Quick Start Guide
+
+To train your model using mixed or TF32 precision with Tensor Cores, perform the following steps using the default parameters of the TFT model on any of the benchmark datasets. For the specifics concerning training and inference, refer to the [Advanced](#advanced) section.
+
+1. Clone the repository.
+```bash
+git clone https://github.com/NVIDIA/DeepLearningExamples
+cd DeepLearningExamples/PyTorch/Forecasting/TFT
+```
+
+2.  Build the TFT PyTorch NGC container.
+```bash
+docker build --network=host -t tft .
+```
+
+3.  Start an interactive session in the NGC container to run training/inference.
+```bash
+docker run -it --rm --ipc=host --network=host --gpus all -v /path/to/your/data:/data/ tft
+```
+
+Note: Ensure to mount your dataset using the -v flag to make it available for training inside the NVIDIA Docker container.
+
+4.  Download and preprocess datasets.
+```bash
+bash scripts/get_data.sh
+```
+
+5. Start training. Choose one of the scripts provided in the `scripts/` directory. Results are stored in the `/results` directory.
+These scripts are tuned for DGX1-32G. If you have a different system, use NGPU and BATCH_SIZE variables to adjust the parameters for your system.
+```bash
+bash scripts/run_electricity.sh
+bash scripts/run_traffic.sh
+```
+
+6. Start validation/evaluation. The metric we use for evaluation is q-risk. We can compare it per-quantile in the Pareto sense or jointly as one number indicating accuracy.
+```bash
+python inference.py \
+--checkpoint <your_checkpoint> \
+--data /data/processed/<dataset>/test.csv \
+--cat_encodings /data/processed/<dataset>/cat_encodings.bin \
+--tgt_scalers /data/processed/<dataset>/tgt_scalers.bin
+```
+
+7. Start inference/predictions. Visualize and save predictions by running the following command.
+```bash
+python inference.py \
+--checkpoint <your_checkpoint> \
+--data /data/processed/<dataset>/test.csv \
+--cat_encodings /data/processed/<dataset>/cat_encodings.bin \
+--tgt_scalers /data/processed/<dataset>/tgt_scalers.bin \
+--visualize \
+--save_predictions
+```
+
+
+
+Now that you have your model trained and evaluated, you can choose to compare your training results with our [Training accuracy results](#training-accuracy-results). You can also choose to benchmark your performance to [Training performance benchmark](#training-performance-results). Following the steps in these sections will ensure that you achieve the same accuracy and performance results as stated in the [Results](#results) section.
+## Advanced
+
+The following sections provide more  details about the dataset, running training and inference, and the training results.
+
+### Scripts and sample code
+
+In the root directory, the most important files are:
+
+`train.py`: Entry point for training
+`data_utils.py`: File containing the dataset implementation and preprocessing functions
+`modeling.py`: Definition of the model
+`configuration.py`: Contains configuration classes for various experiments
+`test.py`: Entry point testing trained model.
+`Dockerfile`: Container definition
+`log_helper.py`: Contains helper functions for setting up dllogger
+`criterions.py`: Definitions of loss functions
+
+The `scripts` directory contains scripts for default use cases:
+`run_electricity.sh`: train default model on the electricity dataset
+`run_traffic.sh`: train default model on the traffic dataset
+
+### Command-line options
+
+To view the full list of available options and their descriptions, use the `-h` or `--help` command-line option, for example:
+`python train.py --help`.
+
+The following example output is printed when running the model:
+```
+usage: train.py [-h] --data_path DATA_PATH --dataset {electricity,volatility,traffic,favorita} [--epochs EPOCHS] [--sample_data SAMPLE_DATA SAMPLE_DATA] [--batch_size BATCH_SIZE] [--lr LR] [--seed SEED] [--use_amp] [--clip_grad CLIP_GRAD]
+                [--early_stopping EARLY_STOPPING] [--results RESULTS] [--log_file LOG_FILE] [--distributed_world_size N] [--distributed_rank DISTRIBUTED_RANK] [--local_rank LOCAL_RANK] [--overwrite_config OVERWRITE_CONFIG]
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --data_path DATA_PATH
+  --dataset {electricity,volatility,traffic,favorita}
+  --epochs EPOCHS
+  --sample_data SAMPLE_DATA SAMPLE_DATA
+  --batch_size BATCH_SIZE
+  --lr LR
+  --seed SEED
+  --use_amp             Enable automatic mixed precision
+  --clip_grad CLIP_GRAD
+  --early_stopping EARLY_STOPPING
+                        Stop training if validation loss does not improve for more than this number of epochs.
+  --results RESULTS
+  --log_file LOG_FILE
+  --distributed_world_size N
+                        total number of GPUs across all nodes (default: all visible GPUs)
+  --distributed_rank DISTRIBUTED_RANK
+                        rank of the current worker
+  --local_rank LOCAL_RANK
+                        rank of the current worker
+  --overwrite_config OVERWRITE_CONFIG
+                        JSON string used to overload config
+
+```
+
+### Getting the data
+    
+The TFT model was trained on the electricity and traffic benchmark datasets. This repository contains the `get_data.sh` download script, which for electricity and and traffic datasets will automatically download and preprocess the training, validation and test datasets, and produce files that contain scalers.
+#### Dataset guidelines
+
+The `data_utils.py` file contains all functions that are used to preprocess the data. Initially the data is loaded to a `pandas.DataFrame` and parsed to the common format which contains the features we will use for training. Then standardized data is cleaned, normalized, encoded and binarized.
+This step does the following:
+Drop all the columns that are not marked in the configuration file as used for training or preprocessing
+Flatten indices in case time series are indexed by more than one column
+Split the data into training, validation and test splits
+Filter out all the time series shorter than minimal example length
+Normalize columns marked as continuous in the configuration file
+Encode as integers columns marked as categorical
+Save the data in csv and binary formats
+
+#### Multi-dataset
+In order to use an alternate dataset, you have to write a function that parses your data to a common format. The format is as follows:
+There is at least one id column
+There is exactly one time column (that can also be used as a feature column)
+Each feature is in a separate column
+Each row represents a moment in time for only one time series
+Additionally, you must specify a configuration of the network, including a data description. Refer to the example in `configuration.py` file.
+### Training process
+
+The `train.py` script is an entry point for a training procedure. Refined recipes can be found in the `scripts` directory.
+The model trains for at most `--epochs` epochs. If option `--early_stopping N` is set, then training will end if for N subsequent epochs validation loss hadn’t improved.
+The details of the architecture and the dataset configuration are encapsulated by the `--dataset` option. This option chooses one of the configurations stored in the `configuration.py` file. You can enable mixed precision training by providing the `--use_amp` option. The training script supports multi-GPU training with the APEX package. To enable distributed training prepend training command with `python -m torch.distributed.launch --nproc_per_node=${NGPU}`.
+
+Example command:
+```
+python -m torch.distributed.launch --nproc_per_node=8 train.py \
+        --dataset electricity \
+        --data_path /data/processed/electricity_bin \
+        --batch_size=1024 \
+        --sample 450000 50000 \
+        --lr 1e-3 \
+        --epochs 25 \
+        --early_stopping 5 \
+        --seed 1 \
+        --use_amp \
+        --results /results/TFT_electricity_bs8x1024_lr1e-3/seed_1
+```
+
+The model is trained by optimizing quantile loss <img src="https://render.githubusercontent.com/render/math?math=\Large\sum_{i=1}^N\sum_{q\in\mathcal{Q}}\sum_{t=1}^{t_{max}}\frac{QL(y_{it},\hat{y}_i(q,t),q)}{Nt_{max}}">
+. After training, the checkpoint with the least validation loss is evaluated on a test split with q-risk metric <img src="https://render.githubusercontent.com/render/math?math=\Large\frac{2\sum_{y\in\Omega}\sum_{t=1}^{t_{max}}QL(y_t,\hat{y}(q,t),q)}{\sum_{y\in\Omega}\sum_{t=1}^{t_{max}}|y_t|}">.
+Results are by default stored in the `/results` directory. This can be changed by providing the `--results` option. At the end of the training,  the results directory will contain the trained checkpoint which had the lowest validation loss, dllogger logs (in dictionary per line format), and TensorBoard logs.
+
+### Inference process
+
+Inference can be run by launching the `inference.py` script. The script requires a trained checkpoint to run. It is crucial to prepare the data in the same way as training data prior to running the inference. Example command:
+```
+python inference.py \
+--checkpoint /results/checkpoint.pt \
+--data /data/processed/electricity_bin/test.csv \
+--tgt_scalers /data/processed/electricity_bin/tgt_scalers.bin \
+--cat_encodings /data/processed/electricity_bin/cat_encodings.bin \
+--batch_size 2048 \
+--visualize \
+--save_predictions \
+--joint_visualization \
+--results /results \
+--use_amp
+```
+
+In the default setting, it performs the evaluation of the model on a specified dataset and prints q-risk evaluated on this dataset. In order to save the predictions, use the `--save_predictions` option. Predictions will be stored in the directory specified by the `--results` option in the csv format. Option `--joint_visualization` allows us to plot graphs in TensorBoard format, allowing us to inspect the results and compare them to true values. Using `--visualize`, you can save plots for each example in a separate file.
+## Performance
+
+### Benchmarking
+
+The following section shows how to run benchmarks measuring the model performance in training and inference modes.
+
+#### Training performance benchmark
+
+In order to run training benchmarks, use the `scripts/benchmark.sh` script.
+
+#### Inference performance benchmark
+
+To benchmark the inference performance on a specific batch size and dataset, run the `inference.py` script.
+### Results
+
+The following sections provide details on how we achieved our performance and accuracy in training and inference.
+
+#### Training accuracy results
+
+We conducted an extensive hyperparameter search along with stability tests. The presented results are the averages from the hundreds of runs.
+
+##### Training accuracy: NVIDIA DGX A100 (A100 80GB)
+
+Our results were obtained by running the `train.sh` training script in the [PyTorch 21.06 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch) on NVIDIA A100 GPUs.
+
+| Dataset | GPUs | Batch size / GPU    | Accuracy - TF32  | Accuracy - mixed precision  |   Time to train - TF32  |  Time to train - mixed precision | Time to train speedup (TF32 to mixed precision)     
+|-------------|---|------|-----------------------|-----------------------|-------|-------|-------
+| Electricity | 1 | 1024 | 0.027 / 0.059 / 0.029 | 0.028 / 0.058 / 0.029 | 1427s | 1087s | 1.313x
+| Electricity | 8 | 1024 | 0.027 / 0.056 / 0.028 | 0.026 / 0.054 / 0.029 | 216s  | 176s  | 1.227x
+| Traffic     | 1 | 1024 | 0.040 / 0.103 / 0.075 | 0.040 / 0.103 / 0.075 | 957s  | 726s  | 1.318x
+| Traffic     | 8 | 1024 | 0.042 / 0.104 / 0.076 | 0.042 / 0.106 / 0.077 | 151s  | 126s  | 1.198x
+
+
+
+
+##### Training accuracy: NVIDIA DGX-1 (V100 16GB)
+
+Our results were obtained by running the `train.sh` training script in the [PyTorch 21.06 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch) on NVIDIA DGX-1 with V100 16GB GPUs.
+
+| Dataset | GPUs    | Batch size / GPU    | Accuracy - FP32  | Accuracy - mixed precision  |   Time to train - FP32  |  Time to train - mixed precision | Time to train speedup (FP32 to mixed precision)        
+|-------------|---|------|-----------------------|-----------------------|-------|-------|-----------
+| Electricity | 1 | 1024 | 0.027 / 0.056 / 0.028 | 0.027 / 0.058 / 0.029 | 2559s | 1598s | 1.601x 
+| Electricity | 8 | 1024 | 0.027 / 0.055 / 0.028 | 0.027 / 0.055 / 0.029 | 381s  | 261s  | 1.460x   
+| Traffic     | 1 | 1024 | 0.040 / 0.102 / 0.075 | 0.041 / 0.101 / 0.074 | 1718s | 1062s | 1.618x 
+| Traffic     | 8 | 1024 | 0.042 / 0.106 / 0.076 | 0.042 / 0.105 / 0.077 | 256s  | 176s  | 1.455x
+
+
+
+##### Training stability test
+
+In order to get a greater picture of the model’s accuracy, we performed a hyperparameter search along with stability tests on 100 random seeds for each configuration. Then, for each benchmark dataset, we have chosen the architecture with the least mean test q-risk. The table below summarizes the best configurations.
+
+| Dataset     | #GPU | Hidden size | #Heads | Local BS | LR   | Gradient clipping | Dropout | Mean q-risk | Std q-risk | Min q-risk | Max q-risk
+|-------------|------|-------------|--------|----------|------|-------------------|---------|-------------|------------| -----------|------ 
+| Electricity | 8    | 128         | 4      | 1024     | 1e-3 | 0.0               | 0.1     | 0.1131      | 0.0025     | 0.1080     | 0.1200
+| Traffic     | 8    | 128         | 4      | 1024     | 1e-3 | 0.0               | 0.3     | 0.2180      | 0.0049     | 0.2069     | 0.2336
+
+
+#### Training performance results
+
+##### Training performance: NVIDIA DGX A100 (A100 80GB)
+
+Our results were obtained by running the `train.sh` training script in the [PyTorch 21.06 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch) on NVIDIA A100 (A100 80GB) GPUs. Performance numbers (in items/images per second) were averaged over an entire training epoch.
+
+| Dataset | GPUs   | Batch size / GPU   | Throughput - TF32    | Throughput - mixed precision    | Throughput speedup (TF32 - mixed precision)   | Weak scaling - TF32    | Weak scaling - mixed precision        
+|-------------|---|------|--------|--------|-------|-------|-----
+| Electricity | 1 | 1024 | 10173  | 13703  | 1.35x | 1     | 1
+| Electricity | 8 | 1024 | 80596  | 107761 | 1.34x | 7.92x | 7.86x
+| Traffic     | 1 | 1024 | 10197  | 13779  | 1.35x | 1     | 1
+| Traffic     | 8 | 1024 | 80692  | 107979 | 1.34x | 7.91x | 7.84x
+
+
+To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
+
+The performance metrics used were items per second.
+
+
+##### Training performance: NVIDIA DGX-1 (V100 16GB)
+
+Our results were obtained by running the `train.sh` training script in the [PyTorch 21.06 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch) on NVIDIA DGX-1 with (V100 16GB) GPUs. Performance numbers (in items/images per second) were averaged over an entire training epoch.
+
+| Dataset | GPUs   | Batch size / GPU   | Throughput - FP32    | Throughput - mixed precision    | Throughput speedup (FP32 - mixed precision)   | Weak scaling - FP32    | Weak scaling - mixed precision        
+|-------------|---|------|-------|-------|-------|------|----
+| Electricity | 1 | 1024 | 5580  | 9148  | 1.64x | 1     | 1
+| Electricity | 8 | 1024 | 43351 | 69855 | 1.61x | 7.77x | 7.64x
+| Traffic     | 1 | 1024 | 5593  | 9194  | 1.64x | 1     | 1
+| Traffic     | 8 | 1024 | 43426 | 69983 | 1.61x | 7.76x | 7.61x
+
+
+
+To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
+
+The performance metrics used were items per second.
+
+## Release notes
+The performance measurements in this document were conducted at the time of publication and may not reflect the performance achieved from NVIDIA’s latest software release. For the most up-to-date performance measurements, go to https://developer.nvidia.com/deep-learning-performance-training-inference.
+
+### Changelog
+
+October 2021
+- Initial release
+
+### Known issues
+There are no known issues with this model.
+
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TFT_architecture.PNG b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TFT_architecture.PNG
new file mode 100644
index 00000000..c3431031
Binary files /dev/null and b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TFT_architecture.PNG differ
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/Dockerfile b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/Dockerfile
new file mode 100644
index 00000000..70552ea1
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/Dockerfile
@@ -0,0 +1,36 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:21.06-py3
+
+FROM ${FROM_IMAGE_NAME}
+
+RUN apt-get update && apt-get install -y libb64-dev libb64-0d
+WORKDIR /workspace
+#ENV PYTHONPATH /workspace
+RUN pip uninstall -y typing
+
+RUN apt update && apt install -y p7zip-full
+COPY requirements.txt .
+RUN pip install --upgrade pip
+RUN pip install --no-cache-dir --ignore-installed -r requirements.txt
+RUN pip install --no-cache-dir -e git://github.com/NVIDIA/dllogger#egg=dllogger
+
+COPY . .
+ENV PYTHONPATH="${PYTHONPATH}:/workspace"
+
+# AMP monkey-patch
+RUN sed -i 's/  def forward(ctx,/  @amp.custom_fwd\(cast_inputs=torch.float32\)\n  def forward(ctx,/g' /opt/conda/lib/python3.8/site-packages/apex/normalization/fused_layer_norm.py
+RUN sed -i 's/  def backward(ctx,/  @amp.custom_bwd\n  def backward(ctx,/g' /opt/conda/lib/python3.8/site-packages/apex/normalization/fused_layer_norm.py
+RUN sed -i 's/^import torch$/import torch\nfrom torch.cuda import amp/' /opt/conda/lib/python3.8/site-packages/apex/normalization/fused_layer_norm.py
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/LICENCE b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/LICENCE
new file mode 100644
index 00000000..261eeb9e
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/LICENCE
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/LICENSE AGREEMENT b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/LICENSE AGREEMENT
new file mode 100644
index 00000000..5d1d88cf
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/LICENSE AGREEMENT	
@@ -0,0 +1,25 @@
+Individual Contributor License Agreement (CLA)
+Thank you for submitting your contributions to this project.
+
+By signing this CLA, you agree that the following terms apply to all of your past, present and future contributions to the project.
+
+License.
+You hereby represent that all present, past and future contributions are governed by the Apache 2.0 License copyright statement.
+
+This entails that to the extent possible under law, you transfer all copyright and related or neighboring rights of the code or documents you contribute to the project itself or its maintainers. Furthermore you also represent that you have the authority to perform the above waiver with respect to the entirety of you contributions.
+
+Moral Rights.
+To the fullest extent permitted under applicable law, you hereby waive, and agree not to assert, all of your “moral rights” in or relating to your contributions for the benefit of the project.
+
+Third Party Content.
+If your Contribution includes or is based on any source code, object code, bug fixes, configuration changes, tools, specifications, documentation, data, materials, feedback, information or other works of authorship that were not authored by you (“Third Party Content”) or if you are aware of any third party intellectual property or proprietary rights associated with your Contribution (“Third Party Rights”), then you agree to include with the submission of your Contribution full details respecting such Third Party Content and Third Party Rights, including, without limitation, identification of which aspects of your Contribution contain Third Party Content or are associated with Third Party Rights, the owner/author of the Third Party Content and Third Party Rights, where you obtained the Third Party Content, and any applicable third party license terms or restrictions respecting the Third Party Content and Third Party Rights. For greater certainty, the foregoing obligations respecting the identification of Third Party Content and Third Party Rights do not apply to any portion of a Project that is incorporated into your Contribution to that same Project.
+
+Representations.
+You represent that, other than the Third Party Content and Third Party Rights identified by you in accordance with this Agreement, you are the sole author of your Contributions and are legally entitled to grant the foregoing licenses and waivers in respect of your Contributions. If your Contributions were created in the course of your employment with your past or present employer(s), you represent that such employer(s) has authorized you to make your Contributions on behalf of such employer(s) or such employer (s) has waived all of their right, title or interest in or to your Contributions.
+
+Disclaimer.
+To the fullest extent permitted under applicable law, your Contributions are provided on an "as is" basis, without any warranties or conditions, express or implied, including, without limitation, any implied warranties or conditions of non-infringement, merchantability or fitness for a particular purpose. You are not required to provide support for your Contributions, except to the extent you desire to provide support.
+
+No Obligation.
+You acknowledge that the maintainers of this project are under no obligation to use or incorporate your contributions into the project. The decision to use or incorporate your contributions into the project will be made at the sole discretion of the maintainers or their authorized delegates.
+
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/NOTICE b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/NOTICE
new file mode 100644
index 00000000..ae19bb47
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/NOTICE
@@ -0,0 +1,3 @@
+TFT for PyTorch
+
+This repository includes software from https://github.com/google-research/google-research/tree/master/tft licensed under the Apache License, Version 2.0
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/README.md b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/README.md
new file mode 100644
index 00000000..69b39d12
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/README.md
@@ -0,0 +1,465 @@
+# Temporal Fusion Transformer For PyTorch
+
+This repository provides a script and recipe to train the Temporal Fusion Transformer model to achieve state-of-the-art accuracy. The content of this repository is tested and maintained by NVIDIA.
+
+## Table Of Contents
+
+- [Model overview](#model-overview)
+    * [Model architecture](#model-architecture)
+    * [Default configuration](#default-configuration)
+    * [Feature support matrix](#feature-support-matrix)
+	    * [Features](#features)
+    * [Mixed precision training](#mixed-precision-training)
+	    * [Enabling mixed precision](#enabling-mixed-precision)
+          * [Enabling TF32](#enabling-tf32)
+    * [Glossary](#glossary)
+- [Setup](#setup)
+    * [Requirements](#requirements)
+- [Quick Start Guide](#quick-start-guide)
+- [Advanced](#advanced)
+    * [Scripts and sample code](#scripts-and-sample-code)
+    * [Command-line options](#command-line-options)
+    * [Getting the data](#getting-the-data)
+        * [Dataset guidelines](#dataset-guidelines)
+        * [Multi-dataset](#multi-dataset)
+    * [Training process](#training-process)
+    * [Inference process](#inference-process)
+- [Performance](#performance)
+    * [Benchmarking](#benchmarking)
+        * [Training performance benchmark](#training-performance-benchmark)
+        * [Inference performance benchmark](#inference-performance-benchmark)
+    * [Results](#results)
+        * [Training accuracy results](#training-accuracy-results)                         
+            * [Training accuracy: NVIDIA DGX A100 (8x A100 80GB)](#training-accuracy-nvidia-dgx-a100-8x-a100-80gb)
+            * [Training accuracy: NVIDIA DGX-1 (8x V100 16GB)](#training-accuracy-nvidia-dgx-1-8x-v100-16gb)
+            * [Training stability test](#training-stability-test)
+        * [Training performance results](#training-performance-results)
+            * [Training performance: NVIDIA DGX A100 (8x A100 80GB)](#training-performance-nvidia-dgx-a100-8x-a100-80gb)
+            * [Training performance: NVIDIA DGX-1 (8x V100 16GB)](#training-performance-nvidia-dgx-1-8x-v100-16gb)
+- [Release notes](#release-notes)
+    * [Changelog](#changelog)
+    * [Known issues](#known-issues)
+
+
+
+## Model overview
+
+The Temporal Fusion Transformer [TFT](https://arxiv.org/abs/1912.09363) model is a state-of-the-art architecture for interpretable, multi-horizon time-series prediction. The model was first developed and [implemented by Google](https://github.com/google-research/google-research/tree/master/tft) with the collaboration with the University of Oxford.
+This implementation differs from the reference implementation by addressing the issue of missing data, which is common in production datasets, by either masking their values in attention matrices or embedding them as a special value in the latent space.
+This model enables the prediction of confidence intervals for future values of time series for multiple future timesteps.
+
+This model is trained with mixed precision using Tensor Cores on Volta, Turing, and the NVIDIA Ampere GPU architectures. Therefore, researchers can get results 1.45x faster than training without Tensor Cores while experiencing the benefits of mixed precision training. This model is tested against each NGC monthly container release to ensure consistent accuracy and performance over time.
+
+### Model architecture
+
+The TFT model is a hybrid architecture joining LSTM encoding of time series and interpretability of transformer attention layers. Prediction is based on three  types of variables: static (constant for a given time series), known (known in advance for whole history and future), observed (known only for historical data). All these variables come in two flavors: categorical, and continuous. In addition to historical data, we feed the model with historical values of time series. All variables are embedded in high-dimensional space by learning an embedding vector. Categorical variables embeddings are learned in the classical sense of embedding discrete values. The model learns a single vector for each continuous variable, which is then scaled by this variable’s value for further processing. The next step is to filter variables through the Variable Selection Network (VSN), which assigns weights to the inputs in accordance with their relevance to the prediction. Static variables are used as a context for variable selection of other variables and as an initial state of LSTM encoders.
+After encoding, variables are passed to multi-head attention layers (decoder), which produce the final prediction. Whole architecture is interwoven with residual connections with gating mechanisms that allow  the architecture to adapt to various problems by skipping some parts of it.
+For the sake of explainability, heads of self-attention layers share value matrices. This allows interpreting  self-attention as an ensemble of models predicting different temporal patterns over the same feature set. The other feature that helps us understand the model is VSN activations, which tells us how relevant the given feature is to the prediction.
+![](TFT_architecture.PNG)
+*image source: https://arxiv.org/abs/1912.09363*
+
+### Default configuration
+
+The specific configuration of the TFT model depends on the dataset used. Not only is the volume of the model subject to change but so are the data sampling and preprocessing strategies. During preprocessing, data is normalized per feature. For a part of the datasets, we apply scaling per-time-series, which takes into account shifts in distribution between entities (i.e., a factory consumes more electricity than an average house). The model is trained with the quantile loss: <img src="https://render.githubusercontent.com/render/math?math=\Large\sum_{i=1}^N\sum_{q\in\mathcal{Q}}\sum_{t=1}^{t_{max}}\frac{QL(y_it,\hat{y}_i(q,t),q)}{Nt_{max}}">
+For quantiles in [0.1, 0.5, 0.9]. The default configurations are tuned for distributed training on DGX-1-32G with mixed precision. We use dynamic loss scaling. Specific values are provided in the table below.
+
+| Dataset | Training samples | Validation samples | Test samples | History length | Forecast horizon | Dropout | Hidden size | #Heads | BS | LR | Gradient clipping |
+| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
+| Electricity | 450k | 50k | 53.5k | 168 | 24 | 0.1 | 128 | 4 | 8x1024 | 1e-3 | 0.0 |
+| Traffic | 450k | 50k | 139.6k | 168 | 24 | 0.3 | 128 | 4 | 8x1024 | 1e-3 | 0.0
+
+### Feature support matrix
+
+The following features are supported by this model: 
+
+| Feature                    | Yes column                
+|----------------------------|--------------------------
+|Distributed data parallel   |         Yes
+|PyTorch AMP                 |         Yes 
+    
+         
+#### Features
+
+[Automatic Mixed Precision](https://pytorch.org/docs/stable/amp.html)
+provides an easy way to leverage Tensor Cores’ performance. It allows the execution of parts of a network in lower precision. Refer to [Mixed precision training](#mixed-precision-training) for more information.
+
+[PyTorch
+DistributedDataParallel](https://pytorch.org/docs/stable/nn.html#torch.nn.parallel.DistributedDataParallel) - a module
+wrapper that enables easy multiprocess distributed data-parallel
+training.
+
+### Mixed precision training
+
+Mixed precision is the combined use of different numerical precisions in a
+computational method.
+[Mixed precision](https://arxiv.org/abs/1710.03740) training offers significant
+computational speedup by performing operations in half-precision format while
+storing minimal information in single-precision to retain as much information
+as possible in critical parts of the network. Since the introduction of [Tensor Cores](https://developer.nvidia.com/tensor-cores) in Volta, and following with 
+both the Turing and Ampere architectures, significant training speedups are 
+experienced by switching to
+mixed precision -- up to 3x overall speedup on the most arithmetically intense
+model architectures. Using mixed precision training previously required two
+steps:
+
+1. Porting the model to use the FP16 data type where appropriate.
+2. Manually adding loss scaling to preserve small gradient values.
+
+The ability to train deep learning networks with lower precision was introduced
+in the Pascal architecture and first supported in [CUDA
+8](https://devblogs.nvidia.com/parallelforall/tag/fp16/) in the NVIDIA Deep
+Learning SDK.
+
+For information about:
+* How to train using mixed precision, refer to the [Mixed Precision
+  Training](https://arxiv.org/abs/1710.03740) paper and [Training With Mixed
+  Precision](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html)
+  documentation.
+* Techniques used for mixed precision training, refer to the [Mixed-Precision
+  Training of Deep Neural
+  Networks](https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/)
+  blog.
+* APEX tools for mixed precision training, refer to the [NVIDIA Apex: Tools for Easy Mixed-Precision Training in
+  PyTorch](https://devblogs.nvidia.com/apex-pytorch-easy-mixed-precision-training/)
+  .
+
+
+#### Enabling mixed precision
+
+
+Mixed precision is enabled in PyTorch by using the Automatic Mixed Precision torch.cuda.amp module, which casts variables to half-precision upon retrieval while storing variables in single-precision format. Furthermore, to preserve small gradient magnitudes in backpropagation, a [loss scaling](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html#lossscaling) step must be included when applying gradients. In PyTorch, loss scaling can be applied automatically by the GradScaler class. All the necessary steps to implement AMP are verbosely described [here](https://pytorch.org/docs/stable/notes/amp_examples.html#amp-examples).
+
+To enable mixed precision for TFT, simply add the `--use_amp` option to the training script.
+#### Enabling TF32
+
+TensorFloat-32 (TF32) is the new math mode in [NVIDIA A100](https://www.nvidia.com/en-us/data-center/a100/) GPUs for handling the matrix math, also called tensor operations. TF32 running on Tensor Cores in A100 GPUs can provide up to 10x speedups compared to single-precision floating-point math (FP32) on Volta GPUs. 
+
+TF32 Tensor Cores can speed up networks using FP32, typically with no loss of accuracy. It is more robust than FP16 for models which require high dynamic range for weights or activations.
+
+For more information, refer to the [TensorFloat-32 in the A100 GPU Accelerates AI Training, HPC up to 20x](https://blogs.nvidia.com/blog/2020/05/14/tensorfloat-32-precision-format/) blog post.
+
+TF32 is supported in the NVIDIA Ampere GPU architecture and is enabled by default.
+
+
+
+### Glossary
+
+**Multi horizon prediction**  
+Process of estimating values of a time series for multiple future time steps.
+
+**Quantiles**  
+Cut points dividing the range of a probability distribution intervals with equal probabilities.
+
+**Time series**  
+Series of data points indexed and equally spaced in time.
+
+**Transformer**  
+The paper [Attention Is All You Need](https://arxiv.org/abs/1706.03762) introduces a novel architecture called Transformer that uses an attention mechanism and transforms one sequence into another.
+ 
+
+## Setup
+
+The following section lists the requirements that you need to meet in order to start training the TFT model.
+
+### Requirements
+
+This repository contains Dockerfile, which extends the PyTorch NGC container and encapsulates some dependencies. Aside from these dependencies, ensure you have the following components:
+-   [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker)
+-   [PyTorch 21.06 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch)
+-   Supported GPUs:
+- [NVIDIA Volta architecture](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/)
+- [NVIDIA Turing architecture](https://www.nvidia.com/en-us/design-visualization/technologies/turing-architecture/)
+- [NVIDIA Ampere architecture](https://www.nvidia.com/en-us/data-center/nvidia-ampere-gpu-architecture/)
+
+For more information about how to get started with NGC containers, refer to the following sections from the NVIDIA GPU Cloud Documentation and the Deep Learning Documentation:
+-   [Getting Started Using NVIDIA GPU Cloud](https://docs.nvidia.com/ngc/ngc-getting-started-guide/index.html)
+-   [Accessing And Pulling From The NGC Container Registry](https://docs.nvidia.com/deeplearning/frameworks/user-guide/index.html#accessing_registry)
+-   Running [PyTorch](https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/running.html#running)
+
+  
+For those unable to use the PyTorch NGC container to set up the required environment or create your own container, refer to the versioned [NVIDIA Container Support Matrix](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html).
+
+## Quick Start Guide
+
+To train your model using mixed or TF32 precision with Tensor Cores, perform the following steps using the default parameters of the TFT model on any of the benchmark datasets. For the specifics concerning training and inference, refer to the [Advanced](#advanced) section.
+
+1. Clone the repository.
+```bash
+git clone https://github.com/NVIDIA/DeepLearningExamples
+cd DeepLearningExamples/PyTorch/Forecasting/TFT
+```
+
+2.  Build the TFT PyTorch NGC container.
+```bash
+docker build --network=host -t tft .
+```
+
+3.  Start an interactive session in the NGC container to run training/inference.
+```bash
+docker run -it --rm --ipc=host --network=host --gpus all -v /path/to/your/data:/data/ tft
+```
+
+Note: Ensure to mount your dataset using the -v flag to make it available for training inside the NVIDIA Docker container.
+
+4.  Download and preprocess datasets.
+```bash
+bash scripts/get_data.sh
+```
+
+5. Start training. Choose one of the scripts provided in the `scripts/` directory. Results are stored in the `/results` directory.
+These scripts are tuned for DGX1-32G. If you have a different system, use NGPU and BATCH_SIZE variables to adjust the parameters for your system.
+```bash
+bash scripts/run_electricity.sh
+bash scripts/run_traffic.sh
+```
+
+6. Start validation/evaluation. The metric we use for evaluation is q-risk. We can compare it per-quantile in the Pareto sense or jointly as one number indicating accuracy.
+```bash
+python inference.py \
+--checkpoint <your_checkpoint> \
+--data /data/processed/<dataset>/test.csv \
+--cat_encodings /data/processed/<dataset>/cat_encodings.bin \
+--tgt_scalers /data/processed/<dataset>/tgt_scalers.bin
+```
+
+7. Start inference/predictions. Visualize and save predictions by running the following command.
+```bash
+python inference.py \
+--checkpoint <your_checkpoint> \
+--data /data/processed/<dataset>/test.csv \
+--cat_encodings /data/processed/<dataset>/cat_encodings.bin \
+--tgt_scalers /data/processed/<dataset>/tgt_scalers.bin \
+--visualize \
+--save_predictions
+```
+
+
+
+Now that you have your model trained and evaluated, you can choose to compare your training results with our [Training accuracy results](#training-accuracy-results). You can also choose to benchmark your performance to [Training performance benchmark](#training-performance-results). Following the steps in these sections will ensure that you achieve the same accuracy and performance results as stated in the [Results](#results) section.
+## Advanced
+
+The following sections provide more  details about the dataset, running training and inference, and the training results.
+
+### Scripts and sample code
+
+In the root directory, the most important files are:
+
+`train.py`: Entry point for training
+`data_utils.py`: File containing the dataset implementation and preprocessing functions
+`modeling.py`: Definition of the model
+`configuration.py`: Contains configuration classes for various experiments
+`test.py`: Entry point testing trained model.
+`Dockerfile`: Container definition
+`log_helper.py`: Contains helper functions for setting up dllogger
+`criterions.py`: Definitions of loss functions
+
+The `scripts` directory contains scripts for default use cases:
+`run_electricity.sh`: train default model on the electricity dataset
+`run_traffic.sh`: train default model on the traffic dataset
+
+### Command-line options
+
+To view the full list of available options and their descriptions, use the `-h` or `--help` command-line option, for example:
+`python train.py --help`.
+
+The following example output is printed when running the model:
+```
+usage: train.py [-h] --data_path DATA_PATH --dataset {electricity,volatility,traffic,favorita} [--epochs EPOCHS] [--sample_data SAMPLE_DATA SAMPLE_DATA] [--batch_size BATCH_SIZE] [--lr LR] [--seed SEED] [--use_amp] [--clip_grad CLIP_GRAD]
+                [--early_stopping EARLY_STOPPING] [--results RESULTS] [--log_file LOG_FILE] [--distributed_world_size N] [--distributed_rank DISTRIBUTED_RANK] [--local_rank LOCAL_RANK] [--overwrite_config OVERWRITE_CONFIG]
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --data_path DATA_PATH
+  --dataset {electricity,volatility,traffic,favorita}
+  --epochs EPOCHS
+  --sample_data SAMPLE_DATA SAMPLE_DATA
+  --batch_size BATCH_SIZE
+  --lr LR
+  --seed SEED
+  --use_amp             Enable automatic mixed precision
+  --clip_grad CLIP_GRAD
+  --early_stopping EARLY_STOPPING
+                        Stop training if validation loss does not improve for more than this number of epochs.
+  --results RESULTS
+  --log_file LOG_FILE
+  --distributed_world_size N
+                        total number of GPUs across all nodes (default: all visible GPUs)
+  --distributed_rank DISTRIBUTED_RANK
+                        rank of the current worker
+  --local_rank LOCAL_RANK
+                        rank of the current worker
+  --overwrite_config OVERWRITE_CONFIG
+                        JSON string used to overload config
+
+```
+
+### Getting the data
+    
+The TFT model was trained on the electricity and traffic benchmark datasets. This repository contains the `get_data.sh` download script, which for electricity and and traffic datasets will automatically download and preprocess the training, validation and test datasets, and produce files that contain scalers.
+#### Dataset guidelines
+
+The `data_utils.py` file contains all functions that are used to preprocess the data. Initially the data is loaded to a `pandas.DataFrame` and parsed to the common format which contains the features we will use for training. Then standardized data is cleaned, normalized, encoded and binarized.
+This step does the following:
+Drop all the columns that are not marked in the configuration file as used for training or preprocessing
+Flatten indices in case time series are indexed by more than one column
+Split the data into training, validation and test splits
+Filter out all the time series shorter than minimal example length
+Normalize columns marked as continuous in the configuration file
+Encode as integers columns marked as categorical
+Save the data in csv and binary formats
+
+#### Multi-dataset
+In order to use an alternate dataset, you have to write a function that parses your data to a common format. The format is as follows:
+There is at least one id column
+There is exactly one time column (that can also be used as a feature column)
+Each feature is in a separate column
+Each row represents a moment in time for only one time series
+Additionally, you must specify a configuration of the network, including a data description. Refer to the example in `configuration.py` file.
+### Training process
+
+The `train.py` script is an entry point for a training procedure. Refined recipes can be found in the `scripts` directory.
+The model trains for at most `--epochs` epochs. If option `--early_stopping N` is set, then training will end if for N subsequent epochs validation loss hadn’t improved.
+The details of the architecture and the dataset configuration are encapsulated by the `--dataset` option. This option chooses one of the configurations stored in the `configuration.py` file. You can enable mixed precision training by providing the `--use_amp` option. The training script supports multi-GPU training with the APEX package. To enable distributed training prepend training command with `python -m torch.distributed.launch --nproc_per_node=${NGPU}`.
+
+Example command:
+```
+python -m torch.distributed.launch --nproc_per_node=8 train.py \
+        --dataset electricity \
+        --data_path /data/processed/electricity_bin \
+        --batch_size=1024 \
+        --sample 450000 50000 \
+        --lr 1e-3 \
+        --epochs 25 \
+        --early_stopping 5 \
+        --seed 1 \
+        --use_amp \
+        --results /results/TFT_electricity_bs8x1024_lr1e-3/seed_1
+```
+
+The model is trained by optimizing quantile loss <img src="https://render.githubusercontent.com/render/math?math=\Large\sum_{i=1}^N\sum_{q\in\mathcal{Q}}\sum_{t=1}^{t_{max}}\frac{QL(y_{it},\hat{y}_i(q,t),q)}{Nt_{max}}">
+. After training, the checkpoint with the least validation loss is evaluated on a test split with q-risk metric <img src="https://render.githubusercontent.com/render/math?math=\Large\frac{2\sum_{y\in\Omega}\sum_{t=1}^{t_{max}}QL(y_t,\hat{y}(q,t),q)}{\sum_{y\in\Omega}\sum_{t=1}^{t_{max}}|y_t|}">.
+Results are by default stored in the `/results` directory. This can be changed by providing the `--results` option. At the end of the training,  the results directory will contain the trained checkpoint which had the lowest validation loss, dllogger logs (in dictionary per line format), and TensorBoard logs.
+
+### Inference process
+
+Inference can be run by launching the `inference.py` script. The script requires a trained checkpoint to run. It is crucial to prepare the data in the same way as training data prior to running the inference. Example command:
+```
+python inference.py \
+--checkpoint /results/checkpoint.pt \
+--data /data/processed/electricity_bin/test.csv \
+--tgt_scalers /data/processed/electricity_bin/tgt_scalers.bin \
+--cat_encodings /data/processed/electricity_bin/cat_encodings.bin \
+--batch_size 2048 \
+--visualize \
+--save_predictions \
+--joint_visualization \
+--results /results \
+--use_amp
+```
+
+In the default setting, it performs the evaluation of the model on a specified dataset and prints q-risk evaluated on this dataset. In order to save the predictions, use the `--save_predictions` option. Predictions will be stored in the directory specified by the `--results` option in the csv format. Option `--joint_visualization` allows us to plot graphs in TensorBoard format, allowing us to inspect the results and compare them to true values. Using `--visualize`, you can save plots for each example in a separate file.
+## Performance
+
+### Benchmarking
+
+The following section shows how to run benchmarks measuring the model performance in training and inference modes.
+
+#### Training performance benchmark
+
+In order to run training benchmarks, use the `scripts/benchmark.sh` script.
+
+#### Inference performance benchmark
+
+To benchmark the inference performance on a specific batch size and dataset, run the `inference.py` script.
+### Results
+
+The following sections provide details on how we achieved our performance and accuracy in training and inference.
+
+#### Training accuracy results
+
+We conducted an extensive hyperparameter search along with stability tests. The presented results are the averages from the hundreds of runs.
+
+##### Training accuracy: NVIDIA DGX A100 (A100 80GB)
+
+Our results were obtained by running the `train.sh` training script in the [PyTorch 21.06 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch) on NVIDIA A100 GPUs.
+
+| Dataset | GPUs | Batch size / GPU    | Accuracy - TF32  | Accuracy - mixed precision  |   Time to train - TF32  |  Time to train - mixed precision | Time to train speedup (TF32 to mixed precision)     
+|-------------|---|------|-----------------------|-----------------------|-------|-------|-------
+| Electricity | 1 | 1024 | 0.027 / 0.059 / 0.029 | 0.028 / 0.058 / 0.029 | 1427s | 1087s | 1.313x
+| Electricity | 8 | 1024 | 0.027 / 0.056 / 0.028 | 0.026 / 0.054 / 0.029 | 216s  | 176s  | 1.227x
+| Traffic     | 1 | 1024 | 0.040 / 0.103 / 0.075 | 0.040 / 0.103 / 0.075 | 957s  | 726s  | 1.318x
+| Traffic     | 8 | 1024 | 0.042 / 0.104 / 0.076 | 0.042 / 0.106 / 0.077 | 151s  | 126s  | 1.198x
+
+
+
+
+##### Training accuracy: NVIDIA DGX-1 (V100 16GB)
+
+Our results were obtained by running the `train.sh` training script in the [PyTorch 21.06 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch) on NVIDIA DGX-1 with V100 16GB GPUs.
+
+| Dataset | GPUs    | Batch size / GPU    | Accuracy - FP32  | Accuracy - mixed precision  |   Time to train - FP32  |  Time to train - mixed precision | Time to train speedup (FP32 to mixed precision)        
+|-------------|---|------|-----------------------|-----------------------|-------|-------|-----------
+| Electricity | 1 | 1024 | 0.027 / 0.056 / 0.028 | 0.027 / 0.058 / 0.029 | 2559s | 1598s | 1.601x 
+| Electricity | 8 | 1024 | 0.027 / 0.055 / 0.028 | 0.027 / 0.055 / 0.029 | 381s  | 261s  | 1.460x   
+| Traffic     | 1 | 1024 | 0.040 / 0.102 / 0.075 | 0.041 / 0.101 / 0.074 | 1718s | 1062s | 1.618x 
+| Traffic     | 8 | 1024 | 0.042 / 0.106 / 0.076 | 0.042 / 0.105 / 0.077 | 256s  | 176s  | 1.455x
+
+
+
+##### Training stability test
+
+In order to get a greater picture of the model’s accuracy, we performed a hyperparameter search along with stability tests on 100 random seeds for each configuration. Then, for each benchmark dataset, we have chosen the architecture with the least mean test q-risk. The table below summarizes the best configurations.
+
+| Dataset     | #GPU | Hidden size | #Heads | Local BS | LR   | Gradient clipping | Dropout | Mean q-risk | Std q-risk | Min q-risk | Max q-risk
+|-------------|------|-------------|--------|----------|------|-------------------|---------|-------------|------------| -----------|------ 
+| Electricity | 8    | 128         | 4      | 1024     | 1e-3 | 0.0               | 0.1     | 0.1131      | 0.0025     | 0.1080     | 0.1200
+| Traffic     | 8    | 128         | 4      | 1024     | 1e-3 | 0.0               | 0.3     | 0.2180      | 0.0049     | 0.2069     | 0.2336
+
+
+#### Training performance results
+
+##### Training performance: NVIDIA DGX A100 (A100 80GB)
+
+Our results were obtained by running the `train.sh` training script in the [PyTorch 21.06 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch) on NVIDIA A100 (A100 80GB) GPUs. Performance numbers (in items/images per second) were averaged over an entire training epoch.
+
+| Dataset | GPUs   | Batch size / GPU   | Throughput - TF32    | Throughput - mixed precision    | Throughput speedup (TF32 - mixed precision)   | Weak scaling - TF32    | Weak scaling - mixed precision        
+|-------------|---|------|--------|--------|-------|-------|-----
+| Electricity | 1 | 1024 | 10173  | 13703  | 1.35x | 1     | 1
+| Electricity | 8 | 1024 | 80596  | 107761 | 1.34x | 7.92x | 7.86x
+| Traffic     | 1 | 1024 | 10197  | 13779  | 1.35x | 1     | 1
+| Traffic     | 8 | 1024 | 80692  | 107979 | 1.34x | 7.91x | 7.84x
+
+
+To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
+
+The performance metrics used were items per second.
+
+
+##### Training performance: NVIDIA DGX-1 (V100 16GB)
+
+Our results were obtained by running the `train.sh` training script in the [PyTorch 21.06 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch) on NVIDIA DGX-1 with (V100 16GB) GPUs. Performance numbers (in items/images per second) were averaged over an entire training epoch.
+
+| Dataset | GPUs   | Batch size / GPU   | Throughput - FP32    | Throughput - mixed precision    | Throughput speedup (FP32 - mixed precision)   | Weak scaling - FP32    | Weak scaling - mixed precision        
+|-------------|---|------|-------|-------|-------|------|----
+| Electricity | 1 | 1024 | 5580  | 9148  | 1.64x | 1     | 1
+| Electricity | 8 | 1024 | 43351 | 69855 | 1.61x | 7.77x | 7.64x
+| Traffic     | 1 | 1024 | 5593  | 9194  | 1.64x | 1     | 1
+| Traffic     | 8 | 1024 | 43426 | 69983 | 1.61x | 7.76x | 7.61x
+
+
+
+To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
+
+The performance metrics used were items per second.
+
+## Release notes
+The performance measurements in this document were conducted at the time of publication and may not reflect the performance achieved from NVIDIA’s latest software release. For the most up-to-date performance measurements, go to https://developer.nvidia.com/deep-learning-performance-training-inference.
+
+### Changelog
+
+October 2021
+- Initial release
+
+### Known issues
+There are no known issues with this model.
+
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/TFT_architecture.PNG b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/TFT_architecture.PNG
new file mode 100644
index 00000000..c3431031
Binary files /dev/null and b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/TFT_architecture.PNG differ
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/configuration.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/configuration.py
new file mode 100644
index 00000000..bef26e66
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/configuration.py
@@ -0,0 +1,128 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from data_utils import InputTypes, DataTypes, FeatureSpec
+import datetime
+
+class ElectricityConfig():
+    def __init__(self):
+
+        self.features = [
+                         FeatureSpec('id', InputTypes.ID, DataTypes.CATEGORICAL),
+                         FeatureSpec('hours_from_start', InputTypes.TIME, DataTypes.CONTINUOUS),
+                         FeatureSpec('power_usage', InputTypes.TARGET, DataTypes.CONTINUOUS),
+                         FeatureSpec('hour', InputTypes.KNOWN, DataTypes.CONTINUOUS),
+                         FeatureSpec('day_of_week', InputTypes.KNOWN, DataTypes.CONTINUOUS),
+                         FeatureSpec('hours_from_start', InputTypes.KNOWN, DataTypes.CONTINUOUS),
+                         FeatureSpec('categorical_id', InputTypes.STATIC, DataTypes.CATEGORICAL),
+                        ]
+        # Dataset split boundaries
+        self.time_ids = 'days_from_start' # This column contains time indices across which we split the data
+        self.train_range = (1096, 1315)
+        self.valid_range = (1308, 1339)
+        self.test_range = (1332, 1346)
+        self.dataset_stride = 1 #how many timesteps between examples
+        self.scale_per_id = True
+        self.missing_id_strategy = None
+        self.missing_cat_data_strategy='encode_all'
+
+        # Feature sizes
+        self.static_categorical_inp_lens = [369]
+        self.temporal_known_categorical_inp_lens = []
+        self.temporal_observed_categorical_inp_lens = []
+        self.quantiles = [0.1, 0.5, 0.9]
+
+        self.example_length = 8 * 24
+        self.encoder_length = 7 * 24
+
+        self.n_head = 4
+        self.hidden_size = 128
+        self.dropout = 0.1
+        self.attn_dropout = 0.0
+
+        #### Derived variables ####
+        self.temporal_known_continuous_inp_size = len([x for x in self.features 
+            if x.feature_type == InputTypes.KNOWN and x.feature_embed_type == DataTypes.CONTINUOUS])
+        self.temporal_observed_continuous_inp_size = len([x for x in self.features 
+            if x.feature_type == InputTypes.OBSERVED and x.feature_embed_type == DataTypes.CONTINUOUS])
+        self.temporal_target_size = len([x for x in self.features if x.feature_type == InputTypes.TARGET])
+        self.static_continuous_inp_size = len([x for x in self.features 
+            if x.feature_type == InputTypes.STATIC and x.feature_embed_type == DataTypes.CONTINUOUS])
+
+        self.num_static_vars = self.static_continuous_inp_size + len(self.static_categorical_inp_lens)
+        self.num_future_vars = self.temporal_known_continuous_inp_size + len(self.temporal_known_categorical_inp_lens)
+        self.num_historic_vars = sum([self.num_future_vars,
+                                      self.temporal_observed_continuous_inp_size,
+                                      self.temporal_target_size,
+                                      len(self.temporal_observed_categorical_inp_lens),
+                                      ])
+
+
+class TrafficConfig():
+    def __init__(self):
+
+        self.features = [
+                         FeatureSpec('id', InputTypes.ID, DataTypes.CATEGORICAL),
+                         FeatureSpec('hours_from_start', InputTypes.TIME, DataTypes.CONTINUOUS),
+                         FeatureSpec('values', InputTypes.TARGET, DataTypes.CONTINUOUS),
+                         FeatureSpec('time_on_day', InputTypes.KNOWN, DataTypes.CONTINUOUS),
+                         FeatureSpec('day_of_week', InputTypes.KNOWN, DataTypes.CONTINUOUS),
+                         FeatureSpec('hours_from_start', InputTypes.KNOWN, DataTypes.CONTINUOUS),
+                         FeatureSpec('categorical_id', InputTypes.STATIC, DataTypes.CATEGORICAL),
+                        ]
+        # Dataset split boundaries
+        self.time_ids = 'sensor_day' # This column contains time indices across which we split the data
+        self.train_range = (0, 151)
+        self.valid_range = (144, 166)
+        self.test_range = (159, float('inf'))
+        self.dataset_stride = 1 #how many timesteps between examples
+        self.scale_per_id = False
+        self.missing_id_strategy = None
+        self.missing_cat_data_strategy='encode_all'
+
+        # Feature sizes
+        self.static_categorical_inp_lens = [963]
+        self.temporal_known_categorical_inp_lens = []
+        self.temporal_observed_categorical_inp_lens = []
+        self.quantiles = [0.1, 0.5, 0.9]
+
+        self.example_length = 8 * 24
+        self.encoder_length = 7 * 24
+
+        self.n_head = 4
+        self.hidden_size = 128
+        self.dropout = 0.3
+        self.attn_dropout = 0.0
+
+        #### Derived variables ####
+        self.temporal_known_continuous_inp_size = len([x for x in self.features 
+            if x.feature_type == InputTypes.KNOWN and x.feature_embed_type == DataTypes.CONTINUOUS])
+        self.temporal_observed_continuous_inp_size = len([x for x in self.features 
+            if x.feature_type == InputTypes.OBSERVED and x.feature_embed_type == DataTypes.CONTINUOUS])
+        self.temporal_target_size = len([x for x in self.features if x.feature_type == InputTypes.TARGET])
+        self.static_continuous_inp_size = len([x for x in self.features 
+            if x.feature_type == InputTypes.STATIC and x.feature_embed_type == DataTypes.CONTINUOUS])
+
+        self.num_static_vars = self.static_continuous_inp_size + len(self.static_categorical_inp_lens)
+        self.num_future_vars = self.temporal_known_continuous_inp_size + len(self.temporal_known_categorical_inp_lens)
+        self.num_historic_vars = sum([self.num_future_vars,
+                                      self.temporal_observed_continuous_inp_size,
+                                      self.temporal_target_size,
+                                      len(self.temporal_observed_categorical_inp_lens),
+                                      ])
+
+
+CONFIGS = {'electricity':  ElectricityConfig,
+           'traffic':      TrafficConfig, 
+           }
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/criterions.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/criterions.py
new file mode 100644
index 00000000..5c9df6ae
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/criterions.py
@@ -0,0 +1,28 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class QuantileLoss(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.register_buffer('q', torch.tensor(config.quantiles))
+
+    def forward(self, predictions, targets):
+        diff = predictions - targets
+        ql = (1-self.q)*F.relu(diff) + self.q*F.relu(-diff)
+        losses = ql.view(-1, ql.shape[-1]).mean(0)
+        return losses
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/data_utils.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/data_utils.py
new file mode 100644
index 00000000..f38f8bfb
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/data_utils.py
@@ -0,0 +1,790 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################
+# Copyright 2021 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import math
+import pickle
+import enum
+import datetime
+
+from collections import namedtuple, OrderedDict
+
+import sklearn.preprocessing
+from sklearn.impute import SimpleImputer
+import pandas as pd
+import numpy as np
+from bisect import bisect
+
+import torch
+from torch.utils.data import Dataset,IterableDataset,DataLoader
+
+class DataTypes(enum.IntEnum):
+    """Defines numerical types of each column."""
+    CONTINUOUS = 0
+    CATEGORICAL = 1
+    DATE = 2
+    STR = 3
+
+class InputTypes(enum.IntEnum):
+    """Defines input types of each column."""
+    TARGET = 0
+    OBSERVED = 1
+    KNOWN = 2
+    STATIC = 3
+    ID = 4  # Single column used as an entity identifier
+    TIME = 5  # Single column exclusively used as a time index
+
+FeatureSpec = namedtuple('FeatureSpec', ['name', 'feature_type', 'feature_embed_type'])
+DTYPE_MAP = {
+        DataTypes.CONTINUOUS : np.float32,
+        DataTypes.CATEGORICAL : np.int64,
+        DataTypes.DATE:'datetime64[ns]',
+        DataTypes.STR: str
+        }
+
+FEAT_ORDER = [
+        (InputTypes.STATIC, DataTypes.CATEGORICAL),
+        (InputTypes.STATIC, DataTypes.CONTINUOUS),
+        (InputTypes.KNOWN, DataTypes.CATEGORICAL),
+        (InputTypes.KNOWN, DataTypes.CONTINUOUS),
+        (InputTypes.OBSERVED, DataTypes.CATEGORICAL),
+        (InputTypes.OBSERVED, DataTypes.CONTINUOUS),
+        (InputTypes.TARGET, DataTypes.CONTINUOUS),
+        (InputTypes.ID, DataTypes.CATEGORICAL)
+        ]
+
+FEAT_NAMES = ['s_cat' , 's_cont' , 'k_cat' , 'k_cont' , 'o_cat' , 'o_cont' , 'target', 'id']
+DEFAULT_ID_COL = 'id'
+
+class TFTBinaryDataset(Dataset):
+    def __init__(self, path, config):
+        super(TFTBinaryDataset).__init__()
+        self.features = [x for x in config.features if x.feature_embed_type != DataTypes.DATE]
+        self.example_length = config.example_length
+        self.stride = config.dataset_stride
+
+        self.grouped = pickle.load(open(path, 'rb'))
+        self.grouped = [x for x in self.grouped if x.shape[0] >= self.example_length]
+        self._cum_examples_in_group = np.cumsum([(g.shape[0] - self.example_length + 1)//self.stride for g in self.grouped])
+
+
+        self.feature_type_col_map = [[i for i,f in enumerate(self.features) if (f.feature_type, f.feature_embed_type) == x] for x in FEAT_ORDER]
+
+        # The list comprehension below is an elaborate way of rearranging data into correct order,
+        # simultaneously doing casting to proper types. Probably can be written neater
+        self.grouped = [
+                [
+                    arr[:, idxs].view(dtype=np.float32).astype(DTYPE_MAP[t[1]]) 
+                    for t, idxs in zip(FEAT_ORDER, self.feature_type_col_map)
+                ] 
+                for arr in self.grouped
+            ]
+
+    def __len__(self):
+        return self._cum_examples_in_group[-1] if len(self._cum_examples_in_group) else 0
+
+    def __getitem__(self, idx):
+        g_idx = bisect(self._cum_examples_in_group, idx)
+        e_idx = idx - self._cum_examples_in_group[g_idx-1] if g_idx else idx
+
+        group =  self.grouped[g_idx]
+
+        tensors = [
+                   torch.from_numpy(feat[e_idx * self.stride:e_idx*self.stride + self.example_length])
+                   if feat.size else torch.empty(0)
+                   for feat in group
+                  ]
+
+        return OrderedDict(zip(FEAT_NAMES, tensors))
+
+
+class TFTDataset(Dataset):
+    def __init__(self, path, config):
+        super(TFTDataset).__init__()
+        self.features = config.features
+        self.data = pd.read_csv(path, index_col=0)
+        self.example_length = config.example_length
+        self.stride = config.dataset_stride
+
+        # name field is a column name.
+        # there can be multiple entries with the same name because one column can be interpreted in many ways
+        time_col_name = next(x.name for x in self.features if x.feature_type==InputTypes.TIME)
+        id_col_name   = next(x.name for x in self.features if x.feature_type==InputTypes.ID)
+        if not id_col_name in self.data.columns:
+            id_col_name = DEFAULT_ID_COL
+            self.features = [x for x in self.features if x.feature_type!=InputTypes.ID]
+            self.features.append(FeatureSpec(DEFAULT_ID_COL, InputTypes.ID, DataTypes.CATEGORICAL))
+        col_dtypes    = {v.name:DTYPE_MAP[v.feature_embed_type] for v in self.features}
+
+
+        self.data.sort_values(time_col_name,inplace=True)
+        self.data = self.data[set(x.name for x in self.features)] #leave only relevant columns
+        self.data = self.data.astype(col_dtypes)
+        self.data = self.data.groupby(id_col_name).filter(lambda group: len(group) >= self.example_length)
+        self.grouped = list(self.data.groupby(id_col_name))
+
+        self._cum_examples_in_group = np.cumsum([(len(g[1]) - self.example_length + 1)//self.stride for g in self.grouped])
+
+    def __len__(self):
+        return self._cum_examples_in_group[-1]
+
+    def __getitem__(self, idx):
+        g_idx = len([x for x in self._cum_examples_in_group if x <= idx])
+        e_idx = idx - self._cum_examples_in_group[g_idx-1] if g_idx else idx
+
+        group =  self.grouped[g_idx][1]
+        sliced = group.iloc[e_idx * self.stride:e_idx*self.stride + self.example_length]
+
+        # We need to be sure that tensors are returned in the correct order
+        tensors = tuple([] for _ in range(8))
+        for v in self.features:
+            if v.feature_type == InputTypes.STATIC and v.feature_embed_type == DataTypes.CATEGORICAL:
+                tensors[0].append(torch.from_numpy(sliced[v.name].to_numpy()))
+            elif v.feature_type == InputTypes.STATIC and v.feature_embed_type == DataTypes.CONTINUOUS:
+                tensors[1].append(torch.from_numpy(sliced[v.name].to_numpy()))
+            elif v.feature_type == InputTypes.KNOWN and v.feature_embed_type == DataTypes.CATEGORICAL:
+                tensors[2].append(torch.from_numpy(sliced[v.name].to_numpy()))
+            elif v.feature_type == InputTypes.KNOWN and v.feature_embed_type == DataTypes.CONTINUOUS:
+                tensors[3].append(torch.from_numpy(sliced[v.name].to_numpy()))
+            elif v.feature_type == InputTypes.OBSERVED and v.feature_embed_type == DataTypes.CATEGORICAL:
+                tensors[4].append(torch.from_numpy(sliced[v.name].to_numpy()))
+            elif v.feature_type == InputTypes.OBSERVED and v.feature_embed_type == DataTypes.CONTINUOUS:
+                tensors[5].append(torch.from_numpy(sliced[v.name].to_numpy()))
+            elif v.feature_type == InputTypes.TARGET:
+                tensors[6].append(torch.from_numpy(sliced[v.name].to_numpy()))
+            elif v.feature_type == InputTypes.ID:
+                tensors[7].append(torch.from_numpy(sliced[v.name].to_numpy()))
+
+
+        tensors = [torch.stack(x, dim=-1) if x else torch.empty(0) for x in tensors]
+
+        return OrderedDict(zip(FEAT_NAMES, tensors))
+        
+def get_dataset_splits(df, config):
+
+    if hasattr(config, 'relative_split') and config.relative_split:
+        forecast_len = config.example_length - config.encoder_length
+        # The valid split is shifted from the train split by number of the forecast steps to the future.
+        # The test split is shifted by the number of the forecast steps from the valid split
+        train = []
+        valid = []
+        test = []
+
+        for _, group in df.groupby(DEFAULT_ID_COL):
+            index = group[config.time_ids]
+            _train = group.loc[index < config.valid_boundary]
+            _valid = group.iloc[(len(_train) - config.encoder_length):(len(_train) + forecast_len)]
+            _test = group.iloc[(len(_train) - config.encoder_length + forecast_len):(len(_train) + 2*forecast_len)]
+            train.append(_train)
+            valid.append(_valid)
+            test.append(_test)
+
+        train = pd.concat(train, axis=0)
+        valid = pd.concat(valid, axis=0)
+        test = pd.concat(test, axis=0)
+    else:
+        index = df[config.time_ids]
+        train = df.loc[(index >= config.train_range[0]) & (index < config.train_range[1])]
+        valid = df.loc[(index >= config.valid_range[0]) & (index < config.valid_range[1])]
+        test  = df.loc[(index >= config.test_range[0]) & (index < config.test_range[1])]
+
+    return train, valid, test
+
+def flatten_ids(df, config):
+
+    if config.missing_id_strategy == 'drop':
+        if hasattr(config, 'combine_ids') and config.combine_ids:
+            index = np.logical_or.reduce([df[c].isna() for c in config.combine_ids])
+        else:
+            id_col = next(x.name for x in config.features if x.feature_type == InputTypes.ID)
+            index = df[id_col].isna()
+        index = index[index == True].index # Extract indices of nans
+        df.drop(index, inplace=True)
+
+    if not (hasattr(config, 'combine_ids') and config.combine_ids):
+        id_col = next(x.name for x in config.features if x.feature_type == InputTypes.ID)
+        ids = df[id_col].apply(str)
+        df.drop(id_col, axis=1, inplace=True)
+        encoder = sklearn.preprocessing.LabelEncoder().fit(ids.values)
+        df[DEFAULT_ID_COL] = encoder.transform(ids)
+        encoders = OrderedDict({id_col: encoder})
+
+    else:
+        encoders = {c:sklearn.preprocessing.LabelEncoder().fit(df[c].values) for c in config.combine_ids}
+        encoders = OrderedDict(encoders)
+        lens = [len(v.classes_) for v in encoders.values()]
+        clens = np.roll(np.cumprod(lens), 1)
+        clens[0] = 1
+
+        # this takes a looooooot of time. Probably it would be better to create 2 dummy columns
+        df[DEFAULT_ID_COL] = df.apply(lambda row: sum([encoders[c].transform([row[c]])[0]*clens[i] for i,c in enumerate(encoders.keys())]), axis=1)
+        df.drop(config.combine_ids, axis=1, inplace=True)
+
+    return DEFAULT_ID_COL, encoders
+
+def impute(df, config):
+    #XXX This ensures that out scaling will have the same mean. We still need to check the variance
+    if not hasattr(config, 'missing_data_label'):
+        return df, None
+    else:
+        imp = SimpleImputer(missing_values=config.missing_data_label, strategy='mean')
+        mask = df.applymap(lambda x: True if x == config.missing_data_label else False)
+        data = df.values
+        col_mask = (data == config.missing_data_label).all(axis=0)
+        data[:,~col_mask] = imp.fit_transform(data)
+        return data, mask
+
+def normalize_reals(train, valid, test, config, id_col=DEFAULT_ID_COL):
+    tgt_cols = [x.name for x in config.features if x.feature_type == InputTypes.TARGET]
+    real_cols = list(set(v.name for v in config.features if v.feature_embed_type == DataTypes.CONTINUOUS).difference(set(tgt_cols)))
+    real_scalers = {}
+    tgt_scalers = {}
+
+    def apply_scalers(df, name=None):
+        if name is None:
+            name = df.name
+        mask = df.applymap(lambda x: True if x == config.missing_data_label else False) if hasattr(config, 'missing_data_label') else None
+        df[real_cols] = real_scalers[name].transform(df[real_cols])
+        if mask is not None and any(mask):
+            df[real_cols].mask(mask, 10**9)
+        df[tgt_cols] = tgt_scalers[name].transform(df[tgt_cols])
+        return df
+
+    if config.scale_per_id:
+        for identifier, sliced in train.groupby(id_col):
+            data = sliced[real_cols]
+            data, _ = impute(data, config)
+            real_scalers[identifier] = sklearn.preprocessing.StandardScaler().fit(data)
+            # XXX We should probably remove examples that contain NaN as a target
+            target = sliced[tgt_cols]
+            tgt_scalers[identifier] = sklearn.preprocessing.StandardScaler().fit(target)
+
+        train = train.groupby(id_col).apply(apply_scalers)
+        # For valid and testing leave only timeseries previously present in train subset
+        # XXX for proper data science we should consider encoding unseen timeseries as a special case, not throwing them away
+        valid = valid.loc[valid[id_col].isin(real_scalers.keys())]
+        valid = valid.groupby(id_col).apply(apply_scalers)
+        test = test.loc[test[id_col].isin(real_scalers.keys())]
+        test = test.groupby(id_col).apply(apply_scalers)
+
+    else:
+        data, _ = impute(train[real_cols], config)
+        real_scalers[''] = sklearn.preprocessing.StandardScaler().fit(data)
+        tgt_scalers[''] = sklearn.preprocessing.StandardScaler().fit(train[tgt_cols])
+
+        train = apply_scalers(train, name='')
+        valid = apply_scalers(valid, name='')
+        test = apply_scalers(test, name='')
+
+    return train, valid, test, real_scalers, tgt_scalers
+
+def encode_categoricals(train, valid, test, config):
+    cat_encodings = {}
+    cat_cols = list(set(v.name for v in config.features if v.feature_embed_type == DataTypes.CATEGORICAL and v.feature_type != InputTypes.ID))
+    num_classes = [] #XXX Maybe we should modify config based on this value? Or send a warninig?
+                     # For TC performance reasons we might want for num_classes[i] be divisible by 8
+
+    # Train categorical encoders
+    for c in cat_cols:
+        if config.missing_cat_data_strategy == 'special_token':
+            #XXX this will probably require some data augmentation
+            unique = train[c].unique()
+            valid[c].loc[valid[c].isin(unique)] = '<UNK>'
+            test[c].loc[test[c].isin(unique)] = '<UNK>'
+
+        if config.missing_cat_data_strategy == 'encode_all' or \
+                config.missing_cat_data_strategy == 'special_token':
+            srs = pd.concat([train[c], valid[c], test[c]]).apply(str)
+            cat_encodings[c] = sklearn.preprocessing.LabelEncoder().fit(srs.values)
+        elif config.missing_cat_data_strategy == 'drop':
+            # TODO: implement this. In addition to dropping rows this has to split specific time series in chunks
+            # to prevent data from having temporal gaps
+            pass
+        num_classes.append(srs.nunique())
+    print('Categorical variables encodings lens: ', num_classes)
+
+
+    for split in [train, valid, test]:
+        for c in cat_cols:
+            srs = split[c].apply(str)
+            split[c] = srs
+            split.loc[:,c] = cat_encodings[c].transform(srs)
+
+    return cat_encodings
+
+
+def preprocess(src_path, dst_path, config):
+    df = pd.read_csv(src_path, index_col=0)
+
+    for c in config.features:
+        if c.feature_embed_type == DataTypes.DATE:
+            df[c.name] = pd.to_datetime(df[c.name])
+
+    # Leave only columns relevant to preprocessing
+    relevant_columns = list(set([f.name for f in config.features] + [config.time_ids]))
+    df = df[relevant_columns]
+
+
+    id_col, id_encoders = flatten_ids(df, config)
+    df = df.reindex(sorted(df.columns), axis=1)
+    
+    train, valid, test = get_dataset_splits(df, config)
+   
+    # Length filter the data (all timeseries shorter than example len will be dropped)
+    #for df in [train, valid, test]:
+    #    df.groupby(id_col).filter(lambda x: len(x) >= config.example_length)
+    train = pd.concat([x[1] for x in train.groupby(id_col) if len(x[1]) >= config.example_length])
+    valid = pd.concat([x[1] for x in valid.groupby(id_col) if len(x[1]) >= config.example_length])
+    test  = pd.concat([x[1] for x in test.groupby(id_col)  if len(x[1]) >= config.example_length])
+
+    train, valid, test, real_scalers, tgt_scalers = normalize_reals(train, valid, test, config, id_col)
+
+    cat_encodings = encode_categoricals(train, valid, test, config)
+
+    os.makedirs(dst_path, exist_ok=True)
+    
+    train.to_csv(os.path.join(dst_path, 'train.csv'))
+    valid.to_csv(os.path.join(dst_path, 'valid.csv'))
+    test.to_csv(os.path.join(dst_path, 'test.csv'))
+
+    # Save relevant columns in binary form for faster dataloading
+    # IMORTANT: We always expect id to be a single column indicating the complete timeseries
+    # We also expect a copy of id in form of static categorical input!!!
+    col_names = [id_col] + [x.name for x in config.features if x.feature_embed_type != DataTypes.DATE and x.feature_type != InputTypes.ID]
+    grouped_train = [x[1][col_names].values.astype(np.float32).view(dtype=np.int32) for x in train.groupby(id_col)]
+    grouped_valid = [x[1][col_names].values.astype(np.float32).view(dtype=np.int32) for x in valid.groupby(id_col)]
+    grouped_test  = [x[1][col_names].values.astype(np.float32).view(dtype=np.int32) for x in test.groupby(id_col)]
+
+    pickle.dump(grouped_train, open(os.path.join(dst_path, 'train.bin'), 'wb'))
+    pickle.dump(grouped_valid, open(os.path.join(dst_path, 'valid.bin'), 'wb'))
+    pickle.dump(grouped_test,  open(os.path.join(dst_path, 'test.bin'), 'wb'))
+
+    
+    with open(os.path.join(dst_path, 'real_scalers.bin'), 'wb') as f:
+        pickle.dump(real_scalers, f)
+    with open(os.path.join(dst_path, 'tgt_scalers.bin'), 'wb') as f:
+        pickle.dump(tgt_scalers, f)
+    with open(os.path.join(dst_path, 'cat_encodings.bin'), 'wb') as f:
+        pickle.dump(cat_encodings, f)
+    with open(os.path.join(dst_path, 'id_encoders.bin'), 'wb') as f:
+        pickle.dump(id_encoders, f)
+    
+
+def sample_data(dataset, num_samples):
+    if num_samples < 0:
+        return dataset
+    else:
+        return torch.utils.data.Subset(dataset, np.random.choice(np.arange(len(dataset)), size=num_samples, replace=False))
+
+
+def standarize_electricity(path):
+    """Code taken from https://github.com/google-research/google-research/blob/master/tft/script_download_data.py"""
+    df = pd.read_csv(os.path.join(path, 'LD2011_2014.txt'), index_col=0, sep=';', decimal=',')
+    df.index = pd.to_datetime(df.index)
+    df.sort_index(inplace=True)
+  
+    # Used to determine the start and end dates of a series
+    output = df.resample('1h').mean().replace(0., np.nan)
+  
+    earliest_time = output.index.min()
+  
+    df_list = []
+    for label in output:
+        print('Processing {}'.format(label))
+        srs = output[label]
+  
+        start_date = min(srs.fillna(method='ffill').dropna().index)
+        end_date = max(srs.fillna(method='bfill').dropna().index)
+  
+        active_range = (srs.index >= start_date) & (srs.index <= end_date)
+        srs = srs[active_range].fillna(0.)
+  
+        tmp = pd.DataFrame({'power_usage': srs})
+        date = tmp.index
+        tmp['t'] = (date - earliest_time).seconds / 60 / 60 + (
+            date - earliest_time).days * 24
+        tmp['days_from_start'] = (date - earliest_time).days
+        tmp['categorical_id'] = label
+        tmp['date'] = date
+        tmp['id'] = label
+        tmp['hour'] = date.hour
+        tmp['day'] = date.day
+        tmp['day_of_week'] = date.dayofweek
+        tmp['month'] = date.month
+  
+        df_list.append(tmp)
+  
+    output = pd.concat(df_list, axis=0, join='outer').reset_index(drop=True)
+  
+    output['categorical_id'] = output['id'].copy()
+    output['hours_from_start'] = output['t']
+    output['categorical_day_of_week'] = output['day_of_week'].copy()
+    output['categorical_hour'] = output['hour'].copy()
+  
+    output.to_csv(os.path.join(path, 'standarized.csv'))
+
+def standarize_volatility(path):
+    df = pd.read_csv(os.path.join(path, 'oxfordmanrealizedvolatilityindices.csv'), index_col=0)  # no explicit index
+  
+    # Adds additional date/day fields
+    idx = [str(s).split('+')[0] for s in df.index
+          ]  # ignore timezones, we don't need them
+    dates = pd.to_datetime(idx)
+    df['date'] = dates
+    df['days_from_start'] = (dates - pd.datetime(2000, 1, 3)).days
+    df['day_of_week'] = dates.dayofweek
+    df['day_of_month'] = dates.day
+    df['week_of_year'] = dates.weekofyear
+    df['month'] = dates.month
+    df['year'] = dates.year
+    df['categorical_id'] = df['Symbol'].copy()
+  
+    # Processes log volatility
+    vol = df['rv5_ss'].copy()
+    vol.loc[vol == 0.] = np.nan
+    df['log_vol'] = np.log(vol)
+  
+    # Adds static information
+    symbol_region_mapping = {
+        '.AEX': 'EMEA',
+        '.AORD': 'APAC',
+        '.BFX': 'EMEA',
+        '.BSESN': 'APAC',
+        '.BVLG': 'EMEA',
+        '.BVSP': 'AMER',
+        '.DJI': 'AMER',
+        '.FCHI': 'EMEA',
+        '.FTMIB': 'EMEA',
+        '.FTSE': 'EMEA',
+        '.GDAXI': 'EMEA',
+        '.GSPTSE': 'AMER',
+        '.HSI': 'APAC',
+        '.IBEX': 'EMEA',
+        '.IXIC': 'AMER',
+        '.KS11': 'APAC',
+        '.KSE': 'APAC',
+        '.MXX': 'AMER',
+        '.N225': 'APAC ',
+        '.NSEI': 'APAC',
+        '.OMXC20': 'EMEA',
+        '.OMXHPI': 'EMEA',
+        '.OMXSPI': 'EMEA',
+        '.OSEAX': 'EMEA',
+        '.RUT': 'EMEA',
+        '.SMSI': 'EMEA',
+        '.SPX': 'AMER',
+        '.SSEC': 'APAC',
+        '.SSMI': 'EMEA',
+        '.STI': 'APAC',
+        '.STOXX50E': 'EMEA'
+    }
+  
+    df['Region'] = df['Symbol'].apply(lambda k: symbol_region_mapping[k])
+  
+    # Performs final processing
+    output_df_list = []
+    for grp in df.groupby('Symbol'):
+        sliced = grp[1].copy()
+        sliced.sort_values('days_from_start', inplace=True)
+        # Impute log volatility values
+        sliced['log_vol'].fillna(method='ffill', inplace=True)
+        sliced.dropna()
+        output_df_list.append(sliced)
+  
+    df = pd.concat(output_df_list, axis=0)
+  
+    df.to_csv(os.path.join(path, 'standarized.csv'))
+
+
+def standarize_traffic(path):
+    def process_list(s, variable_type=int, delimiter=None):
+        """Parses a line in the PEMS format to a list."""
+        if delimiter is None:
+            l = [
+                variable_type(i) for i in s.replace('[', '').replace(']', '').split()
+            ]
+        else:
+            l = [
+                variable_type(i)
+                for i in s.replace('[', '').replace(']', '').split(delimiter)
+            ]
+  
+        return l
+  
+    def read_single_list(filename):
+        """Returns single list from a file in the PEMS-custom format."""
+        with open(os.path.join(path, filename), 'r') as dat:
+            l = process_list(dat.readlines()[0])
+        return l
+  
+    def read_matrix(filename):
+        """Returns a matrix from a file in the PEMS-custom format."""
+        array_list = []
+        with open(os.path.join(path, filename), 'r') as dat:
+            lines = dat.readlines()
+            for i, line in enumerate(lines):
+                if (i + 1) % 50 == 0:
+                    print('Completed {} of {} rows for {}'.format(i + 1, len(lines),
+                                                                filename))
+                array = [
+                    process_list(row_split, variable_type=float, delimiter=None)
+                    for row_split in process_list(
+                        line, variable_type=str, delimiter=';')
+                ]
+                array_list.append(array)
+  
+        return array_list
+  
+    shuffle_order = np.array(read_single_list('randperm')) - 1  # index from 0
+    train_dayofweek = read_single_list('PEMS_trainlabels')
+    train_tensor = read_matrix('PEMS_train')
+    test_dayofweek = read_single_list('PEMS_testlabels')
+    test_tensor = read_matrix('PEMS_test')
+  
+    # Inverse permutate shuffle order
+    print('Shuffling')
+    inverse_mapping = {
+        new_location: previous_location
+        for previous_location, new_location in enumerate(shuffle_order)
+    }
+    reverse_shuffle_order = np.array([
+        inverse_mapping[new_location]
+        for new_location, _ in enumerate(shuffle_order)
+    ])
+  
+    # Group and reoder based on permuation matrix
+    print('Reodering')
+    day_of_week = np.array(train_dayofweek + test_dayofweek)
+    combined_tensor = np.array(train_tensor + test_tensor)
+  
+    day_of_week = day_of_week[reverse_shuffle_order]
+    combined_tensor = combined_tensor[reverse_shuffle_order]
+  
+    # Put everything back into a dataframe
+    print('Parsing as dataframe')
+    labels = ['traj_{}'.format(i) for i in read_single_list('stations_list')]
+  
+    hourly_list = []
+    for day, day_matrix in enumerate(combined_tensor):
+        # Hourly data
+        hourly = pd.DataFrame(day_matrix.T, columns=labels)
+        hourly['hour_on_day'] = [int(i / 6) for i in hourly.index
+                                ]  # sampled at 10 min intervals
+        if hourly['hour_on_day'].max() > 23 or hourly['hour_on_day'].min() < 0:
+            raise ValueError('Invalid hour! {}-{}'.format(
+                hourly['hour_on_day'].min(), hourly['hour_on_day'].max()))
+  
+        hourly = hourly.groupby('hour_on_day', as_index=True).mean()[labels]
+        hourly['sensor_day'] = day
+        hourly['time_on_day'] = hourly.index
+        hourly['day_of_week'] = day_of_week[day]
+  
+        hourly_list.append(hourly)
+  
+    hourly_frame = pd.concat(hourly_list, axis=0, ignore_index=True, sort=False)
+  
+    # Flatten such that each entitiy uses one row in dataframe
+    store_columns = [c for c in hourly_frame.columns if 'traj' in c]
+    other_columns = [c for c in hourly_frame.columns if 'traj' not in c]
+    flat_df = pd.DataFrame(columns=['values', 'prev_values', 'next_values'] +
+                           other_columns + ['id'])
+  
+    for store in store_columns:
+        print('Processing {}'.format(store))
+  
+        sliced = hourly_frame[[store] + other_columns].copy()
+        sliced.columns = ['values'] + other_columns
+        sliced['id'] = int(store.replace('traj_', ''))
+  
+        # Sort by Sensor-date-time
+        key = sliced['id'].apply(str) \
+                + sliced['sensor_day'].apply(lambda x: '_{:03d}'.format(x)) \
+                + sliced['time_on_day'].apply(lambda x: '_{:03d}'.format(x))
+        sliced = sliced.set_index(key).sort_index()
+  
+        sliced['values'] = sliced['values'].fillna(method='ffill')
+        sliced['prev_values'] = sliced['values'].shift(1)
+        sliced['next_values'] = sliced['values'].shift(-1)
+  
+        flat_df = flat_df.append(sliced.dropna(), ignore_index=True, sort=False)
+  
+    # Filter to match range used by other academic papers
+    index = flat_df['sensor_day']
+    flat_df = flat_df[index < 173].copy()
+  
+    # Creating columns fo categorical inputs
+    flat_df['categorical_id'] = flat_df['id'].copy()
+    flat_df['hours_from_start'] = flat_df['time_on_day'] \
+        + flat_df['sensor_day']*24.
+    flat_df['categorical_day_of_week'] = flat_df['day_of_week'].copy()
+    flat_df['categorical_time_on_day'] = flat_df['time_on_day'].copy()
+  
+    flat_df.to_csv(os.path.join(path, 'standarized.csv'))
+
+
+# XXX needs rework
+def standarize_favorita(data_folder):
+    import gc
+    # Extract only a subset of data to save/process for efficiency
+    start_date = pd.datetime(2015, 1, 1)
+    end_date = pd.datetime(2016, 6, 1)
+  
+    print('Regenerating data...')
+  
+    # load temporal data
+    temporal = pd.read_csv(os.path.join(data_folder, 'train.csv'), index_col=0)
+  
+    store_info = pd.read_csv(os.path.join(data_folder, 'stores.csv'), index_col=0)
+    oil = pd.read_csv(
+        os.path.join(data_folder, 'oil.csv'), index_col=0).iloc[:, 0]
+    holidays = pd.read_csv(os.path.join(data_folder, 'holidays_events.csv'))
+    items = pd.read_csv(os.path.join(data_folder, 'items.csv'), index_col=0)
+    transactions = pd.read_csv(os.path.join(data_folder, 'transactions.csv'))
+  
+    # Take first 6 months of data
+    temporal['date'] = pd.to_datetime(temporal['date'])
+  
+    # Filter dates to reduce storage space requirements
+    if start_date is not None:
+        temporal = temporal[(temporal['date'] >= start_date)]
+    if end_date is not None:
+        temporal = temporal[(temporal['date'] < end_date)]
+  
+    dates = temporal['date'].unique()
+  
+    # Add trajectory identifier
+    temporal['traj_id'] = temporal['store_nbr'].apply(
+        str) + '_' + temporal['item_nbr'].apply(str)
+    temporal['unique_id'] = temporal['traj_id'] + '_' + temporal['date'].apply(
+        str)
+  
+    # Remove all IDs with negative returns
+    print('Removing returns data')
+    min_returns = temporal['unit_sales'].groupby(temporal['traj_id']).min()
+    valid_ids = set(min_returns[min_returns >= 0].index)
+    selector = temporal['traj_id'].apply(lambda traj_id: traj_id in valid_ids)
+    new_temporal = temporal[selector].copy()
+    del temporal
+    gc.collect()
+    temporal = new_temporal
+    temporal['open'] = 1
+  
+    # Resampling
+    print('Resampling to regular grid')
+    resampled_dfs = []
+    for traj_id, raw_sub_df in temporal.groupby('traj_id'):
+        print('Resampling', traj_id)
+        sub_df = raw_sub_df.set_index('date', drop=True).copy()
+        sub_df = sub_df.resample('1d').last()
+        sub_df['date'] = sub_df.index
+        sub_df[['store_nbr', 'item_nbr', 'onpromotion']] \
+            = sub_df[['store_nbr', 'item_nbr', 'onpromotion']].fillna(method='ffill')
+        sub_df['open'] = sub_df['open'].fillna(
+            0)  # flag where sales data is unknown
+        sub_df['log_sales'] = np.log(sub_df['unit_sales'])
+    
+        resampled_dfs.append(sub_df.reset_index(drop=True))
+  
+    new_temporal = pd.concat(resampled_dfs, axis=0)
+    del temporal
+    gc.collect()
+    temporal = new_temporal
+  
+    print('Adding oil')
+    oil.name = 'oil'
+    oil.index = pd.to_datetime(oil.index)
+    #XXX the lines below match the value of the oil on given date with the rest of the timeseries
+    # missing values in oil series are copied from the index before. Then the oil series is joined with
+    # temporal. Then there are some dates present in temporal which arent present in oil, for which 
+    # oil values is substituted with -1. WHY?!
+    #TODO: check how many nans there are after first step. Previously oil series was extended by dates
+    # present in dates variable with nan value, which were forward filled. 
+    # This behavior is no longer supported by pandas, so we changed to DataFrame.isin method.
+    # This leaves us with more nans after first step than previously. To achieve previous behavior
+    # we have to join series before filling nans.
+    temporal = temporal.join(
+        #oil.loc[oil.index.isin(dates)].fillna(method='ffill'), on='date', how='left')
+        oil.loc[oil.index.isin(dates)], on='date', how='left')
+    temporal['oil'] = temporal['oil'].fillna(method='ffill')
+    temporal['oil'] = temporal['oil'].fillna(-1)
+  
+    print('Adding store info')
+    temporal = temporal.join(store_info, on='store_nbr', how='left')
+  
+    print('Adding item info')
+    temporal = temporal.join(items, on='item_nbr', how='left')
+  
+    transactions['date'] = pd.to_datetime(transactions['date'])
+    temporal = temporal.merge(
+        transactions,
+        left_on=['date', 'store_nbr'],
+        right_on=['date', 'store_nbr'],
+        how='left')
+    temporal['transactions'] = temporal['transactions'].fillna(-1)
+  
+    # Additional date info
+    temporal['day_of_week'] = pd.to_datetime(temporal['date'].values).dayofweek
+    temporal['day_of_month'] = pd.to_datetime(temporal['date'].values).day
+    temporal['month'] = pd.to_datetime(temporal['date'].values).month
+  
+    # Add holiday info
+    print('Adding holidays')
+    holiday_subset = holidays[holidays['transferred'].apply(
+        lambda x: not x)].copy()
+    holiday_subset.columns = [
+        s if s != 'type' else 'holiday_type' for s in holiday_subset.columns
+    ]
+    holiday_subset['date'] = pd.to_datetime(holiday_subset['date'])
+    local_holidays = holiday_subset[holiday_subset['locale'] == 'Local']
+    regional_holidays = holiday_subset[holiday_subset['locale'] == 'Regional']
+    national_holidays = holiday_subset[holiday_subset['locale'] == 'National']
+  
+    temporal['national_hol'] = temporal.merge(
+        national_holidays, left_on=['date'], right_on=['date'],
+        how='left')['description'].fillna('')
+    temporal['regional_hol'] = temporal.merge(
+        regional_holidays,
+        left_on=['state', 'date'],
+        right_on=['locale_name', 'date'],
+        how='left')['description'].fillna('')
+    temporal['local_hol'] = temporal.merge(
+        local_holidays,
+        left_on=['city', 'date'],
+        right_on=['locale_name', 'date'],
+        how='left')['description'].fillna('')
+  
+    temporal.sort_values('unique_id', inplace=True)
+
+    # Transform date to integer index
+    start_date = pd.to_datetime(min(temporal['date']))
+    dates = temporal['date'].apply(pd.to_datetime)
+    temporal['days_from_start'] = (dates - start_date).dt.days
+    temporal['categorical_id'] = temporal['traj_id'].copy()
+  
+    print('Saving processed file to {}'.format(os.path.join(data_folder, 'standarized.csv')))
+    temporal.to_csv(os.path.join(data_folder, 'standarized.csv'))
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/ema.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/ema.py
new file mode 100644
index 00000000..f8f5b331
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/ema.py
@@ -0,0 +1,73 @@
+# Copyright 2021 NVIDIA CORPORATION
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Copyright 2019 Ross Wightman
+
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+
+#        http://www.apache.org/licenses/LICENSE-2.0
+
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+
+""" 
+Exponential Moving Average (EMA) of model updates
+"""
+
+from collections import OrderedDict
+from copy import deepcopy
+
+import torch
+import torch.nn as nn
+
+class ModelEma(nn.Module):
+    """ Model Exponential Moving Average V2
+
+    Keep a moving average of everything in the model state_dict (parameters and buffers).
+    V2 of this module is simpler, it does not match params/buffers based on name but simply
+    iterates in order. It works with torchscript (JIT of full model).
+
+    """
+    def __init__(self, model, decay=0.999, device=None):
+        super().__init__()
+        # make a copy of the model for accumulating moving average of weights
+        self.module = deepcopy(model)
+        self.module.eval()
+        self.decay = decay
+        self.device = device  # perform ema on different device from model if set
+        if self.device is not None:
+            self.module.to(device=device)
+
+    def update(self, model):
+        update_fn=lambda ema_v, model_v: self.decay * ema_v + (1. - self.decay) * model_v
+        with torch.no_grad():
+            for ema_v, model_v in zip(self.module.state_dict().values(), model.state_dict().values()):
+                if self.device is not None:
+                    model_v = model_v.to(device=self.device)
+                ema_v.copy_(update_fn(ema_v, model_v))
+
+    def set(self, model):
+        with torch.no_grad():
+            for ema_v, model_v in zip(self.module.state_dict().values(), model.state_dict().values()):
+                if self.device is not None:
+                    model_v = model_v.to(device=self.device)
+                ema_v.copy_( model_v )
+
+    def forward(self, x):
+        return self.module(x)
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/gpu_affinity.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/gpu_affinity.py
new file mode 100644
index 00000000..79fb1fc4
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/gpu_affinity.py
@@ -0,0 +1,157 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+import math
+import os
+import pathlib
+import re
+
+import pynvml
+
+pynvml.nvmlInit()
+
+
+def systemGetDriverVersion():
+    return pynvml.nvmlSystemGetDriverVersion()
+
+
+def deviceGetCount():
+    return pynvml.nvmlDeviceGetCount()
+
+
+class device:
+    # assume nvml returns list of 64 bit ints
+    _nvml_affinity_elements = math.ceil(os.cpu_count() / 64)
+
+    def __init__(self, device_idx):
+        super().__init__()
+        self.handle = pynvml.nvmlDeviceGetHandleByIndex(device_idx)
+
+    def getName(self):
+        return pynvml.nvmlDeviceGetName(self.handle)
+
+    def getCpuAffinity(self):
+        affinity_string = ''
+        for j in pynvml.nvmlDeviceGetCpuAffinity(
+            self.handle, device._nvml_affinity_elements
+        ):
+            # assume nvml returns list of 64 bit ints
+            affinity_string = '{:064b}'.format(j) + affinity_string
+        affinity_list = [int(x) for x in affinity_string]
+        affinity_list.reverse()  # so core 0 is in 0th element of list
+
+        ret = [i for i, e in enumerate(affinity_list) if e != 0]
+        return ret
+
+
+def set_socket_affinity(gpu_id):
+    dev = device(gpu_id)
+    affinity = dev.getCpuAffinity()
+    os.sched_setaffinity(0, affinity)
+
+
+def set_single_affinity(gpu_id):
+    dev = device(gpu_id)
+    affinity = dev.getCpuAffinity()
+    os.sched_setaffinity(0, affinity[:1])
+
+
+def set_single_unique_affinity(gpu_id, nproc_per_node):
+    devices = [device(i) for i in range(nproc_per_node)]
+    socket_affinities = [dev.getCpuAffinity() for dev in devices]
+
+    siblings_list = get_thread_siblings_list()
+    siblings_dict = dict(siblings_list)
+
+    # remove siblings
+    for idx, socket_affinity in enumerate(socket_affinities):
+        socket_affinities[idx] = list(set(socket_affinity) - set(siblings_dict.values()))
+
+    affinities = []
+    assigned = []
+
+    for socket_affinity in socket_affinities:
+        for core in socket_affinity:
+            if core not in assigned:
+                affinities.append([core])
+                assigned.append(core)
+                break
+    os.sched_setaffinity(0, affinities[gpu_id])
+
+
+def set_socket_unique_affinity(gpu_id, nproc_per_node, mode):
+    device_ids = [device(i) for i in range(nproc_per_node)]
+    socket_affinities = [dev.getCpuAffinity() for dev in device_ids]
+
+    siblings_list = get_thread_siblings_list()
+    siblings_dict = dict(siblings_list)
+
+    # remove siblings
+    for idx, socket_affinity in enumerate(socket_affinities):
+        socket_affinities[idx] = list(set(socket_affinity) - set(siblings_dict.values()))
+
+    socket_affinities_to_device_ids = collections.defaultdict(list)
+
+    for idx, socket_affinity in enumerate(socket_affinities):
+        socket_affinities_to_device_ids[tuple(socket_affinity)].append(idx)
+
+    for socket_affinity, device_ids in socket_affinities_to_device_ids.items():
+        devices_per_group = len(device_ids)
+        cores_per_device = len(socket_affinity) // devices_per_group
+        for group_id, device_id in enumerate(device_ids):
+            if device_id == gpu_id:
+                if mode == 'interleaved':
+                    affinity = list(socket_affinity[group_id::devices_per_group])
+                elif mode == 'continuous':
+                    affinity = list(socket_affinity[group_id*cores_per_device:(group_id+1)*cores_per_device])
+                else:
+                    raise RuntimeError('Unknown set_socket_unique_affinity mode')
+
+                # reintroduce siblings
+                affinity += [siblings_dict[aff] for aff in affinity if aff in siblings_dict]
+                os.sched_setaffinity(0, affinity)
+
+
+def get_thread_siblings_list():
+    path = '/sys/devices/system/cpu/cpu*/topology/thread_siblings_list'
+    thread_siblings_list = []
+    pattern = re.compile(r'(\d+)\D(\d+)')
+    for fname in pathlib.Path(path[0]).glob(path[1:]):
+        with open(fname) as f:
+            content = f.read().strip()
+            res = pattern.findall(content)
+            if res:
+                pair = tuple(map(int, res[0]))
+                thread_siblings_list.append(pair)
+    return thread_siblings_list
+
+
+def set_affinity(gpu_id, nproc_per_node, mode='socket'):
+    if mode == 'socket':
+        set_socket_affinity(gpu_id)
+    elif mode == 'single':
+        set_single_affinity(gpu_id)
+    elif mode == 'single_unique':
+        set_single_unique_affinity(gpu_id, nproc_per_node)
+    elif mode == 'socket_unique_interleaved':
+        set_socket_unique_affinity(gpu_id, nproc_per_node, 'interleaved')
+    elif mode == 'socket_unique_continuous':
+        set_socket_unique_affinity(gpu_id, nproc_per_node, 'continuous')
+    else:
+        raise RuntimeError('Unknown affinity mode')
+
+    affinity = os.sched_getaffinity(0)
+    return affinity
+
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/inference.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/inference.py
new file mode 100644
index 00000000..056429f1
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/inference.py
@@ -0,0 +1,239 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import pandas as pd
+import numpy as np
+import pickle
+import argparse
+import torch
+from torch.utils.data import DataLoader
+from torch.cuda import amp
+from torch.utils.tensorboard import SummaryWriter
+from tqdm import tqdm
+from modeling import TemporalFusionTransformer
+from configuration import ElectricityConfig
+from data_utils import TFTDataset
+from utils import PerformanceMeter
+from criterions import QuantileLoss
+import dllogger
+from log_helper import setup_logger
+
+def _unscale_per_id(config, values, ids, scalers):
+    values = values.cpu().numpy()
+    num_horizons = config.example_length - config.encoder_length + 1
+    flat_values = pd.DataFrame(
+            values,
+            columns=[f't{j}' for j in range(num_horizons - values.shape[1], num_horizons)]
+            )
+    flat_values['id'] = ids
+    df_list = []
+    for idx, group in flat_values.groupby('id'):
+        scaler = scalers[idx]
+        group_copy = group.copy()
+        for col in group_copy.columns:
+            if not 'id' in col:
+                _col = np.expand_dims(group_copy[col].values, -1)
+                _t_col = scaler.inverse_transform(_col)[:,-1]
+                group_copy[col] = _t_col
+        df_list.append(group_copy)
+    flat_values = pd.concat(df_list, axis=0)
+
+    flat_values = flat_values[[col for col in flat_values if not 'id' in col]]
+    flat_tensor = torch.from_numpy(flat_values.values)
+    return flat_tensor
+
+def _unscale(config, values, scaler):
+    values = values.cpu().numpy()
+    num_horizons = config.example_length - config.encoder_length + 1
+    flat_values = pd.DataFrame(
+            values,
+            columns=[f't{j}' for j in range(num_horizons - values.shape[1], num_horizons)]
+            )
+    for col in flat_values.columns:
+        if not 'id' in col:
+            _col = np.expand_dims(flat_values[col].values, -1)
+            _t_col = scaler.inverse_transform(_col)[:,-1]
+            flat_values[col] = _t_col
+
+    flat_values = flat_values[[col for col in flat_values if not 'id' in col]]
+    flat_tensor = torch.from_numpy(flat_values.values)
+    return flat_tensor
+
+def predict(args, config, model, data_loader, scalers, cat_encodings, extend_targets=False):
+    model.eval()
+    predictions = []
+    targets = []
+    ids = []
+    perf_meter = PerformanceMeter()
+    n_workers = args.distributed_world_size if hasattr(args, 'distributed_world_size') else 1
+
+    for step, batch in enumerate(data_loader):
+        perf_meter.reset_current_lap()
+        with torch.no_grad():
+            batch = {key: tensor.cuda() if tensor.numel() else None for key, tensor in batch.items()}
+            ids.append(batch['id'][:,0,:])
+            targets.append(batch['target'])
+            predictions.append(model(batch).float())
+
+        perf_meter.update(args.batch_size * n_workers,
+            exclude_from_total=step in [0, len(data_loader)-1])
+
+    targets = torch.cat(targets, dim=0)
+    if not extend_targets:
+        targets = targets[:,config.encoder_length:,:] 
+    predictions = torch.cat(predictions, dim=0)
+    
+    if config.scale_per_id:
+        ids = torch.cat(ids, dim=0).cpu().numpy()
+
+        unscaled_predictions = torch.stack(
+                [_unscale_per_id(config, predictions[:,:,i], ids, scalers) for i in range(len(config.quantiles))], 
+                dim=-1)
+        unscaled_targets = _unscale_per_id(config, targets[:,:,0], ids, scalers).unsqueeze(-1)
+    else:
+        ids = None
+        unscaled_predictions = torch.stack(
+                [_unscale(config, predictions[:,:,i], scalers['']) for i in range(len(config.quantiles))], 
+                dim=-1)
+        unscaled_targets = _unscale(config, targets[:,:,0], scalers['']).unsqueeze(-1)
+
+    return unscaled_predictions, unscaled_targets, ids, perf_meter
+
+def visualize_v2(args, config, model, data_loader, scalers, cat_encodings):
+    unscaled_predictions, unscaled_targets, ids, _ = predict(args, config, model, data_loader, scalers, cat_encodings, extend_targets=True)
+
+    num_horizons = config.example_length - config.encoder_length + 1
+    pad = unscaled_predictions.new_full((unscaled_targets.shape[0], unscaled_targets.shape[1] - unscaled_predictions.shape[1], unscaled_predictions.shape[2]), fill_value=float('nan'))
+    pad[:,-1,:] = unscaled_targets[:,-num_horizons,:]
+    unscaled_predictions = torch.cat((pad, unscaled_predictions), dim=1)
+
+    ids = torch.from_numpy(ids.squeeze())
+    joint_graphs = torch.cat([unscaled_targets, unscaled_predictions], dim=2)
+    graphs = {i:joint_graphs[ids == i, :, :] for i in set(ids.tolist())}
+    for key, g in graphs.items():
+        for i, ex in enumerate(g):
+            df = pd.DataFrame(ex.numpy(), 
+                    index=range(num_horizons - ex.shape[0], num_horizons),
+                    columns=['target'] + [f'P{int(q*100)}' for q in config.quantiles])
+            fig = df.plot().get_figure()
+            ax = fig.get_axes()[0]
+            _values = df.values[config.encoder_length-1:,:]
+            ax.fill_between(range(num_horizons), _values[:,1], _values[:,-1], alpha=0.2, color='green')
+            os.makedirs(os.path.join(args.results, 'single_example_vis', str(key)), exist_ok=True)
+            fig.savefig(os.path.join(args.results, 'single_example_vis', str(key), f'{i}.pdf'))
+
+def inference(args, config, model, data_loader, scalers, cat_encodings):
+    unscaled_predictions, unscaled_targets, ids, perf_meter = predict(args, config, model, data_loader, scalers, cat_encodings)
+
+    if args.joint_visualization or args.save_predictions:
+        ids = torch.from_numpy(ids.squeeze())
+        #ids = torch.cat([x['id'][0] for x in data_loader.dataset])
+        joint_graphs = torch.cat([unscaled_targets, unscaled_predictions], dim=2)
+        graphs = {i:joint_graphs[ids == i, :, :] for i in set(ids.tolist())}
+        for key, g in graphs.items(): #timeseries id, joint targets and predictions
+            _g = {'targets': g[:,:,0]}
+            _g.update({f'P{int(q*100)}':g[:,:,i+1] for i, q in enumerate(config.quantiles)})
+            
+            if args.joint_visualization:
+                summary_writer = SummaryWriter(log_dir=os.path.join(args.results, 'predictions_vis', str(key)))
+                for q, t in _g.items(): # target and quantiles, timehorizon values
+                    if q == 'targets':
+                        targets = torch.cat([t[:,0], t[-1,1:]]) # WIP
+                        # We want to plot targets on the same graph as predictions. Probably could be written better.
+                        for i, val in enumerate(targets):
+                            summary_writer.add_scalars(str(key), {f'{q}':val}, i)
+                        continue
+
+                    # Tensor t contains different time horizons which are shifted in phase
+                    # Next lines realign them
+                    y = t.new_full((t.shape[0] + t.shape[1] -1, t.shape[1]), float('nan'))
+                    for i in range(y.shape[1]):
+                        y[i:i+t.shape[0], i] = t[:,i]
+
+                    for i, vals in enumerate(y): # timestep, timehorizon values value
+                        summary_writer.add_scalars(str(key), {f'{q}_t+{j+1}':v for j,v in enumerate(vals) if v == v}, i)
+                summary_writer.close()
+
+            if args.save_predictions:
+                for q, t in _g.items():
+                    df = pd.DataFrame(t.tolist())
+                    df.columns = [f't+{i+1}' for i in range(len(df.columns))]
+                    os.makedirs(os.path.join(args.results, 'predictions', str(key)), exist_ok=True)
+                    df.to_csv(os.path.join(args.results, 'predictions', str(key), q+'.csv'))
+
+    losses = QuantileLoss(config)(unscaled_predictions, unscaled_targets)
+    normalizer = unscaled_targets.abs().mean()
+    q_risk = 2 * losses / normalizer
+
+    perf_dict = {
+                'throughput': perf_meter.avg,
+                'latency_avg': perf_meter.total_time/len(perf_meter.intervals),
+                'latency_p90': perf_meter.p(90),
+                'latency_p95': perf_meter.p(95),
+                'latency_p99': perf_meter.p(99),
+                'total_infernece_time': perf_meter.total_time,
+                }
+
+    return q_risk, perf_dict
+
+
+def main(args):
+    
+    setup_logger(args)
+    # Set up model
+    state_dict = torch.load(args.checkpoint)
+    config = state_dict['config']
+    model = TemporalFusionTransformer(config).cuda()
+    model.load_state_dict(state_dict['model'])
+    model.eval()
+    model.cuda()
+
+    # Set up dataset
+    test_split = TFTDataset(args.data, config)
+    data_loader = DataLoader(test_split, batch_size=args.batch_size, num_workers=4)
+
+    scalers = pickle.load(open(args.tgt_scalers, 'rb'))
+    cat_encodings = pickle.load(open(args.cat_encodings, 'rb'))
+
+    if args.visualize:
+        # TODO: abstract away all forms of visualization.
+        visualize_v2(args, config, model, data_loader, scalers, cat_encodings)
+
+    quantiles, perf_dict = inference(args, config, model, data_loader, scalers, cat_encodings)
+    quantiles = {'test_p10': quantiles[0].item(), 'test_p50': quantiles[1].item(), 'test_p90': quantiles[2].item(), 'sum':sum(quantiles).item()}
+    finish_log = {**quantiles, **perf_dict}
+    dllogger.log(step=(), data=finish_log, verbosity=1)
+    print('Test q-risk: P10 {} | P50 {} | P90 {}'.format(*quantiles))
+    print('Latency:\n\tAverage {:.3f}s\n\tp90 {:.3f}s\n\tp95 {:.3f}s\n\tp99 {:.3f}s'.format(
+        perf_dict['latency_avg'], perf_dict['latency_p90'], perf_dict['latency_p95'], perf_dict['latency_p99']))
+
+if __name__=='__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--checkpoint', type=str,
+                        help='Path to the checkpoint')
+    parser.add_argument('--data', type=str,
+                        help='Path to the test split of the dataset')
+    parser.add_argument('--tgt_scalers', type=str,
+                        help='Path to the tgt_scalers.bin file produced by the preprocessing')
+    parser.add_argument('--cat_encodings', type=str,
+                        help='Path to the cat_encodings.bin file produced by the preprocessing')
+    parser.add_argument('--batch_size', type=int, default=64)
+    parser.add_argument('--visualize', action='store_true', help='Visualize predictions - each example on the separate plot')
+    parser.add_argument('--joint_visualization', action='store_true', help='Visualize predictions - each timeseries on separate plot. Projections will be concatenated.')
+    parser.add_argument('--save_predictions', action='store_true')
+    parser.add_argument('--results', type=str, default='/results')
+    parser.add_argument('--log_file', type=str, default='dllogger.json')
+    ARGS = parser.parse_args()
+    main(ARGS)
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/log_helper.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/log_helper.py
new file mode 100644
index 00000000..83d2ac7f
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/log_helper.py
@@ -0,0 +1,141 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import subprocess
+import sys
+import itertools
+import atexit
+
+import dllogger
+from dllogger import Backend, JSONStreamBackend, StdOutBackend
+
+import torch.distributed as dist
+from torch.utils.tensorboard import SummaryWriter
+
+class TensorBoardBackend(Backend):
+    def __init__(self, verbosity, log_dir):
+        super().__init__(verbosity=verbosity)
+        self.summary_writer = SummaryWriter(log_dir=os.path.join(log_dir, 'TB_summary'),
+                                            flush_secs=120,
+                                            max_queue=200
+                                            )
+        self.hp_cache = None
+        atexit.register(self.summary_writer.close)
+
+    @property
+    def log_level(self):
+        return self._log_level
+
+    def metadata(self, timestamp, elapsedtime, metric, metadata):
+        pass
+
+    def log(self, timestamp, elapsedtime, step, data):
+        if step == 'HPARAMS':
+            parameters = {k: v for k, v in data.items() if not isinstance(v, (list, tuple))}
+            #Unpack list and tuples
+            for d in [{k+f'_{i}':v for i,v in enumerate(l)} for k,l in data.items() if isinstance(l, (list, tuple))]:
+                parameters.update(d)
+            #Remove custom classes
+            parameters = {k: v for k, v in data.items() if isinstance(v, (int, float, str, bool))}
+            parameters.update({k:'None' for k, v in data.items() if v is None})
+            self.hp_cache = parameters
+        if step == ():
+            if self.hp_cache is None:
+                print('Warning: Cannot save HParameters. Please log HParameters with step=\'HPARAMS\'', file=sys.stderr)
+                return
+            self.summary_writer.add_hparams(self.hp_cache, data)
+        if not isinstance(step, int):
+            return
+        for k, v in data.items():
+            self.summary_writer.add_scalar(k, v, step)
+
+    def flush(self):
+        pass
+
+def setup_logger(args):
+    os.makedirs(args.results, exist_ok=True)
+    log_path = os.path.join(args.results, args.log_file)
+
+    if os.path.exists(log_path):
+        for i in itertools.count():
+            s_fname = args.log_file.split('.')
+            fname = '.'.join(s_fname[:-1]) + f'_{i}.' + s_fname[-1] if len(s_fname) > 1 else args.stat_file + f'.{i}'
+            log_path = os.path.join(args.results, fname)
+            if not os.path.exists(log_path):
+                break
+
+    def metric_format(metric, metadata, value):
+        return "{}: {}".format(metric, f'{value:.5f}' if isinstance(value, float) else value)
+    def step_format(step):
+        if step == ():
+            return "Finished |"
+        elif isinstance(step, int):
+            return "Step {0: <5} |".format(step)
+        return "Step {} |".format(step)
+
+
+    if not dist.is_initialized() or not args.distributed_world_size > 1 or args.distributed_rank == 0:
+        dllogger.init(backends=[JSONStreamBackend(verbosity=1, filename=log_path),
+                                TensorBoardBackend(verbosity=1, log_dir=args.results),
+                                StdOutBackend(verbosity=2, 
+                                              step_format=step_format,
+                                              prefix_format=lambda x: "")#,
+                                              #metric_format=metric_format)
+                                ])
+    else:
+        dllogger.init(backends=[])
+    dllogger.log(step='PARAMETER', data=vars(args), verbosity=0)
+
+    container_setup_info = {**get_framework_env_vars(), **get_system_info()}
+    dllogger.log(step='ENVIRONMENT', data=container_setup_info, verbosity=0)
+
+    dllogger.metadata('loss', {'GOAL': 'MINIMIZE', 'STAGE': 'TRAIN', 'format': ':5f'})
+    dllogger.metadata('P10', {'GOAL': 'MINIMIZE', 'STAGE': 'TRAIN', 'format': ':5f'})
+    dllogger.metadata('P50', {'GOAL': 'MINIMIZE', 'STAGE': 'TRAIN', 'format': ':5f'})
+    dllogger.metadata('P90', {'GOAL': 'MINIMIZE', 'STAGE': 'TRAIN', 'format': ':5f'})
+    dllogger.metadata('items/s', {'GOAL': 'MAXIMIZE', 'STAGE': 'TRAIN', 'format': ':1f'})
+    dllogger.metadata('val_loss', {'GOAL': 'MINIMIZE', 'STAGE': 'VAL', 'format':':5f'})
+    dllogger.metadata('val_P10', {'GOAL': 'MINIMIZE', 'STAGE': 'VAL', 'format': ':5f'})
+    dllogger.metadata('val_P50', {'GOAL': 'MINIMIZE', 'STAGE': 'VAL', 'format': ':5f'})
+    dllogger.metadata('val_P90', {'GOAL': 'MINIMIZE', 'STAGE': 'VAL', 'format': ':5f'})
+    dllogger.metadata('val_items/s', {'GOAL': 'MAXIMIZE', 'STAGE': 'VAL', 'format': ':1f'})
+    dllogger.metadata('test_P10', {'GOAL': 'MINIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
+    dllogger.metadata('test_P50', {'GOAL': 'MINIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
+    dllogger.metadata('test_P90', {'GOAL': 'MINIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
+    dllogger.metadata('throughput', {'GOAL': 'MAXIMIZE', 'STAGE': 'TEST', 'format': ':1f'})
+    dllogger.metadata('latency_p90', {'GOAL': 'MIMIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
+    dllogger.metadata('latency_p95', {'GOAL': 'MIMIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
+    dllogger.metadata('latency_p99', {'GOAL': 'MIMIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
+
+
+def get_framework_env_vars():
+    return {
+        'NVIDIA_PYTORCH_VERSION': os.environ.get('NVIDIA_PYTORCH_VERSION'),
+        'PYTORCH_VERSION': os.environ.get('PYTORCH_VERSION'),
+        'CUBLAS_VERSION': os.environ.get('CUBLAS_VERSION'),
+        'NCCL_VERSION': os.environ.get('NCCL_VERSION'),
+        'CUDA_DRIVER_VERSION': os.environ.get('CUDA_DRIVER_VERSION'),
+        'CUDNN_VERSION': os.environ.get('CUDNN_VERSION'),
+        'CUDA_VERSION': os.environ.get('CUDA_VERSION'),
+        'NVIDIA_PIPELINE_ID': os.environ.get('NVIDIA_PIPELINE_ID'),
+        'NVIDIA_BUILD_ID': os.environ.get('NVIDIA_BUILD_ID'),
+        'NVIDIA_TF32_OVERRIDE': os.environ.get('NVIDIA_TF32_OVERRIDE'),
+    }
+
+def get_system_info():
+    system_info = subprocess.run('nvidia-smi --query-gpu=gpu_name,memory.total,enforced.power.limit --format=csv'.split(), capture_output=True).stdout
+    system_info = [i.decode('utf-8') for i in system_info.split(b'\n')]
+    system_info = [x for x in system_info if x]
+    return {'system_info': system_info}
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/modeling.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/modeling.py
new file mode 100644
index 00000000..65e64983
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/modeling.py
@@ -0,0 +1,367 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from torch import Tensor
+from typing import Dict, Tuple, Optional, List
+
+if os.environ.get("TFT_SCRIPTING", False):
+    from torch.nn import LayerNorm
+else:
+    from apex.normalization.fused_layer_norm import FusedLayerNorm as LayerNorm
+
+class MaybeLayerNorm(nn.Module):
+    def __init__(self, output_size, hidden_size, eps):
+        super().__init__()
+        if output_size and output_size == 1:
+            self.ln = nn.Identity()
+        else:
+            self.ln = LayerNorm(output_size if output_size else hidden_size, eps=eps)
+    
+    def forward(self, x):
+        return self.ln(x)
+
+
+class GLU(nn.Module):
+    def __init__(self, hidden_size, output_size):
+        super().__init__()
+        self.lin = nn.Linear(hidden_size, output_size * 2)
+
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.lin(x)
+        x = F.glu(x)
+        return x
+
+
+class GRN(nn.Module):
+    def __init__(self,
+                 input_size,
+                 hidden_size, 
+                 output_size=None,
+                 context_hidden_size=None,
+                 dropout=0):
+        super().__init__()
+
+        
+        self.layer_norm = MaybeLayerNorm(output_size, hidden_size, eps=1e-3)
+        self.lin_a = nn.Linear(input_size, hidden_size)
+        if context_hidden_size is not None:
+            self.lin_c = nn.Linear(context_hidden_size, hidden_size, bias=False)
+        self.lin_i = nn.Linear(hidden_size, hidden_size)
+        self.glu = GLU(hidden_size, output_size if output_size else hidden_size)
+        self.dropout = nn.Dropout(dropout)
+        self.out_proj = nn.Linear(input_size, output_size) if output_size else None
+
+    def forward(self, a: Tensor, c: Optional[Tensor] = None):
+        x = self.lin_a(a)
+        if c is not None:
+            x = x + self.lin_c(c).unsqueeze(1)
+        x = F.elu(x)
+        x = self.lin_i(x)
+        x = self.dropout(x)
+        x = self.glu(x)
+        y = a if not self.out_proj else self.out_proj(a)
+        x = x + y
+        x = self.layer_norm(x)
+        return x 
+
+class TFTEmbedding(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.s_cat_inp_lens    = config.static_categorical_inp_lens
+        self.t_cat_k_inp_lens  = config.temporal_known_categorical_inp_lens
+        self.t_cat_o_inp_lens  = config.temporal_observed_categorical_inp_lens
+        self.s_cont_inp_size   = config.static_continuous_inp_size
+        self.t_cont_k_inp_size = config.temporal_known_continuous_inp_size
+        self.t_cont_o_inp_size = config.temporal_observed_continuous_inp_size
+        self.t_tgt_size        = config.temporal_target_size
+
+        self.hidden_size = config.hidden_size
+
+        # There are 7 types of input:
+        # 1. Static categorical
+        # 2. Static continuous
+        # 3. Temporal known a priori categorical
+        # 4. Temporal known a priori continuous
+        # 5. Temporal observed categorical
+        # 6. Temporal observed continuous
+        # 7. Temporal observed targets (time series obseved so far)
+
+        self.s_cat_embed = nn.ModuleList([
+            nn.Embedding(n, self.hidden_size) for n in self.s_cat_inp_lens]) if self.s_cat_inp_lens else None
+        self.t_cat_k_embed = nn.ModuleList([
+            nn.Embedding(n, self.hidden_size) for n in self.t_cat_k_inp_lens]) if self.t_cat_k_inp_lens else None
+        self.t_cat_o_embed = nn.ModuleList([
+            nn.Embedding(n, self.hidden_size) for n in self.t_cat_o_inp_lens]) if self.t_cat_o_inp_lens else None
+
+        self.s_cont_embedding_vectors = nn.Parameter(torch.Tensor(self.s_cont_inp_size, self.hidden_size)) if self.s_cont_inp_size else None
+        self.t_cont_k_embedding_vectors = nn.Parameter(torch.Tensor(self.t_cont_k_inp_size, self.hidden_size)) if self.t_cont_k_inp_size else None
+        self.t_cont_o_embedding_vectors = nn.Parameter(torch.Tensor(self.t_cont_o_inp_size, self.hidden_size)) if self.t_cont_o_inp_size else None
+        self.t_tgt_embedding_vectors = nn.Parameter(torch.Tensor(self.t_tgt_size, self.hidden_size))
+
+        self.s_cont_embedding_bias = nn.Parameter(torch.zeros(self.s_cont_inp_size, self.hidden_size)) if self.s_cont_inp_size else None
+        self.t_cont_k_embedding_bias = nn.Parameter(torch.zeros(self.t_cont_k_inp_size, self.hidden_size)) if self.t_cont_k_inp_size else None
+        self.t_cont_o_embedding_bias = nn.Parameter(torch.zeros(self.t_cont_o_inp_size, self.hidden_size)) if self.t_cont_o_inp_size else None
+        self.t_tgt_embedding_bias = nn.Parameter(torch.zeros(self.t_tgt_size, self.hidden_size))
+
+        if self.s_cont_embedding_vectors is not None:
+            torch.nn.init.xavier_normal_(self.s_cont_embedding_vectors)
+        if self.t_cont_k_embedding_vectors is not None:
+            torch.nn.init.xavier_normal_(self.t_cont_k_embedding_vectors)
+        if self.t_cont_o_embedding_vectors is not None:
+            torch.nn.init.xavier_normal_(self.t_cont_o_embedding_vectors)
+        torch.nn.init.xavier_normal_(self.t_tgt_embedding_vectors)
+
+    def _apply_embedding(self,
+            cat: Optional[Tensor],
+            cont: Optional[Tensor],
+            cat_emb: Optional[nn.ModuleList], 
+            cont_emb: Tensor,
+            cont_bias: Tensor,
+            ) -> Tuple[Optional[Tensor], Optional[Tensor]]:
+        e_cat = torch.stack([embed(cat[...,i]) for i, embed in enumerate(cat_emb)], dim=-2) if cat is not None else None
+        if cont is not None:
+            #the line below is equivalent to following einsums
+            #e_cont = torch.einsum('btf,fh->bthf', cont, cont_emb)
+            #e_cont = torch.einsum('bf,fh->bhf', cont, cont_emb)
+            e_cont = torch.mul(cont.unsqueeze(-1), cont_emb)
+            e_cont = e_cont + cont_bias
+        else:
+            e_cont = None
+
+        if e_cat is not None and e_cont is not None:
+            return torch.cat([e_cat, e_cont], dim=-2)
+        elif e_cat is not None:
+            return e_cat
+        elif e_cont is not None:
+            return e_cont
+        else:
+            return None
+
+    def forward(self, x: Dict[str, Tensor]):
+        # temporal/static categorical/continuous known/observed input 
+        s_cat_inp = x.get('s_cat', None)
+        s_cont_inp = x.get('s_cont', None)
+        t_cat_k_inp = x.get('k_cat', None)
+        t_cont_k_inp = x.get('k_cont', None)
+        t_cat_o_inp = x.get('o_cat', None)
+        t_cont_o_inp = x.get('o_cont', None)
+        t_tgt_obs = x['target'] # Has to be present
+
+        # Static inputs are expected to be equal for all timesteps
+        # For memory efficiency there is no assert statement
+        s_cat_inp = s_cat_inp[:,0,:] if s_cat_inp is not None else None
+        s_cont_inp = s_cont_inp[:,0,:] if s_cont_inp is not None else None
+
+        s_inp = self._apply_embedding(s_cat_inp,
+                                      s_cont_inp,
+                                      self.s_cat_embed,
+                                      self.s_cont_embedding_vectors,
+                                      self.s_cont_embedding_bias)
+        t_known_inp = self._apply_embedding(t_cat_k_inp,
+                                            t_cont_k_inp,
+                                            self.t_cat_k_embed,
+                                            self.t_cont_k_embedding_vectors,
+                                            self.t_cont_k_embedding_bias)
+        t_observed_inp = self._apply_embedding(t_cat_o_inp,
+                                               t_cont_o_inp,
+                                               self.t_cat_o_embed,
+                                               self.t_cont_o_embedding_vectors,
+                                               self.t_cont_o_embedding_bias)
+
+        # Temporal observed targets
+        # t_observed_tgt = torch.einsum('btf,fh->btfh', t_tgt_obs, self.t_tgt_embedding_vectors)
+        t_observed_tgt = torch.matmul(t_tgt_obs.unsqueeze(3).unsqueeze(4), self.t_tgt_embedding_vectors.unsqueeze(1)).squeeze(3)
+        t_observed_tgt = t_observed_tgt + self.t_tgt_embedding_bias
+
+        return s_inp, t_known_inp, t_observed_inp, t_observed_tgt
+
+class VariableSelectionNetwork(nn.Module):
+    def __init__(self, config, num_inputs):
+        super().__init__()
+        self.joint_grn = GRN(config.hidden_size*num_inputs, config.hidden_size, output_size=num_inputs, context_hidden_size=config.hidden_size)
+        self.var_grns = nn.ModuleList([GRN(config.hidden_size, config.hidden_size, dropout=config.dropout) for _ in range(num_inputs)])
+
+    def forward(self, x: Tensor, context: Optional[Tensor] = None):
+        Xi = x.reshape(*x.shape[:-2], -1)
+        grn_outputs = self.joint_grn(Xi, c=context)
+        sparse_weights = F.softmax(grn_outputs, dim=-1)
+        transformed_embed_list = [m(x[...,i,:]) for i, m in enumerate(self.var_grns)]
+        transformed_embed = torch.stack(transformed_embed_list, dim=-1)
+        #the line below performs batched matrix vector multiplication
+        #for temporal features it's bthf,btf->bth
+        #for static features it's bhf,bf->bh
+        variable_ctx = torch.matmul(transformed_embed, sparse_weights.unsqueeze(-1)).squeeze(-1)
+
+        return variable_ctx, sparse_weights
+
+class StaticCovariateEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.vsn = VariableSelectionNetwork(config, config.num_static_vars)
+        self.context_grns = nn.ModuleList([GRN(config.hidden_size, config.hidden_size, dropout=config.dropout) for _ in range(4)])
+
+    def forward(self, x: Tensor) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
+        variable_ctx, sparse_weights = self.vsn(x)
+
+        # Context vectors:
+        # variable selection context
+        # enrichment context
+        # state_c context
+        # state_h context
+        cs, ce, ch, cc = tuple(m(variable_ctx) for m in self.context_grns)
+
+        return cs, ce, ch, cc
+
+
+class InterpretableMultiHeadAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.n_head = config.n_head
+        assert config.hidden_size % config.n_head == 0
+        self.d_head = config.hidden_size // config.n_head
+        self.qkv_linears = nn.Linear(config.hidden_size, (2 * self.n_head + 1) * self.d_head, bias=False)
+        self.out_proj = nn.Linear(self.d_head, config.hidden_size, bias=False)
+        self.attn_dropout = nn.Dropout(config.attn_dropout)
+        self.out_dropout = nn.Dropout(config.dropout)
+        self.scale = self.d_head**-0.5
+        self.register_buffer("_mask", torch.triu(torch.full((config.example_length, config.example_length), float('-inf')), 1).unsqueeze(0))
+
+    def forward(self, x: Tensor, mask_future_timesteps: bool = True) -> Tuple[Tensor, Tensor]:
+        bs, t, h_size = x.shape
+        qkv = self.qkv_linears(x)
+        q, k, v = qkv.split((self.n_head * self.d_head, self.n_head * self.d_head, self.d_head), dim=-1)
+        q = q.view(bs, t, self.n_head, self.d_head)
+        k = k.view(bs, t, self.n_head, self.d_head)
+        v = v.view(bs, t, self.d_head)
+
+        # attn_score = torch.einsum('bind,bjnd->bnij', q, k)
+        attn_score = torch.matmul(q.permute((0, 2, 1, 3)), k.permute((0, 2, 3, 1)))
+        attn_score.mul_(self.scale)
+
+        if mask_future_timesteps:
+            attn_score = attn_score + self._mask
+
+        attn_prob = F.softmax(attn_score, dim=3)
+        attn_prob = self.attn_dropout(attn_prob)
+
+        # attn_vec = torch.einsum('bnij,bjd->bnid', attn_prob, v)
+        attn_vec = torch.matmul(attn_prob, v.unsqueeze(1))
+        m_attn_vec = torch.mean(attn_vec, dim=1)
+        out = self.out_proj(m_attn_vec)
+        out = self.out_dropout(out)
+
+        return out, attn_vec
+
+
+
+class TemporalFusionTransformer(nn.Module):
+    """ 
+    Implementation of https://arxiv.org/abs/1912.09363 
+    """
+    def __init__(self, config):
+        super().__init__()
+
+        if hasattr(config, 'model'):
+            config = config.model
+
+        self.encoder_length = config.encoder_length #this determines from how distant past we want to use data from
+
+        self.embedding = TFTEmbedding(config)
+        self.static_encoder = StaticCovariateEncoder(config)
+
+        self.history_vsn = VariableSelectionNetwork(config, config.num_historic_vars) 
+        self.history_encoder = nn.LSTM(config.hidden_size, config.hidden_size, batch_first=True)
+        self.future_vsn = VariableSelectionNetwork(config, config.num_future_vars)
+        self.future_encoder = nn.LSTM(config.hidden_size, config.hidden_size, batch_first=True)
+
+
+        self.input_gate = GLU(config.hidden_size, config.hidden_size)
+        self.input_gate_ln = LayerNorm(config.hidden_size, eps=1e-3)
+
+        self.enrichment_grn = GRN(config.hidden_size,
+                                  config.hidden_size,
+                                  context_hidden_size=config.hidden_size, 
+                                  dropout=config.dropout)
+        self.attention = InterpretableMultiHeadAttention(config)
+        self.attention_gate = GLU(config.hidden_size, config.hidden_size)
+        self.attention_ln = LayerNorm(config.hidden_size, eps=1e-3)
+
+        self.positionwise_grn = GRN(config.hidden_size,
+                                    config.hidden_size,
+                                    dropout=config.dropout)
+
+        self.decoder_gate = GLU(config.hidden_size, config.hidden_size)
+        self.decoder_ln = LayerNorm(config.hidden_size, eps=1e-3)
+
+        self.quantile_proj = nn.Linear(config.hidden_size, len(config.quantiles))
+
+    def forward(self, x: Dict[str, Tensor]) -> Tensor:
+        s_inp, t_known_inp, t_observed_inp, t_observed_tgt = self.embedding(x)
+
+        # Static context
+        cs, ce, ch, cc = self.static_encoder(s_inp)
+        ch, cc = ch.unsqueeze(0), cc.unsqueeze(0) #lstm initial states
+
+        # Temporal input
+        _historical_inputs = [t_known_inp[:,:self.encoder_length,:], t_observed_tgt[:,:self.encoder_length,:]]
+        if t_observed_inp is not None:
+            _historical_inputs.insert(0,t_observed_inp[:,:self.encoder_length,:])
+
+        historical_inputs = torch.cat(_historical_inputs, dim=-2)
+        future_inputs = t_known_inp[:, self.encoder_length:]
+
+        # Encoders
+        historical_features, _ = self.history_vsn(historical_inputs, cs)
+        history, state = self.history_encoder(historical_features, (ch, cc))
+        future_features, _ = self.future_vsn(future_inputs, cs)
+        future, _ = self.future_encoder(future_features, state)
+        torch.cuda.synchronize() # this call gives perf boost for unknown reasons
+
+        # skip connection
+        input_embedding = torch.cat([historical_features, future_features], dim=1)
+        temporal_features = torch.cat([history, future], dim=1)
+        temporal_features = self.input_gate(temporal_features)
+        temporal_features = temporal_features + input_embedding
+        temporal_features = self.input_gate_ln(temporal_features)
+
+        # Static enrichment
+        enriched = self.enrichment_grn(temporal_features, c=ce)
+
+        # Temporal self attention
+        x, _ = self.attention(enriched, mask_future_timesteps=True)
+
+        # Don't compute hictorical quantiles
+        x = x[:, self.encoder_length:, :]
+        temporal_features = temporal_features[:, self.encoder_length:, :]
+        enriched = enriched[:, self.encoder_length:, :]
+
+        x = self.attention_gate(x)
+        x = x + enriched
+        x = self.attention_ln(x)
+
+        # Position-wise feed-forward
+        x = self.positionwise_grn(x)
+
+        # Final skip connection
+        x = self.decoder_gate(x)
+        x = x + temporal_features
+        x = self.decoder_ln(x)
+
+        out = self.quantile_proj(x)
+
+        return out
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/requirements.txt b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/requirements.txt
new file mode 100644
index 00000000..8ba46efc
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/requirements.txt
@@ -0,0 +1 @@
+tensorboard
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/scripts/benchmark.sh b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/scripts/benchmark.sh
new file mode 100644
index 00000000..c8a04c36
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/scripts/benchmark.sh
@@ -0,0 +1,54 @@
+#! /bin/bash
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+NUM_GPUS=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
+[ $NUM_GPUS -eq 16 ] && WORKER_NUMS=(1 8 16) || WORKER_NUMS=(1 8)
+DATASETS=(electricity traffic)
+
+rm -r /tmp/benchmark_results
+
+for DATASET in ${DATASETS[@]}
+do
+    for NGPU in ${WORKER_NUMS[@]}
+    do
+        for BATCH_SIZE in 512 1024 1536 2048 2560
+        do
+            for USE_AMP in --use_amp ""
+            do
+                for AFFINITY in "--affinity disabled" "--affinity single" "--affinity socket_unique_interleaved"
+                do 
+                    EXP_NAME="TFT_benchmark_${DATASET}_BS_${BATCH_SIZE}_${NGPU}GPU${USE_AMP}_${AFFINITY}"
+                    python -m torch.distributed.launch --nproc_per_node=${NGPU} train.py \
+                            --dataset ${DATASET} \
+                            --data_path /data/processed/${DATASET}_bin \
+                            --batch_size=${BATCH_SIZE} \
+                            --lr 5e-4 \
+                            --epochs 1 \
+                            --sample 100000 5000 \
+                            --seed 1 \
+                            ${USE_AMP} \
+                            ${AFFINITY} \
+                            --clip_grad 0.1 \
+                            --results /tmp/benchmark_results/${EXP_NAME}
+                done
+            done
+        done
+    done
+done
+for P in `ls /tmp/benchmark_results/`;
+do
+    echo ${P}
+    tail -n 1 /tmp/benchmark_results/${P}/dllogger.json
+done
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/scripts/get_data.sh b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/scripts/get_data.sh
new file mode 100644
index 00000000..d4c7c7e1
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/scripts/get_data.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+DATAPATH='/data'
+
+declare -A URLS=( ['electricity']='https://archive.ics.uci.edu/ml/machine-learning-databases/00321/LD2011_2014.txt.zip'
+                  ['traffic']='https://archive.ics.uci.edu/ml/machine-learning-databases/00204/PEMS-SF.zip'
+                )
+
+mkdir -p ${DATAPATH}/raw
+mkdir -p ${DATAPATH}/processed
+
+for DS in electricity traffic
+do
+	DS_PATH=${DATAPATH}/raw/${DS}
+	ZIP_FNAME=${DS_PATH}.zip
+    if [ ! -d ${DS_PATH} ]
+    then
+        wget "${URLS[${DS}]}" -O ${ZIP_FNAME}
+        unzip ${ZIP_FNAME} -d ${DS_PATH}
+    fi
+	python -c "from data_utils import standarize_${DS} as standarize; standarize(\"${DS_PATH}\")"
+	python -c "from data_utils import preprocess; \
+               from configuration import ${DS^}Config as Config; \
+               preprocess(\"${DS_PATH}/standarized.csv\", \"${DATAPATH}/processed/${DS}_bin\", Config())" 
+done
+
+
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/scripts/run_electricity.sh b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/scripts/run_electricity.sh
new file mode 100644
index 00000000..86214a9a
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/scripts/run_electricity.sh
@@ -0,0 +1,30 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+: ${SEED:=1}
+: ${LR:=1e-3}
+: ${NGPU:=8}
+: ${BATCH_SIZE:=1024}
+: ${EPOCHS:=30}
+
+python -m torch.distributed.launch --nproc_per_node=${NGPU} train.py \
+        --dataset electricity \
+        --data_path /data/processed/electricity_bin \
+        --batch_size=${BATCH_SIZE} \
+        --sample 450000 50000 \
+        --lr ${LR} \
+        --epochs ${EPOCHS} \
+        --seed ${SEED} \
+        --use_amp \
+        --results /results/TFT_electricity_bs${NGPU}x${BATCH_SIZE}_lr${LR}/seed_${SEED}
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/scripts/run_electricity_DGX1-16G.sh b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/scripts/run_electricity_DGX1-16G.sh
new file mode 100644
index 00000000..86214a9a
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/scripts/run_electricity_DGX1-16G.sh
@@ -0,0 +1,30 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+: ${SEED:=1}
+: ${LR:=1e-3}
+: ${NGPU:=8}
+: ${BATCH_SIZE:=1024}
+: ${EPOCHS:=30}
+
+python -m torch.distributed.launch --nproc_per_node=${NGPU} train.py \
+        --dataset electricity \
+        --data_path /data/processed/electricity_bin \
+        --batch_size=${BATCH_SIZE} \
+        --sample 450000 50000 \
+        --lr ${LR} \
+        --epochs ${EPOCHS} \
+        --seed ${SEED} \
+        --use_amp \
+        --results /results/TFT_electricity_bs${NGPU}x${BATCH_SIZE}_lr${LR}/seed_${SEED}
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/scripts/run_traffic.sh b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/scripts/run_traffic.sh
new file mode 100644
index 00000000..cab8e473
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/scripts/run_traffic.sh
@@ -0,0 +1,30 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+: ${SEED:=1}
+: ${LR:=1e-3}
+: ${NGPU:=8}
+: ${BATCH_SIZE:=1024}
+: ${EPOCHS:=20}
+
+python -m torch.distributed.launch --nproc_per_node=${NGPU} train.py \
+        --dataset traffic \
+        --data_path /data/processed/traffic_bin \
+        --batch_size=${BATCH_SIZE} \
+        --sample 450000 50000 \
+        --lr ${LR} \
+        --epochs ${EPOCHS} \
+        --seed ${SEED} \
+        --use_amp \
+        --results /results/TFT_traffic_bs${NGPU}x${BATCH_SIZE}_lr${LR}/seed_${SEED}
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/scripts/run_traffic_DGX1-16G.sh b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/scripts/run_traffic_DGX1-16G.sh
new file mode 100644
index 00000000..cab8e473
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/scripts/run_traffic_DGX1-16G.sh
@@ -0,0 +1,30 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+: ${SEED:=1}
+: ${LR:=1e-3}
+: ${NGPU:=8}
+: ${BATCH_SIZE:=1024}
+: ${EPOCHS:=20}
+
+python -m torch.distributed.launch --nproc_per_node=${NGPU} train.py \
+        --dataset traffic \
+        --data_path /data/processed/traffic_bin \
+        --batch_size=${BATCH_SIZE} \
+        --sample 450000 50000 \
+        --lr ${LR} \
+        --epochs ${EPOCHS} \
+        --seed ${SEED} \
+        --use_amp \
+        --results /results/TFT_traffic_bs${NGPU}x${BATCH_SIZE}_lr${LR}/seed_${SEED}
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/Dockerfile b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/Dockerfile
new file mode 100644
index 00000000..70552ea1
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/Dockerfile
@@ -0,0 +1,36 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:21.06-py3
+
+FROM ${FROM_IMAGE_NAME}
+
+RUN apt-get update && apt-get install -y libb64-dev libb64-0d
+WORKDIR /workspace
+#ENV PYTHONPATH /workspace
+RUN pip uninstall -y typing
+
+RUN apt update && apt install -y p7zip-full
+COPY requirements.txt .
+RUN pip install --upgrade pip
+RUN pip install --no-cache-dir --ignore-installed -r requirements.txt
+RUN pip install --no-cache-dir -e git://github.com/NVIDIA/dllogger#egg=dllogger
+
+COPY . .
+ENV PYTHONPATH="${PYTHONPATH}:/workspace"
+
+# AMP monkey-patch
+RUN sed -i 's/  def forward(ctx,/  @amp.custom_fwd\(cast_inputs=torch.float32\)\n  def forward(ctx,/g' /opt/conda/lib/python3.8/site-packages/apex/normalization/fused_layer_norm.py
+RUN sed -i 's/  def backward(ctx,/  @amp.custom_bwd\n  def backward(ctx,/g' /opt/conda/lib/python3.8/site-packages/apex/normalization/fused_layer_norm.py
+RUN sed -i 's/^import torch$/import torch\nfrom torch.cuda import amp/' /opt/conda/lib/python3.8/site-packages/apex/normalization/fused_layer_norm.py
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/LICENCE b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/LICENCE
new file mode 100644
index 00000000..261eeb9e
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/LICENCE
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/LICENSE AGREEMENT b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/LICENSE AGREEMENT
new file mode 100644
index 00000000..5d1d88cf
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/LICENSE AGREEMENT	
@@ -0,0 +1,25 @@
+Individual Contributor License Agreement (CLA)
+Thank you for submitting your contributions to this project.
+
+By signing this CLA, you agree that the following terms apply to all of your past, present and future contributions to the project.
+
+License.
+You hereby represent that all present, past and future contributions are governed by the Apache 2.0 License copyright statement.
+
+This entails that to the extent possible under law, you transfer all copyright and related or neighboring rights of the code or documents you contribute to the project itself or its maintainers. Furthermore you also represent that you have the authority to perform the above waiver with respect to the entirety of you contributions.
+
+Moral Rights.
+To the fullest extent permitted under applicable law, you hereby waive, and agree not to assert, all of your “moral rights” in or relating to your contributions for the benefit of the project.
+
+Third Party Content.
+If your Contribution includes or is based on any source code, object code, bug fixes, configuration changes, tools, specifications, documentation, data, materials, feedback, information or other works of authorship that were not authored by you (“Third Party Content”) or if you are aware of any third party intellectual property or proprietary rights associated with your Contribution (“Third Party Rights”), then you agree to include with the submission of your Contribution full details respecting such Third Party Content and Third Party Rights, including, without limitation, identification of which aspects of your Contribution contain Third Party Content or are associated with Third Party Rights, the owner/author of the Third Party Content and Third Party Rights, where you obtained the Third Party Content, and any applicable third party license terms or restrictions respecting the Third Party Content and Third Party Rights. For greater certainty, the foregoing obligations respecting the identification of Third Party Content and Third Party Rights do not apply to any portion of a Project that is incorporated into your Contribution to that same Project.
+
+Representations.
+You represent that, other than the Third Party Content and Third Party Rights identified by you in accordance with this Agreement, you are the sole author of your Contributions and are legally entitled to grant the foregoing licenses and waivers in respect of your Contributions. If your Contributions were created in the course of your employment with your past or present employer(s), you represent that such employer(s) has authorized you to make your Contributions on behalf of such employer(s) or such employer (s) has waived all of their right, title or interest in or to your Contributions.
+
+Disclaimer.
+To the fullest extent permitted under applicable law, your Contributions are provided on an "as is" basis, without any warranties or conditions, express or implied, including, without limitation, any implied warranties or conditions of non-infringement, merchantability or fitness for a particular purpose. You are not required to provide support for your Contributions, except to the extent you desire to provide support.
+
+No Obligation.
+You acknowledge that the maintainers of this project are under no obligation to use or incorporate your contributions into the project. The decision to use or incorporate your contributions into the project will be made at the sole discretion of the maintainers or their authorized delegates.
+
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/NOTICE b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/NOTICE
new file mode 100644
index 00000000..ae19bb47
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/NOTICE
@@ -0,0 +1,3 @@
+TFT for PyTorch
+
+This repository includes software from https://github.com/google-research/google-research/tree/master/tft licensed under the Apache License, Version 2.0
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/README.md b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/README.md
new file mode 100644
index 00000000..69b39d12
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/README.md
@@ -0,0 +1,465 @@
+# Temporal Fusion Transformer For PyTorch
+
+This repository provides a script and recipe to train the Temporal Fusion Transformer model to achieve state-of-the-art accuracy. The content of this repository is tested and maintained by NVIDIA.
+
+## Table Of Contents
+
+- [Model overview](#model-overview)
+    * [Model architecture](#model-architecture)
+    * [Default configuration](#default-configuration)
+    * [Feature support matrix](#feature-support-matrix)
+	    * [Features](#features)
+    * [Mixed precision training](#mixed-precision-training)
+	    * [Enabling mixed precision](#enabling-mixed-precision)
+          * [Enabling TF32](#enabling-tf32)
+    * [Glossary](#glossary)
+- [Setup](#setup)
+    * [Requirements](#requirements)
+- [Quick Start Guide](#quick-start-guide)
+- [Advanced](#advanced)
+    * [Scripts and sample code](#scripts-and-sample-code)
+    * [Command-line options](#command-line-options)
+    * [Getting the data](#getting-the-data)
+        * [Dataset guidelines](#dataset-guidelines)
+        * [Multi-dataset](#multi-dataset)
+    * [Training process](#training-process)
+    * [Inference process](#inference-process)
+- [Performance](#performance)
+    * [Benchmarking](#benchmarking)
+        * [Training performance benchmark](#training-performance-benchmark)
+        * [Inference performance benchmark](#inference-performance-benchmark)
+    * [Results](#results)
+        * [Training accuracy results](#training-accuracy-results)                         
+            * [Training accuracy: NVIDIA DGX A100 (8x A100 80GB)](#training-accuracy-nvidia-dgx-a100-8x-a100-80gb)
+            * [Training accuracy: NVIDIA DGX-1 (8x V100 16GB)](#training-accuracy-nvidia-dgx-1-8x-v100-16gb)
+            * [Training stability test](#training-stability-test)
+        * [Training performance results](#training-performance-results)
+            * [Training performance: NVIDIA DGX A100 (8x A100 80GB)](#training-performance-nvidia-dgx-a100-8x-a100-80gb)
+            * [Training performance: NVIDIA DGX-1 (8x V100 16GB)](#training-performance-nvidia-dgx-1-8x-v100-16gb)
+- [Release notes](#release-notes)
+    * [Changelog](#changelog)
+    * [Known issues](#known-issues)
+
+
+
+## Model overview
+
+The Temporal Fusion Transformer [TFT](https://arxiv.org/abs/1912.09363) model is a state-of-the-art architecture for interpretable, multi-horizon time-series prediction. The model was first developed and [implemented by Google](https://github.com/google-research/google-research/tree/master/tft) with the collaboration with the University of Oxford.
+This implementation differs from the reference implementation by addressing the issue of missing data, which is common in production datasets, by either masking their values in attention matrices or embedding them as a special value in the latent space.
+This model enables the prediction of confidence intervals for future values of time series for multiple future timesteps.
+
+This model is trained with mixed precision using Tensor Cores on Volta, Turing, and the NVIDIA Ampere GPU architectures. Therefore, researchers can get results 1.45x faster than training without Tensor Cores while experiencing the benefits of mixed precision training. This model is tested against each NGC monthly container release to ensure consistent accuracy and performance over time.
+
+### Model architecture
+
+The TFT model is a hybrid architecture joining LSTM encoding of time series and interpretability of transformer attention layers. Prediction is based on three  types of variables: static (constant for a given time series), known (known in advance for whole history and future), observed (known only for historical data). All these variables come in two flavors: categorical, and continuous. In addition to historical data, we feed the model with historical values of time series. All variables are embedded in high-dimensional space by learning an embedding vector. Categorical variables embeddings are learned in the classical sense of embedding discrete values. The model learns a single vector for each continuous variable, which is then scaled by this variable’s value for further processing. The next step is to filter variables through the Variable Selection Network (VSN), which assigns weights to the inputs in accordance with their relevance to the prediction. Static variables are used as a context for variable selection of other variables and as an initial state of LSTM encoders.
+After encoding, variables are passed to multi-head attention layers (decoder), which produce the final prediction. Whole architecture is interwoven with residual connections with gating mechanisms that allow  the architecture to adapt to various problems by skipping some parts of it.
+For the sake of explainability, heads of self-attention layers share value matrices. This allows interpreting  self-attention as an ensemble of models predicting different temporal patterns over the same feature set. The other feature that helps us understand the model is VSN activations, which tells us how relevant the given feature is to the prediction.
+![](TFT_architecture.PNG)
+*image source: https://arxiv.org/abs/1912.09363*
+
+### Default configuration
+
+The specific configuration of the TFT model depends on the dataset used. Not only is the volume of the model subject to change but so are the data sampling and preprocessing strategies. During preprocessing, data is normalized per feature. For a part of the datasets, we apply scaling per-time-series, which takes into account shifts in distribution between entities (i.e., a factory consumes more electricity than an average house). The model is trained with the quantile loss: <img src="https://render.githubusercontent.com/render/math?math=\Large\sum_{i=1}^N\sum_{q\in\mathcal{Q}}\sum_{t=1}^{t_{max}}\frac{QL(y_it,\hat{y}_i(q,t),q)}{Nt_{max}}">
+For quantiles in [0.1, 0.5, 0.9]. The default configurations are tuned for distributed training on DGX-1-32G with mixed precision. We use dynamic loss scaling. Specific values are provided in the table below.
+
+| Dataset | Training samples | Validation samples | Test samples | History length | Forecast horizon | Dropout | Hidden size | #Heads | BS | LR | Gradient clipping |
+| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
+| Electricity | 450k | 50k | 53.5k | 168 | 24 | 0.1 | 128 | 4 | 8x1024 | 1e-3 | 0.0 |
+| Traffic | 450k | 50k | 139.6k | 168 | 24 | 0.3 | 128 | 4 | 8x1024 | 1e-3 | 0.0
+
+### Feature support matrix
+
+The following features are supported by this model: 
+
+| Feature                    | Yes column                
+|----------------------------|--------------------------
+|Distributed data parallel   |         Yes
+|PyTorch AMP                 |         Yes 
+    
+         
+#### Features
+
+[Automatic Mixed Precision](https://pytorch.org/docs/stable/amp.html)
+provides an easy way to leverage Tensor Cores’ performance. It allows the execution of parts of a network in lower precision. Refer to [Mixed precision training](#mixed-precision-training) for more information.
+
+[PyTorch
+DistributedDataParallel](https://pytorch.org/docs/stable/nn.html#torch.nn.parallel.DistributedDataParallel) - a module
+wrapper that enables easy multiprocess distributed data-parallel
+training.
+
+### Mixed precision training
+
+Mixed precision is the combined use of different numerical precisions in a
+computational method.
+[Mixed precision](https://arxiv.org/abs/1710.03740) training offers significant
+computational speedup by performing operations in half-precision format while
+storing minimal information in single-precision to retain as much information
+as possible in critical parts of the network. Since the introduction of [Tensor Cores](https://developer.nvidia.com/tensor-cores) in Volta, and following with 
+both the Turing and Ampere architectures, significant training speedups are 
+experienced by switching to
+mixed precision -- up to 3x overall speedup on the most arithmetically intense
+model architectures. Using mixed precision training previously required two
+steps:
+
+1. Porting the model to use the FP16 data type where appropriate.
+2. Manually adding loss scaling to preserve small gradient values.
+
+The ability to train deep learning networks with lower precision was introduced
+in the Pascal architecture and first supported in [CUDA
+8](https://devblogs.nvidia.com/parallelforall/tag/fp16/) in the NVIDIA Deep
+Learning SDK.
+
+For information about:
+* How to train using mixed precision, refer to the [Mixed Precision
+  Training](https://arxiv.org/abs/1710.03740) paper and [Training With Mixed
+  Precision](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html)
+  documentation.
+* Techniques used for mixed precision training, refer to the [Mixed-Precision
+  Training of Deep Neural
+  Networks](https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/)
+  blog.
+* APEX tools for mixed precision training, refer to the [NVIDIA Apex: Tools for Easy Mixed-Precision Training in
+  PyTorch](https://devblogs.nvidia.com/apex-pytorch-easy-mixed-precision-training/)
+  .
+
+
+#### Enabling mixed precision
+
+
+Mixed precision is enabled in PyTorch by using the Automatic Mixed Precision torch.cuda.amp module, which casts variables to half-precision upon retrieval while storing variables in single-precision format. Furthermore, to preserve small gradient magnitudes in backpropagation, a [loss scaling](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html#lossscaling) step must be included when applying gradients. In PyTorch, loss scaling can be applied automatically by the GradScaler class. All the necessary steps to implement AMP are verbosely described [here](https://pytorch.org/docs/stable/notes/amp_examples.html#amp-examples).
+
+To enable mixed precision for TFT, simply add the `--use_amp` option to the training script.
+#### Enabling TF32
+
+TensorFloat-32 (TF32) is the new math mode in [NVIDIA A100](https://www.nvidia.com/en-us/data-center/a100/) GPUs for handling the matrix math, also called tensor operations. TF32 running on Tensor Cores in A100 GPUs can provide up to 10x speedups compared to single-precision floating-point math (FP32) on Volta GPUs. 
+
+TF32 Tensor Cores can speed up networks using FP32, typically with no loss of accuracy. It is more robust than FP16 for models which require high dynamic range for weights or activations.
+
+For more information, refer to the [TensorFloat-32 in the A100 GPU Accelerates AI Training, HPC up to 20x](https://blogs.nvidia.com/blog/2020/05/14/tensorfloat-32-precision-format/) blog post.
+
+TF32 is supported in the NVIDIA Ampere GPU architecture and is enabled by default.
+
+
+
+### Glossary
+
+**Multi horizon prediction**  
+Process of estimating values of a time series for multiple future time steps.
+
+**Quantiles**  
+Cut points dividing the range of a probability distribution intervals with equal probabilities.
+
+**Time series**  
+Series of data points indexed and equally spaced in time.
+
+**Transformer**  
+The paper [Attention Is All You Need](https://arxiv.org/abs/1706.03762) introduces a novel architecture called Transformer that uses an attention mechanism and transforms one sequence into another.
+ 
+
+## Setup
+
+The following section lists the requirements that you need to meet in order to start training the TFT model.
+
+### Requirements
+
+This repository contains Dockerfile, which extends the PyTorch NGC container and encapsulates some dependencies. Aside from these dependencies, ensure you have the following components:
+-   [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker)
+-   [PyTorch 21.06 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch)
+-   Supported GPUs:
+- [NVIDIA Volta architecture](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/)
+- [NVIDIA Turing architecture](https://www.nvidia.com/en-us/design-visualization/technologies/turing-architecture/)
+- [NVIDIA Ampere architecture](https://www.nvidia.com/en-us/data-center/nvidia-ampere-gpu-architecture/)
+
+For more information about how to get started with NGC containers, refer to the following sections from the NVIDIA GPU Cloud Documentation and the Deep Learning Documentation:
+-   [Getting Started Using NVIDIA GPU Cloud](https://docs.nvidia.com/ngc/ngc-getting-started-guide/index.html)
+-   [Accessing And Pulling From The NGC Container Registry](https://docs.nvidia.com/deeplearning/frameworks/user-guide/index.html#accessing_registry)
+-   Running [PyTorch](https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/running.html#running)
+
+  
+For those unable to use the PyTorch NGC container to set up the required environment or create your own container, refer to the versioned [NVIDIA Container Support Matrix](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html).
+
+## Quick Start Guide
+
+To train your model using mixed or TF32 precision with Tensor Cores, perform the following steps using the default parameters of the TFT model on any of the benchmark datasets. For the specifics concerning training and inference, refer to the [Advanced](#advanced) section.
+
+1. Clone the repository.
+```bash
+git clone https://github.com/NVIDIA/DeepLearningExamples
+cd DeepLearningExamples/PyTorch/Forecasting/TFT
+```
+
+2.  Build the TFT PyTorch NGC container.
+```bash
+docker build --network=host -t tft .
+```
+
+3.  Start an interactive session in the NGC container to run training/inference.
+```bash
+docker run -it --rm --ipc=host --network=host --gpus all -v /path/to/your/data:/data/ tft
+```
+
+Note: Ensure to mount your dataset using the -v flag to make it available for training inside the NVIDIA Docker container.
+
+4.  Download and preprocess datasets.
+```bash
+bash scripts/get_data.sh
+```
+
+5. Start training. Choose one of the scripts provided in the `scripts/` directory. Results are stored in the `/results` directory.
+These scripts are tuned for DGX1-32G. If you have a different system, use NGPU and BATCH_SIZE variables to adjust the parameters for your system.
+```bash
+bash scripts/run_electricity.sh
+bash scripts/run_traffic.sh
+```
+
+6. Start validation/evaluation. The metric we use for evaluation is q-risk. We can compare it per-quantile in the Pareto sense or jointly as one number indicating accuracy.
+```bash
+python inference.py \
+--checkpoint <your_checkpoint> \
+--data /data/processed/<dataset>/test.csv \
+--cat_encodings /data/processed/<dataset>/cat_encodings.bin \
+--tgt_scalers /data/processed/<dataset>/tgt_scalers.bin
+```
+
+7. Start inference/predictions. Visualize and save predictions by running the following command.
+```bash
+python inference.py \
+--checkpoint <your_checkpoint> \
+--data /data/processed/<dataset>/test.csv \
+--cat_encodings /data/processed/<dataset>/cat_encodings.bin \
+--tgt_scalers /data/processed/<dataset>/tgt_scalers.bin \
+--visualize \
+--save_predictions
+```
+
+
+
+Now that you have your model trained and evaluated, you can choose to compare your training results with our [Training accuracy results](#training-accuracy-results). You can also choose to benchmark your performance to [Training performance benchmark](#training-performance-results). Following the steps in these sections will ensure that you achieve the same accuracy and performance results as stated in the [Results](#results) section.
+## Advanced
+
+The following sections provide more  details about the dataset, running training and inference, and the training results.
+
+### Scripts and sample code
+
+In the root directory, the most important files are:
+
+`train.py`: Entry point for training
+`data_utils.py`: File containing the dataset implementation and preprocessing functions
+`modeling.py`: Definition of the model
+`configuration.py`: Contains configuration classes for various experiments
+`test.py`: Entry point testing trained model.
+`Dockerfile`: Container definition
+`log_helper.py`: Contains helper functions for setting up dllogger
+`criterions.py`: Definitions of loss functions
+
+The `scripts` directory contains scripts for default use cases:
+`run_electricity.sh`: train default model on the electricity dataset
+`run_traffic.sh`: train default model on the traffic dataset
+
+### Command-line options
+
+To view the full list of available options and their descriptions, use the `-h` or `--help` command-line option, for example:
+`python train.py --help`.
+
+The following example output is printed when running the model:
+```
+usage: train.py [-h] --data_path DATA_PATH --dataset {electricity,volatility,traffic,favorita} [--epochs EPOCHS] [--sample_data SAMPLE_DATA SAMPLE_DATA] [--batch_size BATCH_SIZE] [--lr LR] [--seed SEED] [--use_amp] [--clip_grad CLIP_GRAD]
+                [--early_stopping EARLY_STOPPING] [--results RESULTS] [--log_file LOG_FILE] [--distributed_world_size N] [--distributed_rank DISTRIBUTED_RANK] [--local_rank LOCAL_RANK] [--overwrite_config OVERWRITE_CONFIG]
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --data_path DATA_PATH
+  --dataset {electricity,volatility,traffic,favorita}
+  --epochs EPOCHS
+  --sample_data SAMPLE_DATA SAMPLE_DATA
+  --batch_size BATCH_SIZE
+  --lr LR
+  --seed SEED
+  --use_amp             Enable automatic mixed precision
+  --clip_grad CLIP_GRAD
+  --early_stopping EARLY_STOPPING
+                        Stop training if validation loss does not improve for more than this number of epochs.
+  --results RESULTS
+  --log_file LOG_FILE
+  --distributed_world_size N
+                        total number of GPUs across all nodes (default: all visible GPUs)
+  --distributed_rank DISTRIBUTED_RANK
+                        rank of the current worker
+  --local_rank LOCAL_RANK
+                        rank of the current worker
+  --overwrite_config OVERWRITE_CONFIG
+                        JSON string used to overload config
+
+```
+
+### Getting the data
+    
+The TFT model was trained on the electricity and traffic benchmark datasets. This repository contains the `get_data.sh` download script, which for electricity and and traffic datasets will automatically download and preprocess the training, validation and test datasets, and produce files that contain scalers.
+#### Dataset guidelines
+
+The `data_utils.py` file contains all functions that are used to preprocess the data. Initially the data is loaded to a `pandas.DataFrame` and parsed to the common format which contains the features we will use for training. Then standardized data is cleaned, normalized, encoded and binarized.
+This step does the following:
+Drop all the columns that are not marked in the configuration file as used for training or preprocessing
+Flatten indices in case time series are indexed by more than one column
+Split the data into training, validation and test splits
+Filter out all the time series shorter than minimal example length
+Normalize columns marked as continuous in the configuration file
+Encode as integers columns marked as categorical
+Save the data in csv and binary formats
+
+#### Multi-dataset
+In order to use an alternate dataset, you have to write a function that parses your data to a common format. The format is as follows:
+There is at least one id column
+There is exactly one time column (that can also be used as a feature column)
+Each feature is in a separate column
+Each row represents a moment in time for only one time series
+Additionally, you must specify a configuration of the network, including a data description. Refer to the example in `configuration.py` file.
+### Training process
+
+The `train.py` script is an entry point for a training procedure. Refined recipes can be found in the `scripts` directory.
+The model trains for at most `--epochs` epochs. If option `--early_stopping N` is set, then training will end if for N subsequent epochs validation loss hadn’t improved.
+The details of the architecture and the dataset configuration are encapsulated by the `--dataset` option. This option chooses one of the configurations stored in the `configuration.py` file. You can enable mixed precision training by providing the `--use_amp` option. The training script supports multi-GPU training with the APEX package. To enable distributed training prepend training command with `python -m torch.distributed.launch --nproc_per_node=${NGPU}`.
+
+Example command:
+```
+python -m torch.distributed.launch --nproc_per_node=8 train.py \
+        --dataset electricity \
+        --data_path /data/processed/electricity_bin \
+        --batch_size=1024 \
+        --sample 450000 50000 \
+        --lr 1e-3 \
+        --epochs 25 \
+        --early_stopping 5 \
+        --seed 1 \
+        --use_amp \
+        --results /results/TFT_electricity_bs8x1024_lr1e-3/seed_1
+```
+
+The model is trained by optimizing quantile loss <img src="https://render.githubusercontent.com/render/math?math=\Large\sum_{i=1}^N\sum_{q\in\mathcal{Q}}\sum_{t=1}^{t_{max}}\frac{QL(y_{it},\hat{y}_i(q,t),q)}{Nt_{max}}">
+. After training, the checkpoint with the least validation loss is evaluated on a test split with q-risk metric <img src="https://render.githubusercontent.com/render/math?math=\Large\frac{2\sum_{y\in\Omega}\sum_{t=1}^{t_{max}}QL(y_t,\hat{y}(q,t),q)}{\sum_{y\in\Omega}\sum_{t=1}^{t_{max}}|y_t|}">.
+Results are by default stored in the `/results` directory. This can be changed by providing the `--results` option. At the end of the training,  the results directory will contain the trained checkpoint which had the lowest validation loss, dllogger logs (in dictionary per line format), and TensorBoard logs.
+
+### Inference process
+
+Inference can be run by launching the `inference.py` script. The script requires a trained checkpoint to run. It is crucial to prepare the data in the same way as training data prior to running the inference. Example command:
+```
+python inference.py \
+--checkpoint /results/checkpoint.pt \
+--data /data/processed/electricity_bin/test.csv \
+--tgt_scalers /data/processed/electricity_bin/tgt_scalers.bin \
+--cat_encodings /data/processed/electricity_bin/cat_encodings.bin \
+--batch_size 2048 \
+--visualize \
+--save_predictions \
+--joint_visualization \
+--results /results \
+--use_amp
+```
+
+In the default setting, it performs the evaluation of the model on a specified dataset and prints q-risk evaluated on this dataset. In order to save the predictions, use the `--save_predictions` option. Predictions will be stored in the directory specified by the `--results` option in the csv format. Option `--joint_visualization` allows us to plot graphs in TensorBoard format, allowing us to inspect the results and compare them to true values. Using `--visualize`, you can save plots for each example in a separate file.
+## Performance
+
+### Benchmarking
+
+The following section shows how to run benchmarks measuring the model performance in training and inference modes.
+
+#### Training performance benchmark
+
+In order to run training benchmarks, use the `scripts/benchmark.sh` script.
+
+#### Inference performance benchmark
+
+To benchmark the inference performance on a specific batch size and dataset, run the `inference.py` script.
+### Results
+
+The following sections provide details on how we achieved our performance and accuracy in training and inference.
+
+#### Training accuracy results
+
+We conducted an extensive hyperparameter search along with stability tests. The presented results are the averages from the hundreds of runs.
+
+##### Training accuracy: NVIDIA DGX A100 (A100 80GB)
+
+Our results were obtained by running the `train.sh` training script in the [PyTorch 21.06 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch) on NVIDIA A100 GPUs.
+
+| Dataset | GPUs | Batch size / GPU    | Accuracy - TF32  | Accuracy - mixed precision  |   Time to train - TF32  |  Time to train - mixed precision | Time to train speedup (TF32 to mixed precision)     
+|-------------|---|------|-----------------------|-----------------------|-------|-------|-------
+| Electricity | 1 | 1024 | 0.027 / 0.059 / 0.029 | 0.028 / 0.058 / 0.029 | 1427s | 1087s | 1.313x
+| Electricity | 8 | 1024 | 0.027 / 0.056 / 0.028 | 0.026 / 0.054 / 0.029 | 216s  | 176s  | 1.227x
+| Traffic     | 1 | 1024 | 0.040 / 0.103 / 0.075 | 0.040 / 0.103 / 0.075 | 957s  | 726s  | 1.318x
+| Traffic     | 8 | 1024 | 0.042 / 0.104 / 0.076 | 0.042 / 0.106 / 0.077 | 151s  | 126s  | 1.198x
+
+
+
+
+##### Training accuracy: NVIDIA DGX-1 (V100 16GB)
+
+Our results were obtained by running the `train.sh` training script in the [PyTorch 21.06 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch) on NVIDIA DGX-1 with V100 16GB GPUs.
+
+| Dataset | GPUs    | Batch size / GPU    | Accuracy - FP32  | Accuracy - mixed precision  |   Time to train - FP32  |  Time to train - mixed precision | Time to train speedup (FP32 to mixed precision)        
+|-------------|---|------|-----------------------|-----------------------|-------|-------|-----------
+| Electricity | 1 | 1024 | 0.027 / 0.056 / 0.028 | 0.027 / 0.058 / 0.029 | 2559s | 1598s | 1.601x 
+| Electricity | 8 | 1024 | 0.027 / 0.055 / 0.028 | 0.027 / 0.055 / 0.029 | 381s  | 261s  | 1.460x   
+| Traffic     | 1 | 1024 | 0.040 / 0.102 / 0.075 | 0.041 / 0.101 / 0.074 | 1718s | 1062s | 1.618x 
+| Traffic     | 8 | 1024 | 0.042 / 0.106 / 0.076 | 0.042 / 0.105 / 0.077 | 256s  | 176s  | 1.455x
+
+
+
+##### Training stability test
+
+In order to get a greater picture of the model’s accuracy, we performed a hyperparameter search along with stability tests on 100 random seeds for each configuration. Then, for each benchmark dataset, we have chosen the architecture with the least mean test q-risk. The table below summarizes the best configurations.
+
+| Dataset     | #GPU | Hidden size | #Heads | Local BS | LR   | Gradient clipping | Dropout | Mean q-risk | Std q-risk | Min q-risk | Max q-risk
+|-------------|------|-------------|--------|----------|------|-------------------|---------|-------------|------------| -----------|------ 
+| Electricity | 8    | 128         | 4      | 1024     | 1e-3 | 0.0               | 0.1     | 0.1131      | 0.0025     | 0.1080     | 0.1200
+| Traffic     | 8    | 128         | 4      | 1024     | 1e-3 | 0.0               | 0.3     | 0.2180      | 0.0049     | 0.2069     | 0.2336
+
+
+#### Training performance results
+
+##### Training performance: NVIDIA DGX A100 (A100 80GB)
+
+Our results were obtained by running the `train.sh` training script in the [PyTorch 21.06 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch) on NVIDIA A100 (A100 80GB) GPUs. Performance numbers (in items/images per second) were averaged over an entire training epoch.
+
+| Dataset | GPUs   | Batch size / GPU   | Throughput - TF32    | Throughput - mixed precision    | Throughput speedup (TF32 - mixed precision)   | Weak scaling - TF32    | Weak scaling - mixed precision        
+|-------------|---|------|--------|--------|-------|-------|-----
+| Electricity | 1 | 1024 | 10173  | 13703  | 1.35x | 1     | 1
+| Electricity | 8 | 1024 | 80596  | 107761 | 1.34x | 7.92x | 7.86x
+| Traffic     | 1 | 1024 | 10197  | 13779  | 1.35x | 1     | 1
+| Traffic     | 8 | 1024 | 80692  | 107979 | 1.34x | 7.91x | 7.84x
+
+
+To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
+
+The performance metrics used were items per second.
+
+
+##### Training performance: NVIDIA DGX-1 (V100 16GB)
+
+Our results were obtained by running the `train.sh` training script in the [PyTorch 21.06 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch) on NVIDIA DGX-1 with (V100 16GB) GPUs. Performance numbers (in items/images per second) were averaged over an entire training epoch.
+
+| Dataset | GPUs   | Batch size / GPU   | Throughput - FP32    | Throughput - mixed precision    | Throughput speedup (FP32 - mixed precision)   | Weak scaling - FP32    | Weak scaling - mixed precision        
+|-------------|---|------|-------|-------|-------|------|----
+| Electricity | 1 | 1024 | 5580  | 9148  | 1.64x | 1     | 1
+| Electricity | 8 | 1024 | 43351 | 69855 | 1.61x | 7.77x | 7.64x
+| Traffic     | 1 | 1024 | 5593  | 9194  | 1.64x | 1     | 1
+| Traffic     | 8 | 1024 | 43426 | 69983 | 1.61x | 7.76x | 7.61x
+
+
+
+To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
+
+The performance metrics used were items per second.
+
+## Release notes
+The performance measurements in this document were conducted at the time of publication and may not reflect the performance achieved from NVIDIA’s latest software release. For the most up-to-date performance measurements, go to https://developer.nvidia.com/deep-learning-performance-training-inference.
+
+### Changelog
+
+October 2021
+- Initial release
+
+### Known issues
+There are no known issues with this model.
+
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/TFT_architecture.PNG b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/TFT_architecture.PNG
new file mode 100644
index 00000000..c3431031
Binary files /dev/null and b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/TFT_architecture.PNG differ
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/configuration.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/configuration.py
new file mode 100644
index 00000000..bef26e66
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/configuration.py
@@ -0,0 +1,128 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from data_utils import InputTypes, DataTypes, FeatureSpec
+import datetime
+
+class ElectricityConfig():
+    def __init__(self):
+
+        self.features = [
+                         FeatureSpec('id', InputTypes.ID, DataTypes.CATEGORICAL),
+                         FeatureSpec('hours_from_start', InputTypes.TIME, DataTypes.CONTINUOUS),
+                         FeatureSpec('power_usage', InputTypes.TARGET, DataTypes.CONTINUOUS),
+                         FeatureSpec('hour', InputTypes.KNOWN, DataTypes.CONTINUOUS),
+                         FeatureSpec('day_of_week', InputTypes.KNOWN, DataTypes.CONTINUOUS),
+                         FeatureSpec('hours_from_start', InputTypes.KNOWN, DataTypes.CONTINUOUS),
+                         FeatureSpec('categorical_id', InputTypes.STATIC, DataTypes.CATEGORICAL),
+                        ]
+        # Dataset split boundaries
+        self.time_ids = 'days_from_start' # This column contains time indices across which we split the data
+        self.train_range = (1096, 1315)
+        self.valid_range = (1308, 1339)
+        self.test_range = (1332, 1346)
+        self.dataset_stride = 1 #how many timesteps between examples
+        self.scale_per_id = True
+        self.missing_id_strategy = None
+        self.missing_cat_data_strategy='encode_all'
+
+        # Feature sizes
+        self.static_categorical_inp_lens = [369]
+        self.temporal_known_categorical_inp_lens = []
+        self.temporal_observed_categorical_inp_lens = []
+        self.quantiles = [0.1, 0.5, 0.9]
+
+        self.example_length = 8 * 24
+        self.encoder_length = 7 * 24
+
+        self.n_head = 4
+        self.hidden_size = 128
+        self.dropout = 0.1
+        self.attn_dropout = 0.0
+
+        #### Derived variables ####
+        self.temporal_known_continuous_inp_size = len([x for x in self.features 
+            if x.feature_type == InputTypes.KNOWN and x.feature_embed_type == DataTypes.CONTINUOUS])
+        self.temporal_observed_continuous_inp_size = len([x for x in self.features 
+            if x.feature_type == InputTypes.OBSERVED and x.feature_embed_type == DataTypes.CONTINUOUS])
+        self.temporal_target_size = len([x for x in self.features if x.feature_type == InputTypes.TARGET])
+        self.static_continuous_inp_size = len([x for x in self.features 
+            if x.feature_type == InputTypes.STATIC and x.feature_embed_type == DataTypes.CONTINUOUS])
+
+        self.num_static_vars = self.static_continuous_inp_size + len(self.static_categorical_inp_lens)
+        self.num_future_vars = self.temporal_known_continuous_inp_size + len(self.temporal_known_categorical_inp_lens)
+        self.num_historic_vars = sum([self.num_future_vars,
+                                      self.temporal_observed_continuous_inp_size,
+                                      self.temporal_target_size,
+                                      len(self.temporal_observed_categorical_inp_lens),
+                                      ])
+
+
+class TrafficConfig():
+    def __init__(self):
+
+        self.features = [
+                         FeatureSpec('id', InputTypes.ID, DataTypes.CATEGORICAL),
+                         FeatureSpec('hours_from_start', InputTypes.TIME, DataTypes.CONTINUOUS),
+                         FeatureSpec('values', InputTypes.TARGET, DataTypes.CONTINUOUS),
+                         FeatureSpec('time_on_day', InputTypes.KNOWN, DataTypes.CONTINUOUS),
+                         FeatureSpec('day_of_week', InputTypes.KNOWN, DataTypes.CONTINUOUS),
+                         FeatureSpec('hours_from_start', InputTypes.KNOWN, DataTypes.CONTINUOUS),
+                         FeatureSpec('categorical_id', InputTypes.STATIC, DataTypes.CATEGORICAL),
+                        ]
+        # Dataset split boundaries
+        self.time_ids = 'sensor_day' # This column contains time indices across which we split the data
+        self.train_range = (0, 151)
+        self.valid_range = (144, 166)
+        self.test_range = (159, float('inf'))
+        self.dataset_stride = 1 #how many timesteps between examples
+        self.scale_per_id = False
+        self.missing_id_strategy = None
+        self.missing_cat_data_strategy='encode_all'
+
+        # Feature sizes
+        self.static_categorical_inp_lens = [963]
+        self.temporal_known_categorical_inp_lens = []
+        self.temporal_observed_categorical_inp_lens = []
+        self.quantiles = [0.1, 0.5, 0.9]
+
+        self.example_length = 8 * 24
+        self.encoder_length = 7 * 24
+
+        self.n_head = 4
+        self.hidden_size = 128
+        self.dropout = 0.3
+        self.attn_dropout = 0.0
+
+        #### Derived variables ####
+        self.temporal_known_continuous_inp_size = len([x for x in self.features 
+            if x.feature_type == InputTypes.KNOWN and x.feature_embed_type == DataTypes.CONTINUOUS])
+        self.temporal_observed_continuous_inp_size = len([x for x in self.features 
+            if x.feature_type == InputTypes.OBSERVED and x.feature_embed_type == DataTypes.CONTINUOUS])
+        self.temporal_target_size = len([x for x in self.features if x.feature_type == InputTypes.TARGET])
+        self.static_continuous_inp_size = len([x for x in self.features 
+            if x.feature_type == InputTypes.STATIC and x.feature_embed_type == DataTypes.CONTINUOUS])
+
+        self.num_static_vars = self.static_continuous_inp_size + len(self.static_categorical_inp_lens)
+        self.num_future_vars = self.temporal_known_continuous_inp_size + len(self.temporal_known_categorical_inp_lens)
+        self.num_historic_vars = sum([self.num_future_vars,
+                                      self.temporal_observed_continuous_inp_size,
+                                      self.temporal_target_size,
+                                      len(self.temporal_observed_categorical_inp_lens),
+                                      ])
+
+
+CONFIGS = {'electricity':  ElectricityConfig,
+           'traffic':      TrafficConfig, 
+           }
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/criterions.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/criterions.py
new file mode 100644
index 00000000..5c9df6ae
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/criterions.py
@@ -0,0 +1,28 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class QuantileLoss(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.register_buffer('q', torch.tensor(config.quantiles))
+
+    def forward(self, predictions, targets):
+        diff = predictions - targets
+        ql = (1-self.q)*F.relu(diff) + self.q*F.relu(-diff)
+        losses = ql.view(-1, ql.shape[-1]).mean(0)
+        return losses
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/data_utils.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/data_utils.py
new file mode 100644
index 00000000..f38f8bfb
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/data_utils.py
@@ -0,0 +1,790 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################
+# Copyright 2021 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import math
+import pickle
+import enum
+import datetime
+
+from collections import namedtuple, OrderedDict
+
+import sklearn.preprocessing
+from sklearn.impute import SimpleImputer
+import pandas as pd
+import numpy as np
+from bisect import bisect
+
+import torch
+from torch.utils.data import Dataset,IterableDataset,DataLoader
+
+class DataTypes(enum.IntEnum):
+    """Defines numerical types of each column."""
+    CONTINUOUS = 0
+    CATEGORICAL = 1
+    DATE = 2
+    STR = 3
+
+class InputTypes(enum.IntEnum):
+    """Defines input types of each column."""
+    TARGET = 0
+    OBSERVED = 1
+    KNOWN = 2
+    STATIC = 3
+    ID = 4  # Single column used as an entity identifier
+    TIME = 5  # Single column exclusively used as a time index
+
+FeatureSpec = namedtuple('FeatureSpec', ['name', 'feature_type', 'feature_embed_type'])
+DTYPE_MAP = {
+        DataTypes.CONTINUOUS : np.float32,
+        DataTypes.CATEGORICAL : np.int64,
+        DataTypes.DATE:'datetime64[ns]',
+        DataTypes.STR: str
+        }
+
+FEAT_ORDER = [
+        (InputTypes.STATIC, DataTypes.CATEGORICAL),
+        (InputTypes.STATIC, DataTypes.CONTINUOUS),
+        (InputTypes.KNOWN, DataTypes.CATEGORICAL),
+        (InputTypes.KNOWN, DataTypes.CONTINUOUS),
+        (InputTypes.OBSERVED, DataTypes.CATEGORICAL),
+        (InputTypes.OBSERVED, DataTypes.CONTINUOUS),
+        (InputTypes.TARGET, DataTypes.CONTINUOUS),
+        (InputTypes.ID, DataTypes.CATEGORICAL)
+        ]
+
+FEAT_NAMES = ['s_cat' , 's_cont' , 'k_cat' , 'k_cont' , 'o_cat' , 'o_cont' , 'target', 'id']
+DEFAULT_ID_COL = 'id'
+
+class TFTBinaryDataset(Dataset):
+    def __init__(self, path, config):
+        super(TFTBinaryDataset).__init__()
+        self.features = [x for x in config.features if x.feature_embed_type != DataTypes.DATE]
+        self.example_length = config.example_length
+        self.stride = config.dataset_stride
+
+        self.grouped = pickle.load(open(path, 'rb'))
+        self.grouped = [x for x in self.grouped if x.shape[0] >= self.example_length]
+        self._cum_examples_in_group = np.cumsum([(g.shape[0] - self.example_length + 1)//self.stride for g in self.grouped])
+
+
+        self.feature_type_col_map = [[i for i,f in enumerate(self.features) if (f.feature_type, f.feature_embed_type) == x] for x in FEAT_ORDER]
+
+        # The list comprehension below is an elaborate way of rearranging data into correct order,
+        # simultaneously doing casting to proper types. Probably can be written neater
+        self.grouped = [
+                [
+                    arr[:, idxs].view(dtype=np.float32).astype(DTYPE_MAP[t[1]]) 
+                    for t, idxs in zip(FEAT_ORDER, self.feature_type_col_map)
+                ] 
+                for arr in self.grouped
+            ]
+
+    def __len__(self):
+        return self._cum_examples_in_group[-1] if len(self._cum_examples_in_group) else 0
+
+    def __getitem__(self, idx):
+        g_idx = bisect(self._cum_examples_in_group, idx)
+        e_idx = idx - self._cum_examples_in_group[g_idx-1] if g_idx else idx
+
+        group =  self.grouped[g_idx]
+
+        tensors = [
+                   torch.from_numpy(feat[e_idx * self.stride:e_idx*self.stride + self.example_length])
+                   if feat.size else torch.empty(0)
+                   for feat in group
+                  ]
+
+        return OrderedDict(zip(FEAT_NAMES, tensors))
+
+
+class TFTDataset(Dataset):
+    def __init__(self, path, config):
+        super(TFTDataset).__init__()
+        self.features = config.features
+        self.data = pd.read_csv(path, index_col=0)
+        self.example_length = config.example_length
+        self.stride = config.dataset_stride
+
+        # name field is a column name.
+        # there can be multiple entries with the same name because one column can be interpreted in many ways
+        time_col_name = next(x.name for x in self.features if x.feature_type==InputTypes.TIME)
+        id_col_name   = next(x.name for x in self.features if x.feature_type==InputTypes.ID)
+        if not id_col_name in self.data.columns:
+            id_col_name = DEFAULT_ID_COL
+            self.features = [x for x in self.features if x.feature_type!=InputTypes.ID]
+            self.features.append(FeatureSpec(DEFAULT_ID_COL, InputTypes.ID, DataTypes.CATEGORICAL))
+        col_dtypes    = {v.name:DTYPE_MAP[v.feature_embed_type] for v in self.features}
+
+
+        self.data.sort_values(time_col_name,inplace=True)
+        self.data = self.data[set(x.name for x in self.features)] #leave only relevant columns
+        self.data = self.data.astype(col_dtypes)
+        self.data = self.data.groupby(id_col_name).filter(lambda group: len(group) >= self.example_length)
+        self.grouped = list(self.data.groupby(id_col_name))
+
+        self._cum_examples_in_group = np.cumsum([(len(g[1]) - self.example_length + 1)//self.stride for g in self.grouped])
+
+    def __len__(self):
+        return self._cum_examples_in_group[-1]
+
+    def __getitem__(self, idx):
+        g_idx = len([x for x in self._cum_examples_in_group if x <= idx])
+        e_idx = idx - self._cum_examples_in_group[g_idx-1] if g_idx else idx
+
+        group =  self.grouped[g_idx][1]
+        sliced = group.iloc[e_idx * self.stride:e_idx*self.stride + self.example_length]
+
+        # We need to be sure that tensors are returned in the correct order
+        tensors = tuple([] for _ in range(8))
+        for v in self.features:
+            if v.feature_type == InputTypes.STATIC and v.feature_embed_type == DataTypes.CATEGORICAL:
+                tensors[0].append(torch.from_numpy(sliced[v.name].to_numpy()))
+            elif v.feature_type == InputTypes.STATIC and v.feature_embed_type == DataTypes.CONTINUOUS:
+                tensors[1].append(torch.from_numpy(sliced[v.name].to_numpy()))
+            elif v.feature_type == InputTypes.KNOWN and v.feature_embed_type == DataTypes.CATEGORICAL:
+                tensors[2].append(torch.from_numpy(sliced[v.name].to_numpy()))
+            elif v.feature_type == InputTypes.KNOWN and v.feature_embed_type == DataTypes.CONTINUOUS:
+                tensors[3].append(torch.from_numpy(sliced[v.name].to_numpy()))
+            elif v.feature_type == InputTypes.OBSERVED and v.feature_embed_type == DataTypes.CATEGORICAL:
+                tensors[4].append(torch.from_numpy(sliced[v.name].to_numpy()))
+            elif v.feature_type == InputTypes.OBSERVED and v.feature_embed_type == DataTypes.CONTINUOUS:
+                tensors[5].append(torch.from_numpy(sliced[v.name].to_numpy()))
+            elif v.feature_type == InputTypes.TARGET:
+                tensors[6].append(torch.from_numpy(sliced[v.name].to_numpy()))
+            elif v.feature_type == InputTypes.ID:
+                tensors[7].append(torch.from_numpy(sliced[v.name].to_numpy()))
+
+
+        tensors = [torch.stack(x, dim=-1) if x else torch.empty(0) for x in tensors]
+
+        return OrderedDict(zip(FEAT_NAMES, tensors))
+        
+def get_dataset_splits(df, config):
+
+    if hasattr(config, 'relative_split') and config.relative_split:
+        forecast_len = config.example_length - config.encoder_length
+        # The valid split is shifted from the train split by number of the forecast steps to the future.
+        # The test split is shifted by the number of the forecast steps from the valid split
+        train = []
+        valid = []
+        test = []
+
+        for _, group in df.groupby(DEFAULT_ID_COL):
+            index = group[config.time_ids]
+            _train = group.loc[index < config.valid_boundary]
+            _valid = group.iloc[(len(_train) - config.encoder_length):(len(_train) + forecast_len)]
+            _test = group.iloc[(len(_train) - config.encoder_length + forecast_len):(len(_train) + 2*forecast_len)]
+            train.append(_train)
+            valid.append(_valid)
+            test.append(_test)
+
+        train = pd.concat(train, axis=0)
+        valid = pd.concat(valid, axis=0)
+        test = pd.concat(test, axis=0)
+    else:
+        index = df[config.time_ids]
+        train = df.loc[(index >= config.train_range[0]) & (index < config.train_range[1])]
+        valid = df.loc[(index >= config.valid_range[0]) & (index < config.valid_range[1])]
+        test  = df.loc[(index >= config.test_range[0]) & (index < config.test_range[1])]
+
+    return train, valid, test
+
+def flatten_ids(df, config):
+
+    if config.missing_id_strategy == 'drop':
+        if hasattr(config, 'combine_ids') and config.combine_ids:
+            index = np.logical_or.reduce([df[c].isna() for c in config.combine_ids])
+        else:
+            id_col = next(x.name for x in config.features if x.feature_type == InputTypes.ID)
+            index = df[id_col].isna()
+        index = index[index == True].index # Extract indices of nans
+        df.drop(index, inplace=True)
+
+    if not (hasattr(config, 'combine_ids') and config.combine_ids):
+        id_col = next(x.name for x in config.features if x.feature_type == InputTypes.ID)
+        ids = df[id_col].apply(str)
+        df.drop(id_col, axis=1, inplace=True)
+        encoder = sklearn.preprocessing.LabelEncoder().fit(ids.values)
+        df[DEFAULT_ID_COL] = encoder.transform(ids)
+        encoders = OrderedDict({id_col: encoder})
+
+    else:
+        encoders = {c:sklearn.preprocessing.LabelEncoder().fit(df[c].values) for c in config.combine_ids}
+        encoders = OrderedDict(encoders)
+        lens = [len(v.classes_) for v in encoders.values()]
+        clens = np.roll(np.cumprod(lens), 1)
+        clens[0] = 1
+
+        # this takes a looooooot of time. Probably it would be better to create 2 dummy columns
+        df[DEFAULT_ID_COL] = df.apply(lambda row: sum([encoders[c].transform([row[c]])[0]*clens[i] for i,c in enumerate(encoders.keys())]), axis=1)
+        df.drop(config.combine_ids, axis=1, inplace=True)
+
+    return DEFAULT_ID_COL, encoders
+
+def impute(df, config):
+    #XXX This ensures that out scaling will have the same mean. We still need to check the variance
+    if not hasattr(config, 'missing_data_label'):
+        return df, None
+    else:
+        imp = SimpleImputer(missing_values=config.missing_data_label, strategy='mean')
+        mask = df.applymap(lambda x: True if x == config.missing_data_label else False)
+        data = df.values
+        col_mask = (data == config.missing_data_label).all(axis=0)
+        data[:,~col_mask] = imp.fit_transform(data)
+        return data, mask
+
+def normalize_reals(train, valid, test, config, id_col=DEFAULT_ID_COL):
+    tgt_cols = [x.name for x in config.features if x.feature_type == InputTypes.TARGET]
+    real_cols = list(set(v.name for v in config.features if v.feature_embed_type == DataTypes.CONTINUOUS).difference(set(tgt_cols)))
+    real_scalers = {}
+    tgt_scalers = {}
+
+    def apply_scalers(df, name=None):
+        if name is None:
+            name = df.name
+        mask = df.applymap(lambda x: True if x == config.missing_data_label else False) if hasattr(config, 'missing_data_label') else None
+        df[real_cols] = real_scalers[name].transform(df[real_cols])
+        if mask is not None and any(mask):
+            df[real_cols].mask(mask, 10**9)
+        df[tgt_cols] = tgt_scalers[name].transform(df[tgt_cols])
+        return df
+
+    if config.scale_per_id:
+        for identifier, sliced in train.groupby(id_col):
+            data = sliced[real_cols]
+            data, _ = impute(data, config)
+            real_scalers[identifier] = sklearn.preprocessing.StandardScaler().fit(data)
+            # XXX We should probably remove examples that contain NaN as a target
+            target = sliced[tgt_cols]
+            tgt_scalers[identifier] = sklearn.preprocessing.StandardScaler().fit(target)
+
+        train = train.groupby(id_col).apply(apply_scalers)
+        # For valid and testing leave only timeseries previously present in train subset
+        # XXX for proper data science we should consider encoding unseen timeseries as a special case, not throwing them away
+        valid = valid.loc[valid[id_col].isin(real_scalers.keys())]
+        valid = valid.groupby(id_col).apply(apply_scalers)
+        test = test.loc[test[id_col].isin(real_scalers.keys())]
+        test = test.groupby(id_col).apply(apply_scalers)
+
+    else:
+        data, _ = impute(train[real_cols], config)
+        real_scalers[''] = sklearn.preprocessing.StandardScaler().fit(data)
+        tgt_scalers[''] = sklearn.preprocessing.StandardScaler().fit(train[tgt_cols])
+
+        train = apply_scalers(train, name='')
+        valid = apply_scalers(valid, name='')
+        test = apply_scalers(test, name='')
+
+    return train, valid, test, real_scalers, tgt_scalers
+
+def encode_categoricals(train, valid, test, config):
+    cat_encodings = {}
+    cat_cols = list(set(v.name for v in config.features if v.feature_embed_type == DataTypes.CATEGORICAL and v.feature_type != InputTypes.ID))
+    num_classes = [] #XXX Maybe we should modify config based on this value? Or send a warninig?
+                     # For TC performance reasons we might want for num_classes[i] be divisible by 8
+
+    # Train categorical encoders
+    for c in cat_cols:
+        if config.missing_cat_data_strategy == 'special_token':
+            #XXX this will probably require some data augmentation
+            unique = train[c].unique()
+            valid[c].loc[valid[c].isin(unique)] = '<UNK>'
+            test[c].loc[test[c].isin(unique)] = '<UNK>'
+
+        if config.missing_cat_data_strategy == 'encode_all' or \
+                config.missing_cat_data_strategy == 'special_token':
+            srs = pd.concat([train[c], valid[c], test[c]]).apply(str)
+            cat_encodings[c] = sklearn.preprocessing.LabelEncoder().fit(srs.values)
+        elif config.missing_cat_data_strategy == 'drop':
+            # TODO: implement this. In addition to dropping rows this has to split specific time series in chunks
+            # to prevent data from having temporal gaps
+            pass
+        num_classes.append(srs.nunique())
+    print('Categorical variables encodings lens: ', num_classes)
+
+
+    for split in [train, valid, test]:
+        for c in cat_cols:
+            srs = split[c].apply(str)
+            split[c] = srs
+            split.loc[:,c] = cat_encodings[c].transform(srs)
+
+    return cat_encodings
+
+
+def preprocess(src_path, dst_path, config):
+    df = pd.read_csv(src_path, index_col=0)
+
+    for c in config.features:
+        if c.feature_embed_type == DataTypes.DATE:
+            df[c.name] = pd.to_datetime(df[c.name])
+
+    # Leave only columns relevant to preprocessing
+    relevant_columns = list(set([f.name for f in config.features] + [config.time_ids]))
+    df = df[relevant_columns]
+
+
+    id_col, id_encoders = flatten_ids(df, config)
+    df = df.reindex(sorted(df.columns), axis=1)
+    
+    train, valid, test = get_dataset_splits(df, config)
+   
+    # Length filter the data (all timeseries shorter than example len will be dropped)
+    #for df in [train, valid, test]:
+    #    df.groupby(id_col).filter(lambda x: len(x) >= config.example_length)
+    train = pd.concat([x[1] for x in train.groupby(id_col) if len(x[1]) >= config.example_length])
+    valid = pd.concat([x[1] for x in valid.groupby(id_col) if len(x[1]) >= config.example_length])
+    test  = pd.concat([x[1] for x in test.groupby(id_col)  if len(x[1]) >= config.example_length])
+
+    train, valid, test, real_scalers, tgt_scalers = normalize_reals(train, valid, test, config, id_col)
+
+    cat_encodings = encode_categoricals(train, valid, test, config)
+
+    os.makedirs(dst_path, exist_ok=True)
+    
+    train.to_csv(os.path.join(dst_path, 'train.csv'))
+    valid.to_csv(os.path.join(dst_path, 'valid.csv'))
+    test.to_csv(os.path.join(dst_path, 'test.csv'))
+
+    # Save relevant columns in binary form for faster dataloading
+    # IMORTANT: We always expect id to be a single column indicating the complete timeseries
+    # We also expect a copy of id in form of static categorical input!!!
+    col_names = [id_col] + [x.name for x in config.features if x.feature_embed_type != DataTypes.DATE and x.feature_type != InputTypes.ID]
+    grouped_train = [x[1][col_names].values.astype(np.float32).view(dtype=np.int32) for x in train.groupby(id_col)]
+    grouped_valid = [x[1][col_names].values.astype(np.float32).view(dtype=np.int32) for x in valid.groupby(id_col)]
+    grouped_test  = [x[1][col_names].values.astype(np.float32).view(dtype=np.int32) for x in test.groupby(id_col)]
+
+    pickle.dump(grouped_train, open(os.path.join(dst_path, 'train.bin'), 'wb'))
+    pickle.dump(grouped_valid, open(os.path.join(dst_path, 'valid.bin'), 'wb'))
+    pickle.dump(grouped_test,  open(os.path.join(dst_path, 'test.bin'), 'wb'))
+
+    
+    with open(os.path.join(dst_path, 'real_scalers.bin'), 'wb') as f:
+        pickle.dump(real_scalers, f)
+    with open(os.path.join(dst_path, 'tgt_scalers.bin'), 'wb') as f:
+        pickle.dump(tgt_scalers, f)
+    with open(os.path.join(dst_path, 'cat_encodings.bin'), 'wb') as f:
+        pickle.dump(cat_encodings, f)
+    with open(os.path.join(dst_path, 'id_encoders.bin'), 'wb') as f:
+        pickle.dump(id_encoders, f)
+    
+
+def sample_data(dataset, num_samples):
+    if num_samples < 0:
+        return dataset
+    else:
+        return torch.utils.data.Subset(dataset, np.random.choice(np.arange(len(dataset)), size=num_samples, replace=False))
+
+
+def standarize_electricity(path):
+    """Code taken from https://github.com/google-research/google-research/blob/master/tft/script_download_data.py"""
+    df = pd.read_csv(os.path.join(path, 'LD2011_2014.txt'), index_col=0, sep=';', decimal=',')
+    df.index = pd.to_datetime(df.index)
+    df.sort_index(inplace=True)
+  
+    # Used to determine the start and end dates of a series
+    output = df.resample('1h').mean().replace(0., np.nan)
+  
+    earliest_time = output.index.min()
+  
+    df_list = []
+    for label in output:
+        print('Processing {}'.format(label))
+        srs = output[label]
+  
+        start_date = min(srs.fillna(method='ffill').dropna().index)
+        end_date = max(srs.fillna(method='bfill').dropna().index)
+  
+        active_range = (srs.index >= start_date) & (srs.index <= end_date)
+        srs = srs[active_range].fillna(0.)
+  
+        tmp = pd.DataFrame({'power_usage': srs})
+        date = tmp.index
+        tmp['t'] = (date - earliest_time).seconds / 60 / 60 + (
+            date - earliest_time).days * 24
+        tmp['days_from_start'] = (date - earliest_time).days
+        tmp['categorical_id'] = label
+        tmp['date'] = date
+        tmp['id'] = label
+        tmp['hour'] = date.hour
+        tmp['day'] = date.day
+        tmp['day_of_week'] = date.dayofweek
+        tmp['month'] = date.month
+  
+        df_list.append(tmp)
+  
+    output = pd.concat(df_list, axis=0, join='outer').reset_index(drop=True)
+  
+    output['categorical_id'] = output['id'].copy()
+    output['hours_from_start'] = output['t']
+    output['categorical_day_of_week'] = output['day_of_week'].copy()
+    output['categorical_hour'] = output['hour'].copy()
+  
+    output.to_csv(os.path.join(path, 'standarized.csv'))
+
+def standarize_volatility(path):
+    df = pd.read_csv(os.path.join(path, 'oxfordmanrealizedvolatilityindices.csv'), index_col=0)  # no explicit index
+  
+    # Adds additional date/day fields
+    idx = [str(s).split('+')[0] for s in df.index
+          ]  # ignore timezones, we don't need them
+    dates = pd.to_datetime(idx)
+    df['date'] = dates
+    df['days_from_start'] = (dates - pd.datetime(2000, 1, 3)).days
+    df['day_of_week'] = dates.dayofweek
+    df['day_of_month'] = dates.day
+    df['week_of_year'] = dates.weekofyear
+    df['month'] = dates.month
+    df['year'] = dates.year
+    df['categorical_id'] = df['Symbol'].copy()
+  
+    # Processes log volatility
+    vol = df['rv5_ss'].copy()
+    vol.loc[vol == 0.] = np.nan
+    df['log_vol'] = np.log(vol)
+  
+    # Adds static information
+    symbol_region_mapping = {
+        '.AEX': 'EMEA',
+        '.AORD': 'APAC',
+        '.BFX': 'EMEA',
+        '.BSESN': 'APAC',
+        '.BVLG': 'EMEA',
+        '.BVSP': 'AMER',
+        '.DJI': 'AMER',
+        '.FCHI': 'EMEA',
+        '.FTMIB': 'EMEA',
+        '.FTSE': 'EMEA',
+        '.GDAXI': 'EMEA',
+        '.GSPTSE': 'AMER',
+        '.HSI': 'APAC',
+        '.IBEX': 'EMEA',
+        '.IXIC': 'AMER',
+        '.KS11': 'APAC',
+        '.KSE': 'APAC',
+        '.MXX': 'AMER',
+        '.N225': 'APAC ',
+        '.NSEI': 'APAC',
+        '.OMXC20': 'EMEA',
+        '.OMXHPI': 'EMEA',
+        '.OMXSPI': 'EMEA',
+        '.OSEAX': 'EMEA',
+        '.RUT': 'EMEA',
+        '.SMSI': 'EMEA',
+        '.SPX': 'AMER',
+        '.SSEC': 'APAC',
+        '.SSMI': 'EMEA',
+        '.STI': 'APAC',
+        '.STOXX50E': 'EMEA'
+    }
+  
+    df['Region'] = df['Symbol'].apply(lambda k: symbol_region_mapping[k])
+  
+    # Performs final processing
+    output_df_list = []
+    for grp in df.groupby('Symbol'):
+        sliced = grp[1].copy()
+        sliced.sort_values('days_from_start', inplace=True)
+        # Impute log volatility values
+        sliced['log_vol'].fillna(method='ffill', inplace=True)
+        sliced.dropna()
+        output_df_list.append(sliced)
+  
+    df = pd.concat(output_df_list, axis=0)
+  
+    df.to_csv(os.path.join(path, 'standarized.csv'))
+
+
+def standarize_traffic(path):
+    def process_list(s, variable_type=int, delimiter=None):
+        """Parses a line in the PEMS format to a list."""
+        if delimiter is None:
+            l = [
+                variable_type(i) for i in s.replace('[', '').replace(']', '').split()
+            ]
+        else:
+            l = [
+                variable_type(i)
+                for i in s.replace('[', '').replace(']', '').split(delimiter)
+            ]
+  
+        return l
+  
+    def read_single_list(filename):
+        """Returns single list from a file in the PEMS-custom format."""
+        with open(os.path.join(path, filename), 'r') as dat:
+            l = process_list(dat.readlines()[0])
+        return l
+  
+    def read_matrix(filename):
+        """Returns a matrix from a file in the PEMS-custom format."""
+        array_list = []
+        with open(os.path.join(path, filename), 'r') as dat:
+            lines = dat.readlines()
+            for i, line in enumerate(lines):
+                if (i + 1) % 50 == 0:
+                    print('Completed {} of {} rows for {}'.format(i + 1, len(lines),
+                                                                filename))
+                array = [
+                    process_list(row_split, variable_type=float, delimiter=None)
+                    for row_split in process_list(
+                        line, variable_type=str, delimiter=';')
+                ]
+                array_list.append(array)
+  
+        return array_list
+  
+    shuffle_order = np.array(read_single_list('randperm')) - 1  # index from 0
+    train_dayofweek = read_single_list('PEMS_trainlabels')
+    train_tensor = read_matrix('PEMS_train')
+    test_dayofweek = read_single_list('PEMS_testlabels')
+    test_tensor = read_matrix('PEMS_test')
+  
+    # Inverse permutate shuffle order
+    print('Shuffling')
+    inverse_mapping = {
+        new_location: previous_location
+        for previous_location, new_location in enumerate(shuffle_order)
+    }
+    reverse_shuffle_order = np.array([
+        inverse_mapping[new_location]
+        for new_location, _ in enumerate(shuffle_order)
+    ])
+  
+    # Group and reoder based on permuation matrix
+    print('Reodering')
+    day_of_week = np.array(train_dayofweek + test_dayofweek)
+    combined_tensor = np.array(train_tensor + test_tensor)
+  
+    day_of_week = day_of_week[reverse_shuffle_order]
+    combined_tensor = combined_tensor[reverse_shuffle_order]
+  
+    # Put everything back into a dataframe
+    print('Parsing as dataframe')
+    labels = ['traj_{}'.format(i) for i in read_single_list('stations_list')]
+  
+    hourly_list = []
+    for day, day_matrix in enumerate(combined_tensor):
+        # Hourly data
+        hourly = pd.DataFrame(day_matrix.T, columns=labels)
+        hourly['hour_on_day'] = [int(i / 6) for i in hourly.index
+                                ]  # sampled at 10 min intervals
+        if hourly['hour_on_day'].max() > 23 or hourly['hour_on_day'].min() < 0:
+            raise ValueError('Invalid hour! {}-{}'.format(
+                hourly['hour_on_day'].min(), hourly['hour_on_day'].max()))
+  
+        hourly = hourly.groupby('hour_on_day', as_index=True).mean()[labels]
+        hourly['sensor_day'] = day
+        hourly['time_on_day'] = hourly.index
+        hourly['day_of_week'] = day_of_week[day]
+  
+        hourly_list.append(hourly)
+  
+    hourly_frame = pd.concat(hourly_list, axis=0, ignore_index=True, sort=False)
+  
+    # Flatten such that each entitiy uses one row in dataframe
+    store_columns = [c for c in hourly_frame.columns if 'traj' in c]
+    other_columns = [c for c in hourly_frame.columns if 'traj' not in c]
+    flat_df = pd.DataFrame(columns=['values', 'prev_values', 'next_values'] +
+                           other_columns + ['id'])
+  
+    for store in store_columns:
+        print('Processing {}'.format(store))
+  
+        sliced = hourly_frame[[store] + other_columns].copy()
+        sliced.columns = ['values'] + other_columns
+        sliced['id'] = int(store.replace('traj_', ''))
+  
+        # Sort by Sensor-date-time
+        key = sliced['id'].apply(str) \
+                + sliced['sensor_day'].apply(lambda x: '_{:03d}'.format(x)) \
+                + sliced['time_on_day'].apply(lambda x: '_{:03d}'.format(x))
+        sliced = sliced.set_index(key).sort_index()
+  
+        sliced['values'] = sliced['values'].fillna(method='ffill')
+        sliced['prev_values'] = sliced['values'].shift(1)
+        sliced['next_values'] = sliced['values'].shift(-1)
+  
+        flat_df = flat_df.append(sliced.dropna(), ignore_index=True, sort=False)
+  
+    # Filter to match range used by other academic papers
+    index = flat_df['sensor_day']
+    flat_df = flat_df[index < 173].copy()
+  
+    # Creating columns fo categorical inputs
+    flat_df['categorical_id'] = flat_df['id'].copy()
+    flat_df['hours_from_start'] = flat_df['time_on_day'] \
+        + flat_df['sensor_day']*24.
+    flat_df['categorical_day_of_week'] = flat_df['day_of_week'].copy()
+    flat_df['categorical_time_on_day'] = flat_df['time_on_day'].copy()
+  
+    flat_df.to_csv(os.path.join(path, 'standarized.csv'))
+
+
+# XXX needs rework
+def standarize_favorita(data_folder):
+    import gc
+    # Extract only a subset of data to save/process for efficiency
+    start_date = pd.datetime(2015, 1, 1)
+    end_date = pd.datetime(2016, 6, 1)
+  
+    print('Regenerating data...')
+  
+    # load temporal data
+    temporal = pd.read_csv(os.path.join(data_folder, 'train.csv'), index_col=0)
+  
+    store_info = pd.read_csv(os.path.join(data_folder, 'stores.csv'), index_col=0)
+    oil = pd.read_csv(
+        os.path.join(data_folder, 'oil.csv'), index_col=0).iloc[:, 0]
+    holidays = pd.read_csv(os.path.join(data_folder, 'holidays_events.csv'))
+    items = pd.read_csv(os.path.join(data_folder, 'items.csv'), index_col=0)
+    transactions = pd.read_csv(os.path.join(data_folder, 'transactions.csv'))
+  
+    # Take first 6 months of data
+    temporal['date'] = pd.to_datetime(temporal['date'])
+  
+    # Filter dates to reduce storage space requirements
+    if start_date is not None:
+        temporal = temporal[(temporal['date'] >= start_date)]
+    if end_date is not None:
+        temporal = temporal[(temporal['date'] < end_date)]
+  
+    dates = temporal['date'].unique()
+  
+    # Add trajectory identifier
+    temporal['traj_id'] = temporal['store_nbr'].apply(
+        str) + '_' + temporal['item_nbr'].apply(str)
+    temporal['unique_id'] = temporal['traj_id'] + '_' + temporal['date'].apply(
+        str)
+  
+    # Remove all IDs with negative returns
+    print('Removing returns data')
+    min_returns = temporal['unit_sales'].groupby(temporal['traj_id']).min()
+    valid_ids = set(min_returns[min_returns >= 0].index)
+    selector = temporal['traj_id'].apply(lambda traj_id: traj_id in valid_ids)
+    new_temporal = temporal[selector].copy()
+    del temporal
+    gc.collect()
+    temporal = new_temporal
+    temporal['open'] = 1
+  
+    # Resampling
+    print('Resampling to regular grid')
+    resampled_dfs = []
+    for traj_id, raw_sub_df in temporal.groupby('traj_id'):
+        print('Resampling', traj_id)
+        sub_df = raw_sub_df.set_index('date', drop=True).copy()
+        sub_df = sub_df.resample('1d').last()
+        sub_df['date'] = sub_df.index
+        sub_df[['store_nbr', 'item_nbr', 'onpromotion']] \
+            = sub_df[['store_nbr', 'item_nbr', 'onpromotion']].fillna(method='ffill')
+        sub_df['open'] = sub_df['open'].fillna(
+            0)  # flag where sales data is unknown
+        sub_df['log_sales'] = np.log(sub_df['unit_sales'])
+    
+        resampled_dfs.append(sub_df.reset_index(drop=True))
+  
+    new_temporal = pd.concat(resampled_dfs, axis=0)
+    del temporal
+    gc.collect()
+    temporal = new_temporal
+  
+    print('Adding oil')
+    oil.name = 'oil'
+    oil.index = pd.to_datetime(oil.index)
+    #XXX the lines below match the value of the oil on given date with the rest of the timeseries
+    # missing values in oil series are copied from the index before. Then the oil series is joined with
+    # temporal. Then there are some dates present in temporal which arent present in oil, for which 
+    # oil values is substituted with -1. WHY?!
+    #TODO: check how many nans there are after first step. Previously oil series was extended by dates
+    # present in dates variable with nan value, which were forward filled. 
+    # This behavior is no longer supported by pandas, so we changed to DataFrame.isin method.
+    # This leaves us with more nans after first step than previously. To achieve previous behavior
+    # we have to join series before filling nans.
+    temporal = temporal.join(
+        #oil.loc[oil.index.isin(dates)].fillna(method='ffill'), on='date', how='left')
+        oil.loc[oil.index.isin(dates)], on='date', how='left')
+    temporal['oil'] = temporal['oil'].fillna(method='ffill')
+    temporal['oil'] = temporal['oil'].fillna(-1)
+  
+    print('Adding store info')
+    temporal = temporal.join(store_info, on='store_nbr', how='left')
+  
+    print('Adding item info')
+    temporal = temporal.join(items, on='item_nbr', how='left')
+  
+    transactions['date'] = pd.to_datetime(transactions['date'])
+    temporal = temporal.merge(
+        transactions,
+        left_on=['date', 'store_nbr'],
+        right_on=['date', 'store_nbr'],
+        how='left')
+    temporal['transactions'] = temporal['transactions'].fillna(-1)
+  
+    # Additional date info
+    temporal['day_of_week'] = pd.to_datetime(temporal['date'].values).dayofweek
+    temporal['day_of_month'] = pd.to_datetime(temporal['date'].values).day
+    temporal['month'] = pd.to_datetime(temporal['date'].values).month
+  
+    # Add holiday info
+    print('Adding holidays')
+    holiday_subset = holidays[holidays['transferred'].apply(
+        lambda x: not x)].copy()
+    holiday_subset.columns = [
+        s if s != 'type' else 'holiday_type' for s in holiday_subset.columns
+    ]
+    holiday_subset['date'] = pd.to_datetime(holiday_subset['date'])
+    local_holidays = holiday_subset[holiday_subset['locale'] == 'Local']
+    regional_holidays = holiday_subset[holiday_subset['locale'] == 'Regional']
+    national_holidays = holiday_subset[holiday_subset['locale'] == 'National']
+  
+    temporal['national_hol'] = temporal.merge(
+        national_holidays, left_on=['date'], right_on=['date'],
+        how='left')['description'].fillna('')
+    temporal['regional_hol'] = temporal.merge(
+        regional_holidays,
+        left_on=['state', 'date'],
+        right_on=['locale_name', 'date'],
+        how='left')['description'].fillna('')
+    temporal['local_hol'] = temporal.merge(
+        local_holidays,
+        left_on=['city', 'date'],
+        right_on=['locale_name', 'date'],
+        how='left')['description'].fillna('')
+  
+    temporal.sort_values('unique_id', inplace=True)
+
+    # Transform date to integer index
+    start_date = pd.to_datetime(min(temporal['date']))
+    dates = temporal['date'].apply(pd.to_datetime)
+    temporal['days_from_start'] = (dates - start_date).dt.days
+    temporal['categorical_id'] = temporal['traj_id'].copy()
+  
+    print('Saving processed file to {}'.format(os.path.join(data_folder, 'standarized.csv')))
+    temporal.to_csv(os.path.join(data_folder, 'standarized.csv'))
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/ema.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/ema.py
new file mode 100644
index 00000000..f8f5b331
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/ema.py
@@ -0,0 +1,73 @@
+# Copyright 2021 NVIDIA CORPORATION
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Copyright 2019 Ross Wightman
+
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+
+#        http://www.apache.org/licenses/LICENSE-2.0
+
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+
+""" 
+Exponential Moving Average (EMA) of model updates
+"""
+
+from collections import OrderedDict
+from copy import deepcopy
+
+import torch
+import torch.nn as nn
+
+class ModelEma(nn.Module):
+    """ Model Exponential Moving Average V2
+
+    Keep a moving average of everything in the model state_dict (parameters and buffers).
+    V2 of this module is simpler, it does not match params/buffers based on name but simply
+    iterates in order. It works with torchscript (JIT of full model).
+
+    """
+    def __init__(self, model, decay=0.999, device=None):
+        super().__init__()
+        # make a copy of the model for accumulating moving average of weights
+        self.module = deepcopy(model)
+        self.module.eval()
+        self.decay = decay
+        self.device = device  # perform ema on different device from model if set
+        if self.device is not None:
+            self.module.to(device=device)
+
+    def update(self, model):
+        update_fn=lambda ema_v, model_v: self.decay * ema_v + (1. - self.decay) * model_v
+        with torch.no_grad():
+            for ema_v, model_v in zip(self.module.state_dict().values(), model.state_dict().values()):
+                if self.device is not None:
+                    model_v = model_v.to(device=self.device)
+                ema_v.copy_(update_fn(ema_v, model_v))
+
+    def set(self, model):
+        with torch.no_grad():
+            for ema_v, model_v in zip(self.module.state_dict().values(), model.state_dict().values()):
+                if self.device is not None:
+                    model_v = model_v.to(device=self.device)
+                ema_v.copy_( model_v )
+
+    def forward(self, x):
+        return self.module(x)
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/gpu_affinity.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/gpu_affinity.py
new file mode 100644
index 00000000..79fb1fc4
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/gpu_affinity.py
@@ -0,0 +1,157 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+import math
+import os
+import pathlib
+import re
+
+import pynvml
+
+pynvml.nvmlInit()
+
+
+def systemGetDriverVersion():
+    return pynvml.nvmlSystemGetDriverVersion()
+
+
+def deviceGetCount():
+    return pynvml.nvmlDeviceGetCount()
+
+
+class device:
+    # assume nvml returns list of 64 bit ints
+    _nvml_affinity_elements = math.ceil(os.cpu_count() / 64)
+
+    def __init__(self, device_idx):
+        super().__init__()
+        self.handle = pynvml.nvmlDeviceGetHandleByIndex(device_idx)
+
+    def getName(self):
+        return pynvml.nvmlDeviceGetName(self.handle)
+
+    def getCpuAffinity(self):
+        affinity_string = ''
+        for j in pynvml.nvmlDeviceGetCpuAffinity(
+            self.handle, device._nvml_affinity_elements
+        ):
+            # assume nvml returns list of 64 bit ints
+            affinity_string = '{:064b}'.format(j) + affinity_string
+        affinity_list = [int(x) for x in affinity_string]
+        affinity_list.reverse()  # so core 0 is in 0th element of list
+
+        ret = [i for i, e in enumerate(affinity_list) if e != 0]
+        return ret
+
+
+def set_socket_affinity(gpu_id):
+    dev = device(gpu_id)
+    affinity = dev.getCpuAffinity()
+    os.sched_setaffinity(0, affinity)
+
+
+def set_single_affinity(gpu_id):
+    dev = device(gpu_id)
+    affinity = dev.getCpuAffinity()
+    os.sched_setaffinity(0, affinity[:1])
+
+
+def set_single_unique_affinity(gpu_id, nproc_per_node):
+    devices = [device(i) for i in range(nproc_per_node)]
+    socket_affinities = [dev.getCpuAffinity() for dev in devices]
+
+    siblings_list = get_thread_siblings_list()
+    siblings_dict = dict(siblings_list)
+
+    # remove siblings
+    for idx, socket_affinity in enumerate(socket_affinities):
+        socket_affinities[idx] = list(set(socket_affinity) - set(siblings_dict.values()))
+
+    affinities = []
+    assigned = []
+
+    for socket_affinity in socket_affinities:
+        for core in socket_affinity:
+            if core not in assigned:
+                affinities.append([core])
+                assigned.append(core)
+                break
+    os.sched_setaffinity(0, affinities[gpu_id])
+
+
+def set_socket_unique_affinity(gpu_id, nproc_per_node, mode):
+    device_ids = [device(i) for i in range(nproc_per_node)]
+    socket_affinities = [dev.getCpuAffinity() for dev in device_ids]
+
+    siblings_list = get_thread_siblings_list()
+    siblings_dict = dict(siblings_list)
+
+    # remove siblings
+    for idx, socket_affinity in enumerate(socket_affinities):
+        socket_affinities[idx] = list(set(socket_affinity) - set(siblings_dict.values()))
+
+    socket_affinities_to_device_ids = collections.defaultdict(list)
+
+    for idx, socket_affinity in enumerate(socket_affinities):
+        socket_affinities_to_device_ids[tuple(socket_affinity)].append(idx)
+
+    for socket_affinity, device_ids in socket_affinities_to_device_ids.items():
+        devices_per_group = len(device_ids)
+        cores_per_device = len(socket_affinity) // devices_per_group
+        for group_id, device_id in enumerate(device_ids):
+            if device_id == gpu_id:
+                if mode == 'interleaved':
+                    affinity = list(socket_affinity[group_id::devices_per_group])
+                elif mode == 'continuous':
+                    affinity = list(socket_affinity[group_id*cores_per_device:(group_id+1)*cores_per_device])
+                else:
+                    raise RuntimeError('Unknown set_socket_unique_affinity mode')
+
+                # reintroduce siblings
+                affinity += [siblings_dict[aff] for aff in affinity if aff in siblings_dict]
+                os.sched_setaffinity(0, affinity)
+
+
+def get_thread_siblings_list():
+    path = '/sys/devices/system/cpu/cpu*/topology/thread_siblings_list'
+    thread_siblings_list = []
+    pattern = re.compile(r'(\d+)\D(\d+)')
+    for fname in pathlib.Path(path[0]).glob(path[1:]):
+        with open(fname) as f:
+            content = f.read().strip()
+            res = pattern.findall(content)
+            if res:
+                pair = tuple(map(int, res[0]))
+                thread_siblings_list.append(pair)
+    return thread_siblings_list
+
+
+def set_affinity(gpu_id, nproc_per_node, mode='socket'):
+    if mode == 'socket':
+        set_socket_affinity(gpu_id)
+    elif mode == 'single':
+        set_single_affinity(gpu_id)
+    elif mode == 'single_unique':
+        set_single_unique_affinity(gpu_id, nproc_per_node)
+    elif mode == 'socket_unique_interleaved':
+        set_socket_unique_affinity(gpu_id, nproc_per_node, 'interleaved')
+    elif mode == 'socket_unique_continuous':
+        set_socket_unique_affinity(gpu_id, nproc_per_node, 'continuous')
+    else:
+        raise RuntimeError('Unknown affinity mode')
+
+    affinity = os.sched_getaffinity(0)
+    return affinity
+
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/inference.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/inference.py
new file mode 100644
index 00000000..056429f1
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/inference.py
@@ -0,0 +1,239 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import pandas as pd
+import numpy as np
+import pickle
+import argparse
+import torch
+from torch.utils.data import DataLoader
+from torch.cuda import amp
+from torch.utils.tensorboard import SummaryWriter
+from tqdm import tqdm
+from modeling import TemporalFusionTransformer
+from configuration import ElectricityConfig
+from data_utils import TFTDataset
+from utils import PerformanceMeter
+from criterions import QuantileLoss
+import dllogger
+from log_helper import setup_logger
+
+def _unscale_per_id(config, values, ids, scalers):
+    values = values.cpu().numpy()
+    num_horizons = config.example_length - config.encoder_length + 1
+    flat_values = pd.DataFrame(
+            values,
+            columns=[f't{j}' for j in range(num_horizons - values.shape[1], num_horizons)]
+            )
+    flat_values['id'] = ids
+    df_list = []
+    for idx, group in flat_values.groupby('id'):
+        scaler = scalers[idx]
+        group_copy = group.copy()
+        for col in group_copy.columns:
+            if not 'id' in col:
+                _col = np.expand_dims(group_copy[col].values, -1)
+                _t_col = scaler.inverse_transform(_col)[:,-1]
+                group_copy[col] = _t_col
+        df_list.append(group_copy)
+    flat_values = pd.concat(df_list, axis=0)
+
+    flat_values = flat_values[[col for col in flat_values if not 'id' in col]]
+    flat_tensor = torch.from_numpy(flat_values.values)
+    return flat_tensor
+
+def _unscale(config, values, scaler):
+    values = values.cpu().numpy()
+    num_horizons = config.example_length - config.encoder_length + 1
+    flat_values = pd.DataFrame(
+            values,
+            columns=[f't{j}' for j in range(num_horizons - values.shape[1], num_horizons)]
+            )
+    for col in flat_values.columns:
+        if not 'id' in col:
+            _col = np.expand_dims(flat_values[col].values, -1)
+            _t_col = scaler.inverse_transform(_col)[:,-1]
+            flat_values[col] = _t_col
+
+    flat_values = flat_values[[col for col in flat_values if not 'id' in col]]
+    flat_tensor = torch.from_numpy(flat_values.values)
+    return flat_tensor
+
+def predict(args, config, model, data_loader, scalers, cat_encodings, extend_targets=False):
+    model.eval()
+    predictions = []
+    targets = []
+    ids = []
+    perf_meter = PerformanceMeter()
+    n_workers = args.distributed_world_size if hasattr(args, 'distributed_world_size') else 1
+
+    for step, batch in enumerate(data_loader):
+        perf_meter.reset_current_lap()
+        with torch.no_grad():
+            batch = {key: tensor.cuda() if tensor.numel() else None for key, tensor in batch.items()}
+            ids.append(batch['id'][:,0,:])
+            targets.append(batch['target'])
+            predictions.append(model(batch).float())
+
+        perf_meter.update(args.batch_size * n_workers,
+            exclude_from_total=step in [0, len(data_loader)-1])
+
+    targets = torch.cat(targets, dim=0)
+    if not extend_targets:
+        targets = targets[:,config.encoder_length:,:] 
+    predictions = torch.cat(predictions, dim=0)
+    
+    if config.scale_per_id:
+        ids = torch.cat(ids, dim=0).cpu().numpy()
+
+        unscaled_predictions = torch.stack(
+                [_unscale_per_id(config, predictions[:,:,i], ids, scalers) for i in range(len(config.quantiles))], 
+                dim=-1)
+        unscaled_targets = _unscale_per_id(config, targets[:,:,0], ids, scalers).unsqueeze(-1)
+    else:
+        ids = None
+        unscaled_predictions = torch.stack(
+                [_unscale(config, predictions[:,:,i], scalers['']) for i in range(len(config.quantiles))], 
+                dim=-1)
+        unscaled_targets = _unscale(config, targets[:,:,0], scalers['']).unsqueeze(-1)
+
+    return unscaled_predictions, unscaled_targets, ids, perf_meter
+
+def visualize_v2(args, config, model, data_loader, scalers, cat_encodings):
+    unscaled_predictions, unscaled_targets, ids, _ = predict(args, config, model, data_loader, scalers, cat_encodings, extend_targets=True)
+
+    num_horizons = config.example_length - config.encoder_length + 1
+    pad = unscaled_predictions.new_full((unscaled_targets.shape[0], unscaled_targets.shape[1] - unscaled_predictions.shape[1], unscaled_predictions.shape[2]), fill_value=float('nan'))
+    pad[:,-1,:] = unscaled_targets[:,-num_horizons,:]
+    unscaled_predictions = torch.cat((pad, unscaled_predictions), dim=1)
+
+    ids = torch.from_numpy(ids.squeeze())
+    joint_graphs = torch.cat([unscaled_targets, unscaled_predictions], dim=2)
+    graphs = {i:joint_graphs[ids == i, :, :] for i in set(ids.tolist())}
+    for key, g in graphs.items():
+        for i, ex in enumerate(g):
+            df = pd.DataFrame(ex.numpy(), 
+                    index=range(num_horizons - ex.shape[0], num_horizons),
+                    columns=['target'] + [f'P{int(q*100)}' for q in config.quantiles])
+            fig = df.plot().get_figure()
+            ax = fig.get_axes()[0]
+            _values = df.values[config.encoder_length-1:,:]
+            ax.fill_between(range(num_horizons), _values[:,1], _values[:,-1], alpha=0.2, color='green')
+            os.makedirs(os.path.join(args.results, 'single_example_vis', str(key)), exist_ok=True)
+            fig.savefig(os.path.join(args.results, 'single_example_vis', str(key), f'{i}.pdf'))
+
+def inference(args, config, model, data_loader, scalers, cat_encodings):
+    unscaled_predictions, unscaled_targets, ids, perf_meter = predict(args, config, model, data_loader, scalers, cat_encodings)
+
+    if args.joint_visualization or args.save_predictions:
+        ids = torch.from_numpy(ids.squeeze())
+        #ids = torch.cat([x['id'][0] for x in data_loader.dataset])
+        joint_graphs = torch.cat([unscaled_targets, unscaled_predictions], dim=2)
+        graphs = {i:joint_graphs[ids == i, :, :] for i in set(ids.tolist())}
+        for key, g in graphs.items(): #timeseries id, joint targets and predictions
+            _g = {'targets': g[:,:,0]}
+            _g.update({f'P{int(q*100)}':g[:,:,i+1] for i, q in enumerate(config.quantiles)})
+            
+            if args.joint_visualization:
+                summary_writer = SummaryWriter(log_dir=os.path.join(args.results, 'predictions_vis', str(key)))
+                for q, t in _g.items(): # target and quantiles, timehorizon values
+                    if q == 'targets':
+                        targets = torch.cat([t[:,0], t[-1,1:]]) # WIP
+                        # We want to plot targets on the same graph as predictions. Probably could be written better.
+                        for i, val in enumerate(targets):
+                            summary_writer.add_scalars(str(key), {f'{q}':val}, i)
+                        continue
+
+                    # Tensor t contains different time horizons which are shifted in phase
+                    # Next lines realign them
+                    y = t.new_full((t.shape[0] + t.shape[1] -1, t.shape[1]), float('nan'))
+                    for i in range(y.shape[1]):
+                        y[i:i+t.shape[0], i] = t[:,i]
+
+                    for i, vals in enumerate(y): # timestep, timehorizon values value
+                        summary_writer.add_scalars(str(key), {f'{q}_t+{j+1}':v for j,v in enumerate(vals) if v == v}, i)
+                summary_writer.close()
+
+            if args.save_predictions:
+                for q, t in _g.items():
+                    df = pd.DataFrame(t.tolist())
+                    df.columns = [f't+{i+1}' for i in range(len(df.columns))]
+                    os.makedirs(os.path.join(args.results, 'predictions', str(key)), exist_ok=True)
+                    df.to_csv(os.path.join(args.results, 'predictions', str(key), q+'.csv'))
+
+    losses = QuantileLoss(config)(unscaled_predictions, unscaled_targets)
+    normalizer = unscaled_targets.abs().mean()
+    q_risk = 2 * losses / normalizer
+
+    perf_dict = {
+                'throughput': perf_meter.avg,
+                'latency_avg': perf_meter.total_time/len(perf_meter.intervals),
+                'latency_p90': perf_meter.p(90),
+                'latency_p95': perf_meter.p(95),
+                'latency_p99': perf_meter.p(99),
+                'total_infernece_time': perf_meter.total_time,
+                }
+
+    return q_risk, perf_dict
+
+
+def main(args):
+    
+    setup_logger(args)
+    # Set up model
+    state_dict = torch.load(args.checkpoint)
+    config = state_dict['config']
+    model = TemporalFusionTransformer(config).cuda()
+    model.load_state_dict(state_dict['model'])
+    model.eval()
+    model.cuda()
+
+    # Set up dataset
+    test_split = TFTDataset(args.data, config)
+    data_loader = DataLoader(test_split, batch_size=args.batch_size, num_workers=4)
+
+    scalers = pickle.load(open(args.tgt_scalers, 'rb'))
+    cat_encodings = pickle.load(open(args.cat_encodings, 'rb'))
+
+    if args.visualize:
+        # TODO: abstract away all forms of visualization.
+        visualize_v2(args, config, model, data_loader, scalers, cat_encodings)
+
+    quantiles, perf_dict = inference(args, config, model, data_loader, scalers, cat_encodings)
+    quantiles = {'test_p10': quantiles[0].item(), 'test_p50': quantiles[1].item(), 'test_p90': quantiles[2].item(), 'sum':sum(quantiles).item()}
+    finish_log = {**quantiles, **perf_dict}
+    dllogger.log(step=(), data=finish_log, verbosity=1)
+    print('Test q-risk: P10 {} | P50 {} | P90 {}'.format(*quantiles))
+    print('Latency:\n\tAverage {:.3f}s\n\tp90 {:.3f}s\n\tp95 {:.3f}s\n\tp99 {:.3f}s'.format(
+        perf_dict['latency_avg'], perf_dict['latency_p90'], perf_dict['latency_p95'], perf_dict['latency_p99']))
+
+if __name__=='__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--checkpoint', type=str,
+                        help='Path to the checkpoint')
+    parser.add_argument('--data', type=str,
+                        help='Path to the test split of the dataset')
+    parser.add_argument('--tgt_scalers', type=str,
+                        help='Path to the tgt_scalers.bin file produced by the preprocessing')
+    parser.add_argument('--cat_encodings', type=str,
+                        help='Path to the cat_encodings.bin file produced by the preprocessing')
+    parser.add_argument('--batch_size', type=int, default=64)
+    parser.add_argument('--visualize', action='store_true', help='Visualize predictions - each example on the separate plot')
+    parser.add_argument('--joint_visualization', action='store_true', help='Visualize predictions - each timeseries on separate plot. Projections will be concatenated.')
+    parser.add_argument('--save_predictions', action='store_true')
+    parser.add_argument('--results', type=str, default='/results')
+    parser.add_argument('--log_file', type=str, default='dllogger.json')
+    ARGS = parser.parse_args()
+    main(ARGS)
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/log_helper.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/log_helper.py
new file mode 100644
index 00000000..83d2ac7f
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/log_helper.py
@@ -0,0 +1,141 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import subprocess
+import sys
+import itertools
+import atexit
+
+import dllogger
+from dllogger import Backend, JSONStreamBackend, StdOutBackend
+
+import torch.distributed as dist
+from torch.utils.tensorboard import SummaryWriter
+
+class TensorBoardBackend(Backend):
+    def __init__(self, verbosity, log_dir):
+        super().__init__(verbosity=verbosity)
+        self.summary_writer = SummaryWriter(log_dir=os.path.join(log_dir, 'TB_summary'),
+                                            flush_secs=120,
+                                            max_queue=200
+                                            )
+        self.hp_cache = None
+        atexit.register(self.summary_writer.close)
+
+    @property
+    def log_level(self):
+        return self._log_level
+
+    def metadata(self, timestamp, elapsedtime, metric, metadata):
+        pass
+
+    def log(self, timestamp, elapsedtime, step, data):
+        if step == 'HPARAMS':
+            parameters = {k: v for k, v in data.items() if not isinstance(v, (list, tuple))}
+            #Unpack list and tuples
+            for d in [{k+f'_{i}':v for i,v in enumerate(l)} for k,l in data.items() if isinstance(l, (list, tuple))]:
+                parameters.update(d)
+            #Remove custom classes
+            parameters = {k: v for k, v in data.items() if isinstance(v, (int, float, str, bool))}
+            parameters.update({k:'None' for k, v in data.items() if v is None})
+            self.hp_cache = parameters
+        if step == ():
+            if self.hp_cache is None:
+                print('Warning: Cannot save HParameters. Please log HParameters with step=\'HPARAMS\'', file=sys.stderr)
+                return
+            self.summary_writer.add_hparams(self.hp_cache, data)
+        if not isinstance(step, int):
+            return
+        for k, v in data.items():
+            self.summary_writer.add_scalar(k, v, step)
+
+    def flush(self):
+        pass
+
+def setup_logger(args):
+    os.makedirs(args.results, exist_ok=True)
+    log_path = os.path.join(args.results, args.log_file)
+
+    if os.path.exists(log_path):
+        for i in itertools.count():
+            s_fname = args.log_file.split('.')
+            fname = '.'.join(s_fname[:-1]) + f'_{i}.' + s_fname[-1] if len(s_fname) > 1 else args.stat_file + f'.{i}'
+            log_path = os.path.join(args.results, fname)
+            if not os.path.exists(log_path):
+                break
+
+    def metric_format(metric, metadata, value):
+        return "{}: {}".format(metric, f'{value:.5f}' if isinstance(value, float) else value)
+    def step_format(step):
+        if step == ():
+            return "Finished |"
+        elif isinstance(step, int):
+            return "Step {0: <5} |".format(step)
+        return "Step {} |".format(step)
+
+
+    if not dist.is_initialized() or not args.distributed_world_size > 1 or args.distributed_rank == 0:
+        dllogger.init(backends=[JSONStreamBackend(verbosity=1, filename=log_path),
+                                TensorBoardBackend(verbosity=1, log_dir=args.results),
+                                StdOutBackend(verbosity=2, 
+                                              step_format=step_format,
+                                              prefix_format=lambda x: "")#,
+                                              #metric_format=metric_format)
+                                ])
+    else:
+        dllogger.init(backends=[])
+    dllogger.log(step='PARAMETER', data=vars(args), verbosity=0)
+
+    container_setup_info = {**get_framework_env_vars(), **get_system_info()}
+    dllogger.log(step='ENVIRONMENT', data=container_setup_info, verbosity=0)
+
+    dllogger.metadata('loss', {'GOAL': 'MINIMIZE', 'STAGE': 'TRAIN', 'format': ':5f'})
+    dllogger.metadata('P10', {'GOAL': 'MINIMIZE', 'STAGE': 'TRAIN', 'format': ':5f'})
+    dllogger.metadata('P50', {'GOAL': 'MINIMIZE', 'STAGE': 'TRAIN', 'format': ':5f'})
+    dllogger.metadata('P90', {'GOAL': 'MINIMIZE', 'STAGE': 'TRAIN', 'format': ':5f'})
+    dllogger.metadata('items/s', {'GOAL': 'MAXIMIZE', 'STAGE': 'TRAIN', 'format': ':1f'})
+    dllogger.metadata('val_loss', {'GOAL': 'MINIMIZE', 'STAGE': 'VAL', 'format':':5f'})
+    dllogger.metadata('val_P10', {'GOAL': 'MINIMIZE', 'STAGE': 'VAL', 'format': ':5f'})
+    dllogger.metadata('val_P50', {'GOAL': 'MINIMIZE', 'STAGE': 'VAL', 'format': ':5f'})
+    dllogger.metadata('val_P90', {'GOAL': 'MINIMIZE', 'STAGE': 'VAL', 'format': ':5f'})
+    dllogger.metadata('val_items/s', {'GOAL': 'MAXIMIZE', 'STAGE': 'VAL', 'format': ':1f'})
+    dllogger.metadata('test_P10', {'GOAL': 'MINIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
+    dllogger.metadata('test_P50', {'GOAL': 'MINIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
+    dllogger.metadata('test_P90', {'GOAL': 'MINIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
+    dllogger.metadata('throughput', {'GOAL': 'MAXIMIZE', 'STAGE': 'TEST', 'format': ':1f'})
+    dllogger.metadata('latency_p90', {'GOAL': 'MIMIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
+    dllogger.metadata('latency_p95', {'GOAL': 'MIMIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
+    dllogger.metadata('latency_p99', {'GOAL': 'MIMIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
+
+
+def get_framework_env_vars():
+    return {
+        'NVIDIA_PYTORCH_VERSION': os.environ.get('NVIDIA_PYTORCH_VERSION'),
+        'PYTORCH_VERSION': os.environ.get('PYTORCH_VERSION'),
+        'CUBLAS_VERSION': os.environ.get('CUBLAS_VERSION'),
+        'NCCL_VERSION': os.environ.get('NCCL_VERSION'),
+        'CUDA_DRIVER_VERSION': os.environ.get('CUDA_DRIVER_VERSION'),
+        'CUDNN_VERSION': os.environ.get('CUDNN_VERSION'),
+        'CUDA_VERSION': os.environ.get('CUDA_VERSION'),
+        'NVIDIA_PIPELINE_ID': os.environ.get('NVIDIA_PIPELINE_ID'),
+        'NVIDIA_BUILD_ID': os.environ.get('NVIDIA_BUILD_ID'),
+        'NVIDIA_TF32_OVERRIDE': os.environ.get('NVIDIA_TF32_OVERRIDE'),
+    }
+
+def get_system_info():
+    system_info = subprocess.run('nvidia-smi --query-gpu=gpu_name,memory.total,enforced.power.limit --format=csv'.split(), capture_output=True).stdout
+    system_info = [i.decode('utf-8') for i in system_info.split(b'\n')]
+    system_info = [x for x in system_info if x]
+    return {'system_info': system_info}
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/modeling.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/modeling.py
new file mode 100644
index 00000000..65e64983
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/modeling.py
@@ -0,0 +1,367 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from torch import Tensor
+from typing import Dict, Tuple, Optional, List
+
+if os.environ.get("TFT_SCRIPTING", False):
+    from torch.nn import LayerNorm
+else:
+    from apex.normalization.fused_layer_norm import FusedLayerNorm as LayerNorm
+
+class MaybeLayerNorm(nn.Module):
+    def __init__(self, output_size, hidden_size, eps):
+        super().__init__()
+        if output_size and output_size == 1:
+            self.ln = nn.Identity()
+        else:
+            self.ln = LayerNorm(output_size if output_size else hidden_size, eps=eps)
+    
+    def forward(self, x):
+        return self.ln(x)
+
+
+class GLU(nn.Module):
+    def __init__(self, hidden_size, output_size):
+        super().__init__()
+        self.lin = nn.Linear(hidden_size, output_size * 2)
+
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.lin(x)
+        x = F.glu(x)
+        return x
+
+
+class GRN(nn.Module):
+    def __init__(self,
+                 input_size,
+                 hidden_size, 
+                 output_size=None,
+                 context_hidden_size=None,
+                 dropout=0):
+        super().__init__()
+
+        
+        self.layer_norm = MaybeLayerNorm(output_size, hidden_size, eps=1e-3)
+        self.lin_a = nn.Linear(input_size, hidden_size)
+        if context_hidden_size is not None:
+            self.lin_c = nn.Linear(context_hidden_size, hidden_size, bias=False)
+        self.lin_i = nn.Linear(hidden_size, hidden_size)
+        self.glu = GLU(hidden_size, output_size if output_size else hidden_size)
+        self.dropout = nn.Dropout(dropout)
+        self.out_proj = nn.Linear(input_size, output_size) if output_size else None
+
+    def forward(self, a: Tensor, c: Optional[Tensor] = None):
+        x = self.lin_a(a)
+        if c is not None:
+            x = x + self.lin_c(c).unsqueeze(1)
+        x = F.elu(x)
+        x = self.lin_i(x)
+        x = self.dropout(x)
+        x = self.glu(x)
+        y = a if not self.out_proj else self.out_proj(a)
+        x = x + y
+        x = self.layer_norm(x)
+        return x 
+
+class TFTEmbedding(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.s_cat_inp_lens    = config.static_categorical_inp_lens
+        self.t_cat_k_inp_lens  = config.temporal_known_categorical_inp_lens
+        self.t_cat_o_inp_lens  = config.temporal_observed_categorical_inp_lens
+        self.s_cont_inp_size   = config.static_continuous_inp_size
+        self.t_cont_k_inp_size = config.temporal_known_continuous_inp_size
+        self.t_cont_o_inp_size = config.temporal_observed_continuous_inp_size
+        self.t_tgt_size        = config.temporal_target_size
+
+        self.hidden_size = config.hidden_size
+
+        # There are 7 types of input:
+        # 1. Static categorical
+        # 2. Static continuous
+        # 3. Temporal known a priori categorical
+        # 4. Temporal known a priori continuous
+        # 5. Temporal observed categorical
+        # 6. Temporal observed continuous
+        # 7. Temporal observed targets (time series obseved so far)
+
+        self.s_cat_embed = nn.ModuleList([
+            nn.Embedding(n, self.hidden_size) for n in self.s_cat_inp_lens]) if self.s_cat_inp_lens else None
+        self.t_cat_k_embed = nn.ModuleList([
+            nn.Embedding(n, self.hidden_size) for n in self.t_cat_k_inp_lens]) if self.t_cat_k_inp_lens else None
+        self.t_cat_o_embed = nn.ModuleList([
+            nn.Embedding(n, self.hidden_size) for n in self.t_cat_o_inp_lens]) if self.t_cat_o_inp_lens else None
+
+        self.s_cont_embedding_vectors = nn.Parameter(torch.Tensor(self.s_cont_inp_size, self.hidden_size)) if self.s_cont_inp_size else None
+        self.t_cont_k_embedding_vectors = nn.Parameter(torch.Tensor(self.t_cont_k_inp_size, self.hidden_size)) if self.t_cont_k_inp_size else None
+        self.t_cont_o_embedding_vectors = nn.Parameter(torch.Tensor(self.t_cont_o_inp_size, self.hidden_size)) if self.t_cont_o_inp_size else None
+        self.t_tgt_embedding_vectors = nn.Parameter(torch.Tensor(self.t_tgt_size, self.hidden_size))
+
+        self.s_cont_embedding_bias = nn.Parameter(torch.zeros(self.s_cont_inp_size, self.hidden_size)) if self.s_cont_inp_size else None
+        self.t_cont_k_embedding_bias = nn.Parameter(torch.zeros(self.t_cont_k_inp_size, self.hidden_size)) if self.t_cont_k_inp_size else None
+        self.t_cont_o_embedding_bias = nn.Parameter(torch.zeros(self.t_cont_o_inp_size, self.hidden_size)) if self.t_cont_o_inp_size else None
+        self.t_tgt_embedding_bias = nn.Parameter(torch.zeros(self.t_tgt_size, self.hidden_size))
+
+        if self.s_cont_embedding_vectors is not None:
+            torch.nn.init.xavier_normal_(self.s_cont_embedding_vectors)
+        if self.t_cont_k_embedding_vectors is not None:
+            torch.nn.init.xavier_normal_(self.t_cont_k_embedding_vectors)
+        if self.t_cont_o_embedding_vectors is not None:
+            torch.nn.init.xavier_normal_(self.t_cont_o_embedding_vectors)
+        torch.nn.init.xavier_normal_(self.t_tgt_embedding_vectors)
+
+    def _apply_embedding(self,
+            cat: Optional[Tensor],
+            cont: Optional[Tensor],
+            cat_emb: Optional[nn.ModuleList], 
+            cont_emb: Tensor,
+            cont_bias: Tensor,
+            ) -> Tuple[Optional[Tensor], Optional[Tensor]]:
+        e_cat = torch.stack([embed(cat[...,i]) for i, embed in enumerate(cat_emb)], dim=-2) if cat is not None else None
+        if cont is not None:
+            #the line below is equivalent to following einsums
+            #e_cont = torch.einsum('btf,fh->bthf', cont, cont_emb)
+            #e_cont = torch.einsum('bf,fh->bhf', cont, cont_emb)
+            e_cont = torch.mul(cont.unsqueeze(-1), cont_emb)
+            e_cont = e_cont + cont_bias
+        else:
+            e_cont = None
+
+        if e_cat is not None and e_cont is not None:
+            return torch.cat([e_cat, e_cont], dim=-2)
+        elif e_cat is not None:
+            return e_cat
+        elif e_cont is not None:
+            return e_cont
+        else:
+            return None
+
+    def forward(self, x: Dict[str, Tensor]):
+        # temporal/static categorical/continuous known/observed input 
+        s_cat_inp = x.get('s_cat', None)
+        s_cont_inp = x.get('s_cont', None)
+        t_cat_k_inp = x.get('k_cat', None)
+        t_cont_k_inp = x.get('k_cont', None)
+        t_cat_o_inp = x.get('o_cat', None)
+        t_cont_o_inp = x.get('o_cont', None)
+        t_tgt_obs = x['target'] # Has to be present
+
+        # Static inputs are expected to be equal for all timesteps
+        # For memory efficiency there is no assert statement
+        s_cat_inp = s_cat_inp[:,0,:] if s_cat_inp is not None else None
+        s_cont_inp = s_cont_inp[:,0,:] if s_cont_inp is not None else None
+
+        s_inp = self._apply_embedding(s_cat_inp,
+                                      s_cont_inp,
+                                      self.s_cat_embed,
+                                      self.s_cont_embedding_vectors,
+                                      self.s_cont_embedding_bias)
+        t_known_inp = self._apply_embedding(t_cat_k_inp,
+                                            t_cont_k_inp,
+                                            self.t_cat_k_embed,
+                                            self.t_cont_k_embedding_vectors,
+                                            self.t_cont_k_embedding_bias)
+        t_observed_inp = self._apply_embedding(t_cat_o_inp,
+                                               t_cont_o_inp,
+                                               self.t_cat_o_embed,
+                                               self.t_cont_o_embedding_vectors,
+                                               self.t_cont_o_embedding_bias)
+
+        # Temporal observed targets
+        # t_observed_tgt = torch.einsum('btf,fh->btfh', t_tgt_obs, self.t_tgt_embedding_vectors)
+        t_observed_tgt = torch.matmul(t_tgt_obs.unsqueeze(3).unsqueeze(4), self.t_tgt_embedding_vectors.unsqueeze(1)).squeeze(3)
+        t_observed_tgt = t_observed_tgt + self.t_tgt_embedding_bias
+
+        return s_inp, t_known_inp, t_observed_inp, t_observed_tgt
+
+class VariableSelectionNetwork(nn.Module):
+    def __init__(self, config, num_inputs):
+        super().__init__()
+        self.joint_grn = GRN(config.hidden_size*num_inputs, config.hidden_size, output_size=num_inputs, context_hidden_size=config.hidden_size)
+        self.var_grns = nn.ModuleList([GRN(config.hidden_size, config.hidden_size, dropout=config.dropout) for _ in range(num_inputs)])
+
+    def forward(self, x: Tensor, context: Optional[Tensor] = None):
+        Xi = x.reshape(*x.shape[:-2], -1)
+        grn_outputs = self.joint_grn(Xi, c=context)
+        sparse_weights = F.softmax(grn_outputs, dim=-1)
+        transformed_embed_list = [m(x[...,i,:]) for i, m in enumerate(self.var_grns)]
+        transformed_embed = torch.stack(transformed_embed_list, dim=-1)
+        #the line below performs batched matrix vector multiplication
+        #for temporal features it's bthf,btf->bth
+        #for static features it's bhf,bf->bh
+        variable_ctx = torch.matmul(transformed_embed, sparse_weights.unsqueeze(-1)).squeeze(-1)
+
+        return variable_ctx, sparse_weights
+
+class StaticCovariateEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.vsn = VariableSelectionNetwork(config, config.num_static_vars)
+        self.context_grns = nn.ModuleList([GRN(config.hidden_size, config.hidden_size, dropout=config.dropout) for _ in range(4)])
+
+    def forward(self, x: Tensor) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
+        variable_ctx, sparse_weights = self.vsn(x)
+
+        # Context vectors:
+        # variable selection context
+        # enrichment context
+        # state_c context
+        # state_h context
+        cs, ce, ch, cc = tuple(m(variable_ctx) for m in self.context_grns)
+
+        return cs, ce, ch, cc
+
+
+class InterpretableMultiHeadAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.n_head = config.n_head
+        assert config.hidden_size % config.n_head == 0
+        self.d_head = config.hidden_size // config.n_head
+        self.qkv_linears = nn.Linear(config.hidden_size, (2 * self.n_head + 1) * self.d_head, bias=False)
+        self.out_proj = nn.Linear(self.d_head, config.hidden_size, bias=False)
+        self.attn_dropout = nn.Dropout(config.attn_dropout)
+        self.out_dropout = nn.Dropout(config.dropout)
+        self.scale = self.d_head**-0.5
+        self.register_buffer("_mask", torch.triu(torch.full((config.example_length, config.example_length), float('-inf')), 1).unsqueeze(0))
+
+    def forward(self, x: Tensor, mask_future_timesteps: bool = True) -> Tuple[Tensor, Tensor]:
+        bs, t, h_size = x.shape
+        qkv = self.qkv_linears(x)
+        q, k, v = qkv.split((self.n_head * self.d_head, self.n_head * self.d_head, self.d_head), dim=-1)
+        q = q.view(bs, t, self.n_head, self.d_head)
+        k = k.view(bs, t, self.n_head, self.d_head)
+        v = v.view(bs, t, self.d_head)
+
+        # attn_score = torch.einsum('bind,bjnd->bnij', q, k)
+        attn_score = torch.matmul(q.permute((0, 2, 1, 3)), k.permute((0, 2, 3, 1)))
+        attn_score.mul_(self.scale)
+
+        if mask_future_timesteps:
+            attn_score = attn_score + self._mask
+
+        attn_prob = F.softmax(attn_score, dim=3)
+        attn_prob = self.attn_dropout(attn_prob)
+
+        # attn_vec = torch.einsum('bnij,bjd->bnid', attn_prob, v)
+        attn_vec = torch.matmul(attn_prob, v.unsqueeze(1))
+        m_attn_vec = torch.mean(attn_vec, dim=1)
+        out = self.out_proj(m_attn_vec)
+        out = self.out_dropout(out)
+
+        return out, attn_vec
+
+
+
+class TemporalFusionTransformer(nn.Module):
+    """ 
+    Implementation of https://arxiv.org/abs/1912.09363 
+    """
+    def __init__(self, config):
+        super().__init__()
+
+        if hasattr(config, 'model'):
+            config = config.model
+
+        self.encoder_length = config.encoder_length #this determines from how distant past we want to use data from
+
+        self.embedding = TFTEmbedding(config)
+        self.static_encoder = StaticCovariateEncoder(config)
+
+        self.history_vsn = VariableSelectionNetwork(config, config.num_historic_vars) 
+        self.history_encoder = nn.LSTM(config.hidden_size, config.hidden_size, batch_first=True)
+        self.future_vsn = VariableSelectionNetwork(config, config.num_future_vars)
+        self.future_encoder = nn.LSTM(config.hidden_size, config.hidden_size, batch_first=True)
+
+
+        self.input_gate = GLU(config.hidden_size, config.hidden_size)
+        self.input_gate_ln = LayerNorm(config.hidden_size, eps=1e-3)
+
+        self.enrichment_grn = GRN(config.hidden_size,
+                                  config.hidden_size,
+                                  context_hidden_size=config.hidden_size, 
+                                  dropout=config.dropout)
+        self.attention = InterpretableMultiHeadAttention(config)
+        self.attention_gate = GLU(config.hidden_size, config.hidden_size)
+        self.attention_ln = LayerNorm(config.hidden_size, eps=1e-3)
+
+        self.positionwise_grn = GRN(config.hidden_size,
+                                    config.hidden_size,
+                                    dropout=config.dropout)
+
+        self.decoder_gate = GLU(config.hidden_size, config.hidden_size)
+        self.decoder_ln = LayerNorm(config.hidden_size, eps=1e-3)
+
+        self.quantile_proj = nn.Linear(config.hidden_size, len(config.quantiles))
+
+    def forward(self, x: Dict[str, Tensor]) -> Tensor:
+        s_inp, t_known_inp, t_observed_inp, t_observed_tgt = self.embedding(x)
+
+        # Static context
+        cs, ce, ch, cc = self.static_encoder(s_inp)
+        ch, cc = ch.unsqueeze(0), cc.unsqueeze(0) #lstm initial states
+
+        # Temporal input
+        _historical_inputs = [t_known_inp[:,:self.encoder_length,:], t_observed_tgt[:,:self.encoder_length,:]]
+        if t_observed_inp is not None:
+            _historical_inputs.insert(0,t_observed_inp[:,:self.encoder_length,:])
+
+        historical_inputs = torch.cat(_historical_inputs, dim=-2)
+        future_inputs = t_known_inp[:, self.encoder_length:]
+
+        # Encoders
+        historical_features, _ = self.history_vsn(historical_inputs, cs)
+        history, state = self.history_encoder(historical_features, (ch, cc))
+        future_features, _ = self.future_vsn(future_inputs, cs)
+        future, _ = self.future_encoder(future_features, state)
+        torch.cuda.synchronize() # this call gives perf boost for unknown reasons
+
+        # skip connection
+        input_embedding = torch.cat([historical_features, future_features], dim=1)
+        temporal_features = torch.cat([history, future], dim=1)
+        temporal_features = self.input_gate(temporal_features)
+        temporal_features = temporal_features + input_embedding
+        temporal_features = self.input_gate_ln(temporal_features)
+
+        # Static enrichment
+        enriched = self.enrichment_grn(temporal_features, c=ce)
+
+        # Temporal self attention
+        x, _ = self.attention(enriched, mask_future_timesteps=True)
+
+        # Don't compute hictorical quantiles
+        x = x[:, self.encoder_length:, :]
+        temporal_features = temporal_features[:, self.encoder_length:, :]
+        enriched = enriched[:, self.encoder_length:, :]
+
+        x = self.attention_gate(x)
+        x = x + enriched
+        x = self.attention_ln(x)
+
+        # Position-wise feed-forward
+        x = self.positionwise_grn(x)
+
+        # Final skip connection
+        x = self.decoder_gate(x)
+        x = x + temporal_features
+        x = self.decoder_ln(x)
+
+        out = self.quantile_proj(x)
+
+        return out
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/requirements.txt b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/requirements.txt
new file mode 100644
index 00000000..8ba46efc
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/requirements.txt
@@ -0,0 +1 @@
+tensorboard
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/scripts/benchmark.sh b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/scripts/benchmark.sh
new file mode 100644
index 00000000..c8a04c36
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/scripts/benchmark.sh
@@ -0,0 +1,54 @@
+#! /bin/bash
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+NUM_GPUS=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
+[ $NUM_GPUS -eq 16 ] && WORKER_NUMS=(1 8 16) || WORKER_NUMS=(1 8)
+DATASETS=(electricity traffic)
+
+rm -r /tmp/benchmark_results
+
+for DATASET in ${DATASETS[@]}
+do
+    for NGPU in ${WORKER_NUMS[@]}
+    do
+        for BATCH_SIZE in 512 1024 1536 2048 2560
+        do
+            for USE_AMP in --use_amp ""
+            do
+                for AFFINITY in "--affinity disabled" "--affinity single" "--affinity socket_unique_interleaved"
+                do 
+                    EXP_NAME="TFT_benchmark_${DATASET}_BS_${BATCH_SIZE}_${NGPU}GPU${USE_AMP}_${AFFINITY}"
+                    python -m torch.distributed.launch --nproc_per_node=${NGPU} train.py \
+                            --dataset ${DATASET} \
+                            --data_path /data/processed/${DATASET}_bin \
+                            --batch_size=${BATCH_SIZE} \
+                            --lr 5e-4 \
+                            --epochs 1 \
+                            --sample 100000 5000 \
+                            --seed 1 \
+                            ${USE_AMP} \
+                            ${AFFINITY} \
+                            --clip_grad 0.1 \
+                            --results /tmp/benchmark_results/${EXP_NAME}
+                done
+            done
+        done
+    done
+done
+for P in `ls /tmp/benchmark_results/`;
+do
+    echo ${P}
+    tail -n 1 /tmp/benchmark_results/${P}/dllogger.json
+done
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/scripts/get_data.sh b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/scripts/get_data.sh
new file mode 100644
index 00000000..d4c7c7e1
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/scripts/get_data.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+DATAPATH='/data'
+
+declare -A URLS=( ['electricity']='https://archive.ics.uci.edu/ml/machine-learning-databases/00321/LD2011_2014.txt.zip'
+                  ['traffic']='https://archive.ics.uci.edu/ml/machine-learning-databases/00204/PEMS-SF.zip'
+                )
+
+mkdir -p ${DATAPATH}/raw
+mkdir -p ${DATAPATH}/processed
+
+for DS in electricity traffic
+do
+	DS_PATH=${DATAPATH}/raw/${DS}
+	ZIP_FNAME=${DS_PATH}.zip
+    if [ ! -d ${DS_PATH} ]
+    then
+        wget "${URLS[${DS}]}" -O ${ZIP_FNAME}
+        unzip ${ZIP_FNAME} -d ${DS_PATH}
+    fi
+	python -c "from data_utils import standarize_${DS} as standarize; standarize(\"${DS_PATH}\")"
+	python -c "from data_utils import preprocess; \
+               from configuration import ${DS^}Config as Config; \
+               preprocess(\"${DS_PATH}/standarized.csv\", \"${DATAPATH}/processed/${DS}_bin\", Config())" 
+done
+
+
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/scripts/run_electricity.sh b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/scripts/run_electricity.sh
new file mode 100644
index 00000000..86214a9a
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/scripts/run_electricity.sh
@@ -0,0 +1,30 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+: ${SEED:=1}
+: ${LR:=1e-3}
+: ${NGPU:=8}
+: ${BATCH_SIZE:=1024}
+: ${EPOCHS:=30}
+
+python -m torch.distributed.launch --nproc_per_node=${NGPU} train.py \
+        --dataset electricity \
+        --data_path /data/processed/electricity_bin \
+        --batch_size=${BATCH_SIZE} \
+        --sample 450000 50000 \
+        --lr ${LR} \
+        --epochs ${EPOCHS} \
+        --seed ${SEED} \
+        --use_amp \
+        --results /results/TFT_electricity_bs${NGPU}x${BATCH_SIZE}_lr${LR}/seed_${SEED}
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/scripts/run_electricity_DGX1-16G.sh b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/scripts/run_electricity_DGX1-16G.sh
new file mode 100644
index 00000000..86214a9a
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/scripts/run_electricity_DGX1-16G.sh
@@ -0,0 +1,30 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+: ${SEED:=1}
+: ${LR:=1e-3}
+: ${NGPU:=8}
+: ${BATCH_SIZE:=1024}
+: ${EPOCHS:=30}
+
+python -m torch.distributed.launch --nproc_per_node=${NGPU} train.py \
+        --dataset electricity \
+        --data_path /data/processed/electricity_bin \
+        --batch_size=${BATCH_SIZE} \
+        --sample 450000 50000 \
+        --lr ${LR} \
+        --epochs ${EPOCHS} \
+        --seed ${SEED} \
+        --use_amp \
+        --results /results/TFT_electricity_bs${NGPU}x${BATCH_SIZE}_lr${LR}/seed_${SEED}
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/scripts/run_traffic.sh b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/scripts/run_traffic.sh
new file mode 100644
index 00000000..cab8e473
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/scripts/run_traffic.sh
@@ -0,0 +1,30 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+: ${SEED:=1}
+: ${LR:=1e-3}
+: ${NGPU:=8}
+: ${BATCH_SIZE:=1024}
+: ${EPOCHS:=20}
+
+python -m torch.distributed.launch --nproc_per_node=${NGPU} train.py \
+        --dataset traffic \
+        --data_path /data/processed/traffic_bin \
+        --batch_size=${BATCH_SIZE} \
+        --sample 450000 50000 \
+        --lr ${LR} \
+        --epochs ${EPOCHS} \
+        --seed ${SEED} \
+        --use_amp \
+        --results /results/TFT_traffic_bs${NGPU}x${BATCH_SIZE}_lr${LR}/seed_${SEED}
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/scripts/run_traffic_DGX1-16G.sh b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/scripts/run_traffic_DGX1-16G.sh
new file mode 100644
index 00000000..cab8e473
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/scripts/run_traffic_DGX1-16G.sh
@@ -0,0 +1,30 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+: ${SEED:=1}
+: ${LR:=1e-3}
+: ${NGPU:=8}
+: ${BATCH_SIZE:=1024}
+: ${EPOCHS:=20}
+
+python -m torch.distributed.launch --nproc_per_node=${NGPU} train.py \
+        --dataset traffic \
+        --data_path /data/processed/traffic_bin \
+        --batch_size=${BATCH_SIZE} \
+        --sample 450000 50000 \
+        --lr ${LR} \
+        --epochs ${EPOCHS} \
+        --seed ${SEED} \
+        --use_amp \
+        --results /results/TFT_traffic_bs${NGPU}x${BATCH_SIZE}_lr${LR}/seed_${SEED}
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/train.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/train.py
new file mode 100644
index 00000000..e5ceceeb
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/train.py
@@ -0,0 +1,294 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import time
+import os
+import pickle
+import json
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.distributed as dist
+from torch.utils.data import DataLoader, DistributedSampler, RandomSampler
+from apex import amp
+from apex.optimizers import FusedAdam
+#from torch.nn.parallel import DistributedDataParallel as DDP
+from apex.parallel import DistributedDataParallel as DDP
+
+import numpy as np
+
+import dllogger
+
+from modeling import TemporalFusionTransformer
+from configuration import CONFIGS
+from data_utils import TFTBinaryDataset, sample_data
+from log_helper import setup_logger
+from criterions import QuantileLoss
+from inference import predict
+from utils import PerformanceMeter
+import gpu_affinity
+from ema import ModelEma
+
+def load_dataset(args, config):
+    train_split = TFTBinaryDataset(os.path.join(args.data_path, 'train.bin'), config)
+    train_split = sample_data(train_split, args.sample_data[0])
+    if args.distributed_world_size > 1:
+        data_sampler = DistributedSampler(train_split, args.distributed_world_size, args.distributed_rank, seed=args.seed + args.distributed_rank, drop_last=True)
+    else:
+        data_sampler = RandomSampler(train_split)
+    train_loader = DataLoader(train_split, batch_size=args.batch_size, num_workers=4, sampler=data_sampler, pin_memory=True)
+
+    valid_split = TFTBinaryDataset(os.path.join(args.data_path, 'valid.bin'), config)
+    valid_split = sample_data(valid_split, args.sample_data[1])
+    if args.distributed_world_size > 1:
+        data_sampler = DistributedSampler(valid_split, args.distributed_world_size, args.distributed_rank, shuffle=False, drop_last=False)
+    else:
+        data_sampler = None
+    valid_loader = DataLoader(valid_split, batch_size=args.batch_size, sampler=data_sampler, num_workers=4, pin_memory=True)
+
+    test_split = TFTBinaryDataset(os.path.join(args.data_path, 'test.bin'), config)
+    if args.distributed_world_size > 1:
+        data_sampler = DistributedSampler(test_split, args.distributed_world_size, args.distributed_rank, shuffle=False, drop_last=False)
+    else:
+        data_sampler = None
+    test_loader = DataLoader(test_split, batch_size=args.batch_size, sampler=data_sampler, num_workers=4, pin_memory=True)
+
+    print_once(f'Train split length: {len(train_split)}')
+    print_once(f'Valid split length: {len(valid_split)}')
+    print_once(f'Test split length: {len(test_split)}')
+
+    return train_loader, valid_loader, test_loader
+
+def print_once(*args, **kwargs):
+    if not dist.is_initialized() or dist.get_rank() == 0:
+        print(*args, **kwargs)
+
+
+def main(args):
+    # Enable CuDNN autotuner
+    nproc_per_node = torch.cuda.device_count()
+    if args.affinity != 'disabled':
+        affinity = gpu_affinity.set_affinity(
+                args.local_rank,
+                nproc_per_node,
+                args.affinity
+            )
+        print(f'{args.local_rank}: thread affinity: {affinity}')
+
+
+    torch.backends.cudnn.benchmark = True
+
+    ### INIT DISTRIBUTED
+    if args.distributed_world_size > 1:
+        args.local_rank = int(os.environ.get('LOCAL_RANK', args.local_rank))
+        torch.cuda.set_device(args.local_rank)
+        dist.init_process_group(backend='nccl', init_method='env://')
+        args.distributed_world_size = int(os.environ['WORLD_SIZE'])
+        args.distributed_rank = dist.get_rank()
+        print_once(f'Distributed training with {args.distributed_world_size} GPUs')
+        torch.cuda.synchronize()
+
+    if args.seed:
+        np.random.seed(args.seed)
+        torch.manual_seed(args.seed)
+        torch.cuda.manual_seed(args.seed)
+
+    setup_logger(args)
+
+    config = CONFIGS[args.dataset]()
+    if args.overwrite_config:
+        config.__dict__.update(json.loads(args.overwrite_config))
+
+    dllogger.log(step='HPARAMS', data={**vars(args), **vars(config)}, verbosity=1)
+
+    model = TemporalFusionTransformer(config).cuda()
+    if args.ema_decay:
+        model_ema = ModelEma(model, decay=args.ema_decay)
+
+    print_once('Model params: {}'.format(sum(p.numel() for p in model.parameters())))
+    criterion = QuantileLoss(config).cuda()
+    optimizer = FusedAdam(model.parameters(), lr=args.lr)
+    if args.use_amp:
+        model, optimizer = amp.initialize(model, optimizer, opt_level="O2", loss_scale="dynamic")
+    if args.distributed_world_size > 1:
+        #model = DDP(model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True)
+        model = DDP(model)
+
+    train_loader, valid_loader, test_loader = load_dataset(args, config)
+
+    global_step = 0
+    perf_meter = PerformanceMeter()
+
+    for epoch in range(args.epochs):
+        start = time.time()
+        dllogger.log(step=global_step, data={'epoch': epoch}, verbosity=1)
+
+        model.train() 
+        for local_step, batch in enumerate(train_loader):
+            perf_meter.reset_current_lap()
+            batch = {key: tensor.cuda() if tensor.numel() else None for key, tensor in batch.items()}
+            predictions = model(batch)
+            targets = batch['target'][:,config.encoder_length:,:]
+            p_losses = criterion(predictions, targets)
+            loss = p_losses.sum()
+
+            if args.use_amp:
+                with amp.scale_loss(loss, optimizer) as scaled_loss:
+                    scaled_loss.backward()
+            else:
+                loss.backward()
+            if not args.grad_accumulation or (global_step+1) % args.grad_accumulation == 0:
+                if args.clip_grad:
+                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip_grad)
+                optimizer.step()
+                optimizer.zero_grad()
+                if args.ema_decay:
+                    model_ema.update(model)
+
+            if args.distributed_world_size > 1:
+                dist.all_reduce(p_losses)
+                p_losses /= args.distributed_world_size
+                loss = p_losses.sum()
+
+            torch.cuda.synchronize()
+            ips = perf_meter.update(args.batch_size * args.distributed_world_size,
+                    exclude_from_total=local_step in [0, len(train_loader)-1])
+
+            log_dict = {'P10':p_losses[0].item(), 'P50':p_losses[1].item(), 'P90':p_losses[2].item(), 'loss': loss.item(), 'items/s':ips}
+            dllogger.log(step=global_step, data=log_dict, verbosity=1)
+            global_step += 1
+
+        validate(args, config, model_ema if args.ema_decay else model, criterion, valid_loader, global_step)
+
+        if validate.early_stop_c >= args.early_stopping:
+            print_once('Early stopping')
+            break
+
+    ### TEST PHASE ###
+    state_dict = torch.load(os.path.join(args.results, 'checkpoint.pt'), map_location='cpu')
+    if isinstance(model, DDP):
+        model.module.load_state_dict(state_dict['model'])
+    else:
+        model.load_state_dict(state_dict['model'])
+    model.cuda().eval()
+
+    tgt_scalers = pickle.load(open(os.path.join(args.data_path, 'tgt_scalers.bin'), 'rb'))
+    cat_encodings = pickle.load(open(os.path.join(args.data_path,'cat_encodings.bin'), 'rb'))
+
+    unscaled_predictions, unscaled_targets, _, _ = predict(args, config, model, test_loader, tgt_scalers, cat_encodings)
+    losses = QuantileLoss(config)(unscaled_predictions, unscaled_targets)
+    normalizer = unscaled_targets.abs().mean()
+    quantiles = 2 * losses / normalizer
+
+    if args.distributed_world_size > 1:
+        quantiles = quantiles.cuda()
+        dist.all_reduce(quantiles)
+        quantiles /= args.distributed_world_size
+
+    quantiles = {'test_p10': quantiles[0].item(), 'test_p50': quantiles[1].item(), 'test_p90': quantiles[2].item(), 'sum':sum(quantiles).item()}
+    finish_log = {**quantiles, 'average_ips':perf_meter.avg, 'convergence_step':validate.conv_step}
+    dllogger.log(step=(), data=finish_log, verbosity=1)
+
+def validate(args, config, model, criterion, dataloader, global_step):
+    if not hasattr(validate, 'best_valid_loss'):
+        validate.best_valid_loss = float('inf')
+    if not hasattr(validate, 'early_stop_c'):
+        validate.early_stop_c = 0
+    model.eval()
+
+    losses = []
+    validation_start = time.time()
+    for batch in dataloader:
+        with torch.no_grad():
+            batch = {key: tensor.cuda() if tensor.numel() else None for key, tensor in batch.items()}
+            predictions = model(batch)
+            targets = batch['target'][:,config.encoder_length:,:]
+            p_losses = criterion(predictions, targets)
+            bs = next(t for t in batch.values() if t is not None).shape[0]
+            losses.append((p_losses, bs))
+
+    validation_end = time.time()
+
+    p_losses = sum([l[0]*l[1] for l in losses])/sum([l[1] for l in losses]) #takes into accunt that the last batch is not full
+    if args.distributed_world_size > 1:
+        dist.all_reduce(p_losses)
+        p_losses = p_losses/args.distributed_world_size
+
+    ips = len(dataloader.dataset) / (validation_end - validation_start)
+
+    log_dict = {'P10':p_losses[0].item(), 'P50':p_losses[1].item(), 'P90':p_losses[2].item(), 'loss': p_losses.sum().item(), 'items/s':ips}
+
+    if log_dict['loss'] < validate.best_valid_loss:
+        validate.best_valid_loss = log_dict['loss']
+        validate.early_stop_c = 0
+        validate.conv_step = global_step
+        if not dist.is_initialized() or dist.get_rank() == 0:
+            state_dict = model.module.state_dict() if isinstance(model, (DDP, ModelEma)) else model.state_dict()
+            ckpt = {'args':args, 'config':config, 'model':state_dict}
+            torch.save(ckpt, os.path.join(args.results, 'checkpoint.pt'))
+        if args.distributed_world_size > 1:
+            dist.barrier()
+    else:
+        validate.early_stop_c += 1
+        
+    log_dict = {'val_'+k:v for k,v in log_dict.items()}
+    dllogger.log(step=global_step, data=log_dict, verbosity=1)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--data_path', type=str, required=True,
+                        help='Path to the dataset')
+    parser.add_argument('--dataset', type=str, required=True, choices=CONFIGS.keys(),
+                        help='Dataset name')
+    parser.add_argument('--epochs', type=int, default=25,
+                        help='Default number of training epochs')
+    parser.add_argument('--sample_data', type=lambda x: int(float(x)), nargs=2, default=[-1, -1],
+                        help="""Subsample the dataset. Specify number of training and valid examples.
+                        Values can be provided in scientific notation. Floats will be truncated.""")
+    parser.add_argument('--batch_size', type=int, default=64)
+    parser.add_argument('--lr', type=float, default=1e-3)
+    parser.add_argument('--seed', type=int, default=1)
+    parser.add_argument('--use_amp', action='store_true', help='Enable automatic mixed precision')
+    parser.add_argument('--clip_grad', type=float, default=0.0)
+    parser.add_argument('--grad_accumulation', type=int, default=0)
+    parser.add_argument('--early_stopping', type=int, default=1000,
+                        help='Stop training if validation loss does not improve for more than this number of epochs.')
+    parser.add_argument('--results', type=str, default='/results',
+                        help='Directory in which results are stored')
+    parser.add_argument('--log_file', type=str, default='dllogger.json',
+                        help='Name of dllogger output file')
+    parser.add_argument('--distributed_world_size', type=int, metavar='N',
+                       default=torch.cuda.device_count(),
+                       help='total number of GPUs across all nodes (default: all visible GPUs)')
+    parser.add_argument('--distributed_rank', default=os.getenv('LOCAL_RANK', 0), type=int,
+                       help='rank of the current worker')
+    parser.add_argument('--local_rank', default=0, type=int,
+                       help='rank of the current worker')
+    parser.add_argument('--overwrite_config', type=str, default='',
+                       help='JSON string used to overload config')
+    parser.add_argument('--affinity', type=str,
+                         default='socket_unique_interleaved',
+                         choices=['socket', 'single', 'single_unique',
+                                  'socket_unique_interleaved',
+                                  'socket_unique_continuous',
+                                  'disabled'],
+                         help='type of CPU affinity')
+    parser.add_argument("--ema_decay", type=float, default=0.0, help='Use exponential moving average')
+
+
+    ARGS = parser.parse_args()
+    main(ARGS)
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/utils.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/utils.py
new file mode 100644
index 00000000..bf88be40
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/utils.py
@@ -0,0 +1,46 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import time
+
+class PerformanceMeter():
+    def __init__(self):
+        self.reset()
+
+    def reset(self):
+        self.avg = 0
+        self.count = 0
+        self.total_time = 0
+        self.last_update_time = time.time()
+        self.intervals = []
+
+    def update(self, n, exclude_from_total=False):
+        delta = time.time() - self.last_update_time
+        self.intervals.append(delta)
+        if not exclude_from_total:
+            self.total_time += delta
+            self.count += n
+            self.avg = self.count / self.total_time
+        self.last_update_time = time.time()
+
+        return n/delta
+
+    def reset_current_lap(self):
+        self.last_update_time = time.time()
+
+    def p(self, i):
+        assert i <= 100
+        idx = int(len(self.intervals) * i / 100)
+        return sorted(self.intervals)[idx]
+
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/train.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/train.py
new file mode 100644
index 00000000..e5ceceeb
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/train.py
@@ -0,0 +1,294 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import time
+import os
+import pickle
+import json
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.distributed as dist
+from torch.utils.data import DataLoader, DistributedSampler, RandomSampler
+from apex import amp
+from apex.optimizers import FusedAdam
+#from torch.nn.parallel import DistributedDataParallel as DDP
+from apex.parallel import DistributedDataParallel as DDP
+
+import numpy as np
+
+import dllogger
+
+from modeling import TemporalFusionTransformer
+from configuration import CONFIGS
+from data_utils import TFTBinaryDataset, sample_data
+from log_helper import setup_logger
+from criterions import QuantileLoss
+from inference import predict
+from utils import PerformanceMeter
+import gpu_affinity
+from ema import ModelEma
+
+def load_dataset(args, config):
+    train_split = TFTBinaryDataset(os.path.join(args.data_path, 'train.bin'), config)
+    train_split = sample_data(train_split, args.sample_data[0])
+    if args.distributed_world_size > 1:
+        data_sampler = DistributedSampler(train_split, args.distributed_world_size, args.distributed_rank, seed=args.seed + args.distributed_rank, drop_last=True)
+    else:
+        data_sampler = RandomSampler(train_split)
+    train_loader = DataLoader(train_split, batch_size=args.batch_size, num_workers=4, sampler=data_sampler, pin_memory=True)
+
+    valid_split = TFTBinaryDataset(os.path.join(args.data_path, 'valid.bin'), config)
+    valid_split = sample_data(valid_split, args.sample_data[1])
+    if args.distributed_world_size > 1:
+        data_sampler = DistributedSampler(valid_split, args.distributed_world_size, args.distributed_rank, shuffle=False, drop_last=False)
+    else:
+        data_sampler = None
+    valid_loader = DataLoader(valid_split, batch_size=args.batch_size, sampler=data_sampler, num_workers=4, pin_memory=True)
+
+    test_split = TFTBinaryDataset(os.path.join(args.data_path, 'test.bin'), config)
+    if args.distributed_world_size > 1:
+        data_sampler = DistributedSampler(test_split, args.distributed_world_size, args.distributed_rank, shuffle=False, drop_last=False)
+    else:
+        data_sampler = None
+    test_loader = DataLoader(test_split, batch_size=args.batch_size, sampler=data_sampler, num_workers=4, pin_memory=True)
+
+    print_once(f'Train split length: {len(train_split)}')
+    print_once(f'Valid split length: {len(valid_split)}')
+    print_once(f'Test split length: {len(test_split)}')
+
+    return train_loader, valid_loader, test_loader
+
+def print_once(*args, **kwargs):
+    if not dist.is_initialized() or dist.get_rank() == 0:
+        print(*args, **kwargs)
+
+
+def main(args):
+    # Enable CuDNN autotuner
+    nproc_per_node = torch.cuda.device_count()
+    if args.affinity != 'disabled':
+        affinity = gpu_affinity.set_affinity(
+                args.local_rank,
+                nproc_per_node,
+                args.affinity
+            )
+        print(f'{args.local_rank}: thread affinity: {affinity}')
+
+
+    torch.backends.cudnn.benchmark = True
+
+    ### INIT DISTRIBUTED
+    if args.distributed_world_size > 1:
+        args.local_rank = int(os.environ.get('LOCAL_RANK', args.local_rank))
+        torch.cuda.set_device(args.local_rank)
+        dist.init_process_group(backend='nccl', init_method='env://')
+        args.distributed_world_size = int(os.environ['WORLD_SIZE'])
+        args.distributed_rank = dist.get_rank()
+        print_once(f'Distributed training with {args.distributed_world_size} GPUs')
+        torch.cuda.synchronize()
+
+    if args.seed:
+        np.random.seed(args.seed)
+        torch.manual_seed(args.seed)
+        torch.cuda.manual_seed(args.seed)
+
+    setup_logger(args)
+
+    config = CONFIGS[args.dataset]()
+    if args.overwrite_config:
+        config.__dict__.update(json.loads(args.overwrite_config))
+
+    dllogger.log(step='HPARAMS', data={**vars(args), **vars(config)}, verbosity=1)
+
+    model = TemporalFusionTransformer(config).cuda()
+    if args.ema_decay:
+        model_ema = ModelEma(model, decay=args.ema_decay)
+
+    print_once('Model params: {}'.format(sum(p.numel() for p in model.parameters())))
+    criterion = QuantileLoss(config).cuda()
+    optimizer = FusedAdam(model.parameters(), lr=args.lr)
+    if args.use_amp:
+        model, optimizer = amp.initialize(model, optimizer, opt_level="O2", loss_scale="dynamic")
+    if args.distributed_world_size > 1:
+        #model = DDP(model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True)
+        model = DDP(model)
+
+    train_loader, valid_loader, test_loader = load_dataset(args, config)
+
+    global_step = 0
+    perf_meter = PerformanceMeter()
+
+    for epoch in range(args.epochs):
+        start = time.time()
+        dllogger.log(step=global_step, data={'epoch': epoch}, verbosity=1)
+
+        model.train() 
+        for local_step, batch in enumerate(train_loader):
+            perf_meter.reset_current_lap()
+            batch = {key: tensor.cuda() if tensor.numel() else None for key, tensor in batch.items()}
+            predictions = model(batch)
+            targets = batch['target'][:,config.encoder_length:,:]
+            p_losses = criterion(predictions, targets)
+            loss = p_losses.sum()
+
+            if args.use_amp:
+                with amp.scale_loss(loss, optimizer) as scaled_loss:
+                    scaled_loss.backward()
+            else:
+                loss.backward()
+            if not args.grad_accumulation or (global_step+1) % args.grad_accumulation == 0:
+                if args.clip_grad:
+                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip_grad)
+                optimizer.step()
+                optimizer.zero_grad()
+                if args.ema_decay:
+                    model_ema.update(model)
+
+            if args.distributed_world_size > 1:
+                dist.all_reduce(p_losses)
+                p_losses /= args.distributed_world_size
+                loss = p_losses.sum()
+
+            torch.cuda.synchronize()
+            ips = perf_meter.update(args.batch_size * args.distributed_world_size,
+                    exclude_from_total=local_step in [0, len(train_loader)-1])
+
+            log_dict = {'P10':p_losses[0].item(), 'P50':p_losses[1].item(), 'P90':p_losses[2].item(), 'loss': loss.item(), 'items/s':ips}
+            dllogger.log(step=global_step, data=log_dict, verbosity=1)
+            global_step += 1
+
+        validate(args, config, model_ema if args.ema_decay else model, criterion, valid_loader, global_step)
+
+        if validate.early_stop_c >= args.early_stopping:
+            print_once('Early stopping')
+            break
+
+    ### TEST PHASE ###
+    state_dict = torch.load(os.path.join(args.results, 'checkpoint.pt'), map_location='cpu')
+    if isinstance(model, DDP):
+        model.module.load_state_dict(state_dict['model'])
+    else:
+        model.load_state_dict(state_dict['model'])
+    model.cuda().eval()
+
+    tgt_scalers = pickle.load(open(os.path.join(args.data_path, 'tgt_scalers.bin'), 'rb'))
+    cat_encodings = pickle.load(open(os.path.join(args.data_path,'cat_encodings.bin'), 'rb'))
+
+    unscaled_predictions, unscaled_targets, _, _ = predict(args, config, model, test_loader, tgt_scalers, cat_encodings)
+    losses = QuantileLoss(config)(unscaled_predictions, unscaled_targets)
+    normalizer = unscaled_targets.abs().mean()
+    quantiles = 2 * losses / normalizer
+
+    if args.distributed_world_size > 1:
+        quantiles = quantiles.cuda()
+        dist.all_reduce(quantiles)
+        quantiles /= args.distributed_world_size
+
+    quantiles = {'test_p10': quantiles[0].item(), 'test_p50': quantiles[1].item(), 'test_p90': quantiles[2].item(), 'sum':sum(quantiles).item()}
+    finish_log = {**quantiles, 'average_ips':perf_meter.avg, 'convergence_step':validate.conv_step}
+    dllogger.log(step=(), data=finish_log, verbosity=1)
+
+def validate(args, config, model, criterion, dataloader, global_step):
+    if not hasattr(validate, 'best_valid_loss'):
+        validate.best_valid_loss = float('inf')
+    if not hasattr(validate, 'early_stop_c'):
+        validate.early_stop_c = 0
+    model.eval()
+
+    losses = []
+    validation_start = time.time()
+    for batch in dataloader:
+        with torch.no_grad():
+            batch = {key: tensor.cuda() if tensor.numel() else None for key, tensor in batch.items()}
+            predictions = model(batch)
+            targets = batch['target'][:,config.encoder_length:,:]
+            p_losses = criterion(predictions, targets)
+            bs = next(t for t in batch.values() if t is not None).shape[0]
+            losses.append((p_losses, bs))
+
+    validation_end = time.time()
+
+    p_losses = sum([l[0]*l[1] for l in losses])/sum([l[1] for l in losses]) #takes into accunt that the last batch is not full
+    if args.distributed_world_size > 1:
+        dist.all_reduce(p_losses)
+        p_losses = p_losses/args.distributed_world_size
+
+    ips = len(dataloader.dataset) / (validation_end - validation_start)
+
+    log_dict = {'P10':p_losses[0].item(), 'P50':p_losses[1].item(), 'P90':p_losses[2].item(), 'loss': p_losses.sum().item(), 'items/s':ips}
+
+    if log_dict['loss'] < validate.best_valid_loss:
+        validate.best_valid_loss = log_dict['loss']
+        validate.early_stop_c = 0
+        validate.conv_step = global_step
+        if not dist.is_initialized() or dist.get_rank() == 0:
+            state_dict = model.module.state_dict() if isinstance(model, (DDP, ModelEma)) else model.state_dict()
+            ckpt = {'args':args, 'config':config, 'model':state_dict}
+            torch.save(ckpt, os.path.join(args.results, 'checkpoint.pt'))
+        if args.distributed_world_size > 1:
+            dist.barrier()
+    else:
+        validate.early_stop_c += 1
+        
+    log_dict = {'val_'+k:v for k,v in log_dict.items()}
+    dllogger.log(step=global_step, data=log_dict, verbosity=1)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--data_path', type=str, required=True,
+                        help='Path to the dataset')
+    parser.add_argument('--dataset', type=str, required=True, choices=CONFIGS.keys(),
+                        help='Dataset name')
+    parser.add_argument('--epochs', type=int, default=25,
+                        help='Default number of training epochs')
+    parser.add_argument('--sample_data', type=lambda x: int(float(x)), nargs=2, default=[-1, -1],
+                        help="""Subsample the dataset. Specify number of training and valid examples.
+                        Values can be provided in scientific notation. Floats will be truncated.""")
+    parser.add_argument('--batch_size', type=int, default=64)
+    parser.add_argument('--lr', type=float, default=1e-3)
+    parser.add_argument('--seed', type=int, default=1)
+    parser.add_argument('--use_amp', action='store_true', help='Enable automatic mixed precision')
+    parser.add_argument('--clip_grad', type=float, default=0.0)
+    parser.add_argument('--grad_accumulation', type=int, default=0)
+    parser.add_argument('--early_stopping', type=int, default=1000,
+                        help='Stop training if validation loss does not improve for more than this number of epochs.')
+    parser.add_argument('--results', type=str, default='/results',
+                        help='Directory in which results are stored')
+    parser.add_argument('--log_file', type=str, default='dllogger.json',
+                        help='Name of dllogger output file')
+    parser.add_argument('--distributed_world_size', type=int, metavar='N',
+                       default=torch.cuda.device_count(),
+                       help='total number of GPUs across all nodes (default: all visible GPUs)')
+    parser.add_argument('--distributed_rank', default=os.getenv('LOCAL_RANK', 0), type=int,
+                       help='rank of the current worker')
+    parser.add_argument('--local_rank', default=0, type=int,
+                       help='rank of the current worker')
+    parser.add_argument('--overwrite_config', type=str, default='',
+                       help='JSON string used to overload config')
+    parser.add_argument('--affinity', type=str,
+                         default='socket_unique_interleaved',
+                         choices=['socket', 'single', 'single_unique',
+                                  'socket_unique_interleaved',
+                                  'socket_unique_continuous',
+                                  'disabled'],
+                         help='type of CPU affinity')
+    parser.add_argument("--ema_decay", type=float, default=0.0, help='Use exponential moving average')
+
+
+    ARGS = parser.parse_args()
+    main(ARGS)
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/utils.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/utils.py
new file mode 100644
index 00000000..bf88be40
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/utils.py
@@ -0,0 +1,46 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import time
+
+class PerformanceMeter():
+    def __init__(self):
+        self.reset()
+
+    def reset(self):
+        self.avg = 0
+        self.count = 0
+        self.total_time = 0
+        self.last_update_time = time.time()
+        self.intervals = []
+
+    def update(self, n, exclude_from_total=False):
+        delta = time.time() - self.last_update_time
+        self.intervals.append(delta)
+        if not exclude_from_total:
+            self.total_time += delta
+            self.count += n
+            self.avg = self.count / self.total_time
+        self.last_update_time = time.time()
+
+        return n/delta
+
+    def reset_current_lap(self):
+        self.last_update_time = time.time()
+
+    def p(self, i):
+        assert i <= 100
+        idx = int(len(self.intervals) * i / 100)
+        return sorted(self.intervals)[idx]
+
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/configuration.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/configuration.py
new file mode 100644
index 00000000..bef26e66
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/configuration.py
@@ -0,0 +1,128 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from data_utils import InputTypes, DataTypes, FeatureSpec
+import datetime
+
+class ElectricityConfig():
+    def __init__(self):
+
+        self.features = [
+                         FeatureSpec('id', InputTypes.ID, DataTypes.CATEGORICAL),
+                         FeatureSpec('hours_from_start', InputTypes.TIME, DataTypes.CONTINUOUS),
+                         FeatureSpec('power_usage', InputTypes.TARGET, DataTypes.CONTINUOUS),
+                         FeatureSpec('hour', InputTypes.KNOWN, DataTypes.CONTINUOUS),
+                         FeatureSpec('day_of_week', InputTypes.KNOWN, DataTypes.CONTINUOUS),
+                         FeatureSpec('hours_from_start', InputTypes.KNOWN, DataTypes.CONTINUOUS),
+                         FeatureSpec('categorical_id', InputTypes.STATIC, DataTypes.CATEGORICAL),
+                        ]
+        # Dataset split boundaries
+        self.time_ids = 'days_from_start' # This column contains time indices across which we split the data
+        self.train_range = (1096, 1315)
+        self.valid_range = (1308, 1339)
+        self.test_range = (1332, 1346)
+        self.dataset_stride = 1 #how many timesteps between examples
+        self.scale_per_id = True
+        self.missing_id_strategy = None
+        self.missing_cat_data_strategy='encode_all'
+
+        # Feature sizes
+        self.static_categorical_inp_lens = [369]
+        self.temporal_known_categorical_inp_lens = []
+        self.temporal_observed_categorical_inp_lens = []
+        self.quantiles = [0.1, 0.5, 0.9]
+
+        self.example_length = 8 * 24
+        self.encoder_length = 7 * 24
+
+        self.n_head = 4
+        self.hidden_size = 128
+        self.dropout = 0.1
+        self.attn_dropout = 0.0
+
+        #### Derived variables ####
+        self.temporal_known_continuous_inp_size = len([x for x in self.features 
+            if x.feature_type == InputTypes.KNOWN and x.feature_embed_type == DataTypes.CONTINUOUS])
+        self.temporal_observed_continuous_inp_size = len([x for x in self.features 
+            if x.feature_type == InputTypes.OBSERVED and x.feature_embed_type == DataTypes.CONTINUOUS])
+        self.temporal_target_size = len([x for x in self.features if x.feature_type == InputTypes.TARGET])
+        self.static_continuous_inp_size = len([x for x in self.features 
+            if x.feature_type == InputTypes.STATIC and x.feature_embed_type == DataTypes.CONTINUOUS])
+
+        self.num_static_vars = self.static_continuous_inp_size + len(self.static_categorical_inp_lens)
+        self.num_future_vars = self.temporal_known_continuous_inp_size + len(self.temporal_known_categorical_inp_lens)
+        self.num_historic_vars = sum([self.num_future_vars,
+                                      self.temporal_observed_continuous_inp_size,
+                                      self.temporal_target_size,
+                                      len(self.temporal_observed_categorical_inp_lens),
+                                      ])
+
+
+class TrafficConfig():
+    def __init__(self):
+
+        self.features = [
+                         FeatureSpec('id', InputTypes.ID, DataTypes.CATEGORICAL),
+                         FeatureSpec('hours_from_start', InputTypes.TIME, DataTypes.CONTINUOUS),
+                         FeatureSpec('values', InputTypes.TARGET, DataTypes.CONTINUOUS),
+                         FeatureSpec('time_on_day', InputTypes.KNOWN, DataTypes.CONTINUOUS),
+                         FeatureSpec('day_of_week', InputTypes.KNOWN, DataTypes.CONTINUOUS),
+                         FeatureSpec('hours_from_start', InputTypes.KNOWN, DataTypes.CONTINUOUS),
+                         FeatureSpec('categorical_id', InputTypes.STATIC, DataTypes.CATEGORICAL),
+                        ]
+        # Dataset split boundaries
+        self.time_ids = 'sensor_day' # This column contains time indices across which we split the data
+        self.train_range = (0, 151)
+        self.valid_range = (144, 166)
+        self.test_range = (159, float('inf'))
+        self.dataset_stride = 1 #how many timesteps between examples
+        self.scale_per_id = False
+        self.missing_id_strategy = None
+        self.missing_cat_data_strategy='encode_all'
+
+        # Feature sizes
+        self.static_categorical_inp_lens = [963]
+        self.temporal_known_categorical_inp_lens = []
+        self.temporal_observed_categorical_inp_lens = []
+        self.quantiles = [0.1, 0.5, 0.9]
+
+        self.example_length = 8 * 24
+        self.encoder_length = 7 * 24
+
+        self.n_head = 4
+        self.hidden_size = 128
+        self.dropout = 0.3
+        self.attn_dropout = 0.0
+
+        #### Derived variables ####
+        self.temporal_known_continuous_inp_size = len([x for x in self.features 
+            if x.feature_type == InputTypes.KNOWN and x.feature_embed_type == DataTypes.CONTINUOUS])
+        self.temporal_observed_continuous_inp_size = len([x for x in self.features 
+            if x.feature_type == InputTypes.OBSERVED and x.feature_embed_type == DataTypes.CONTINUOUS])
+        self.temporal_target_size = len([x for x in self.features if x.feature_type == InputTypes.TARGET])
+        self.static_continuous_inp_size = len([x for x in self.features 
+            if x.feature_type == InputTypes.STATIC and x.feature_embed_type == DataTypes.CONTINUOUS])
+
+        self.num_static_vars = self.static_continuous_inp_size + len(self.static_categorical_inp_lens)
+        self.num_future_vars = self.temporal_known_continuous_inp_size + len(self.temporal_known_categorical_inp_lens)
+        self.num_historic_vars = sum([self.num_future_vars,
+                                      self.temporal_observed_continuous_inp_size,
+                                      self.temporal_target_size,
+                                      len(self.temporal_observed_categorical_inp_lens),
+                                      ])
+
+
+CONFIGS = {'electricity':  ElectricityConfig,
+           'traffic':      TrafficConfig, 
+           }
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/criterions.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/criterions.py
new file mode 100644
index 00000000..5c9df6ae
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/criterions.py
@@ -0,0 +1,28 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class QuantileLoss(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.register_buffer('q', torch.tensor(config.quantiles))
+
+    def forward(self, predictions, targets):
+        diff = predictions - targets
+        ql = (1-self.q)*F.relu(diff) + self.q*F.relu(-diff)
+        losses = ql.view(-1, ql.shape[-1]).mean(0)
+        return losses
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/data_utils.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/data_utils.py
new file mode 100644
index 00000000..f38f8bfb
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/data_utils.py
@@ -0,0 +1,790 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################
+# Copyright 2021 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import math
+import pickle
+import enum
+import datetime
+
+from collections import namedtuple, OrderedDict
+
+import sklearn.preprocessing
+from sklearn.impute import SimpleImputer
+import pandas as pd
+import numpy as np
+from bisect import bisect
+
+import torch
+from torch.utils.data import Dataset,IterableDataset,DataLoader
+
+class DataTypes(enum.IntEnum):
+    """Defines numerical types of each column."""
+    CONTINUOUS = 0
+    CATEGORICAL = 1
+    DATE = 2
+    STR = 3
+
+class InputTypes(enum.IntEnum):
+    """Defines input types of each column."""
+    TARGET = 0
+    OBSERVED = 1
+    KNOWN = 2
+    STATIC = 3
+    ID = 4  # Single column used as an entity identifier
+    TIME = 5  # Single column exclusively used as a time index
+
+FeatureSpec = namedtuple('FeatureSpec', ['name', 'feature_type', 'feature_embed_type'])
+DTYPE_MAP = {
+        DataTypes.CONTINUOUS : np.float32,
+        DataTypes.CATEGORICAL : np.int64,
+        DataTypes.DATE:'datetime64[ns]',
+        DataTypes.STR: str
+        }
+
+FEAT_ORDER = [
+        (InputTypes.STATIC, DataTypes.CATEGORICAL),
+        (InputTypes.STATIC, DataTypes.CONTINUOUS),
+        (InputTypes.KNOWN, DataTypes.CATEGORICAL),
+        (InputTypes.KNOWN, DataTypes.CONTINUOUS),
+        (InputTypes.OBSERVED, DataTypes.CATEGORICAL),
+        (InputTypes.OBSERVED, DataTypes.CONTINUOUS),
+        (InputTypes.TARGET, DataTypes.CONTINUOUS),
+        (InputTypes.ID, DataTypes.CATEGORICAL)
+        ]
+
+FEAT_NAMES = ['s_cat' , 's_cont' , 'k_cat' , 'k_cont' , 'o_cat' , 'o_cont' , 'target', 'id']
+DEFAULT_ID_COL = 'id'
+
+class TFTBinaryDataset(Dataset):
+    def __init__(self, path, config):
+        super(TFTBinaryDataset).__init__()
+        self.features = [x for x in config.features if x.feature_embed_type != DataTypes.DATE]
+        self.example_length = config.example_length
+        self.stride = config.dataset_stride
+
+        self.grouped = pickle.load(open(path, 'rb'))
+        self.grouped = [x for x in self.grouped if x.shape[0] >= self.example_length]
+        self._cum_examples_in_group = np.cumsum([(g.shape[0] - self.example_length + 1)//self.stride for g in self.grouped])
+
+
+        self.feature_type_col_map = [[i for i,f in enumerate(self.features) if (f.feature_type, f.feature_embed_type) == x] for x in FEAT_ORDER]
+
+        # The list comprehension below is an elaborate way of rearranging data into correct order,
+        # simultaneously doing casting to proper types. Probably can be written neater
+        self.grouped = [
+                [
+                    arr[:, idxs].view(dtype=np.float32).astype(DTYPE_MAP[t[1]]) 
+                    for t, idxs in zip(FEAT_ORDER, self.feature_type_col_map)
+                ] 
+                for arr in self.grouped
+            ]
+
+    def __len__(self):
+        return self._cum_examples_in_group[-1] if len(self._cum_examples_in_group) else 0
+
+    def __getitem__(self, idx):
+        g_idx = bisect(self._cum_examples_in_group, idx)
+        e_idx = idx - self._cum_examples_in_group[g_idx-1] if g_idx else idx
+
+        group =  self.grouped[g_idx]
+
+        tensors = [
+                   torch.from_numpy(feat[e_idx * self.stride:e_idx*self.stride + self.example_length])
+                   if feat.size else torch.empty(0)
+                   for feat in group
+                  ]
+
+        return OrderedDict(zip(FEAT_NAMES, tensors))
+
+
+class TFTDataset(Dataset):
+    def __init__(self, path, config):
+        super(TFTDataset).__init__()
+        self.features = config.features
+        self.data = pd.read_csv(path, index_col=0)
+        self.example_length = config.example_length
+        self.stride = config.dataset_stride
+
+        # name field is a column name.
+        # there can be multiple entries with the same name because one column can be interpreted in many ways
+        time_col_name = next(x.name for x in self.features if x.feature_type==InputTypes.TIME)
+        id_col_name   = next(x.name for x in self.features if x.feature_type==InputTypes.ID)
+        if not id_col_name in self.data.columns:
+            id_col_name = DEFAULT_ID_COL
+            self.features = [x for x in self.features if x.feature_type!=InputTypes.ID]
+            self.features.append(FeatureSpec(DEFAULT_ID_COL, InputTypes.ID, DataTypes.CATEGORICAL))
+        col_dtypes    = {v.name:DTYPE_MAP[v.feature_embed_type] for v in self.features}
+
+
+        self.data.sort_values(time_col_name,inplace=True)
+        self.data = self.data[set(x.name for x in self.features)] #leave only relevant columns
+        self.data = self.data.astype(col_dtypes)
+        self.data = self.data.groupby(id_col_name).filter(lambda group: len(group) >= self.example_length)
+        self.grouped = list(self.data.groupby(id_col_name))
+
+        self._cum_examples_in_group = np.cumsum([(len(g[1]) - self.example_length + 1)//self.stride for g in self.grouped])
+
+    def __len__(self):
+        return self._cum_examples_in_group[-1]
+
+    def __getitem__(self, idx):
+        g_idx = len([x for x in self._cum_examples_in_group if x <= idx])
+        e_idx = idx - self._cum_examples_in_group[g_idx-1] if g_idx else idx
+
+        group =  self.grouped[g_idx][1]
+        sliced = group.iloc[e_idx * self.stride:e_idx*self.stride + self.example_length]
+
+        # We need to be sure that tensors are returned in the correct order
+        tensors = tuple([] for _ in range(8))
+        for v in self.features:
+            if v.feature_type == InputTypes.STATIC and v.feature_embed_type == DataTypes.CATEGORICAL:
+                tensors[0].append(torch.from_numpy(sliced[v.name].to_numpy()))
+            elif v.feature_type == InputTypes.STATIC and v.feature_embed_type == DataTypes.CONTINUOUS:
+                tensors[1].append(torch.from_numpy(sliced[v.name].to_numpy()))
+            elif v.feature_type == InputTypes.KNOWN and v.feature_embed_type == DataTypes.CATEGORICAL:
+                tensors[2].append(torch.from_numpy(sliced[v.name].to_numpy()))
+            elif v.feature_type == InputTypes.KNOWN and v.feature_embed_type == DataTypes.CONTINUOUS:
+                tensors[3].append(torch.from_numpy(sliced[v.name].to_numpy()))
+            elif v.feature_type == InputTypes.OBSERVED and v.feature_embed_type == DataTypes.CATEGORICAL:
+                tensors[4].append(torch.from_numpy(sliced[v.name].to_numpy()))
+            elif v.feature_type == InputTypes.OBSERVED and v.feature_embed_type == DataTypes.CONTINUOUS:
+                tensors[5].append(torch.from_numpy(sliced[v.name].to_numpy()))
+            elif v.feature_type == InputTypes.TARGET:
+                tensors[6].append(torch.from_numpy(sliced[v.name].to_numpy()))
+            elif v.feature_type == InputTypes.ID:
+                tensors[7].append(torch.from_numpy(sliced[v.name].to_numpy()))
+
+
+        tensors = [torch.stack(x, dim=-1) if x else torch.empty(0) for x in tensors]
+
+        return OrderedDict(zip(FEAT_NAMES, tensors))
+        
+def get_dataset_splits(df, config):
+
+    if hasattr(config, 'relative_split') and config.relative_split:
+        forecast_len = config.example_length - config.encoder_length
+        # The valid split is shifted from the train split by number of the forecast steps to the future.
+        # The test split is shifted by the number of the forecast steps from the valid split
+        train = []
+        valid = []
+        test = []
+
+        for _, group in df.groupby(DEFAULT_ID_COL):
+            index = group[config.time_ids]
+            _train = group.loc[index < config.valid_boundary]
+            _valid = group.iloc[(len(_train) - config.encoder_length):(len(_train) + forecast_len)]
+            _test = group.iloc[(len(_train) - config.encoder_length + forecast_len):(len(_train) + 2*forecast_len)]
+            train.append(_train)
+            valid.append(_valid)
+            test.append(_test)
+
+        train = pd.concat(train, axis=0)
+        valid = pd.concat(valid, axis=0)
+        test = pd.concat(test, axis=0)
+    else:
+        index = df[config.time_ids]
+        train = df.loc[(index >= config.train_range[0]) & (index < config.train_range[1])]
+        valid = df.loc[(index >= config.valid_range[0]) & (index < config.valid_range[1])]
+        test  = df.loc[(index >= config.test_range[0]) & (index < config.test_range[1])]
+
+    return train, valid, test
+
+def flatten_ids(df, config):
+
+    if config.missing_id_strategy == 'drop':
+        if hasattr(config, 'combine_ids') and config.combine_ids:
+            index = np.logical_or.reduce([df[c].isna() for c in config.combine_ids])
+        else:
+            id_col = next(x.name for x in config.features if x.feature_type == InputTypes.ID)
+            index = df[id_col].isna()
+        index = index[index == True].index # Extract indices of nans
+        df.drop(index, inplace=True)
+
+    if not (hasattr(config, 'combine_ids') and config.combine_ids):
+        id_col = next(x.name for x in config.features if x.feature_type == InputTypes.ID)
+        ids = df[id_col].apply(str)
+        df.drop(id_col, axis=1, inplace=True)
+        encoder = sklearn.preprocessing.LabelEncoder().fit(ids.values)
+        df[DEFAULT_ID_COL] = encoder.transform(ids)
+        encoders = OrderedDict({id_col: encoder})
+
+    else:
+        encoders = {c:sklearn.preprocessing.LabelEncoder().fit(df[c].values) for c in config.combine_ids}
+        encoders = OrderedDict(encoders)
+        lens = [len(v.classes_) for v in encoders.values()]
+        clens = np.roll(np.cumprod(lens), 1)
+        clens[0] = 1
+
+        # this takes a looooooot of time. Probably it would be better to create 2 dummy columns
+        df[DEFAULT_ID_COL] = df.apply(lambda row: sum([encoders[c].transform([row[c]])[0]*clens[i] for i,c in enumerate(encoders.keys())]), axis=1)
+        df.drop(config.combine_ids, axis=1, inplace=True)
+
+    return DEFAULT_ID_COL, encoders
+
+def impute(df, config):
+    #XXX This ensures that out scaling will have the same mean. We still need to check the variance
+    if not hasattr(config, 'missing_data_label'):
+        return df, None
+    else:
+        imp = SimpleImputer(missing_values=config.missing_data_label, strategy='mean')
+        mask = df.applymap(lambda x: True if x == config.missing_data_label else False)
+        data = df.values
+        col_mask = (data == config.missing_data_label).all(axis=0)
+        data[:,~col_mask] = imp.fit_transform(data)
+        return data, mask
+
+def normalize_reals(train, valid, test, config, id_col=DEFAULT_ID_COL):
+    tgt_cols = [x.name for x in config.features if x.feature_type == InputTypes.TARGET]
+    real_cols = list(set(v.name for v in config.features if v.feature_embed_type == DataTypes.CONTINUOUS).difference(set(tgt_cols)))
+    real_scalers = {}
+    tgt_scalers = {}
+
+    def apply_scalers(df, name=None):
+        if name is None:
+            name = df.name
+        mask = df.applymap(lambda x: True if x == config.missing_data_label else False) if hasattr(config, 'missing_data_label') else None
+        df[real_cols] = real_scalers[name].transform(df[real_cols])
+        if mask is not None and any(mask):
+            df[real_cols].mask(mask, 10**9)
+        df[tgt_cols] = tgt_scalers[name].transform(df[tgt_cols])
+        return df
+
+    if config.scale_per_id:
+        for identifier, sliced in train.groupby(id_col):
+            data = sliced[real_cols]
+            data, _ = impute(data, config)
+            real_scalers[identifier] = sklearn.preprocessing.StandardScaler().fit(data)
+            # XXX We should probably remove examples that contain NaN as a target
+            target = sliced[tgt_cols]
+            tgt_scalers[identifier] = sklearn.preprocessing.StandardScaler().fit(target)
+
+        train = train.groupby(id_col).apply(apply_scalers)
+        # For valid and testing leave only timeseries previously present in train subset
+        # XXX for proper data science we should consider encoding unseen timeseries as a special case, not throwing them away
+        valid = valid.loc[valid[id_col].isin(real_scalers.keys())]
+        valid = valid.groupby(id_col).apply(apply_scalers)
+        test = test.loc[test[id_col].isin(real_scalers.keys())]
+        test = test.groupby(id_col).apply(apply_scalers)
+
+    else:
+        data, _ = impute(train[real_cols], config)
+        real_scalers[''] = sklearn.preprocessing.StandardScaler().fit(data)
+        tgt_scalers[''] = sklearn.preprocessing.StandardScaler().fit(train[tgt_cols])
+
+        train = apply_scalers(train, name='')
+        valid = apply_scalers(valid, name='')
+        test = apply_scalers(test, name='')
+
+    return train, valid, test, real_scalers, tgt_scalers
+
+def encode_categoricals(train, valid, test, config):
+    cat_encodings = {}
+    cat_cols = list(set(v.name for v in config.features if v.feature_embed_type == DataTypes.CATEGORICAL and v.feature_type != InputTypes.ID))
+    num_classes = [] #XXX Maybe we should modify config based on this value? Or send a warninig?
+                     # For TC performance reasons we might want for num_classes[i] be divisible by 8
+
+    # Train categorical encoders
+    for c in cat_cols:
+        if config.missing_cat_data_strategy == 'special_token':
+            #XXX this will probably require some data augmentation
+            unique = train[c].unique()
+            valid[c].loc[valid[c].isin(unique)] = '<UNK>'
+            test[c].loc[test[c].isin(unique)] = '<UNK>'
+
+        if config.missing_cat_data_strategy == 'encode_all' or \
+                config.missing_cat_data_strategy == 'special_token':
+            srs = pd.concat([train[c], valid[c], test[c]]).apply(str)
+            cat_encodings[c] = sklearn.preprocessing.LabelEncoder().fit(srs.values)
+        elif config.missing_cat_data_strategy == 'drop':
+            # TODO: implement this. In addition to dropping rows this has to split specific time series in chunks
+            # to prevent data from having temporal gaps
+            pass
+        num_classes.append(srs.nunique())
+    print('Categorical variables encodings lens: ', num_classes)
+
+
+    for split in [train, valid, test]:
+        for c in cat_cols:
+            srs = split[c].apply(str)
+            split[c] = srs
+            split.loc[:,c] = cat_encodings[c].transform(srs)
+
+    return cat_encodings
+
+
+def preprocess(src_path, dst_path, config):
+    df = pd.read_csv(src_path, index_col=0)
+
+    for c in config.features:
+        if c.feature_embed_type == DataTypes.DATE:
+            df[c.name] = pd.to_datetime(df[c.name])
+
+    # Leave only columns relevant to preprocessing
+    relevant_columns = list(set([f.name for f in config.features] + [config.time_ids]))
+    df = df[relevant_columns]
+
+
+    id_col, id_encoders = flatten_ids(df, config)
+    df = df.reindex(sorted(df.columns), axis=1)
+    
+    train, valid, test = get_dataset_splits(df, config)
+   
+    # Length filter the data (all timeseries shorter than example len will be dropped)
+    #for df in [train, valid, test]:
+    #    df.groupby(id_col).filter(lambda x: len(x) >= config.example_length)
+    train = pd.concat([x[1] for x in train.groupby(id_col) if len(x[1]) >= config.example_length])
+    valid = pd.concat([x[1] for x in valid.groupby(id_col) if len(x[1]) >= config.example_length])
+    test  = pd.concat([x[1] for x in test.groupby(id_col)  if len(x[1]) >= config.example_length])
+
+    train, valid, test, real_scalers, tgt_scalers = normalize_reals(train, valid, test, config, id_col)
+
+    cat_encodings = encode_categoricals(train, valid, test, config)
+
+    os.makedirs(dst_path, exist_ok=True)
+    
+    train.to_csv(os.path.join(dst_path, 'train.csv'))
+    valid.to_csv(os.path.join(dst_path, 'valid.csv'))
+    test.to_csv(os.path.join(dst_path, 'test.csv'))
+
+    # Save relevant columns in binary form for faster dataloading
+    # IMORTANT: We always expect id to be a single column indicating the complete timeseries
+    # We also expect a copy of id in form of static categorical input!!!
+    col_names = [id_col] + [x.name for x in config.features if x.feature_embed_type != DataTypes.DATE and x.feature_type != InputTypes.ID]
+    grouped_train = [x[1][col_names].values.astype(np.float32).view(dtype=np.int32) for x in train.groupby(id_col)]
+    grouped_valid = [x[1][col_names].values.astype(np.float32).view(dtype=np.int32) for x in valid.groupby(id_col)]
+    grouped_test  = [x[1][col_names].values.astype(np.float32).view(dtype=np.int32) for x in test.groupby(id_col)]
+
+    pickle.dump(grouped_train, open(os.path.join(dst_path, 'train.bin'), 'wb'))
+    pickle.dump(grouped_valid, open(os.path.join(dst_path, 'valid.bin'), 'wb'))
+    pickle.dump(grouped_test,  open(os.path.join(dst_path, 'test.bin'), 'wb'))
+
+    
+    with open(os.path.join(dst_path, 'real_scalers.bin'), 'wb') as f:
+        pickle.dump(real_scalers, f)
+    with open(os.path.join(dst_path, 'tgt_scalers.bin'), 'wb') as f:
+        pickle.dump(tgt_scalers, f)
+    with open(os.path.join(dst_path, 'cat_encodings.bin'), 'wb') as f:
+        pickle.dump(cat_encodings, f)
+    with open(os.path.join(dst_path, 'id_encoders.bin'), 'wb') as f:
+        pickle.dump(id_encoders, f)
+    
+
+def sample_data(dataset, num_samples):
+    if num_samples < 0:
+        return dataset
+    else:
+        return torch.utils.data.Subset(dataset, np.random.choice(np.arange(len(dataset)), size=num_samples, replace=False))
+
+
+def standarize_electricity(path):
+    """Code taken from https://github.com/google-research/google-research/blob/master/tft/script_download_data.py"""
+    df = pd.read_csv(os.path.join(path, 'LD2011_2014.txt'), index_col=0, sep=';', decimal=',')
+    df.index = pd.to_datetime(df.index)
+    df.sort_index(inplace=True)
+  
+    # Used to determine the start and end dates of a series
+    output = df.resample('1h').mean().replace(0., np.nan)
+  
+    earliest_time = output.index.min()
+  
+    df_list = []
+    for label in output:
+        print('Processing {}'.format(label))
+        srs = output[label]
+  
+        start_date = min(srs.fillna(method='ffill').dropna().index)
+        end_date = max(srs.fillna(method='bfill').dropna().index)
+  
+        active_range = (srs.index >= start_date) & (srs.index <= end_date)
+        srs = srs[active_range].fillna(0.)
+  
+        tmp = pd.DataFrame({'power_usage': srs})
+        date = tmp.index
+        tmp['t'] = (date - earliest_time).seconds / 60 / 60 + (
+            date - earliest_time).days * 24
+        tmp['days_from_start'] = (date - earliest_time).days
+        tmp['categorical_id'] = label
+        tmp['date'] = date
+        tmp['id'] = label
+        tmp['hour'] = date.hour
+        tmp['day'] = date.day
+        tmp['day_of_week'] = date.dayofweek
+        tmp['month'] = date.month
+  
+        df_list.append(tmp)
+  
+    output = pd.concat(df_list, axis=0, join='outer').reset_index(drop=True)
+  
+    output['categorical_id'] = output['id'].copy()
+    output['hours_from_start'] = output['t']
+    output['categorical_day_of_week'] = output['day_of_week'].copy()
+    output['categorical_hour'] = output['hour'].copy()
+  
+    output.to_csv(os.path.join(path, 'standarized.csv'))
+
+def standarize_volatility(path):
+    df = pd.read_csv(os.path.join(path, 'oxfordmanrealizedvolatilityindices.csv'), index_col=0)  # no explicit index
+  
+    # Adds additional date/day fields
+    idx = [str(s).split('+')[0] for s in df.index
+          ]  # ignore timezones, we don't need them
+    dates = pd.to_datetime(idx)
+    df['date'] = dates
+    df['days_from_start'] = (dates - pd.datetime(2000, 1, 3)).days
+    df['day_of_week'] = dates.dayofweek
+    df['day_of_month'] = dates.day
+    df['week_of_year'] = dates.weekofyear
+    df['month'] = dates.month
+    df['year'] = dates.year
+    df['categorical_id'] = df['Symbol'].copy()
+  
+    # Processes log volatility
+    vol = df['rv5_ss'].copy()
+    vol.loc[vol == 0.] = np.nan
+    df['log_vol'] = np.log(vol)
+  
+    # Adds static information
+    symbol_region_mapping = {
+        '.AEX': 'EMEA',
+        '.AORD': 'APAC',
+        '.BFX': 'EMEA',
+        '.BSESN': 'APAC',
+        '.BVLG': 'EMEA',
+        '.BVSP': 'AMER',
+        '.DJI': 'AMER',
+        '.FCHI': 'EMEA',
+        '.FTMIB': 'EMEA',
+        '.FTSE': 'EMEA',
+        '.GDAXI': 'EMEA',
+        '.GSPTSE': 'AMER',
+        '.HSI': 'APAC',
+        '.IBEX': 'EMEA',
+        '.IXIC': 'AMER',
+        '.KS11': 'APAC',
+        '.KSE': 'APAC',
+        '.MXX': 'AMER',
+        '.N225': 'APAC ',
+        '.NSEI': 'APAC',
+        '.OMXC20': 'EMEA',
+        '.OMXHPI': 'EMEA',
+        '.OMXSPI': 'EMEA',
+        '.OSEAX': 'EMEA',
+        '.RUT': 'EMEA',
+        '.SMSI': 'EMEA',
+        '.SPX': 'AMER',
+        '.SSEC': 'APAC',
+        '.SSMI': 'EMEA',
+        '.STI': 'APAC',
+        '.STOXX50E': 'EMEA'
+    }
+  
+    df['Region'] = df['Symbol'].apply(lambda k: symbol_region_mapping[k])
+  
+    # Performs final processing
+    output_df_list = []
+    for grp in df.groupby('Symbol'):
+        sliced = grp[1].copy()
+        sliced.sort_values('days_from_start', inplace=True)
+        # Impute log volatility values
+        sliced['log_vol'].fillna(method='ffill', inplace=True)
+        sliced.dropna()
+        output_df_list.append(sliced)
+  
+    df = pd.concat(output_df_list, axis=0)
+  
+    df.to_csv(os.path.join(path, 'standarized.csv'))
+
+
+def standarize_traffic(path):
+    def process_list(s, variable_type=int, delimiter=None):
+        """Parses a line in the PEMS format to a list."""
+        if delimiter is None:
+            l = [
+                variable_type(i) for i in s.replace('[', '').replace(']', '').split()
+            ]
+        else:
+            l = [
+                variable_type(i)
+                for i in s.replace('[', '').replace(']', '').split(delimiter)
+            ]
+  
+        return l
+  
+    def read_single_list(filename):
+        """Returns single list from a file in the PEMS-custom format."""
+        with open(os.path.join(path, filename), 'r') as dat:
+            l = process_list(dat.readlines()[0])
+        return l
+  
+    def read_matrix(filename):
+        """Returns a matrix from a file in the PEMS-custom format."""
+        array_list = []
+        with open(os.path.join(path, filename), 'r') as dat:
+            lines = dat.readlines()
+            for i, line in enumerate(lines):
+                if (i + 1) % 50 == 0:
+                    print('Completed {} of {} rows for {}'.format(i + 1, len(lines),
+                                                                filename))
+                array = [
+                    process_list(row_split, variable_type=float, delimiter=None)
+                    for row_split in process_list(
+                        line, variable_type=str, delimiter=';')
+                ]
+                array_list.append(array)
+  
+        return array_list
+  
+    shuffle_order = np.array(read_single_list('randperm')) - 1  # index from 0
+    train_dayofweek = read_single_list('PEMS_trainlabels')
+    train_tensor = read_matrix('PEMS_train')
+    test_dayofweek = read_single_list('PEMS_testlabels')
+    test_tensor = read_matrix('PEMS_test')
+  
+    # Inverse permutate shuffle order
+    print('Shuffling')
+    inverse_mapping = {
+        new_location: previous_location
+        for previous_location, new_location in enumerate(shuffle_order)
+    }
+    reverse_shuffle_order = np.array([
+        inverse_mapping[new_location]
+        for new_location, _ in enumerate(shuffle_order)
+    ])
+  
+    # Group and reoder based on permuation matrix
+    print('Reodering')
+    day_of_week = np.array(train_dayofweek + test_dayofweek)
+    combined_tensor = np.array(train_tensor + test_tensor)
+  
+    day_of_week = day_of_week[reverse_shuffle_order]
+    combined_tensor = combined_tensor[reverse_shuffle_order]
+  
+    # Put everything back into a dataframe
+    print('Parsing as dataframe')
+    labels = ['traj_{}'.format(i) for i in read_single_list('stations_list')]
+  
+    hourly_list = []
+    for day, day_matrix in enumerate(combined_tensor):
+        # Hourly data
+        hourly = pd.DataFrame(day_matrix.T, columns=labels)
+        hourly['hour_on_day'] = [int(i / 6) for i in hourly.index
+                                ]  # sampled at 10 min intervals
+        if hourly['hour_on_day'].max() > 23 or hourly['hour_on_day'].min() < 0:
+            raise ValueError('Invalid hour! {}-{}'.format(
+                hourly['hour_on_day'].min(), hourly['hour_on_day'].max()))
+  
+        hourly = hourly.groupby('hour_on_day', as_index=True).mean()[labels]
+        hourly['sensor_day'] = day
+        hourly['time_on_day'] = hourly.index
+        hourly['day_of_week'] = day_of_week[day]
+  
+        hourly_list.append(hourly)
+  
+    hourly_frame = pd.concat(hourly_list, axis=0, ignore_index=True, sort=False)
+  
+    # Flatten such that each entitiy uses one row in dataframe
+    store_columns = [c for c in hourly_frame.columns if 'traj' in c]
+    other_columns = [c for c in hourly_frame.columns if 'traj' not in c]
+    flat_df = pd.DataFrame(columns=['values', 'prev_values', 'next_values'] +
+                           other_columns + ['id'])
+  
+    for store in store_columns:
+        print('Processing {}'.format(store))
+  
+        sliced = hourly_frame[[store] + other_columns].copy()
+        sliced.columns = ['values'] + other_columns
+        sliced['id'] = int(store.replace('traj_', ''))
+  
+        # Sort by Sensor-date-time
+        key = sliced['id'].apply(str) \
+                + sliced['sensor_day'].apply(lambda x: '_{:03d}'.format(x)) \
+                + sliced['time_on_day'].apply(lambda x: '_{:03d}'.format(x))
+        sliced = sliced.set_index(key).sort_index()
+  
+        sliced['values'] = sliced['values'].fillna(method='ffill')
+        sliced['prev_values'] = sliced['values'].shift(1)
+        sliced['next_values'] = sliced['values'].shift(-1)
+  
+        flat_df = flat_df.append(sliced.dropna(), ignore_index=True, sort=False)
+  
+    # Filter to match range used by other academic papers
+    index = flat_df['sensor_day']
+    flat_df = flat_df[index < 173].copy()
+  
+    # Creating columns fo categorical inputs
+    flat_df['categorical_id'] = flat_df['id'].copy()
+    flat_df['hours_from_start'] = flat_df['time_on_day'] \
+        + flat_df['sensor_day']*24.
+    flat_df['categorical_day_of_week'] = flat_df['day_of_week'].copy()
+    flat_df['categorical_time_on_day'] = flat_df['time_on_day'].copy()
+  
+    flat_df.to_csv(os.path.join(path, 'standarized.csv'))
+
+
+# XXX needs rework
+def standarize_favorita(data_folder):
+    import gc
+    # Extract only a subset of data to save/process for efficiency
+    start_date = pd.datetime(2015, 1, 1)
+    end_date = pd.datetime(2016, 6, 1)
+  
+    print('Regenerating data...')
+  
+    # load temporal data
+    temporal = pd.read_csv(os.path.join(data_folder, 'train.csv'), index_col=0)
+  
+    store_info = pd.read_csv(os.path.join(data_folder, 'stores.csv'), index_col=0)
+    oil = pd.read_csv(
+        os.path.join(data_folder, 'oil.csv'), index_col=0).iloc[:, 0]
+    holidays = pd.read_csv(os.path.join(data_folder, 'holidays_events.csv'))
+    items = pd.read_csv(os.path.join(data_folder, 'items.csv'), index_col=0)
+    transactions = pd.read_csv(os.path.join(data_folder, 'transactions.csv'))
+  
+    # Take first 6 months of data
+    temporal['date'] = pd.to_datetime(temporal['date'])
+  
+    # Filter dates to reduce storage space requirements
+    if start_date is not None:
+        temporal = temporal[(temporal['date'] >= start_date)]
+    if end_date is not None:
+        temporal = temporal[(temporal['date'] < end_date)]
+  
+    dates = temporal['date'].unique()
+  
+    # Add trajectory identifier
+    temporal['traj_id'] = temporal['store_nbr'].apply(
+        str) + '_' + temporal['item_nbr'].apply(str)
+    temporal['unique_id'] = temporal['traj_id'] + '_' + temporal['date'].apply(
+        str)
+  
+    # Remove all IDs with negative returns
+    print('Removing returns data')
+    min_returns = temporal['unit_sales'].groupby(temporal['traj_id']).min()
+    valid_ids = set(min_returns[min_returns >= 0].index)
+    selector = temporal['traj_id'].apply(lambda traj_id: traj_id in valid_ids)
+    new_temporal = temporal[selector].copy()
+    del temporal
+    gc.collect()
+    temporal = new_temporal
+    temporal['open'] = 1
+  
+    # Resampling
+    print('Resampling to regular grid')
+    resampled_dfs = []
+    for traj_id, raw_sub_df in temporal.groupby('traj_id'):
+        print('Resampling', traj_id)
+        sub_df = raw_sub_df.set_index('date', drop=True).copy()
+        sub_df = sub_df.resample('1d').last()
+        sub_df['date'] = sub_df.index
+        sub_df[['store_nbr', 'item_nbr', 'onpromotion']] \
+            = sub_df[['store_nbr', 'item_nbr', 'onpromotion']].fillna(method='ffill')
+        sub_df['open'] = sub_df['open'].fillna(
+            0)  # flag where sales data is unknown
+        sub_df['log_sales'] = np.log(sub_df['unit_sales'])
+    
+        resampled_dfs.append(sub_df.reset_index(drop=True))
+  
+    new_temporal = pd.concat(resampled_dfs, axis=0)
+    del temporal
+    gc.collect()
+    temporal = new_temporal
+  
+    print('Adding oil')
+    oil.name = 'oil'
+    oil.index = pd.to_datetime(oil.index)
+    #XXX the lines below match the value of the oil on given date with the rest of the timeseries
+    # missing values in oil series are copied from the index before. Then the oil series is joined with
+    # temporal. Then there are some dates present in temporal which arent present in oil, for which 
+    # oil values is substituted with -1. WHY?!
+    #TODO: check how many nans there are after first step. Previously oil series was extended by dates
+    # present in dates variable with nan value, which were forward filled. 
+    # This behavior is no longer supported by pandas, so we changed to DataFrame.isin method.
+    # This leaves us with more nans after first step than previously. To achieve previous behavior
+    # we have to join series before filling nans.
+    temporal = temporal.join(
+        #oil.loc[oil.index.isin(dates)].fillna(method='ffill'), on='date', how='left')
+        oil.loc[oil.index.isin(dates)], on='date', how='left')
+    temporal['oil'] = temporal['oil'].fillna(method='ffill')
+    temporal['oil'] = temporal['oil'].fillna(-1)
+  
+    print('Adding store info')
+    temporal = temporal.join(store_info, on='store_nbr', how='left')
+  
+    print('Adding item info')
+    temporal = temporal.join(items, on='item_nbr', how='left')
+  
+    transactions['date'] = pd.to_datetime(transactions['date'])
+    temporal = temporal.merge(
+        transactions,
+        left_on=['date', 'store_nbr'],
+        right_on=['date', 'store_nbr'],
+        how='left')
+    temporal['transactions'] = temporal['transactions'].fillna(-1)
+  
+    # Additional date info
+    temporal['day_of_week'] = pd.to_datetime(temporal['date'].values).dayofweek
+    temporal['day_of_month'] = pd.to_datetime(temporal['date'].values).day
+    temporal['month'] = pd.to_datetime(temporal['date'].values).month
+  
+    # Add holiday info
+    print('Adding holidays')
+    holiday_subset = holidays[holidays['transferred'].apply(
+        lambda x: not x)].copy()
+    holiday_subset.columns = [
+        s if s != 'type' else 'holiday_type' for s in holiday_subset.columns
+    ]
+    holiday_subset['date'] = pd.to_datetime(holiday_subset['date'])
+    local_holidays = holiday_subset[holiday_subset['locale'] == 'Local']
+    regional_holidays = holiday_subset[holiday_subset['locale'] == 'Regional']
+    national_holidays = holiday_subset[holiday_subset['locale'] == 'National']
+  
+    temporal['national_hol'] = temporal.merge(
+        national_holidays, left_on=['date'], right_on=['date'],
+        how='left')['description'].fillna('')
+    temporal['regional_hol'] = temporal.merge(
+        regional_holidays,
+        left_on=['state', 'date'],
+        right_on=['locale_name', 'date'],
+        how='left')['description'].fillna('')
+    temporal['local_hol'] = temporal.merge(
+        local_holidays,
+        left_on=['city', 'date'],
+        right_on=['locale_name', 'date'],
+        how='left')['description'].fillna('')
+  
+    temporal.sort_values('unique_id', inplace=True)
+
+    # Transform date to integer index
+    start_date = pd.to_datetime(min(temporal['date']))
+    dates = temporal['date'].apply(pd.to_datetime)
+    temporal['days_from_start'] = (dates - start_date).dt.days
+    temporal['categorical_id'] = temporal['traj_id'].copy()
+  
+    print('Saving processed file to {}'.format(os.path.join(data_folder, 'standarized.csv')))
+    temporal.to_csv(os.path.join(data_folder, 'standarized.csv'))
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/ema.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/ema.py
new file mode 100644
index 00000000..f8f5b331
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/ema.py
@@ -0,0 +1,73 @@
+# Copyright 2021 NVIDIA CORPORATION
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Copyright 2019 Ross Wightman
+
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+
+#        http://www.apache.org/licenses/LICENSE-2.0
+
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+
+""" 
+Exponential Moving Average (EMA) of model updates
+"""
+
+from collections import OrderedDict
+from copy import deepcopy
+
+import torch
+import torch.nn as nn
+
+class ModelEma(nn.Module):
+    """ Model Exponential Moving Average V2
+
+    Keep a moving average of everything in the model state_dict (parameters and buffers).
+    V2 of this module is simpler, it does not match params/buffers based on name but simply
+    iterates in order. It works with torchscript (JIT of full model).
+
+    """
+    def __init__(self, model, decay=0.999, device=None):
+        super().__init__()
+        # make a copy of the model for accumulating moving average of weights
+        self.module = deepcopy(model)
+        self.module.eval()
+        self.decay = decay
+        self.device = device  # perform ema on different device from model if set
+        if self.device is not None:
+            self.module.to(device=device)
+
+    def update(self, model):
+        update_fn=lambda ema_v, model_v: self.decay * ema_v + (1. - self.decay) * model_v
+        with torch.no_grad():
+            for ema_v, model_v in zip(self.module.state_dict().values(), model.state_dict().values()):
+                if self.device is not None:
+                    model_v = model_v.to(device=self.device)
+                ema_v.copy_(update_fn(ema_v, model_v))
+
+    def set(self, model):
+        with torch.no_grad():
+            for ema_v, model_v in zip(self.module.state_dict().values(), model.state_dict().values()):
+                if self.device is not None:
+                    model_v = model_v.to(device=self.device)
+                ema_v.copy_( model_v )
+
+    def forward(self, x):
+        return self.module(x)
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/gpu_affinity.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/gpu_affinity.py
new file mode 100644
index 00000000..79fb1fc4
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/gpu_affinity.py
@@ -0,0 +1,157 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+import math
+import os
+import pathlib
+import re
+
+import pynvml
+
+pynvml.nvmlInit()
+
+
+def systemGetDriverVersion():
+    return pynvml.nvmlSystemGetDriverVersion()
+
+
+def deviceGetCount():
+    return pynvml.nvmlDeviceGetCount()
+
+
+class device:
+    # assume nvml returns list of 64 bit ints
+    _nvml_affinity_elements = math.ceil(os.cpu_count() / 64)
+
+    def __init__(self, device_idx):
+        super().__init__()
+        self.handle = pynvml.nvmlDeviceGetHandleByIndex(device_idx)
+
+    def getName(self):
+        return pynvml.nvmlDeviceGetName(self.handle)
+
+    def getCpuAffinity(self):
+        affinity_string = ''
+        for j in pynvml.nvmlDeviceGetCpuAffinity(
+            self.handle, device._nvml_affinity_elements
+        ):
+            # assume nvml returns list of 64 bit ints
+            affinity_string = '{:064b}'.format(j) + affinity_string
+        affinity_list = [int(x) for x in affinity_string]
+        affinity_list.reverse()  # so core 0 is in 0th element of list
+
+        ret = [i for i, e in enumerate(affinity_list) if e != 0]
+        return ret
+
+
+def set_socket_affinity(gpu_id):
+    dev = device(gpu_id)
+    affinity = dev.getCpuAffinity()
+    os.sched_setaffinity(0, affinity)
+
+
+def set_single_affinity(gpu_id):
+    dev = device(gpu_id)
+    affinity = dev.getCpuAffinity()
+    os.sched_setaffinity(0, affinity[:1])
+
+
+def set_single_unique_affinity(gpu_id, nproc_per_node):
+    devices = [device(i) for i in range(nproc_per_node)]
+    socket_affinities = [dev.getCpuAffinity() for dev in devices]
+
+    siblings_list = get_thread_siblings_list()
+    siblings_dict = dict(siblings_list)
+
+    # remove siblings
+    for idx, socket_affinity in enumerate(socket_affinities):
+        socket_affinities[idx] = list(set(socket_affinity) - set(siblings_dict.values()))
+
+    affinities = []
+    assigned = []
+
+    for socket_affinity in socket_affinities:
+        for core in socket_affinity:
+            if core not in assigned:
+                affinities.append([core])
+                assigned.append(core)
+                break
+    os.sched_setaffinity(0, affinities[gpu_id])
+
+
+def set_socket_unique_affinity(gpu_id, nproc_per_node, mode):
+    device_ids = [device(i) for i in range(nproc_per_node)]
+    socket_affinities = [dev.getCpuAffinity() for dev in device_ids]
+
+    siblings_list = get_thread_siblings_list()
+    siblings_dict = dict(siblings_list)
+
+    # remove siblings
+    for idx, socket_affinity in enumerate(socket_affinities):
+        socket_affinities[idx] = list(set(socket_affinity) - set(siblings_dict.values()))
+
+    socket_affinities_to_device_ids = collections.defaultdict(list)
+
+    for idx, socket_affinity in enumerate(socket_affinities):
+        socket_affinities_to_device_ids[tuple(socket_affinity)].append(idx)
+
+    for socket_affinity, device_ids in socket_affinities_to_device_ids.items():
+        devices_per_group = len(device_ids)
+        cores_per_device = len(socket_affinity) // devices_per_group
+        for group_id, device_id in enumerate(device_ids):
+            if device_id == gpu_id:
+                if mode == 'interleaved':
+                    affinity = list(socket_affinity[group_id::devices_per_group])
+                elif mode == 'continuous':
+                    affinity = list(socket_affinity[group_id*cores_per_device:(group_id+1)*cores_per_device])
+                else:
+                    raise RuntimeError('Unknown set_socket_unique_affinity mode')
+
+                # reintroduce siblings
+                affinity += [siblings_dict[aff] for aff in affinity if aff in siblings_dict]
+                os.sched_setaffinity(0, affinity)
+
+
+def get_thread_siblings_list():
+    path = '/sys/devices/system/cpu/cpu*/topology/thread_siblings_list'
+    thread_siblings_list = []
+    pattern = re.compile(r'(\d+)\D(\d+)')
+    for fname in pathlib.Path(path[0]).glob(path[1:]):
+        with open(fname) as f:
+            content = f.read().strip()
+            res = pattern.findall(content)
+            if res:
+                pair = tuple(map(int, res[0]))
+                thread_siblings_list.append(pair)
+    return thread_siblings_list
+
+
+def set_affinity(gpu_id, nproc_per_node, mode='socket'):
+    if mode == 'socket':
+        set_socket_affinity(gpu_id)
+    elif mode == 'single':
+        set_single_affinity(gpu_id)
+    elif mode == 'single_unique':
+        set_single_unique_affinity(gpu_id, nproc_per_node)
+    elif mode == 'socket_unique_interleaved':
+        set_socket_unique_affinity(gpu_id, nproc_per_node, 'interleaved')
+    elif mode == 'socket_unique_continuous':
+        set_socket_unique_affinity(gpu_id, nproc_per_node, 'continuous')
+    else:
+        raise RuntimeError('Unknown affinity mode')
+
+    affinity = os.sched_getaffinity(0)
+    return affinity
+
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/inference.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/inference.py
new file mode 100644
index 00000000..056429f1
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/inference.py
@@ -0,0 +1,239 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import pandas as pd
+import numpy as np
+import pickle
+import argparse
+import torch
+from torch.utils.data import DataLoader
+from torch.cuda import amp
+from torch.utils.tensorboard import SummaryWriter
+from tqdm import tqdm
+from modeling import TemporalFusionTransformer
+from configuration import ElectricityConfig
+from data_utils import TFTDataset
+from utils import PerformanceMeter
+from criterions import QuantileLoss
+import dllogger
+from log_helper import setup_logger
+
+def _unscale_per_id(config, values, ids, scalers):
+    values = values.cpu().numpy()
+    num_horizons = config.example_length - config.encoder_length + 1
+    flat_values = pd.DataFrame(
+            values,
+            columns=[f't{j}' for j in range(num_horizons - values.shape[1], num_horizons)]
+            )
+    flat_values['id'] = ids
+    df_list = []
+    for idx, group in flat_values.groupby('id'):
+        scaler = scalers[idx]
+        group_copy = group.copy()
+        for col in group_copy.columns:
+            if not 'id' in col:
+                _col = np.expand_dims(group_copy[col].values, -1)
+                _t_col = scaler.inverse_transform(_col)[:,-1]
+                group_copy[col] = _t_col
+        df_list.append(group_copy)
+    flat_values = pd.concat(df_list, axis=0)
+
+    flat_values = flat_values[[col for col in flat_values if not 'id' in col]]
+    flat_tensor = torch.from_numpy(flat_values.values)
+    return flat_tensor
+
+def _unscale(config, values, scaler):
+    values = values.cpu().numpy()
+    num_horizons = config.example_length - config.encoder_length + 1
+    flat_values = pd.DataFrame(
+            values,
+            columns=[f't{j}' for j in range(num_horizons - values.shape[1], num_horizons)]
+            )
+    for col in flat_values.columns:
+        if not 'id' in col:
+            _col = np.expand_dims(flat_values[col].values, -1)
+            _t_col = scaler.inverse_transform(_col)[:,-1]
+            flat_values[col] = _t_col
+
+    flat_values = flat_values[[col for col in flat_values if not 'id' in col]]
+    flat_tensor = torch.from_numpy(flat_values.values)
+    return flat_tensor
+
+def predict(args, config, model, data_loader, scalers, cat_encodings, extend_targets=False):
+    model.eval()
+    predictions = []
+    targets = []
+    ids = []
+    perf_meter = PerformanceMeter()
+    n_workers = args.distributed_world_size if hasattr(args, 'distributed_world_size') else 1
+
+    for step, batch in enumerate(data_loader):
+        perf_meter.reset_current_lap()
+        with torch.no_grad():
+            batch = {key: tensor.cuda() if tensor.numel() else None for key, tensor in batch.items()}
+            ids.append(batch['id'][:,0,:])
+            targets.append(batch['target'])
+            predictions.append(model(batch).float())
+
+        perf_meter.update(args.batch_size * n_workers,
+            exclude_from_total=step in [0, len(data_loader)-1])
+
+    targets = torch.cat(targets, dim=0)
+    if not extend_targets:
+        targets = targets[:,config.encoder_length:,:] 
+    predictions = torch.cat(predictions, dim=0)
+    
+    if config.scale_per_id:
+        ids = torch.cat(ids, dim=0).cpu().numpy()
+
+        unscaled_predictions = torch.stack(
+                [_unscale_per_id(config, predictions[:,:,i], ids, scalers) for i in range(len(config.quantiles))], 
+                dim=-1)
+        unscaled_targets = _unscale_per_id(config, targets[:,:,0], ids, scalers).unsqueeze(-1)
+    else:
+        ids = None
+        unscaled_predictions = torch.stack(
+                [_unscale(config, predictions[:,:,i], scalers['']) for i in range(len(config.quantiles))], 
+                dim=-1)
+        unscaled_targets = _unscale(config, targets[:,:,0], scalers['']).unsqueeze(-1)
+
+    return unscaled_predictions, unscaled_targets, ids, perf_meter
+
+def visualize_v2(args, config, model, data_loader, scalers, cat_encodings):
+    unscaled_predictions, unscaled_targets, ids, _ = predict(args, config, model, data_loader, scalers, cat_encodings, extend_targets=True)
+
+    num_horizons = config.example_length - config.encoder_length + 1
+    pad = unscaled_predictions.new_full((unscaled_targets.shape[0], unscaled_targets.shape[1] - unscaled_predictions.shape[1], unscaled_predictions.shape[2]), fill_value=float('nan'))
+    pad[:,-1,:] = unscaled_targets[:,-num_horizons,:]
+    unscaled_predictions = torch.cat((pad, unscaled_predictions), dim=1)
+
+    ids = torch.from_numpy(ids.squeeze())
+    joint_graphs = torch.cat([unscaled_targets, unscaled_predictions], dim=2)
+    graphs = {i:joint_graphs[ids == i, :, :] for i in set(ids.tolist())}
+    for key, g in graphs.items():
+        for i, ex in enumerate(g):
+            df = pd.DataFrame(ex.numpy(), 
+                    index=range(num_horizons - ex.shape[0], num_horizons),
+                    columns=['target'] + [f'P{int(q*100)}' for q in config.quantiles])
+            fig = df.plot().get_figure()
+            ax = fig.get_axes()[0]
+            _values = df.values[config.encoder_length-1:,:]
+            ax.fill_between(range(num_horizons), _values[:,1], _values[:,-1], alpha=0.2, color='green')
+            os.makedirs(os.path.join(args.results, 'single_example_vis', str(key)), exist_ok=True)
+            fig.savefig(os.path.join(args.results, 'single_example_vis', str(key), f'{i}.pdf'))
+
+def inference(args, config, model, data_loader, scalers, cat_encodings):
+    unscaled_predictions, unscaled_targets, ids, perf_meter = predict(args, config, model, data_loader, scalers, cat_encodings)
+
+    if args.joint_visualization or args.save_predictions:
+        ids = torch.from_numpy(ids.squeeze())
+        #ids = torch.cat([x['id'][0] for x in data_loader.dataset])
+        joint_graphs = torch.cat([unscaled_targets, unscaled_predictions], dim=2)
+        graphs = {i:joint_graphs[ids == i, :, :] for i in set(ids.tolist())}
+        for key, g in graphs.items(): #timeseries id, joint targets and predictions
+            _g = {'targets': g[:,:,0]}
+            _g.update({f'P{int(q*100)}':g[:,:,i+1] for i, q in enumerate(config.quantiles)})
+            
+            if args.joint_visualization:
+                summary_writer = SummaryWriter(log_dir=os.path.join(args.results, 'predictions_vis', str(key)))
+                for q, t in _g.items(): # target and quantiles, timehorizon values
+                    if q == 'targets':
+                        targets = torch.cat([t[:,0], t[-1,1:]]) # WIP
+                        # We want to plot targets on the same graph as predictions. Probably could be written better.
+                        for i, val in enumerate(targets):
+                            summary_writer.add_scalars(str(key), {f'{q}':val}, i)
+                        continue
+
+                    # Tensor t contains different time horizons which are shifted in phase
+                    # Next lines realign them
+                    y = t.new_full((t.shape[0] + t.shape[1] -1, t.shape[1]), float('nan'))
+                    for i in range(y.shape[1]):
+                        y[i:i+t.shape[0], i] = t[:,i]
+
+                    for i, vals in enumerate(y): # timestep, timehorizon values value
+                        summary_writer.add_scalars(str(key), {f'{q}_t+{j+1}':v for j,v in enumerate(vals) if v == v}, i)
+                summary_writer.close()
+
+            if args.save_predictions:
+                for q, t in _g.items():
+                    df = pd.DataFrame(t.tolist())
+                    df.columns = [f't+{i+1}' for i in range(len(df.columns))]
+                    os.makedirs(os.path.join(args.results, 'predictions', str(key)), exist_ok=True)
+                    df.to_csv(os.path.join(args.results, 'predictions', str(key), q+'.csv'))
+
+    losses = QuantileLoss(config)(unscaled_predictions, unscaled_targets)
+    normalizer = unscaled_targets.abs().mean()
+    q_risk = 2 * losses / normalizer
+
+    perf_dict = {
+                'throughput': perf_meter.avg,
+                'latency_avg': perf_meter.total_time/len(perf_meter.intervals),
+                'latency_p90': perf_meter.p(90),
+                'latency_p95': perf_meter.p(95),
+                'latency_p99': perf_meter.p(99),
+                'total_infernece_time': perf_meter.total_time,
+                }
+
+    return q_risk, perf_dict
+
+
+def main(args):
+    
+    setup_logger(args)
+    # Set up model
+    state_dict = torch.load(args.checkpoint)
+    config = state_dict['config']
+    model = TemporalFusionTransformer(config).cuda()
+    model.load_state_dict(state_dict['model'])
+    model.eval()
+    model.cuda()
+
+    # Set up dataset
+    test_split = TFTDataset(args.data, config)
+    data_loader = DataLoader(test_split, batch_size=args.batch_size, num_workers=4)
+
+    scalers = pickle.load(open(args.tgt_scalers, 'rb'))
+    cat_encodings = pickle.load(open(args.cat_encodings, 'rb'))
+
+    if args.visualize:
+        # TODO: abstract away all forms of visualization.
+        visualize_v2(args, config, model, data_loader, scalers, cat_encodings)
+
+    quantiles, perf_dict = inference(args, config, model, data_loader, scalers, cat_encodings)
+    quantiles = {'test_p10': quantiles[0].item(), 'test_p50': quantiles[1].item(), 'test_p90': quantiles[2].item(), 'sum':sum(quantiles).item()}
+    finish_log = {**quantiles, **perf_dict}
+    dllogger.log(step=(), data=finish_log, verbosity=1)
+    print('Test q-risk: P10 {} | P50 {} | P90 {}'.format(*quantiles))
+    print('Latency:\n\tAverage {:.3f}s\n\tp90 {:.3f}s\n\tp95 {:.3f}s\n\tp99 {:.3f}s'.format(
+        perf_dict['latency_avg'], perf_dict['latency_p90'], perf_dict['latency_p95'], perf_dict['latency_p99']))
+
+if __name__=='__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--checkpoint', type=str,
+                        help='Path to the checkpoint')
+    parser.add_argument('--data', type=str,
+                        help='Path to the test split of the dataset')
+    parser.add_argument('--tgt_scalers', type=str,
+                        help='Path to the tgt_scalers.bin file produced by the preprocessing')
+    parser.add_argument('--cat_encodings', type=str,
+                        help='Path to the cat_encodings.bin file produced by the preprocessing')
+    parser.add_argument('--batch_size', type=int, default=64)
+    parser.add_argument('--visualize', action='store_true', help='Visualize predictions - each example on the separate plot')
+    parser.add_argument('--joint_visualization', action='store_true', help='Visualize predictions - each timeseries on separate plot. Projections will be concatenated.')
+    parser.add_argument('--save_predictions', action='store_true')
+    parser.add_argument('--results', type=str, default='/results')
+    parser.add_argument('--log_file', type=str, default='dllogger.json')
+    ARGS = parser.parse_args()
+    main(ARGS)
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/log_helper.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/log_helper.py
new file mode 100644
index 00000000..83d2ac7f
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/log_helper.py
@@ -0,0 +1,141 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import subprocess
+import sys
+import itertools
+import atexit
+
+import dllogger
+from dllogger import Backend, JSONStreamBackend, StdOutBackend
+
+import torch.distributed as dist
+from torch.utils.tensorboard import SummaryWriter
+
+class TensorBoardBackend(Backend):
+    def __init__(self, verbosity, log_dir):
+        super().__init__(verbosity=verbosity)
+        self.summary_writer = SummaryWriter(log_dir=os.path.join(log_dir, 'TB_summary'),
+                                            flush_secs=120,
+                                            max_queue=200
+                                            )
+        self.hp_cache = None
+        atexit.register(self.summary_writer.close)
+
+    @property
+    def log_level(self):
+        return self._log_level
+
+    def metadata(self, timestamp, elapsedtime, metric, metadata):
+        pass
+
+    def log(self, timestamp, elapsedtime, step, data):
+        if step == 'HPARAMS':
+            parameters = {k: v for k, v in data.items() if not isinstance(v, (list, tuple))}
+            #Unpack list and tuples
+            for d in [{k+f'_{i}':v for i,v in enumerate(l)} for k,l in data.items() if isinstance(l, (list, tuple))]:
+                parameters.update(d)
+            #Remove custom classes
+            parameters = {k: v for k, v in data.items() if isinstance(v, (int, float, str, bool))}
+            parameters.update({k:'None' for k, v in data.items() if v is None})
+            self.hp_cache = parameters
+        if step == ():
+            if self.hp_cache is None:
+                print('Warning: Cannot save HParameters. Please log HParameters with step=\'HPARAMS\'', file=sys.stderr)
+                return
+            self.summary_writer.add_hparams(self.hp_cache, data)
+        if not isinstance(step, int):
+            return
+        for k, v in data.items():
+            self.summary_writer.add_scalar(k, v, step)
+
+    def flush(self):
+        pass
+
+def setup_logger(args):
+    os.makedirs(args.results, exist_ok=True)
+    log_path = os.path.join(args.results, args.log_file)
+
+    if os.path.exists(log_path):
+        for i in itertools.count():
+            s_fname = args.log_file.split('.')
+            fname = '.'.join(s_fname[:-1]) + f'_{i}.' + s_fname[-1] if len(s_fname) > 1 else args.stat_file + f'.{i}'
+            log_path = os.path.join(args.results, fname)
+            if not os.path.exists(log_path):
+                break
+
+    def metric_format(metric, metadata, value):
+        return "{}: {}".format(metric, f'{value:.5f}' if isinstance(value, float) else value)
+    def step_format(step):
+        if step == ():
+            return "Finished |"
+        elif isinstance(step, int):
+            return "Step {0: <5} |".format(step)
+        return "Step {} |".format(step)
+
+
+    if not dist.is_initialized() or not args.distributed_world_size > 1 or args.distributed_rank == 0:
+        dllogger.init(backends=[JSONStreamBackend(verbosity=1, filename=log_path),
+                                TensorBoardBackend(verbosity=1, log_dir=args.results),
+                                StdOutBackend(verbosity=2, 
+                                              step_format=step_format,
+                                              prefix_format=lambda x: "")#,
+                                              #metric_format=metric_format)
+                                ])
+    else:
+        dllogger.init(backends=[])
+    dllogger.log(step='PARAMETER', data=vars(args), verbosity=0)
+
+    container_setup_info = {**get_framework_env_vars(), **get_system_info()}
+    dllogger.log(step='ENVIRONMENT', data=container_setup_info, verbosity=0)
+
+    dllogger.metadata('loss', {'GOAL': 'MINIMIZE', 'STAGE': 'TRAIN', 'format': ':5f'})
+    dllogger.metadata('P10', {'GOAL': 'MINIMIZE', 'STAGE': 'TRAIN', 'format': ':5f'})
+    dllogger.metadata('P50', {'GOAL': 'MINIMIZE', 'STAGE': 'TRAIN', 'format': ':5f'})
+    dllogger.metadata('P90', {'GOAL': 'MINIMIZE', 'STAGE': 'TRAIN', 'format': ':5f'})
+    dllogger.metadata('items/s', {'GOAL': 'MAXIMIZE', 'STAGE': 'TRAIN', 'format': ':1f'})
+    dllogger.metadata('val_loss', {'GOAL': 'MINIMIZE', 'STAGE': 'VAL', 'format':':5f'})
+    dllogger.metadata('val_P10', {'GOAL': 'MINIMIZE', 'STAGE': 'VAL', 'format': ':5f'})
+    dllogger.metadata('val_P50', {'GOAL': 'MINIMIZE', 'STAGE': 'VAL', 'format': ':5f'})
+    dllogger.metadata('val_P90', {'GOAL': 'MINIMIZE', 'STAGE': 'VAL', 'format': ':5f'})
+    dllogger.metadata('val_items/s', {'GOAL': 'MAXIMIZE', 'STAGE': 'VAL', 'format': ':1f'})
+    dllogger.metadata('test_P10', {'GOAL': 'MINIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
+    dllogger.metadata('test_P50', {'GOAL': 'MINIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
+    dllogger.metadata('test_P90', {'GOAL': 'MINIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
+    dllogger.metadata('throughput', {'GOAL': 'MAXIMIZE', 'STAGE': 'TEST', 'format': ':1f'})
+    dllogger.metadata('latency_p90', {'GOAL': 'MIMIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
+    dllogger.metadata('latency_p95', {'GOAL': 'MIMIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
+    dllogger.metadata('latency_p99', {'GOAL': 'MIMIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
+
+
+def get_framework_env_vars():
+    return {
+        'NVIDIA_PYTORCH_VERSION': os.environ.get('NVIDIA_PYTORCH_VERSION'),
+        'PYTORCH_VERSION': os.environ.get('PYTORCH_VERSION'),
+        'CUBLAS_VERSION': os.environ.get('CUBLAS_VERSION'),
+        'NCCL_VERSION': os.environ.get('NCCL_VERSION'),
+        'CUDA_DRIVER_VERSION': os.environ.get('CUDA_DRIVER_VERSION'),
+        'CUDNN_VERSION': os.environ.get('CUDNN_VERSION'),
+        'CUDA_VERSION': os.environ.get('CUDA_VERSION'),
+        'NVIDIA_PIPELINE_ID': os.environ.get('NVIDIA_PIPELINE_ID'),
+        'NVIDIA_BUILD_ID': os.environ.get('NVIDIA_BUILD_ID'),
+        'NVIDIA_TF32_OVERRIDE': os.environ.get('NVIDIA_TF32_OVERRIDE'),
+    }
+
+def get_system_info():
+    system_info = subprocess.run('nvidia-smi --query-gpu=gpu_name,memory.total,enforced.power.limit --format=csv'.split(), capture_output=True).stdout
+    system_info = [i.decode('utf-8') for i in system_info.split(b'\n')]
+    system_info = [x for x in system_info if x]
+    return {'system_info': system_info}
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/modeling.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/modeling.py
new file mode 100644
index 00000000..65e64983
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/modeling.py
@@ -0,0 +1,367 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from torch import Tensor
+from typing import Dict, Tuple, Optional, List
+
+if os.environ.get("TFT_SCRIPTING", False):
+    from torch.nn import LayerNorm
+else:
+    from apex.normalization.fused_layer_norm import FusedLayerNorm as LayerNorm
+
+class MaybeLayerNorm(nn.Module):
+    def __init__(self, output_size, hidden_size, eps):
+        super().__init__()
+        if output_size and output_size == 1:
+            self.ln = nn.Identity()
+        else:
+            self.ln = LayerNorm(output_size if output_size else hidden_size, eps=eps)
+    
+    def forward(self, x):
+        return self.ln(x)
+
+
+class GLU(nn.Module):
+    def __init__(self, hidden_size, output_size):
+        super().__init__()
+        self.lin = nn.Linear(hidden_size, output_size * 2)
+
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.lin(x)
+        x = F.glu(x)
+        return x
+
+
+class GRN(nn.Module):
+    def __init__(self,
+                 input_size,
+                 hidden_size, 
+                 output_size=None,
+                 context_hidden_size=None,
+                 dropout=0):
+        super().__init__()
+
+        
+        self.layer_norm = MaybeLayerNorm(output_size, hidden_size, eps=1e-3)
+        self.lin_a = nn.Linear(input_size, hidden_size)
+        if context_hidden_size is not None:
+            self.lin_c = nn.Linear(context_hidden_size, hidden_size, bias=False)
+        self.lin_i = nn.Linear(hidden_size, hidden_size)
+        self.glu = GLU(hidden_size, output_size if output_size else hidden_size)
+        self.dropout = nn.Dropout(dropout)
+        self.out_proj = nn.Linear(input_size, output_size) if output_size else None
+
+    def forward(self, a: Tensor, c: Optional[Tensor] = None):
+        x = self.lin_a(a)
+        if c is not None:
+            x = x + self.lin_c(c).unsqueeze(1)
+        x = F.elu(x)
+        x = self.lin_i(x)
+        x = self.dropout(x)
+        x = self.glu(x)
+        y = a if not self.out_proj else self.out_proj(a)
+        x = x + y
+        x = self.layer_norm(x)
+        return x 
+
+class TFTEmbedding(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.s_cat_inp_lens    = config.static_categorical_inp_lens
+        self.t_cat_k_inp_lens  = config.temporal_known_categorical_inp_lens
+        self.t_cat_o_inp_lens  = config.temporal_observed_categorical_inp_lens
+        self.s_cont_inp_size   = config.static_continuous_inp_size
+        self.t_cont_k_inp_size = config.temporal_known_continuous_inp_size
+        self.t_cont_o_inp_size = config.temporal_observed_continuous_inp_size
+        self.t_tgt_size        = config.temporal_target_size
+
+        self.hidden_size = config.hidden_size
+
+        # There are 7 types of input:
+        # 1. Static categorical
+        # 2. Static continuous
+        # 3. Temporal known a priori categorical
+        # 4. Temporal known a priori continuous
+        # 5. Temporal observed categorical
+        # 6. Temporal observed continuous
+        # 7. Temporal observed targets (time series obseved so far)
+
+        self.s_cat_embed = nn.ModuleList([
+            nn.Embedding(n, self.hidden_size) for n in self.s_cat_inp_lens]) if self.s_cat_inp_lens else None
+        self.t_cat_k_embed = nn.ModuleList([
+            nn.Embedding(n, self.hidden_size) for n in self.t_cat_k_inp_lens]) if self.t_cat_k_inp_lens else None
+        self.t_cat_o_embed = nn.ModuleList([
+            nn.Embedding(n, self.hidden_size) for n in self.t_cat_o_inp_lens]) if self.t_cat_o_inp_lens else None
+
+        self.s_cont_embedding_vectors = nn.Parameter(torch.Tensor(self.s_cont_inp_size, self.hidden_size)) if self.s_cont_inp_size else None
+        self.t_cont_k_embedding_vectors = nn.Parameter(torch.Tensor(self.t_cont_k_inp_size, self.hidden_size)) if self.t_cont_k_inp_size else None
+        self.t_cont_o_embedding_vectors = nn.Parameter(torch.Tensor(self.t_cont_o_inp_size, self.hidden_size)) if self.t_cont_o_inp_size else None
+        self.t_tgt_embedding_vectors = nn.Parameter(torch.Tensor(self.t_tgt_size, self.hidden_size))
+
+        self.s_cont_embedding_bias = nn.Parameter(torch.zeros(self.s_cont_inp_size, self.hidden_size)) if self.s_cont_inp_size else None
+        self.t_cont_k_embedding_bias = nn.Parameter(torch.zeros(self.t_cont_k_inp_size, self.hidden_size)) if self.t_cont_k_inp_size else None
+        self.t_cont_o_embedding_bias = nn.Parameter(torch.zeros(self.t_cont_o_inp_size, self.hidden_size)) if self.t_cont_o_inp_size else None
+        self.t_tgt_embedding_bias = nn.Parameter(torch.zeros(self.t_tgt_size, self.hidden_size))
+
+        if self.s_cont_embedding_vectors is not None:
+            torch.nn.init.xavier_normal_(self.s_cont_embedding_vectors)
+        if self.t_cont_k_embedding_vectors is not None:
+            torch.nn.init.xavier_normal_(self.t_cont_k_embedding_vectors)
+        if self.t_cont_o_embedding_vectors is not None:
+            torch.nn.init.xavier_normal_(self.t_cont_o_embedding_vectors)
+        torch.nn.init.xavier_normal_(self.t_tgt_embedding_vectors)
+
+    def _apply_embedding(self,
+            cat: Optional[Tensor],
+            cont: Optional[Tensor],
+            cat_emb: Optional[nn.ModuleList], 
+            cont_emb: Tensor,
+            cont_bias: Tensor,
+            ) -> Tuple[Optional[Tensor], Optional[Tensor]]:
+        e_cat = torch.stack([embed(cat[...,i]) for i, embed in enumerate(cat_emb)], dim=-2) if cat is not None else None
+        if cont is not None:
+            #the line below is equivalent to following einsums
+            #e_cont = torch.einsum('btf,fh->bthf', cont, cont_emb)
+            #e_cont = torch.einsum('bf,fh->bhf', cont, cont_emb)
+            e_cont = torch.mul(cont.unsqueeze(-1), cont_emb)
+            e_cont = e_cont + cont_bias
+        else:
+            e_cont = None
+
+        if e_cat is not None and e_cont is not None:
+            return torch.cat([e_cat, e_cont], dim=-2)
+        elif e_cat is not None:
+            return e_cat
+        elif e_cont is not None:
+            return e_cont
+        else:
+            return None
+
+    def forward(self, x: Dict[str, Tensor]):
+        # temporal/static categorical/continuous known/observed input 
+        s_cat_inp = x.get('s_cat', None)
+        s_cont_inp = x.get('s_cont', None)
+        t_cat_k_inp = x.get('k_cat', None)
+        t_cont_k_inp = x.get('k_cont', None)
+        t_cat_o_inp = x.get('o_cat', None)
+        t_cont_o_inp = x.get('o_cont', None)
+        t_tgt_obs = x['target'] # Has to be present
+
+        # Static inputs are expected to be equal for all timesteps
+        # For memory efficiency there is no assert statement
+        s_cat_inp = s_cat_inp[:,0,:] if s_cat_inp is not None else None
+        s_cont_inp = s_cont_inp[:,0,:] if s_cont_inp is not None else None
+
+        s_inp = self._apply_embedding(s_cat_inp,
+                                      s_cont_inp,
+                                      self.s_cat_embed,
+                                      self.s_cont_embedding_vectors,
+                                      self.s_cont_embedding_bias)
+        t_known_inp = self._apply_embedding(t_cat_k_inp,
+                                            t_cont_k_inp,
+                                            self.t_cat_k_embed,
+                                            self.t_cont_k_embedding_vectors,
+                                            self.t_cont_k_embedding_bias)
+        t_observed_inp = self._apply_embedding(t_cat_o_inp,
+                                               t_cont_o_inp,
+                                               self.t_cat_o_embed,
+                                               self.t_cont_o_embedding_vectors,
+                                               self.t_cont_o_embedding_bias)
+
+        # Temporal observed targets
+        # t_observed_tgt = torch.einsum('btf,fh->btfh', t_tgt_obs, self.t_tgt_embedding_vectors)
+        t_observed_tgt = torch.matmul(t_tgt_obs.unsqueeze(3).unsqueeze(4), self.t_tgt_embedding_vectors.unsqueeze(1)).squeeze(3)
+        t_observed_tgt = t_observed_tgt + self.t_tgt_embedding_bias
+
+        return s_inp, t_known_inp, t_observed_inp, t_observed_tgt
+
+class VariableSelectionNetwork(nn.Module):
+    def __init__(self, config, num_inputs):
+        super().__init__()
+        self.joint_grn = GRN(config.hidden_size*num_inputs, config.hidden_size, output_size=num_inputs, context_hidden_size=config.hidden_size)
+        self.var_grns = nn.ModuleList([GRN(config.hidden_size, config.hidden_size, dropout=config.dropout) for _ in range(num_inputs)])
+
+    def forward(self, x: Tensor, context: Optional[Tensor] = None):
+        Xi = x.reshape(*x.shape[:-2], -1)
+        grn_outputs = self.joint_grn(Xi, c=context)
+        sparse_weights = F.softmax(grn_outputs, dim=-1)
+        transformed_embed_list = [m(x[...,i,:]) for i, m in enumerate(self.var_grns)]
+        transformed_embed = torch.stack(transformed_embed_list, dim=-1)
+        #the line below performs batched matrix vector multiplication
+        #for temporal features it's bthf,btf->bth
+        #for static features it's bhf,bf->bh
+        variable_ctx = torch.matmul(transformed_embed, sparse_weights.unsqueeze(-1)).squeeze(-1)
+
+        return variable_ctx, sparse_weights
+
+class StaticCovariateEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.vsn = VariableSelectionNetwork(config, config.num_static_vars)
+        self.context_grns = nn.ModuleList([GRN(config.hidden_size, config.hidden_size, dropout=config.dropout) for _ in range(4)])
+
+    def forward(self, x: Tensor) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
+        variable_ctx, sparse_weights = self.vsn(x)
+
+        # Context vectors:
+        # variable selection context
+        # enrichment context
+        # state_c context
+        # state_h context
+        cs, ce, ch, cc = tuple(m(variable_ctx) for m in self.context_grns)
+
+        return cs, ce, ch, cc
+
+
+class InterpretableMultiHeadAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.n_head = config.n_head
+        assert config.hidden_size % config.n_head == 0
+        self.d_head = config.hidden_size // config.n_head
+        self.qkv_linears = nn.Linear(config.hidden_size, (2 * self.n_head + 1) * self.d_head, bias=False)
+        self.out_proj = nn.Linear(self.d_head, config.hidden_size, bias=False)
+        self.attn_dropout = nn.Dropout(config.attn_dropout)
+        self.out_dropout = nn.Dropout(config.dropout)
+        self.scale = self.d_head**-0.5
+        self.register_buffer("_mask", torch.triu(torch.full((config.example_length, config.example_length), float('-inf')), 1).unsqueeze(0))
+
+    def forward(self, x: Tensor, mask_future_timesteps: bool = True) -> Tuple[Tensor, Tensor]:
+        bs, t, h_size = x.shape
+        qkv = self.qkv_linears(x)
+        q, k, v = qkv.split((self.n_head * self.d_head, self.n_head * self.d_head, self.d_head), dim=-1)
+        q = q.view(bs, t, self.n_head, self.d_head)
+        k = k.view(bs, t, self.n_head, self.d_head)
+        v = v.view(bs, t, self.d_head)
+
+        # attn_score = torch.einsum('bind,bjnd->bnij', q, k)
+        attn_score = torch.matmul(q.permute((0, 2, 1, 3)), k.permute((0, 2, 3, 1)))
+        attn_score.mul_(self.scale)
+
+        if mask_future_timesteps:
+            attn_score = attn_score + self._mask
+
+        attn_prob = F.softmax(attn_score, dim=3)
+        attn_prob = self.attn_dropout(attn_prob)
+
+        # attn_vec = torch.einsum('bnij,bjd->bnid', attn_prob, v)
+        attn_vec = torch.matmul(attn_prob, v.unsqueeze(1))
+        m_attn_vec = torch.mean(attn_vec, dim=1)
+        out = self.out_proj(m_attn_vec)
+        out = self.out_dropout(out)
+
+        return out, attn_vec
+
+
+
+class TemporalFusionTransformer(nn.Module):
+    """ 
+    Implementation of https://arxiv.org/abs/1912.09363 
+    """
+    def __init__(self, config):
+        super().__init__()
+
+        if hasattr(config, 'model'):
+            config = config.model
+
+        self.encoder_length = config.encoder_length #this determines from how distant past we want to use data from
+
+        self.embedding = TFTEmbedding(config)
+        self.static_encoder = StaticCovariateEncoder(config)
+
+        self.history_vsn = VariableSelectionNetwork(config, config.num_historic_vars) 
+        self.history_encoder = nn.LSTM(config.hidden_size, config.hidden_size, batch_first=True)
+        self.future_vsn = VariableSelectionNetwork(config, config.num_future_vars)
+        self.future_encoder = nn.LSTM(config.hidden_size, config.hidden_size, batch_first=True)
+
+
+        self.input_gate = GLU(config.hidden_size, config.hidden_size)
+        self.input_gate_ln = LayerNorm(config.hidden_size, eps=1e-3)
+
+        self.enrichment_grn = GRN(config.hidden_size,
+                                  config.hidden_size,
+                                  context_hidden_size=config.hidden_size, 
+                                  dropout=config.dropout)
+        self.attention = InterpretableMultiHeadAttention(config)
+        self.attention_gate = GLU(config.hidden_size, config.hidden_size)
+        self.attention_ln = LayerNorm(config.hidden_size, eps=1e-3)
+
+        self.positionwise_grn = GRN(config.hidden_size,
+                                    config.hidden_size,
+                                    dropout=config.dropout)
+
+        self.decoder_gate = GLU(config.hidden_size, config.hidden_size)
+        self.decoder_ln = LayerNorm(config.hidden_size, eps=1e-3)
+
+        self.quantile_proj = nn.Linear(config.hidden_size, len(config.quantiles))
+
+    def forward(self, x: Dict[str, Tensor]) -> Tensor:
+        s_inp, t_known_inp, t_observed_inp, t_observed_tgt = self.embedding(x)
+
+        # Static context
+        cs, ce, ch, cc = self.static_encoder(s_inp)
+        ch, cc = ch.unsqueeze(0), cc.unsqueeze(0) #lstm initial states
+
+        # Temporal input
+        _historical_inputs = [t_known_inp[:,:self.encoder_length,:], t_observed_tgt[:,:self.encoder_length,:]]
+        if t_observed_inp is not None:
+            _historical_inputs.insert(0,t_observed_inp[:,:self.encoder_length,:])
+
+        historical_inputs = torch.cat(_historical_inputs, dim=-2)
+        future_inputs = t_known_inp[:, self.encoder_length:]
+
+        # Encoders
+        historical_features, _ = self.history_vsn(historical_inputs, cs)
+        history, state = self.history_encoder(historical_features, (ch, cc))
+        future_features, _ = self.future_vsn(future_inputs, cs)
+        future, _ = self.future_encoder(future_features, state)
+        torch.cuda.synchronize() # this call gives perf boost for unknown reasons
+
+        # skip connection
+        input_embedding = torch.cat([historical_features, future_features], dim=1)
+        temporal_features = torch.cat([history, future], dim=1)
+        temporal_features = self.input_gate(temporal_features)
+        temporal_features = temporal_features + input_embedding
+        temporal_features = self.input_gate_ln(temporal_features)
+
+        # Static enrichment
+        enriched = self.enrichment_grn(temporal_features, c=ce)
+
+        # Temporal self attention
+        x, _ = self.attention(enriched, mask_future_timesteps=True)
+
+        # Don't compute hictorical quantiles
+        x = x[:, self.encoder_length:, :]
+        temporal_features = temporal_features[:, self.encoder_length:, :]
+        enriched = enriched[:, self.encoder_length:, :]
+
+        x = self.attention_gate(x)
+        x = x + enriched
+        x = self.attention_ln(x)
+
+        # Position-wise feed-forward
+        x = self.positionwise_grn(x)
+
+        # Final skip connection
+        x = self.decoder_gate(x)
+        x = x + temporal_features
+        x = self.decoder_ln(x)
+
+        out = self.quantile_proj(x)
+
+        return out
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/requirements.txt b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/requirements.txt
new file mode 100644
index 00000000..8ba46efc
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/requirements.txt
@@ -0,0 +1 @@
+tensorboard
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/scripts/benchmark.sh b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/scripts/benchmark.sh
new file mode 100644
index 00000000..c8a04c36
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/scripts/benchmark.sh
@@ -0,0 +1,54 @@
+#! /bin/bash
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+NUM_GPUS=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
+[ $NUM_GPUS -eq 16 ] && WORKER_NUMS=(1 8 16) || WORKER_NUMS=(1 8)
+DATASETS=(electricity traffic)
+
+rm -r /tmp/benchmark_results
+
+for DATASET in ${DATASETS[@]}
+do
+    for NGPU in ${WORKER_NUMS[@]}
+    do
+        for BATCH_SIZE in 512 1024 1536 2048 2560
+        do
+            for USE_AMP in --use_amp ""
+            do
+                for AFFINITY in "--affinity disabled" "--affinity single" "--affinity socket_unique_interleaved"
+                do 
+                    EXP_NAME="TFT_benchmark_${DATASET}_BS_${BATCH_SIZE}_${NGPU}GPU${USE_AMP}_${AFFINITY}"
+                    python -m torch.distributed.launch --nproc_per_node=${NGPU} train.py \
+                            --dataset ${DATASET} \
+                            --data_path /data/processed/${DATASET}_bin \
+                            --batch_size=${BATCH_SIZE} \
+                            --lr 5e-4 \
+                            --epochs 1 \
+                            --sample 100000 5000 \
+                            --seed 1 \
+                            ${USE_AMP} \
+                            ${AFFINITY} \
+                            --clip_grad 0.1 \
+                            --results /tmp/benchmark_results/${EXP_NAME}
+                done
+            done
+        done
+    done
+done
+for P in `ls /tmp/benchmark_results/`;
+do
+    echo ${P}
+    tail -n 1 /tmp/benchmark_results/${P}/dllogger.json
+done
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/scripts/get_data.sh b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/scripts/get_data.sh
new file mode 100644
index 00000000..d4c7c7e1
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/scripts/get_data.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+DATAPATH='/data'
+
+declare -A URLS=( ['electricity']='https://archive.ics.uci.edu/ml/machine-learning-databases/00321/LD2011_2014.txt.zip'
+                  ['traffic']='https://archive.ics.uci.edu/ml/machine-learning-databases/00204/PEMS-SF.zip'
+                )
+
+mkdir -p ${DATAPATH}/raw
+mkdir -p ${DATAPATH}/processed
+
+for DS in electricity traffic
+do
+	DS_PATH=${DATAPATH}/raw/${DS}
+	ZIP_FNAME=${DS_PATH}.zip
+    if [ ! -d ${DS_PATH} ]
+    then
+        wget "${URLS[${DS}]}" -O ${ZIP_FNAME}
+        unzip ${ZIP_FNAME} -d ${DS_PATH}
+    fi
+	python -c "from data_utils import standarize_${DS} as standarize; standarize(\"${DS_PATH}\")"
+	python -c "from data_utils import preprocess; \
+               from configuration import ${DS^}Config as Config; \
+               preprocess(\"${DS_PATH}/standarized.csv\", \"${DATAPATH}/processed/${DS}_bin\", Config())" 
+done
+
+
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/scripts/run_electricity.sh b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/scripts/run_electricity.sh
new file mode 100644
index 00000000..86214a9a
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/scripts/run_electricity.sh
@@ -0,0 +1,30 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+: ${SEED:=1}
+: ${LR:=1e-3}
+: ${NGPU:=8}
+: ${BATCH_SIZE:=1024}
+: ${EPOCHS:=30}
+
+python -m torch.distributed.launch --nproc_per_node=${NGPU} train.py \
+        --dataset electricity \
+        --data_path /data/processed/electricity_bin \
+        --batch_size=${BATCH_SIZE} \
+        --sample 450000 50000 \
+        --lr ${LR} \
+        --epochs ${EPOCHS} \
+        --seed ${SEED} \
+        --use_amp \
+        --results /results/TFT_electricity_bs${NGPU}x${BATCH_SIZE}_lr${LR}/seed_${SEED}
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/scripts/run_electricity_DGX1-16G.sh b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/scripts/run_electricity_DGX1-16G.sh
new file mode 100644
index 00000000..86214a9a
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/scripts/run_electricity_DGX1-16G.sh
@@ -0,0 +1,30 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+: ${SEED:=1}
+: ${LR:=1e-3}
+: ${NGPU:=8}
+: ${BATCH_SIZE:=1024}
+: ${EPOCHS:=30}
+
+python -m torch.distributed.launch --nproc_per_node=${NGPU} train.py \
+        --dataset electricity \
+        --data_path /data/processed/electricity_bin \
+        --batch_size=${BATCH_SIZE} \
+        --sample 450000 50000 \
+        --lr ${LR} \
+        --epochs ${EPOCHS} \
+        --seed ${SEED} \
+        --use_amp \
+        --results /results/TFT_electricity_bs${NGPU}x${BATCH_SIZE}_lr${LR}/seed_${SEED}
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/scripts/run_traffic.sh b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/scripts/run_traffic.sh
new file mode 100644
index 00000000..cab8e473
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/scripts/run_traffic.sh
@@ -0,0 +1,30 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+: ${SEED:=1}
+: ${LR:=1e-3}
+: ${NGPU:=8}
+: ${BATCH_SIZE:=1024}
+: ${EPOCHS:=20}
+
+python -m torch.distributed.launch --nproc_per_node=${NGPU} train.py \
+        --dataset traffic \
+        --data_path /data/processed/traffic_bin \
+        --batch_size=${BATCH_SIZE} \
+        --sample 450000 50000 \
+        --lr ${LR} \
+        --epochs ${EPOCHS} \
+        --seed ${SEED} \
+        --use_amp \
+        --results /results/TFT_traffic_bs${NGPU}x${BATCH_SIZE}_lr${LR}/seed_${SEED}
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/scripts/run_traffic_DGX1-16G.sh b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/scripts/run_traffic_DGX1-16G.sh
new file mode 100644
index 00000000..cab8e473
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/scripts/run_traffic_DGX1-16G.sh
@@ -0,0 +1,30 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+: ${SEED:=1}
+: ${LR:=1e-3}
+: ${NGPU:=8}
+: ${BATCH_SIZE:=1024}
+: ${EPOCHS:=20}
+
+python -m torch.distributed.launch --nproc_per_node=${NGPU} train.py \
+        --dataset traffic \
+        --data_path /data/processed/traffic_bin \
+        --batch_size=${BATCH_SIZE} \
+        --sample 450000 50000 \
+        --lr ${LR} \
+        --epochs ${EPOCHS} \
+        --seed ${SEED} \
+        --use_amp \
+        --results /results/TFT_traffic_bs${NGPU}x${BATCH_SIZE}_lr${LR}/seed_${SEED}
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/Dockerfile b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/Dockerfile
new file mode 100644
index 00000000..70552ea1
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/Dockerfile
@@ -0,0 +1,36 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:21.06-py3
+
+FROM ${FROM_IMAGE_NAME}
+
+RUN apt-get update && apt-get install -y libb64-dev libb64-0d
+WORKDIR /workspace
+#ENV PYTHONPATH /workspace
+RUN pip uninstall -y typing
+
+RUN apt update && apt install -y p7zip-full
+COPY requirements.txt .
+RUN pip install --upgrade pip
+RUN pip install --no-cache-dir --ignore-installed -r requirements.txt
+RUN pip install --no-cache-dir -e git://github.com/NVIDIA/dllogger#egg=dllogger
+
+COPY . .
+ENV PYTHONPATH="${PYTHONPATH}:/workspace"
+
+# AMP monkey-patch
+RUN sed -i 's/  def forward(ctx,/  @amp.custom_fwd\(cast_inputs=torch.float32\)\n  def forward(ctx,/g' /opt/conda/lib/python3.8/site-packages/apex/normalization/fused_layer_norm.py
+RUN sed -i 's/  def backward(ctx,/  @amp.custom_bwd\n  def backward(ctx,/g' /opt/conda/lib/python3.8/site-packages/apex/normalization/fused_layer_norm.py
+RUN sed -i 's/^import torch$/import torch\nfrom torch.cuda import amp/' /opt/conda/lib/python3.8/site-packages/apex/normalization/fused_layer_norm.py
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/LICENCE b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/LICENCE
new file mode 100644
index 00000000..261eeb9e
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/LICENCE
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/LICENSE AGREEMENT b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/LICENSE AGREEMENT
new file mode 100644
index 00000000..5d1d88cf
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/LICENSE AGREEMENT	
@@ -0,0 +1,25 @@
+Individual Contributor License Agreement (CLA)
+Thank you for submitting your contributions to this project.
+
+By signing this CLA, you agree that the following terms apply to all of your past, present and future contributions to the project.
+
+License.
+You hereby represent that all present, past and future contributions are governed by the Apache 2.0 License copyright statement.
+
+This entails that to the extent possible under law, you transfer all copyright and related or neighboring rights of the code or documents you contribute to the project itself or its maintainers. Furthermore you also represent that you have the authority to perform the above waiver with respect to the entirety of you contributions.
+
+Moral Rights.
+To the fullest extent permitted under applicable law, you hereby waive, and agree not to assert, all of your “moral rights” in or relating to your contributions for the benefit of the project.
+
+Third Party Content.
+If your Contribution includes or is based on any source code, object code, bug fixes, configuration changes, tools, specifications, documentation, data, materials, feedback, information or other works of authorship that were not authored by you (“Third Party Content”) or if you are aware of any third party intellectual property or proprietary rights associated with your Contribution (“Third Party Rights”), then you agree to include with the submission of your Contribution full details respecting such Third Party Content and Third Party Rights, including, without limitation, identification of which aspects of your Contribution contain Third Party Content or are associated with Third Party Rights, the owner/author of the Third Party Content and Third Party Rights, where you obtained the Third Party Content, and any applicable third party license terms or restrictions respecting the Third Party Content and Third Party Rights. For greater certainty, the foregoing obligations respecting the identification of Third Party Content and Third Party Rights do not apply to any portion of a Project that is incorporated into your Contribution to that same Project.
+
+Representations.
+You represent that, other than the Third Party Content and Third Party Rights identified by you in accordance with this Agreement, you are the sole author of your Contributions and are legally entitled to grant the foregoing licenses and waivers in respect of your Contributions. If your Contributions were created in the course of your employment with your past or present employer(s), you represent that such employer(s) has authorized you to make your Contributions on behalf of such employer(s) or such employer (s) has waived all of their right, title or interest in or to your Contributions.
+
+Disclaimer.
+To the fullest extent permitted under applicable law, your Contributions are provided on an "as is" basis, without any warranties or conditions, express or implied, including, without limitation, any implied warranties or conditions of non-infringement, merchantability or fitness for a particular purpose. You are not required to provide support for your Contributions, except to the extent you desire to provide support.
+
+No Obligation.
+You acknowledge that the maintainers of this project are under no obligation to use or incorporate your contributions into the project. The decision to use or incorporate your contributions into the project will be made at the sole discretion of the maintainers or their authorized delegates.
+
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/NOTICE b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/NOTICE
new file mode 100644
index 00000000..ae19bb47
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/NOTICE
@@ -0,0 +1,3 @@
+TFT for PyTorch
+
+This repository includes software from https://github.com/google-research/google-research/tree/master/tft licensed under the Apache License, Version 2.0
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/README.md b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/README.md
new file mode 100644
index 00000000..69b39d12
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/README.md
@@ -0,0 +1,465 @@
+# Temporal Fusion Transformer For PyTorch
+
+This repository provides a script and recipe to train the Temporal Fusion Transformer model to achieve state-of-the-art accuracy. The content of this repository is tested and maintained by NVIDIA.
+
+## Table Of Contents
+
+- [Model overview](#model-overview)
+    * [Model architecture](#model-architecture)
+    * [Default configuration](#default-configuration)
+    * [Feature support matrix](#feature-support-matrix)
+	    * [Features](#features)
+    * [Mixed precision training](#mixed-precision-training)
+	    * [Enabling mixed precision](#enabling-mixed-precision)
+          * [Enabling TF32](#enabling-tf32)
+    * [Glossary](#glossary)
+- [Setup](#setup)
+    * [Requirements](#requirements)
+- [Quick Start Guide](#quick-start-guide)
+- [Advanced](#advanced)
+    * [Scripts and sample code](#scripts-and-sample-code)
+    * [Command-line options](#command-line-options)
+    * [Getting the data](#getting-the-data)
+        * [Dataset guidelines](#dataset-guidelines)
+        * [Multi-dataset](#multi-dataset)
+    * [Training process](#training-process)
+    * [Inference process](#inference-process)
+- [Performance](#performance)
+    * [Benchmarking](#benchmarking)
+        * [Training performance benchmark](#training-performance-benchmark)
+        * [Inference performance benchmark](#inference-performance-benchmark)
+    * [Results](#results)
+        * [Training accuracy results](#training-accuracy-results)                         
+            * [Training accuracy: NVIDIA DGX A100 (8x A100 80GB)](#training-accuracy-nvidia-dgx-a100-8x-a100-80gb)
+            * [Training accuracy: NVIDIA DGX-1 (8x V100 16GB)](#training-accuracy-nvidia-dgx-1-8x-v100-16gb)
+            * [Training stability test](#training-stability-test)
+        * [Training performance results](#training-performance-results)
+            * [Training performance: NVIDIA DGX A100 (8x A100 80GB)](#training-performance-nvidia-dgx-a100-8x-a100-80gb)
+            * [Training performance: NVIDIA DGX-1 (8x V100 16GB)](#training-performance-nvidia-dgx-1-8x-v100-16gb)
+- [Release notes](#release-notes)
+    * [Changelog](#changelog)
+    * [Known issues](#known-issues)
+
+
+
+## Model overview
+
+The Temporal Fusion Transformer [TFT](https://arxiv.org/abs/1912.09363) model is a state-of-the-art architecture for interpretable, multi-horizon time-series prediction. The model was first developed and [implemented by Google](https://github.com/google-research/google-research/tree/master/tft) with the collaboration with the University of Oxford.
+This implementation differs from the reference implementation by addressing the issue of missing data, which is common in production datasets, by either masking their values in attention matrices or embedding them as a special value in the latent space.
+This model enables the prediction of confidence intervals for future values of time series for multiple future timesteps.
+
+This model is trained with mixed precision using Tensor Cores on Volta, Turing, and the NVIDIA Ampere GPU architectures. Therefore, researchers can get results 1.45x faster than training without Tensor Cores while experiencing the benefits of mixed precision training. This model is tested against each NGC monthly container release to ensure consistent accuracy and performance over time.
+
+### Model architecture
+
+The TFT model is a hybrid architecture joining LSTM encoding of time series and interpretability of transformer attention layers. Prediction is based on three  types of variables: static (constant for a given time series), known (known in advance for whole history and future), observed (known only for historical data). All these variables come in two flavors: categorical, and continuous. In addition to historical data, we feed the model with historical values of time series. All variables are embedded in high-dimensional space by learning an embedding vector. Categorical variables embeddings are learned in the classical sense of embedding discrete values. The model learns a single vector for each continuous variable, which is then scaled by this variable’s value for further processing. The next step is to filter variables through the Variable Selection Network (VSN), which assigns weights to the inputs in accordance with their relevance to the prediction. Static variables are used as a context for variable selection of other variables and as an initial state of LSTM encoders.
+After encoding, variables are passed to multi-head attention layers (decoder), which produce the final prediction. Whole architecture is interwoven with residual connections with gating mechanisms that allow  the architecture to adapt to various problems by skipping some parts of it.
+For the sake of explainability, heads of self-attention layers share value matrices. This allows interpreting  self-attention as an ensemble of models predicting different temporal patterns over the same feature set. The other feature that helps us understand the model is VSN activations, which tells us how relevant the given feature is to the prediction.
+![](TFT_architecture.PNG)
+*image source: https://arxiv.org/abs/1912.09363*
+
+### Default configuration
+
+The specific configuration of the TFT model depends on the dataset used. Not only is the volume of the model subject to change but so are the data sampling and preprocessing strategies. During preprocessing, data is normalized per feature. For a part of the datasets, we apply scaling per-time-series, which takes into account shifts in distribution between entities (i.e., a factory consumes more electricity than an average house). The model is trained with the quantile loss: <img src="https://render.githubusercontent.com/render/math?math=\Large\sum_{i=1}^N\sum_{q\in\mathcal{Q}}\sum_{t=1}^{t_{max}}\frac{QL(y_it,\hat{y}_i(q,t),q)}{Nt_{max}}">
+For quantiles in [0.1, 0.5, 0.9]. The default configurations are tuned for distributed training on DGX-1-32G with mixed precision. We use dynamic loss scaling. Specific values are provided in the table below.
+
+| Dataset | Training samples | Validation samples | Test samples | History length | Forecast horizon | Dropout | Hidden size | #Heads | BS | LR | Gradient clipping |
+| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
+| Electricity | 450k | 50k | 53.5k | 168 | 24 | 0.1 | 128 | 4 | 8x1024 | 1e-3 | 0.0 |
+| Traffic | 450k | 50k | 139.6k | 168 | 24 | 0.3 | 128 | 4 | 8x1024 | 1e-3 | 0.0
+
+### Feature support matrix
+
+The following features are supported by this model: 
+
+| Feature                    | Yes column                
+|----------------------------|--------------------------
+|Distributed data parallel   |         Yes
+|PyTorch AMP                 |         Yes 
+    
+         
+#### Features
+
+[Automatic Mixed Precision](https://pytorch.org/docs/stable/amp.html)
+provides an easy way to leverage Tensor Cores’ performance. It allows the execution of parts of a network in lower precision. Refer to [Mixed precision training](#mixed-precision-training) for more information.
+
+[PyTorch
+DistributedDataParallel](https://pytorch.org/docs/stable/nn.html#torch.nn.parallel.DistributedDataParallel) - a module
+wrapper that enables easy multiprocess distributed data-parallel
+training.
+
+### Mixed precision training
+
+Mixed precision is the combined use of different numerical precisions in a
+computational method.
+[Mixed precision](https://arxiv.org/abs/1710.03740) training offers significant
+computational speedup by performing operations in half-precision format while
+storing minimal information in single-precision to retain as much information
+as possible in critical parts of the network. Since the introduction of [Tensor Cores](https://developer.nvidia.com/tensor-cores) in Volta, and following with 
+both the Turing and Ampere architectures, significant training speedups are 
+experienced by switching to
+mixed precision -- up to 3x overall speedup on the most arithmetically intense
+model architectures. Using mixed precision training previously required two
+steps:
+
+1. Porting the model to use the FP16 data type where appropriate.
+2. Manually adding loss scaling to preserve small gradient values.
+
+The ability to train deep learning networks with lower precision was introduced
+in the Pascal architecture and first supported in [CUDA
+8](https://devblogs.nvidia.com/parallelforall/tag/fp16/) in the NVIDIA Deep
+Learning SDK.
+
+For information about:
+* How to train using mixed precision, refer to the [Mixed Precision
+  Training](https://arxiv.org/abs/1710.03740) paper and [Training With Mixed
+  Precision](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html)
+  documentation.
+* Techniques used for mixed precision training, refer to the [Mixed-Precision
+  Training of Deep Neural
+  Networks](https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/)
+  blog.
+* APEX tools for mixed precision training, refer to the [NVIDIA Apex: Tools for Easy Mixed-Precision Training in
+  PyTorch](https://devblogs.nvidia.com/apex-pytorch-easy-mixed-precision-training/)
+  .
+
+
+#### Enabling mixed precision
+
+
+Mixed precision is enabled in PyTorch by using the Automatic Mixed Precision torch.cuda.amp module, which casts variables to half-precision upon retrieval while storing variables in single-precision format. Furthermore, to preserve small gradient magnitudes in backpropagation, a [loss scaling](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html#lossscaling) step must be included when applying gradients. In PyTorch, loss scaling can be applied automatically by the GradScaler class. All the necessary steps to implement AMP are verbosely described [here](https://pytorch.org/docs/stable/notes/amp_examples.html#amp-examples).
+
+To enable mixed precision for TFT, simply add the `--use_amp` option to the training script.
+#### Enabling TF32
+
+TensorFloat-32 (TF32) is the new math mode in [NVIDIA A100](https://www.nvidia.com/en-us/data-center/a100/) GPUs for handling the matrix math, also called tensor operations. TF32 running on Tensor Cores in A100 GPUs can provide up to 10x speedups compared to single-precision floating-point math (FP32) on Volta GPUs. 
+
+TF32 Tensor Cores can speed up networks using FP32, typically with no loss of accuracy. It is more robust than FP16 for models which require high dynamic range for weights or activations.
+
+For more information, refer to the [TensorFloat-32 in the A100 GPU Accelerates AI Training, HPC up to 20x](https://blogs.nvidia.com/blog/2020/05/14/tensorfloat-32-precision-format/) blog post.
+
+TF32 is supported in the NVIDIA Ampere GPU architecture and is enabled by default.
+
+
+
+### Glossary
+
+**Multi horizon prediction**  
+Process of estimating values of a time series for multiple future time steps.
+
+**Quantiles**  
+Cut points dividing the range of a probability distribution intervals with equal probabilities.
+
+**Time series**  
+Series of data points indexed and equally spaced in time.
+
+**Transformer**  
+The paper [Attention Is All You Need](https://arxiv.org/abs/1706.03762) introduces a novel architecture called Transformer that uses an attention mechanism and transforms one sequence into another.
+ 
+
+## Setup
+
+The following section lists the requirements that you need to meet in order to start training the TFT model.
+
+### Requirements
+
+This repository contains Dockerfile, which extends the PyTorch NGC container and encapsulates some dependencies. Aside from these dependencies, ensure you have the following components:
+-   [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker)
+-   [PyTorch 21.06 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch)
+-   Supported GPUs:
+- [NVIDIA Volta architecture](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/)
+- [NVIDIA Turing architecture](https://www.nvidia.com/en-us/design-visualization/technologies/turing-architecture/)
+- [NVIDIA Ampere architecture](https://www.nvidia.com/en-us/data-center/nvidia-ampere-gpu-architecture/)
+
+For more information about how to get started with NGC containers, refer to the following sections from the NVIDIA GPU Cloud Documentation and the Deep Learning Documentation:
+-   [Getting Started Using NVIDIA GPU Cloud](https://docs.nvidia.com/ngc/ngc-getting-started-guide/index.html)
+-   [Accessing And Pulling From The NGC Container Registry](https://docs.nvidia.com/deeplearning/frameworks/user-guide/index.html#accessing_registry)
+-   Running [PyTorch](https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/running.html#running)
+
+  
+For those unable to use the PyTorch NGC container to set up the required environment or create your own container, refer to the versioned [NVIDIA Container Support Matrix](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html).
+
+## Quick Start Guide
+
+To train your model using mixed or TF32 precision with Tensor Cores, perform the following steps using the default parameters of the TFT model on any of the benchmark datasets. For the specifics concerning training and inference, refer to the [Advanced](#advanced) section.
+
+1. Clone the repository.
+```bash
+git clone https://github.com/NVIDIA/DeepLearningExamples
+cd DeepLearningExamples/PyTorch/Forecasting/TFT
+```
+
+2.  Build the TFT PyTorch NGC container.
+```bash
+docker build --network=host -t tft .
+```
+
+3.  Start an interactive session in the NGC container to run training/inference.
+```bash
+docker run -it --rm --ipc=host --network=host --gpus all -v /path/to/your/data:/data/ tft
+```
+
+Note: Ensure to mount your dataset using the -v flag to make it available for training inside the NVIDIA Docker container.
+
+4.  Download and preprocess datasets.
+```bash
+bash scripts/get_data.sh
+```
+
+5. Start training. Choose one of the scripts provided in the `scripts/` directory. Results are stored in the `/results` directory.
+These scripts are tuned for DGX1-32G. If you have a different system, use NGPU and BATCH_SIZE variables to adjust the parameters for your system.
+```bash
+bash scripts/run_electricity.sh
+bash scripts/run_traffic.sh
+```
+
+6. Start validation/evaluation. The metric we use for evaluation is q-risk. We can compare it per-quantile in the Pareto sense or jointly as one number indicating accuracy.
+```bash
+python inference.py \
+--checkpoint <your_checkpoint> \
+--data /data/processed/<dataset>/test.csv \
+--cat_encodings /data/processed/<dataset>/cat_encodings.bin \
+--tgt_scalers /data/processed/<dataset>/tgt_scalers.bin
+```
+
+7. Start inference/predictions. Visualize and save predictions by running the following command.
+```bash
+python inference.py \
+--checkpoint <your_checkpoint> \
+--data /data/processed/<dataset>/test.csv \
+--cat_encodings /data/processed/<dataset>/cat_encodings.bin \
+--tgt_scalers /data/processed/<dataset>/tgt_scalers.bin \
+--visualize \
+--save_predictions
+```
+
+
+
+Now that you have your model trained and evaluated, you can choose to compare your training results with our [Training accuracy results](#training-accuracy-results). You can also choose to benchmark your performance to [Training performance benchmark](#training-performance-results). Following the steps in these sections will ensure that you achieve the same accuracy and performance results as stated in the [Results](#results) section.
+## Advanced
+
+The following sections provide more  details about the dataset, running training and inference, and the training results.
+
+### Scripts and sample code
+
+In the root directory, the most important files are:
+
+`train.py`: Entry point for training
+`data_utils.py`: File containing the dataset implementation and preprocessing functions
+`modeling.py`: Definition of the model
+`configuration.py`: Contains configuration classes for various experiments
+`test.py`: Entry point testing trained model.
+`Dockerfile`: Container definition
+`log_helper.py`: Contains helper functions for setting up dllogger
+`criterions.py`: Definitions of loss functions
+
+The `scripts` directory contains scripts for default use cases:
+`run_electricity.sh`: train default model on the electricity dataset
+`run_traffic.sh`: train default model on the traffic dataset
+
+### Command-line options
+
+To view the full list of available options and their descriptions, use the `-h` or `--help` command-line option, for example:
+`python train.py --help`.
+
+The following example output is printed when running the model:
+```
+usage: train.py [-h] --data_path DATA_PATH --dataset {electricity,volatility,traffic,favorita} [--epochs EPOCHS] [--sample_data SAMPLE_DATA SAMPLE_DATA] [--batch_size BATCH_SIZE] [--lr LR] [--seed SEED] [--use_amp] [--clip_grad CLIP_GRAD]
+                [--early_stopping EARLY_STOPPING] [--results RESULTS] [--log_file LOG_FILE] [--distributed_world_size N] [--distributed_rank DISTRIBUTED_RANK] [--local_rank LOCAL_RANK] [--overwrite_config OVERWRITE_CONFIG]
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --data_path DATA_PATH
+  --dataset {electricity,volatility,traffic,favorita}
+  --epochs EPOCHS
+  --sample_data SAMPLE_DATA SAMPLE_DATA
+  --batch_size BATCH_SIZE
+  --lr LR
+  --seed SEED
+  --use_amp             Enable automatic mixed precision
+  --clip_grad CLIP_GRAD
+  --early_stopping EARLY_STOPPING
+                        Stop training if validation loss does not improve for more than this number of epochs.
+  --results RESULTS
+  --log_file LOG_FILE
+  --distributed_world_size N
+                        total number of GPUs across all nodes (default: all visible GPUs)
+  --distributed_rank DISTRIBUTED_RANK
+                        rank of the current worker
+  --local_rank LOCAL_RANK
+                        rank of the current worker
+  --overwrite_config OVERWRITE_CONFIG
+                        JSON string used to overload config
+
+```
+
+### Getting the data
+    
+The TFT model was trained on the electricity and traffic benchmark datasets. This repository contains the `get_data.sh` download script, which for electricity and and traffic datasets will automatically download and preprocess the training, validation and test datasets, and produce files that contain scalers.
+#### Dataset guidelines
+
+The `data_utils.py` file contains all functions that are used to preprocess the data. Initially the data is loaded to a `pandas.DataFrame` and parsed to the common format which contains the features we will use for training. Then standardized data is cleaned, normalized, encoded and binarized.
+This step does the following:
+Drop all the columns that are not marked in the configuration file as used for training or preprocessing
+Flatten indices in case time series are indexed by more than one column
+Split the data into training, validation and test splits
+Filter out all the time series shorter than minimal example length
+Normalize columns marked as continuous in the configuration file
+Encode as integers columns marked as categorical
+Save the data in csv and binary formats
+
+#### Multi-dataset
+In order to use an alternate dataset, you have to write a function that parses your data to a common format. The format is as follows:
+There is at least one id column
+There is exactly one time column (that can also be used as a feature column)
+Each feature is in a separate column
+Each row represents a moment in time for only one time series
+Additionally, you must specify a configuration of the network, including a data description. Refer to the example in `configuration.py` file.
+### Training process
+
+The `train.py` script is an entry point for a training procedure. Refined recipes can be found in the `scripts` directory.
+The model trains for at most `--epochs` epochs. If option `--early_stopping N` is set, then training will end if for N subsequent epochs validation loss hadn’t improved.
+The details of the architecture and the dataset configuration are encapsulated by the `--dataset` option. This option chooses one of the configurations stored in the `configuration.py` file. You can enable mixed precision training by providing the `--use_amp` option. The training script supports multi-GPU training with the APEX package. To enable distributed training prepend training command with `python -m torch.distributed.launch --nproc_per_node=${NGPU}`.
+
+Example command:
+```
+python -m torch.distributed.launch --nproc_per_node=8 train.py \
+        --dataset electricity \
+        --data_path /data/processed/electricity_bin \
+        --batch_size=1024 \
+        --sample 450000 50000 \
+        --lr 1e-3 \
+        --epochs 25 \
+        --early_stopping 5 \
+        --seed 1 \
+        --use_amp \
+        --results /results/TFT_electricity_bs8x1024_lr1e-3/seed_1
+```
+
+The model is trained by optimizing quantile loss <img src="https://render.githubusercontent.com/render/math?math=\Large\sum_{i=1}^N\sum_{q\in\mathcal{Q}}\sum_{t=1}^{t_{max}}\frac{QL(y_{it},\hat{y}_i(q,t),q)}{Nt_{max}}">
+. After training, the checkpoint with the least validation loss is evaluated on a test split with q-risk metric <img src="https://render.githubusercontent.com/render/math?math=\Large\frac{2\sum_{y\in\Omega}\sum_{t=1}^{t_{max}}QL(y_t,\hat{y}(q,t),q)}{\sum_{y\in\Omega}\sum_{t=1}^{t_{max}}|y_t|}">.
+Results are by default stored in the `/results` directory. This can be changed by providing the `--results` option. At the end of the training,  the results directory will contain the trained checkpoint which had the lowest validation loss, dllogger logs (in dictionary per line format), and TensorBoard logs.
+
+### Inference process
+
+Inference can be run by launching the `inference.py` script. The script requires a trained checkpoint to run. It is crucial to prepare the data in the same way as training data prior to running the inference. Example command:
+```
+python inference.py \
+--checkpoint /results/checkpoint.pt \
+--data /data/processed/electricity_bin/test.csv \
+--tgt_scalers /data/processed/electricity_bin/tgt_scalers.bin \
+--cat_encodings /data/processed/electricity_bin/cat_encodings.bin \
+--batch_size 2048 \
+--visualize \
+--save_predictions \
+--joint_visualization \
+--results /results \
+--use_amp
+```
+
+In the default setting, it performs the evaluation of the model on a specified dataset and prints q-risk evaluated on this dataset. In order to save the predictions, use the `--save_predictions` option. Predictions will be stored in the directory specified by the `--results` option in the csv format. Option `--joint_visualization` allows us to plot graphs in TensorBoard format, allowing us to inspect the results and compare them to true values. Using `--visualize`, you can save plots for each example in a separate file.
+## Performance
+
+### Benchmarking
+
+The following section shows how to run benchmarks measuring the model performance in training and inference modes.
+
+#### Training performance benchmark
+
+In order to run training benchmarks, use the `scripts/benchmark.sh` script.
+
+#### Inference performance benchmark
+
+To benchmark the inference performance on a specific batch size and dataset, run the `inference.py` script.
+### Results
+
+The following sections provide details on how we achieved our performance and accuracy in training and inference.
+
+#### Training accuracy results
+
+We conducted an extensive hyperparameter search along with stability tests. The presented results are the averages from the hundreds of runs.
+
+##### Training accuracy: NVIDIA DGX A100 (A100 80GB)
+
+Our results were obtained by running the `train.sh` training script in the [PyTorch 21.06 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch) on NVIDIA A100 GPUs.
+
+| Dataset | GPUs | Batch size / GPU    | Accuracy - TF32  | Accuracy - mixed precision  |   Time to train - TF32  |  Time to train - mixed precision | Time to train speedup (TF32 to mixed precision)     
+|-------------|---|------|-----------------------|-----------------------|-------|-------|-------
+| Electricity | 1 | 1024 | 0.027 / 0.059 / 0.029 | 0.028 / 0.058 / 0.029 | 1427s | 1087s | 1.313x
+| Electricity | 8 | 1024 | 0.027 / 0.056 / 0.028 | 0.026 / 0.054 / 0.029 | 216s  | 176s  | 1.227x
+| Traffic     | 1 | 1024 | 0.040 / 0.103 / 0.075 | 0.040 / 0.103 / 0.075 | 957s  | 726s  | 1.318x
+| Traffic     | 8 | 1024 | 0.042 / 0.104 / 0.076 | 0.042 / 0.106 / 0.077 | 151s  | 126s  | 1.198x
+
+
+
+
+##### Training accuracy: NVIDIA DGX-1 (V100 16GB)
+
+Our results were obtained by running the `train.sh` training script in the [PyTorch 21.06 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch) on NVIDIA DGX-1 with V100 16GB GPUs.
+
+| Dataset | GPUs    | Batch size / GPU    | Accuracy - FP32  | Accuracy - mixed precision  |   Time to train - FP32  |  Time to train - mixed precision | Time to train speedup (FP32 to mixed precision)        
+|-------------|---|------|-----------------------|-----------------------|-------|-------|-----------
+| Electricity | 1 | 1024 | 0.027 / 0.056 / 0.028 | 0.027 / 0.058 / 0.029 | 2559s | 1598s | 1.601x 
+| Electricity | 8 | 1024 | 0.027 / 0.055 / 0.028 | 0.027 / 0.055 / 0.029 | 381s  | 261s  | 1.460x   
+| Traffic     | 1 | 1024 | 0.040 / 0.102 / 0.075 | 0.041 / 0.101 / 0.074 | 1718s | 1062s | 1.618x 
+| Traffic     | 8 | 1024 | 0.042 / 0.106 / 0.076 | 0.042 / 0.105 / 0.077 | 256s  | 176s  | 1.455x
+
+
+
+##### Training stability test
+
+In order to get a greater picture of the model’s accuracy, we performed a hyperparameter search along with stability tests on 100 random seeds for each configuration. Then, for each benchmark dataset, we have chosen the architecture with the least mean test q-risk. The table below summarizes the best configurations.
+
+| Dataset     | #GPU | Hidden size | #Heads | Local BS | LR   | Gradient clipping | Dropout | Mean q-risk | Std q-risk | Min q-risk | Max q-risk
+|-------------|------|-------------|--------|----------|------|-------------------|---------|-------------|------------| -----------|------ 
+| Electricity | 8    | 128         | 4      | 1024     | 1e-3 | 0.0               | 0.1     | 0.1131      | 0.0025     | 0.1080     | 0.1200
+| Traffic     | 8    | 128         | 4      | 1024     | 1e-3 | 0.0               | 0.3     | 0.2180      | 0.0049     | 0.2069     | 0.2336
+
+
+#### Training performance results
+
+##### Training performance: NVIDIA DGX A100 (A100 80GB)
+
+Our results were obtained by running the `train.sh` training script in the [PyTorch 21.06 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch) on NVIDIA A100 (A100 80GB) GPUs. Performance numbers (in items/images per second) were averaged over an entire training epoch.
+
+| Dataset | GPUs   | Batch size / GPU   | Throughput - TF32    | Throughput - mixed precision    | Throughput speedup (TF32 - mixed precision)   | Weak scaling - TF32    | Weak scaling - mixed precision        
+|-------------|---|------|--------|--------|-------|-------|-----
+| Electricity | 1 | 1024 | 10173  | 13703  | 1.35x | 1     | 1
+| Electricity | 8 | 1024 | 80596  | 107761 | 1.34x | 7.92x | 7.86x
+| Traffic     | 1 | 1024 | 10197  | 13779  | 1.35x | 1     | 1
+| Traffic     | 8 | 1024 | 80692  | 107979 | 1.34x | 7.91x | 7.84x
+
+
+To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
+
+The performance metrics used were items per second.
+
+
+##### Training performance: NVIDIA DGX-1 (V100 16GB)
+
+Our results were obtained by running the `train.sh` training script in the [PyTorch 21.06 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch) on NVIDIA DGX-1 with (V100 16GB) GPUs. Performance numbers (in items/images per second) were averaged over an entire training epoch.
+
+| Dataset | GPUs   | Batch size / GPU   | Throughput - FP32    | Throughput - mixed precision    | Throughput speedup (FP32 - mixed precision)   | Weak scaling - FP32    | Weak scaling - mixed precision        
+|-------------|---|------|-------|-------|-------|------|----
+| Electricity | 1 | 1024 | 5580  | 9148  | 1.64x | 1     | 1
+| Electricity | 8 | 1024 | 43351 | 69855 | 1.61x | 7.77x | 7.64x
+| Traffic     | 1 | 1024 | 5593  | 9194  | 1.64x | 1     | 1
+| Traffic     | 8 | 1024 | 43426 | 69983 | 1.61x | 7.76x | 7.61x
+
+
+
+To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
+
+The performance metrics used were items per second.
+
+## Release notes
+The performance measurements in this document were conducted at the time of publication and may not reflect the performance achieved from NVIDIA’s latest software release. For the most up-to-date performance measurements, go to https://developer.nvidia.com/deep-learning-performance-training-inference.
+
+### Changelog
+
+October 2021
+- Initial release
+
+### Known issues
+There are no known issues with this model.
+
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/TFT_architecture.PNG b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/TFT_architecture.PNG
new file mode 100644
index 00000000..c3431031
Binary files /dev/null and b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/TFT_architecture.PNG differ
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/configuration.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/configuration.py
new file mode 100644
index 00000000..bef26e66
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/configuration.py
@@ -0,0 +1,128 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from data_utils import InputTypes, DataTypes, FeatureSpec
+import datetime
+
+class ElectricityConfig():
+    def __init__(self):
+
+        self.features = [
+                         FeatureSpec('id', InputTypes.ID, DataTypes.CATEGORICAL),
+                         FeatureSpec('hours_from_start', InputTypes.TIME, DataTypes.CONTINUOUS),
+                         FeatureSpec('power_usage', InputTypes.TARGET, DataTypes.CONTINUOUS),
+                         FeatureSpec('hour', InputTypes.KNOWN, DataTypes.CONTINUOUS),
+                         FeatureSpec('day_of_week', InputTypes.KNOWN, DataTypes.CONTINUOUS),
+                         FeatureSpec('hours_from_start', InputTypes.KNOWN, DataTypes.CONTINUOUS),
+                         FeatureSpec('categorical_id', InputTypes.STATIC, DataTypes.CATEGORICAL),
+                        ]
+        # Dataset split boundaries
+        self.time_ids = 'days_from_start' # This column contains time indices across which we split the data
+        self.train_range = (1096, 1315)
+        self.valid_range = (1308, 1339)
+        self.test_range = (1332, 1346)
+        self.dataset_stride = 1 #how many timesteps between examples
+        self.scale_per_id = True
+        self.missing_id_strategy = None
+        self.missing_cat_data_strategy='encode_all'
+
+        # Feature sizes
+        self.static_categorical_inp_lens = [369]
+        self.temporal_known_categorical_inp_lens = []
+        self.temporal_observed_categorical_inp_lens = []
+        self.quantiles = [0.1, 0.5, 0.9]
+
+        self.example_length = 8 * 24
+        self.encoder_length = 7 * 24
+
+        self.n_head = 4
+        self.hidden_size = 128
+        self.dropout = 0.1
+        self.attn_dropout = 0.0
+
+        #### Derived variables ####
+        self.temporal_known_continuous_inp_size = len([x for x in self.features 
+            if x.feature_type == InputTypes.KNOWN and x.feature_embed_type == DataTypes.CONTINUOUS])
+        self.temporal_observed_continuous_inp_size = len([x for x in self.features 
+            if x.feature_type == InputTypes.OBSERVED and x.feature_embed_type == DataTypes.CONTINUOUS])
+        self.temporal_target_size = len([x for x in self.features if x.feature_type == InputTypes.TARGET])
+        self.static_continuous_inp_size = len([x for x in self.features 
+            if x.feature_type == InputTypes.STATIC and x.feature_embed_type == DataTypes.CONTINUOUS])
+
+        self.num_static_vars = self.static_continuous_inp_size + len(self.static_categorical_inp_lens)
+        self.num_future_vars = self.temporal_known_continuous_inp_size + len(self.temporal_known_categorical_inp_lens)
+        self.num_historic_vars = sum([self.num_future_vars,
+                                      self.temporal_observed_continuous_inp_size,
+                                      self.temporal_target_size,
+                                      len(self.temporal_observed_categorical_inp_lens),
+                                      ])
+
+
+class TrafficConfig():
+    def __init__(self):
+
+        self.features = [
+                         FeatureSpec('id', InputTypes.ID, DataTypes.CATEGORICAL),
+                         FeatureSpec('hours_from_start', InputTypes.TIME, DataTypes.CONTINUOUS),
+                         FeatureSpec('values', InputTypes.TARGET, DataTypes.CONTINUOUS),
+                         FeatureSpec('time_on_day', InputTypes.KNOWN, DataTypes.CONTINUOUS),
+                         FeatureSpec('day_of_week', InputTypes.KNOWN, DataTypes.CONTINUOUS),
+                         FeatureSpec('hours_from_start', InputTypes.KNOWN, DataTypes.CONTINUOUS),
+                         FeatureSpec('categorical_id', InputTypes.STATIC, DataTypes.CATEGORICAL),
+                        ]
+        # Dataset split boundaries
+        self.time_ids = 'sensor_day' # This column contains time indices across which we split the data
+        self.train_range = (0, 151)
+        self.valid_range = (144, 166)
+        self.test_range = (159, float('inf'))
+        self.dataset_stride = 1 #how many timesteps between examples
+        self.scale_per_id = False
+        self.missing_id_strategy = None
+        self.missing_cat_data_strategy='encode_all'
+
+        # Feature sizes
+        self.static_categorical_inp_lens = [963]
+        self.temporal_known_categorical_inp_lens = []
+        self.temporal_observed_categorical_inp_lens = []
+        self.quantiles = [0.1, 0.5, 0.9]
+
+        self.example_length = 8 * 24
+        self.encoder_length = 7 * 24
+
+        self.n_head = 4
+        self.hidden_size = 128
+        self.dropout = 0.3
+        self.attn_dropout = 0.0
+
+        #### Derived variables ####
+        self.temporal_known_continuous_inp_size = len([x for x in self.features 
+            if x.feature_type == InputTypes.KNOWN and x.feature_embed_type == DataTypes.CONTINUOUS])
+        self.temporal_observed_continuous_inp_size = len([x for x in self.features 
+            if x.feature_type == InputTypes.OBSERVED and x.feature_embed_type == DataTypes.CONTINUOUS])
+        self.temporal_target_size = len([x for x in self.features if x.feature_type == InputTypes.TARGET])
+        self.static_continuous_inp_size = len([x for x in self.features 
+            if x.feature_type == InputTypes.STATIC and x.feature_embed_type == DataTypes.CONTINUOUS])
+
+        self.num_static_vars = self.static_continuous_inp_size + len(self.static_categorical_inp_lens)
+        self.num_future_vars = self.temporal_known_continuous_inp_size + len(self.temporal_known_categorical_inp_lens)
+        self.num_historic_vars = sum([self.num_future_vars,
+                                      self.temporal_observed_continuous_inp_size,
+                                      self.temporal_target_size,
+                                      len(self.temporal_observed_categorical_inp_lens),
+                                      ])
+
+
+CONFIGS = {'electricity':  ElectricityConfig,
+           'traffic':      TrafficConfig, 
+           }
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/criterions.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/criterions.py
new file mode 100644
index 00000000..5c9df6ae
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/criterions.py
@@ -0,0 +1,28 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class QuantileLoss(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.register_buffer('q', torch.tensor(config.quantiles))
+
+    def forward(self, predictions, targets):
+        diff = predictions - targets
+        ql = (1-self.q)*F.relu(diff) + self.q*F.relu(-diff)
+        losses = ql.view(-1, ql.shape[-1]).mean(0)
+        return losses
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/data_utils.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/data_utils.py
new file mode 100644
index 00000000..f38f8bfb
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/data_utils.py
@@ -0,0 +1,790 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################
+# Copyright 2021 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import math
+import pickle
+import enum
+import datetime
+
+from collections import namedtuple, OrderedDict
+
+import sklearn.preprocessing
+from sklearn.impute import SimpleImputer
+import pandas as pd
+import numpy as np
+from bisect import bisect
+
+import torch
+from torch.utils.data import Dataset,IterableDataset,DataLoader
+
+class DataTypes(enum.IntEnum):
+    """Defines numerical types of each column."""
+    CONTINUOUS = 0
+    CATEGORICAL = 1
+    DATE = 2
+    STR = 3
+
+class InputTypes(enum.IntEnum):
+    """Defines input types of each column."""
+    TARGET = 0
+    OBSERVED = 1
+    KNOWN = 2
+    STATIC = 3
+    ID = 4  # Single column used as an entity identifier
+    TIME = 5  # Single column exclusively used as a time index
+
+FeatureSpec = namedtuple('FeatureSpec', ['name', 'feature_type', 'feature_embed_type'])
+DTYPE_MAP = {
+        DataTypes.CONTINUOUS : np.float32,
+        DataTypes.CATEGORICAL : np.int64,
+        DataTypes.DATE:'datetime64[ns]',
+        DataTypes.STR: str
+        }
+
+FEAT_ORDER = [
+        (InputTypes.STATIC, DataTypes.CATEGORICAL),
+        (InputTypes.STATIC, DataTypes.CONTINUOUS),
+        (InputTypes.KNOWN, DataTypes.CATEGORICAL),
+        (InputTypes.KNOWN, DataTypes.CONTINUOUS),
+        (InputTypes.OBSERVED, DataTypes.CATEGORICAL),
+        (InputTypes.OBSERVED, DataTypes.CONTINUOUS),
+        (InputTypes.TARGET, DataTypes.CONTINUOUS),
+        (InputTypes.ID, DataTypes.CATEGORICAL)
+        ]
+
+FEAT_NAMES = ['s_cat' , 's_cont' , 'k_cat' , 'k_cont' , 'o_cat' , 'o_cont' , 'target', 'id']
+DEFAULT_ID_COL = 'id'
+
+class TFTBinaryDataset(Dataset):
+    def __init__(self, path, config):
+        super(TFTBinaryDataset).__init__()
+        self.features = [x for x in config.features if x.feature_embed_type != DataTypes.DATE]
+        self.example_length = config.example_length
+        self.stride = config.dataset_stride
+
+        self.grouped = pickle.load(open(path, 'rb'))
+        self.grouped = [x for x in self.grouped if x.shape[0] >= self.example_length]
+        self._cum_examples_in_group = np.cumsum([(g.shape[0] - self.example_length + 1)//self.stride for g in self.grouped])
+
+
+        self.feature_type_col_map = [[i for i,f in enumerate(self.features) if (f.feature_type, f.feature_embed_type) == x] for x in FEAT_ORDER]
+
+        # The list comprehension below is an elaborate way of rearranging data into correct order,
+        # simultaneously doing casting to proper types. Probably can be written neater
+        self.grouped = [
+                [
+                    arr[:, idxs].view(dtype=np.float32).astype(DTYPE_MAP[t[1]]) 
+                    for t, idxs in zip(FEAT_ORDER, self.feature_type_col_map)
+                ] 
+                for arr in self.grouped
+            ]
+
+    def __len__(self):
+        return self._cum_examples_in_group[-1] if len(self._cum_examples_in_group) else 0
+
+    def __getitem__(self, idx):
+        g_idx = bisect(self._cum_examples_in_group, idx)
+        e_idx = idx - self._cum_examples_in_group[g_idx-1] if g_idx else idx
+
+        group =  self.grouped[g_idx]
+
+        tensors = [
+                   torch.from_numpy(feat[e_idx * self.stride:e_idx*self.stride + self.example_length])
+                   if feat.size else torch.empty(0)
+                   for feat in group
+                  ]
+
+        return OrderedDict(zip(FEAT_NAMES, tensors))
+
+
+class TFTDataset(Dataset):
+    def __init__(self, path, config):
+        super(TFTDataset).__init__()
+        self.features = config.features
+        self.data = pd.read_csv(path, index_col=0)
+        self.example_length = config.example_length
+        self.stride = config.dataset_stride
+
+        # name field is a column name.
+        # there can be multiple entries with the same name because one column can be interpreted in many ways
+        time_col_name = next(x.name for x in self.features if x.feature_type==InputTypes.TIME)
+        id_col_name   = next(x.name for x in self.features if x.feature_type==InputTypes.ID)
+        if not id_col_name in self.data.columns:
+            id_col_name = DEFAULT_ID_COL
+            self.features = [x for x in self.features if x.feature_type!=InputTypes.ID]
+            self.features.append(FeatureSpec(DEFAULT_ID_COL, InputTypes.ID, DataTypes.CATEGORICAL))
+        col_dtypes    = {v.name:DTYPE_MAP[v.feature_embed_type] for v in self.features}
+
+
+        self.data.sort_values(time_col_name,inplace=True)
+        self.data = self.data[set(x.name for x in self.features)] #leave only relevant columns
+        self.data = self.data.astype(col_dtypes)
+        self.data = self.data.groupby(id_col_name).filter(lambda group: len(group) >= self.example_length)
+        self.grouped = list(self.data.groupby(id_col_name))
+
+        self._cum_examples_in_group = np.cumsum([(len(g[1]) - self.example_length + 1)//self.stride for g in self.grouped])
+
+    def __len__(self):
+        return self._cum_examples_in_group[-1]
+
+    def __getitem__(self, idx):
+        g_idx = len([x for x in self._cum_examples_in_group if x <= idx])
+        e_idx = idx - self._cum_examples_in_group[g_idx-1] if g_idx else idx
+
+        group =  self.grouped[g_idx][1]
+        sliced = group.iloc[e_idx * self.stride:e_idx*self.stride + self.example_length]
+
+        # We need to be sure that tensors are returned in the correct order
+        tensors = tuple([] for _ in range(8))
+        for v in self.features:
+            if v.feature_type == InputTypes.STATIC and v.feature_embed_type == DataTypes.CATEGORICAL:
+                tensors[0].append(torch.from_numpy(sliced[v.name].to_numpy()))
+            elif v.feature_type == InputTypes.STATIC and v.feature_embed_type == DataTypes.CONTINUOUS:
+                tensors[1].append(torch.from_numpy(sliced[v.name].to_numpy()))
+            elif v.feature_type == InputTypes.KNOWN and v.feature_embed_type == DataTypes.CATEGORICAL:
+                tensors[2].append(torch.from_numpy(sliced[v.name].to_numpy()))
+            elif v.feature_type == InputTypes.KNOWN and v.feature_embed_type == DataTypes.CONTINUOUS:
+                tensors[3].append(torch.from_numpy(sliced[v.name].to_numpy()))
+            elif v.feature_type == InputTypes.OBSERVED and v.feature_embed_type == DataTypes.CATEGORICAL:
+                tensors[4].append(torch.from_numpy(sliced[v.name].to_numpy()))
+            elif v.feature_type == InputTypes.OBSERVED and v.feature_embed_type == DataTypes.CONTINUOUS:
+                tensors[5].append(torch.from_numpy(sliced[v.name].to_numpy()))
+            elif v.feature_type == InputTypes.TARGET:
+                tensors[6].append(torch.from_numpy(sliced[v.name].to_numpy()))
+            elif v.feature_type == InputTypes.ID:
+                tensors[7].append(torch.from_numpy(sliced[v.name].to_numpy()))
+
+
+        tensors = [torch.stack(x, dim=-1) if x else torch.empty(0) for x in tensors]
+
+        return OrderedDict(zip(FEAT_NAMES, tensors))
+        
+def get_dataset_splits(df, config):
+
+    if hasattr(config, 'relative_split') and config.relative_split:
+        forecast_len = config.example_length - config.encoder_length
+        # The valid split is shifted from the train split by number of the forecast steps to the future.
+        # The test split is shifted by the number of the forecast steps from the valid split
+        train = []
+        valid = []
+        test = []
+
+        for _, group in df.groupby(DEFAULT_ID_COL):
+            index = group[config.time_ids]
+            _train = group.loc[index < config.valid_boundary]
+            _valid = group.iloc[(len(_train) - config.encoder_length):(len(_train) + forecast_len)]
+            _test = group.iloc[(len(_train) - config.encoder_length + forecast_len):(len(_train) + 2*forecast_len)]
+            train.append(_train)
+            valid.append(_valid)
+            test.append(_test)
+
+        train = pd.concat(train, axis=0)
+        valid = pd.concat(valid, axis=0)
+        test = pd.concat(test, axis=0)
+    else:
+        index = df[config.time_ids]
+        train = df.loc[(index >= config.train_range[0]) & (index < config.train_range[1])]
+        valid = df.loc[(index >= config.valid_range[0]) & (index < config.valid_range[1])]
+        test  = df.loc[(index >= config.test_range[0]) & (index < config.test_range[1])]
+
+    return train, valid, test
+
+def flatten_ids(df, config):
+
+    if config.missing_id_strategy == 'drop':
+        if hasattr(config, 'combine_ids') and config.combine_ids:
+            index = np.logical_or.reduce([df[c].isna() for c in config.combine_ids])
+        else:
+            id_col = next(x.name for x in config.features if x.feature_type == InputTypes.ID)
+            index = df[id_col].isna()
+        index = index[index == True].index # Extract indices of nans
+        df.drop(index, inplace=True)
+
+    if not (hasattr(config, 'combine_ids') and config.combine_ids):
+        id_col = next(x.name for x in config.features if x.feature_type == InputTypes.ID)
+        ids = df[id_col].apply(str)
+        df.drop(id_col, axis=1, inplace=True)
+        encoder = sklearn.preprocessing.LabelEncoder().fit(ids.values)
+        df[DEFAULT_ID_COL] = encoder.transform(ids)
+        encoders = OrderedDict({id_col: encoder})
+
+    else:
+        encoders = {c:sklearn.preprocessing.LabelEncoder().fit(df[c].values) for c in config.combine_ids}
+        encoders = OrderedDict(encoders)
+        lens = [len(v.classes_) for v in encoders.values()]
+        clens = np.roll(np.cumprod(lens), 1)
+        clens[0] = 1
+
+        # this takes a looooooot of time. Probably it would be better to create 2 dummy columns
+        df[DEFAULT_ID_COL] = df.apply(lambda row: sum([encoders[c].transform([row[c]])[0]*clens[i] for i,c in enumerate(encoders.keys())]), axis=1)
+        df.drop(config.combine_ids, axis=1, inplace=True)
+
+    return DEFAULT_ID_COL, encoders
+
+def impute(df, config):
+    #XXX This ensures that out scaling will have the same mean. We still need to check the variance
+    if not hasattr(config, 'missing_data_label'):
+        return df, None
+    else:
+        imp = SimpleImputer(missing_values=config.missing_data_label, strategy='mean')
+        mask = df.applymap(lambda x: True if x == config.missing_data_label else False)
+        data = df.values
+        col_mask = (data == config.missing_data_label).all(axis=0)
+        data[:,~col_mask] = imp.fit_transform(data)
+        return data, mask
+
+def normalize_reals(train, valid, test, config, id_col=DEFAULT_ID_COL):
+    tgt_cols = [x.name for x in config.features if x.feature_type == InputTypes.TARGET]
+    real_cols = list(set(v.name for v in config.features if v.feature_embed_type == DataTypes.CONTINUOUS).difference(set(tgt_cols)))
+    real_scalers = {}
+    tgt_scalers = {}
+
+    def apply_scalers(df, name=None):
+        if name is None:
+            name = df.name
+        mask = df.applymap(lambda x: True if x == config.missing_data_label else False) if hasattr(config, 'missing_data_label') else None
+        df[real_cols] = real_scalers[name].transform(df[real_cols])
+        if mask is not None and any(mask):
+            df[real_cols].mask(mask, 10**9)
+        df[tgt_cols] = tgt_scalers[name].transform(df[tgt_cols])
+        return df
+
+    if config.scale_per_id:
+        for identifier, sliced in train.groupby(id_col):
+            data = sliced[real_cols]
+            data, _ = impute(data, config)
+            real_scalers[identifier] = sklearn.preprocessing.StandardScaler().fit(data)
+            # XXX We should probably remove examples that contain NaN as a target
+            target = sliced[tgt_cols]
+            tgt_scalers[identifier] = sklearn.preprocessing.StandardScaler().fit(target)
+
+        train = train.groupby(id_col).apply(apply_scalers)
+        # For valid and testing leave only timeseries previously present in train subset
+        # XXX for proper data science we should consider encoding unseen timeseries as a special case, not throwing them away
+        valid = valid.loc[valid[id_col].isin(real_scalers.keys())]
+        valid = valid.groupby(id_col).apply(apply_scalers)
+        test = test.loc[test[id_col].isin(real_scalers.keys())]
+        test = test.groupby(id_col).apply(apply_scalers)
+
+    else:
+        data, _ = impute(train[real_cols], config)
+        real_scalers[''] = sklearn.preprocessing.StandardScaler().fit(data)
+        tgt_scalers[''] = sklearn.preprocessing.StandardScaler().fit(train[tgt_cols])
+
+        train = apply_scalers(train, name='')
+        valid = apply_scalers(valid, name='')
+        test = apply_scalers(test, name='')
+
+    return train, valid, test, real_scalers, tgt_scalers
+
+def encode_categoricals(train, valid, test, config):
+    cat_encodings = {}
+    cat_cols = list(set(v.name for v in config.features if v.feature_embed_type == DataTypes.CATEGORICAL and v.feature_type != InputTypes.ID))
+    num_classes = [] #XXX Maybe we should modify config based on this value? Or send a warninig?
+                     # For TC performance reasons we might want for num_classes[i] be divisible by 8
+
+    # Train categorical encoders
+    for c in cat_cols:
+        if config.missing_cat_data_strategy == 'special_token':
+            #XXX this will probably require some data augmentation
+            unique = train[c].unique()
+            valid[c].loc[valid[c].isin(unique)] = '<UNK>'
+            test[c].loc[test[c].isin(unique)] = '<UNK>'
+
+        if config.missing_cat_data_strategy == 'encode_all' or \
+                config.missing_cat_data_strategy == 'special_token':
+            srs = pd.concat([train[c], valid[c], test[c]]).apply(str)
+            cat_encodings[c] = sklearn.preprocessing.LabelEncoder().fit(srs.values)
+        elif config.missing_cat_data_strategy == 'drop':
+            # TODO: implement this. In addition to dropping rows this has to split specific time series in chunks
+            # to prevent data from having temporal gaps
+            pass
+        num_classes.append(srs.nunique())
+    print('Categorical variables encodings lens: ', num_classes)
+
+
+    for split in [train, valid, test]:
+        for c in cat_cols:
+            srs = split[c].apply(str)
+            split[c] = srs
+            split.loc[:,c] = cat_encodings[c].transform(srs)
+
+    return cat_encodings
+
+
+def preprocess(src_path, dst_path, config):
+    df = pd.read_csv(src_path, index_col=0)
+
+    for c in config.features:
+        if c.feature_embed_type == DataTypes.DATE:
+            df[c.name] = pd.to_datetime(df[c.name])
+
+    # Leave only columns relevant to preprocessing
+    relevant_columns = list(set([f.name for f in config.features] + [config.time_ids]))
+    df = df[relevant_columns]
+
+
+    id_col, id_encoders = flatten_ids(df, config)
+    df = df.reindex(sorted(df.columns), axis=1)
+    
+    train, valid, test = get_dataset_splits(df, config)
+   
+    # Length filter the data (all timeseries shorter than example len will be dropped)
+    #for df in [train, valid, test]:
+    #    df.groupby(id_col).filter(lambda x: len(x) >= config.example_length)
+    train = pd.concat([x[1] for x in train.groupby(id_col) if len(x[1]) >= config.example_length])
+    valid = pd.concat([x[1] for x in valid.groupby(id_col) if len(x[1]) >= config.example_length])
+    test  = pd.concat([x[1] for x in test.groupby(id_col)  if len(x[1]) >= config.example_length])
+
+    train, valid, test, real_scalers, tgt_scalers = normalize_reals(train, valid, test, config, id_col)
+
+    cat_encodings = encode_categoricals(train, valid, test, config)
+
+    os.makedirs(dst_path, exist_ok=True)
+    
+    train.to_csv(os.path.join(dst_path, 'train.csv'))
+    valid.to_csv(os.path.join(dst_path, 'valid.csv'))
+    test.to_csv(os.path.join(dst_path, 'test.csv'))
+
+    # Save relevant columns in binary form for faster dataloading
+    # IMORTANT: We always expect id to be a single column indicating the complete timeseries
+    # We also expect a copy of id in form of static categorical input!!!
+    col_names = [id_col] + [x.name for x in config.features if x.feature_embed_type != DataTypes.DATE and x.feature_type != InputTypes.ID]
+    grouped_train = [x[1][col_names].values.astype(np.float32).view(dtype=np.int32) for x in train.groupby(id_col)]
+    grouped_valid = [x[1][col_names].values.astype(np.float32).view(dtype=np.int32) for x in valid.groupby(id_col)]
+    grouped_test  = [x[1][col_names].values.astype(np.float32).view(dtype=np.int32) for x in test.groupby(id_col)]
+
+    pickle.dump(grouped_train, open(os.path.join(dst_path, 'train.bin'), 'wb'))
+    pickle.dump(grouped_valid, open(os.path.join(dst_path, 'valid.bin'), 'wb'))
+    pickle.dump(grouped_test,  open(os.path.join(dst_path, 'test.bin'), 'wb'))
+
+    
+    with open(os.path.join(dst_path, 'real_scalers.bin'), 'wb') as f:
+        pickle.dump(real_scalers, f)
+    with open(os.path.join(dst_path, 'tgt_scalers.bin'), 'wb') as f:
+        pickle.dump(tgt_scalers, f)
+    with open(os.path.join(dst_path, 'cat_encodings.bin'), 'wb') as f:
+        pickle.dump(cat_encodings, f)
+    with open(os.path.join(dst_path, 'id_encoders.bin'), 'wb') as f:
+        pickle.dump(id_encoders, f)
+    
+
+def sample_data(dataset, num_samples):
+    if num_samples < 0:
+        return dataset
+    else:
+        return torch.utils.data.Subset(dataset, np.random.choice(np.arange(len(dataset)), size=num_samples, replace=False))
+
+
+def standarize_electricity(path):
+    """Code taken from https://github.com/google-research/google-research/blob/master/tft/script_download_data.py"""
+    df = pd.read_csv(os.path.join(path, 'LD2011_2014.txt'), index_col=0, sep=';', decimal=',')
+    df.index = pd.to_datetime(df.index)
+    df.sort_index(inplace=True)
+  
+    # Used to determine the start and end dates of a series
+    output = df.resample('1h').mean().replace(0., np.nan)
+  
+    earliest_time = output.index.min()
+  
+    df_list = []
+    for label in output:
+        print('Processing {}'.format(label))
+        srs = output[label]
+  
+        start_date = min(srs.fillna(method='ffill').dropna().index)
+        end_date = max(srs.fillna(method='bfill').dropna().index)
+  
+        active_range = (srs.index >= start_date) & (srs.index <= end_date)
+        srs = srs[active_range].fillna(0.)
+  
+        tmp = pd.DataFrame({'power_usage': srs})
+        date = tmp.index
+        tmp['t'] = (date - earliest_time).seconds / 60 / 60 + (
+            date - earliest_time).days * 24
+        tmp['days_from_start'] = (date - earliest_time).days
+        tmp['categorical_id'] = label
+        tmp['date'] = date
+        tmp['id'] = label
+        tmp['hour'] = date.hour
+        tmp['day'] = date.day
+        tmp['day_of_week'] = date.dayofweek
+        tmp['month'] = date.month
+  
+        df_list.append(tmp)
+  
+    output = pd.concat(df_list, axis=0, join='outer').reset_index(drop=True)
+  
+    output['categorical_id'] = output['id'].copy()
+    output['hours_from_start'] = output['t']
+    output['categorical_day_of_week'] = output['day_of_week'].copy()
+    output['categorical_hour'] = output['hour'].copy()
+  
+    output.to_csv(os.path.join(path, 'standarized.csv'))
+
+def standarize_volatility(path):
+    df = pd.read_csv(os.path.join(path, 'oxfordmanrealizedvolatilityindices.csv'), index_col=0)  # no explicit index
+  
+    # Adds additional date/day fields
+    idx = [str(s).split('+')[0] for s in df.index
+          ]  # ignore timezones, we don't need them
+    dates = pd.to_datetime(idx)
+    df['date'] = dates
+    df['days_from_start'] = (dates - pd.datetime(2000, 1, 3)).days
+    df['day_of_week'] = dates.dayofweek
+    df['day_of_month'] = dates.day
+    df['week_of_year'] = dates.weekofyear
+    df['month'] = dates.month
+    df['year'] = dates.year
+    df['categorical_id'] = df['Symbol'].copy()
+  
+    # Processes log volatility
+    vol = df['rv5_ss'].copy()
+    vol.loc[vol == 0.] = np.nan
+    df['log_vol'] = np.log(vol)
+  
+    # Adds static information
+    symbol_region_mapping = {
+        '.AEX': 'EMEA',
+        '.AORD': 'APAC',
+        '.BFX': 'EMEA',
+        '.BSESN': 'APAC',
+        '.BVLG': 'EMEA',
+        '.BVSP': 'AMER',
+        '.DJI': 'AMER',
+        '.FCHI': 'EMEA',
+        '.FTMIB': 'EMEA',
+        '.FTSE': 'EMEA',
+        '.GDAXI': 'EMEA',
+        '.GSPTSE': 'AMER',
+        '.HSI': 'APAC',
+        '.IBEX': 'EMEA',
+        '.IXIC': 'AMER',
+        '.KS11': 'APAC',
+        '.KSE': 'APAC',
+        '.MXX': 'AMER',
+        '.N225': 'APAC ',
+        '.NSEI': 'APAC',
+        '.OMXC20': 'EMEA',
+        '.OMXHPI': 'EMEA',
+        '.OMXSPI': 'EMEA',
+        '.OSEAX': 'EMEA',
+        '.RUT': 'EMEA',
+        '.SMSI': 'EMEA',
+        '.SPX': 'AMER',
+        '.SSEC': 'APAC',
+        '.SSMI': 'EMEA',
+        '.STI': 'APAC',
+        '.STOXX50E': 'EMEA'
+    }
+  
+    df['Region'] = df['Symbol'].apply(lambda k: symbol_region_mapping[k])
+  
+    # Performs final processing
+    output_df_list = []
+    for grp in df.groupby('Symbol'):
+        sliced = grp[1].copy()
+        sliced.sort_values('days_from_start', inplace=True)
+        # Impute log volatility values
+        sliced['log_vol'].fillna(method='ffill', inplace=True)
+        sliced.dropna()
+        output_df_list.append(sliced)
+  
+    df = pd.concat(output_df_list, axis=0)
+  
+    df.to_csv(os.path.join(path, 'standarized.csv'))
+
+
+def standarize_traffic(path):
+    def process_list(s, variable_type=int, delimiter=None):
+        """Parses a line in the PEMS format to a list."""
+        if delimiter is None:
+            l = [
+                variable_type(i) for i in s.replace('[', '').replace(']', '').split()
+            ]
+        else:
+            l = [
+                variable_type(i)
+                for i in s.replace('[', '').replace(']', '').split(delimiter)
+            ]
+  
+        return l
+  
+    def read_single_list(filename):
+        """Returns single list from a file in the PEMS-custom format."""
+        with open(os.path.join(path, filename), 'r') as dat:
+            l = process_list(dat.readlines()[0])
+        return l
+  
+    def read_matrix(filename):
+        """Returns a matrix from a file in the PEMS-custom format."""
+        array_list = []
+        with open(os.path.join(path, filename), 'r') as dat:
+            lines = dat.readlines()
+            for i, line in enumerate(lines):
+                if (i + 1) % 50 == 0:
+                    print('Completed {} of {} rows for {}'.format(i + 1, len(lines),
+                                                                filename))
+                array = [
+                    process_list(row_split, variable_type=float, delimiter=None)
+                    for row_split in process_list(
+                        line, variable_type=str, delimiter=';')
+                ]
+                array_list.append(array)
+  
+        return array_list
+  
+    shuffle_order = np.array(read_single_list('randperm')) - 1  # index from 0
+    train_dayofweek = read_single_list('PEMS_trainlabels')
+    train_tensor = read_matrix('PEMS_train')
+    test_dayofweek = read_single_list('PEMS_testlabels')
+    test_tensor = read_matrix('PEMS_test')
+  
+    # Inverse permutate shuffle order
+    print('Shuffling')
+    inverse_mapping = {
+        new_location: previous_location
+        for previous_location, new_location in enumerate(shuffle_order)
+    }
+    reverse_shuffle_order = np.array([
+        inverse_mapping[new_location]
+        for new_location, _ in enumerate(shuffle_order)
+    ])
+  
+    # Group and reoder based on permuation matrix
+    print('Reodering')
+    day_of_week = np.array(train_dayofweek + test_dayofweek)
+    combined_tensor = np.array(train_tensor + test_tensor)
+  
+    day_of_week = day_of_week[reverse_shuffle_order]
+    combined_tensor = combined_tensor[reverse_shuffle_order]
+  
+    # Put everything back into a dataframe
+    print('Parsing as dataframe')
+    labels = ['traj_{}'.format(i) for i in read_single_list('stations_list')]
+  
+    hourly_list = []
+    for day, day_matrix in enumerate(combined_tensor):
+        # Hourly data
+        hourly = pd.DataFrame(day_matrix.T, columns=labels)
+        hourly['hour_on_day'] = [int(i / 6) for i in hourly.index
+                                ]  # sampled at 10 min intervals
+        if hourly['hour_on_day'].max() > 23 or hourly['hour_on_day'].min() < 0:
+            raise ValueError('Invalid hour! {}-{}'.format(
+                hourly['hour_on_day'].min(), hourly['hour_on_day'].max()))
+  
+        hourly = hourly.groupby('hour_on_day', as_index=True).mean()[labels]
+        hourly['sensor_day'] = day
+        hourly['time_on_day'] = hourly.index
+        hourly['day_of_week'] = day_of_week[day]
+  
+        hourly_list.append(hourly)
+  
+    hourly_frame = pd.concat(hourly_list, axis=0, ignore_index=True, sort=False)
+  
+    # Flatten such that each entitiy uses one row in dataframe
+    store_columns = [c for c in hourly_frame.columns if 'traj' in c]
+    other_columns = [c for c in hourly_frame.columns if 'traj' not in c]
+    flat_df = pd.DataFrame(columns=['values', 'prev_values', 'next_values'] +
+                           other_columns + ['id'])
+  
+    for store in store_columns:
+        print('Processing {}'.format(store))
+  
+        sliced = hourly_frame[[store] + other_columns].copy()
+        sliced.columns = ['values'] + other_columns
+        sliced['id'] = int(store.replace('traj_', ''))
+  
+        # Sort by Sensor-date-time
+        key = sliced['id'].apply(str) \
+                + sliced['sensor_day'].apply(lambda x: '_{:03d}'.format(x)) \
+                + sliced['time_on_day'].apply(lambda x: '_{:03d}'.format(x))
+        sliced = sliced.set_index(key).sort_index()
+  
+        sliced['values'] = sliced['values'].fillna(method='ffill')
+        sliced['prev_values'] = sliced['values'].shift(1)
+        sliced['next_values'] = sliced['values'].shift(-1)
+  
+        flat_df = flat_df.append(sliced.dropna(), ignore_index=True, sort=False)
+  
+    # Filter to match range used by other academic papers
+    index = flat_df['sensor_day']
+    flat_df = flat_df[index < 173].copy()
+  
+    # Creating columns fo categorical inputs
+    flat_df['categorical_id'] = flat_df['id'].copy()
+    flat_df['hours_from_start'] = flat_df['time_on_day'] \
+        + flat_df['sensor_day']*24.
+    flat_df['categorical_day_of_week'] = flat_df['day_of_week'].copy()
+    flat_df['categorical_time_on_day'] = flat_df['time_on_day'].copy()
+  
+    flat_df.to_csv(os.path.join(path, 'standarized.csv'))
+
+
+# XXX needs rework
+def standarize_favorita(data_folder):
+    import gc
+    # Extract only a subset of data to save/process for efficiency
+    start_date = pd.datetime(2015, 1, 1)
+    end_date = pd.datetime(2016, 6, 1)
+  
+    print('Regenerating data...')
+  
+    # load temporal data
+    temporal = pd.read_csv(os.path.join(data_folder, 'train.csv'), index_col=0)
+  
+    store_info = pd.read_csv(os.path.join(data_folder, 'stores.csv'), index_col=0)
+    oil = pd.read_csv(
+        os.path.join(data_folder, 'oil.csv'), index_col=0).iloc[:, 0]
+    holidays = pd.read_csv(os.path.join(data_folder, 'holidays_events.csv'))
+    items = pd.read_csv(os.path.join(data_folder, 'items.csv'), index_col=0)
+    transactions = pd.read_csv(os.path.join(data_folder, 'transactions.csv'))
+  
+    # Take first 6 months of data
+    temporal['date'] = pd.to_datetime(temporal['date'])
+  
+    # Filter dates to reduce storage space requirements
+    if start_date is not None:
+        temporal = temporal[(temporal['date'] >= start_date)]
+    if end_date is not None:
+        temporal = temporal[(temporal['date'] < end_date)]
+  
+    dates = temporal['date'].unique()
+  
+    # Add trajectory identifier
+    temporal['traj_id'] = temporal['store_nbr'].apply(
+        str) + '_' + temporal['item_nbr'].apply(str)
+    temporal['unique_id'] = temporal['traj_id'] + '_' + temporal['date'].apply(
+        str)
+  
+    # Remove all IDs with negative returns
+    print('Removing returns data')
+    min_returns = temporal['unit_sales'].groupby(temporal['traj_id']).min()
+    valid_ids = set(min_returns[min_returns >= 0].index)
+    selector = temporal['traj_id'].apply(lambda traj_id: traj_id in valid_ids)
+    new_temporal = temporal[selector].copy()
+    del temporal
+    gc.collect()
+    temporal = new_temporal
+    temporal['open'] = 1
+  
+    # Resampling
+    print('Resampling to regular grid')
+    resampled_dfs = []
+    for traj_id, raw_sub_df in temporal.groupby('traj_id'):
+        print('Resampling', traj_id)
+        sub_df = raw_sub_df.set_index('date', drop=True).copy()
+        sub_df = sub_df.resample('1d').last()
+        sub_df['date'] = sub_df.index
+        sub_df[['store_nbr', 'item_nbr', 'onpromotion']] \
+            = sub_df[['store_nbr', 'item_nbr', 'onpromotion']].fillna(method='ffill')
+        sub_df['open'] = sub_df['open'].fillna(
+            0)  # flag where sales data is unknown
+        sub_df['log_sales'] = np.log(sub_df['unit_sales'])
+    
+        resampled_dfs.append(sub_df.reset_index(drop=True))
+  
+    new_temporal = pd.concat(resampled_dfs, axis=0)
+    del temporal
+    gc.collect()
+    temporal = new_temporal
+  
+    print('Adding oil')
+    oil.name = 'oil'
+    oil.index = pd.to_datetime(oil.index)
+    #XXX the lines below match the value of the oil on given date with the rest of the timeseries
+    # missing values in oil series are copied from the index before. Then the oil series is joined with
+    # temporal. Then there are some dates present in temporal which arent present in oil, for which 
+    # oil values is substituted with -1. WHY?!
+    #TODO: check how many nans there are after first step. Previously oil series was extended by dates
+    # present in dates variable with nan value, which were forward filled. 
+    # This behavior is no longer supported by pandas, so we changed to DataFrame.isin method.
+    # This leaves us with more nans after first step than previously. To achieve previous behavior
+    # we have to join series before filling nans.
+    temporal = temporal.join(
+        #oil.loc[oil.index.isin(dates)].fillna(method='ffill'), on='date', how='left')
+        oil.loc[oil.index.isin(dates)], on='date', how='left')
+    temporal['oil'] = temporal['oil'].fillna(method='ffill')
+    temporal['oil'] = temporal['oil'].fillna(-1)
+  
+    print('Adding store info')
+    temporal = temporal.join(store_info, on='store_nbr', how='left')
+  
+    print('Adding item info')
+    temporal = temporal.join(items, on='item_nbr', how='left')
+  
+    transactions['date'] = pd.to_datetime(transactions['date'])
+    temporal = temporal.merge(
+        transactions,
+        left_on=['date', 'store_nbr'],
+        right_on=['date', 'store_nbr'],
+        how='left')
+    temporal['transactions'] = temporal['transactions'].fillna(-1)
+  
+    # Additional date info
+    temporal['day_of_week'] = pd.to_datetime(temporal['date'].values).dayofweek
+    temporal['day_of_month'] = pd.to_datetime(temporal['date'].values).day
+    temporal['month'] = pd.to_datetime(temporal['date'].values).month
+  
+    # Add holiday info
+    print('Adding holidays')
+    holiday_subset = holidays[holidays['transferred'].apply(
+        lambda x: not x)].copy()
+    holiday_subset.columns = [
+        s if s != 'type' else 'holiday_type' for s in holiday_subset.columns
+    ]
+    holiday_subset['date'] = pd.to_datetime(holiday_subset['date'])
+    local_holidays = holiday_subset[holiday_subset['locale'] == 'Local']
+    regional_holidays = holiday_subset[holiday_subset['locale'] == 'Regional']
+    national_holidays = holiday_subset[holiday_subset['locale'] == 'National']
+  
+    temporal['national_hol'] = temporal.merge(
+        national_holidays, left_on=['date'], right_on=['date'],
+        how='left')['description'].fillna('')
+    temporal['regional_hol'] = temporal.merge(
+        regional_holidays,
+        left_on=['state', 'date'],
+        right_on=['locale_name', 'date'],
+        how='left')['description'].fillna('')
+    temporal['local_hol'] = temporal.merge(
+        local_holidays,
+        left_on=['city', 'date'],
+        right_on=['locale_name', 'date'],
+        how='left')['description'].fillna('')
+  
+    temporal.sort_values('unique_id', inplace=True)
+
+    # Transform date to integer index
+    start_date = pd.to_datetime(min(temporal['date']))
+    dates = temporal['date'].apply(pd.to_datetime)
+    temporal['days_from_start'] = (dates - start_date).dt.days
+    temporal['categorical_id'] = temporal['traj_id'].copy()
+  
+    print('Saving processed file to {}'.format(os.path.join(data_folder, 'standarized.csv')))
+    temporal.to_csv(os.path.join(data_folder, 'standarized.csv'))
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/ema.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/ema.py
new file mode 100644
index 00000000..f8f5b331
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/ema.py
@@ -0,0 +1,73 @@
+# Copyright 2021 NVIDIA CORPORATION
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Copyright 2019 Ross Wightman
+
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+
+#        http://www.apache.org/licenses/LICENSE-2.0
+
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+
+""" 
+Exponential Moving Average (EMA) of model updates
+"""
+
+from collections import OrderedDict
+from copy import deepcopy
+
+import torch
+import torch.nn as nn
+
+class ModelEma(nn.Module):
+    """ Model Exponential Moving Average V2
+
+    Keep a moving average of everything in the model state_dict (parameters and buffers).
+    V2 of this module is simpler, it does not match params/buffers based on name but simply
+    iterates in order. It works with torchscript (JIT of full model).
+
+    """
+    def __init__(self, model, decay=0.999, device=None):
+        super().__init__()
+        # make a copy of the model for accumulating moving average of weights
+        self.module = deepcopy(model)
+        self.module.eval()
+        self.decay = decay
+        self.device = device  # perform ema on different device from model if set
+        if self.device is not None:
+            self.module.to(device=device)
+
+    def update(self, model):
+        update_fn=lambda ema_v, model_v: self.decay * ema_v + (1. - self.decay) * model_v
+        with torch.no_grad():
+            for ema_v, model_v in zip(self.module.state_dict().values(), model.state_dict().values()):
+                if self.device is not None:
+                    model_v = model_v.to(device=self.device)
+                ema_v.copy_(update_fn(ema_v, model_v))
+
+    def set(self, model):
+        with torch.no_grad():
+            for ema_v, model_v in zip(self.module.state_dict().values(), model.state_dict().values()):
+                if self.device is not None:
+                    model_v = model_v.to(device=self.device)
+                ema_v.copy_( model_v )
+
+    def forward(self, x):
+        return self.module(x)
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/gpu_affinity.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/gpu_affinity.py
new file mode 100644
index 00000000..79fb1fc4
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/gpu_affinity.py
@@ -0,0 +1,157 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+import math
+import os
+import pathlib
+import re
+
+import pynvml
+
+pynvml.nvmlInit()
+
+
+def systemGetDriverVersion():
+    return pynvml.nvmlSystemGetDriverVersion()
+
+
+def deviceGetCount():
+    return pynvml.nvmlDeviceGetCount()
+
+
+class device:
+    # assume nvml returns list of 64 bit ints
+    _nvml_affinity_elements = math.ceil(os.cpu_count() / 64)
+
+    def __init__(self, device_idx):
+        super().__init__()
+        self.handle = pynvml.nvmlDeviceGetHandleByIndex(device_idx)
+
+    def getName(self):
+        return pynvml.nvmlDeviceGetName(self.handle)
+
+    def getCpuAffinity(self):
+        affinity_string = ''
+        for j in pynvml.nvmlDeviceGetCpuAffinity(
+            self.handle, device._nvml_affinity_elements
+        ):
+            # assume nvml returns list of 64 bit ints
+            affinity_string = '{:064b}'.format(j) + affinity_string
+        affinity_list = [int(x) for x in affinity_string]
+        affinity_list.reverse()  # so core 0 is in 0th element of list
+
+        ret = [i for i, e in enumerate(affinity_list) if e != 0]
+        return ret
+
+
+def set_socket_affinity(gpu_id):
+    dev = device(gpu_id)
+    affinity = dev.getCpuAffinity()
+    os.sched_setaffinity(0, affinity)
+
+
+def set_single_affinity(gpu_id):
+    dev = device(gpu_id)
+    affinity = dev.getCpuAffinity()
+    os.sched_setaffinity(0, affinity[:1])
+
+
+def set_single_unique_affinity(gpu_id, nproc_per_node):
+    devices = [device(i) for i in range(nproc_per_node)]
+    socket_affinities = [dev.getCpuAffinity() for dev in devices]
+
+    siblings_list = get_thread_siblings_list()
+    siblings_dict = dict(siblings_list)
+
+    # remove siblings
+    for idx, socket_affinity in enumerate(socket_affinities):
+        socket_affinities[idx] = list(set(socket_affinity) - set(siblings_dict.values()))
+
+    affinities = []
+    assigned = []
+
+    for socket_affinity in socket_affinities:
+        for core in socket_affinity:
+            if core not in assigned:
+                affinities.append([core])
+                assigned.append(core)
+                break
+    os.sched_setaffinity(0, affinities[gpu_id])
+
+
+def set_socket_unique_affinity(gpu_id, nproc_per_node, mode):
+    device_ids = [device(i) for i in range(nproc_per_node)]
+    socket_affinities = [dev.getCpuAffinity() for dev in device_ids]
+
+    siblings_list = get_thread_siblings_list()
+    siblings_dict = dict(siblings_list)
+
+    # remove siblings
+    for idx, socket_affinity in enumerate(socket_affinities):
+        socket_affinities[idx] = list(set(socket_affinity) - set(siblings_dict.values()))
+
+    socket_affinities_to_device_ids = collections.defaultdict(list)
+
+    for idx, socket_affinity in enumerate(socket_affinities):
+        socket_affinities_to_device_ids[tuple(socket_affinity)].append(idx)
+
+    for socket_affinity, device_ids in socket_affinities_to_device_ids.items():
+        devices_per_group = len(device_ids)
+        cores_per_device = len(socket_affinity) // devices_per_group
+        for group_id, device_id in enumerate(device_ids):
+            if device_id == gpu_id:
+                if mode == 'interleaved':
+                    affinity = list(socket_affinity[group_id::devices_per_group])
+                elif mode == 'continuous':
+                    affinity = list(socket_affinity[group_id*cores_per_device:(group_id+1)*cores_per_device])
+                else:
+                    raise RuntimeError('Unknown set_socket_unique_affinity mode')
+
+                # reintroduce siblings
+                affinity += [siblings_dict[aff] for aff in affinity if aff in siblings_dict]
+                os.sched_setaffinity(0, affinity)
+
+
+def get_thread_siblings_list():
+    path = '/sys/devices/system/cpu/cpu*/topology/thread_siblings_list'
+    thread_siblings_list = []
+    pattern = re.compile(r'(\d+)\D(\d+)')
+    for fname in pathlib.Path(path[0]).glob(path[1:]):
+        with open(fname) as f:
+            content = f.read().strip()
+            res = pattern.findall(content)
+            if res:
+                pair = tuple(map(int, res[0]))
+                thread_siblings_list.append(pair)
+    return thread_siblings_list
+
+
+def set_affinity(gpu_id, nproc_per_node, mode='socket'):
+    if mode == 'socket':
+        set_socket_affinity(gpu_id)
+    elif mode == 'single':
+        set_single_affinity(gpu_id)
+    elif mode == 'single_unique':
+        set_single_unique_affinity(gpu_id, nproc_per_node)
+    elif mode == 'socket_unique_interleaved':
+        set_socket_unique_affinity(gpu_id, nproc_per_node, 'interleaved')
+    elif mode == 'socket_unique_continuous':
+        set_socket_unique_affinity(gpu_id, nproc_per_node, 'continuous')
+    else:
+        raise RuntimeError('Unknown affinity mode')
+
+    affinity = os.sched_getaffinity(0)
+    return affinity
+
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/inference.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/inference.py
new file mode 100644
index 00000000..056429f1
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/inference.py
@@ -0,0 +1,239 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import pandas as pd
+import numpy as np
+import pickle
+import argparse
+import torch
+from torch.utils.data import DataLoader
+from torch.cuda import amp
+from torch.utils.tensorboard import SummaryWriter
+from tqdm import tqdm
+from modeling import TemporalFusionTransformer
+from configuration import ElectricityConfig
+from data_utils import TFTDataset
+from utils import PerformanceMeter
+from criterions import QuantileLoss
+import dllogger
+from log_helper import setup_logger
+
+def _unscale_per_id(config, values, ids, scalers):
+    values = values.cpu().numpy()
+    num_horizons = config.example_length - config.encoder_length + 1
+    flat_values = pd.DataFrame(
+            values,
+            columns=[f't{j}' for j in range(num_horizons - values.shape[1], num_horizons)]
+            )
+    flat_values['id'] = ids
+    df_list = []
+    for idx, group in flat_values.groupby('id'):
+        scaler = scalers[idx]
+        group_copy = group.copy()
+        for col in group_copy.columns:
+            if not 'id' in col:
+                _col = np.expand_dims(group_copy[col].values, -1)
+                _t_col = scaler.inverse_transform(_col)[:,-1]
+                group_copy[col] = _t_col
+        df_list.append(group_copy)
+    flat_values = pd.concat(df_list, axis=0)
+
+    flat_values = flat_values[[col for col in flat_values if not 'id' in col]]
+    flat_tensor = torch.from_numpy(flat_values.values)
+    return flat_tensor
+
+def _unscale(config, values, scaler):
+    values = values.cpu().numpy()
+    num_horizons = config.example_length - config.encoder_length + 1
+    flat_values = pd.DataFrame(
+            values,
+            columns=[f't{j}' for j in range(num_horizons - values.shape[1], num_horizons)]
+            )
+    for col in flat_values.columns:
+        if not 'id' in col:
+            _col = np.expand_dims(flat_values[col].values, -1)
+            _t_col = scaler.inverse_transform(_col)[:,-1]
+            flat_values[col] = _t_col
+
+    flat_values = flat_values[[col for col in flat_values if not 'id' in col]]
+    flat_tensor = torch.from_numpy(flat_values.values)
+    return flat_tensor
+
+def predict(args, config, model, data_loader, scalers, cat_encodings, extend_targets=False):
+    model.eval()
+    predictions = []
+    targets = []
+    ids = []
+    perf_meter = PerformanceMeter()
+    n_workers = args.distributed_world_size if hasattr(args, 'distributed_world_size') else 1
+
+    for step, batch in enumerate(data_loader):
+        perf_meter.reset_current_lap()
+        with torch.no_grad():
+            batch = {key: tensor.cuda() if tensor.numel() else None for key, tensor in batch.items()}
+            ids.append(batch['id'][:,0,:])
+            targets.append(batch['target'])
+            predictions.append(model(batch).float())
+
+        perf_meter.update(args.batch_size * n_workers,
+            exclude_from_total=step in [0, len(data_loader)-1])
+
+    targets = torch.cat(targets, dim=0)
+    if not extend_targets:
+        targets = targets[:,config.encoder_length:,:] 
+    predictions = torch.cat(predictions, dim=0)
+    
+    if config.scale_per_id:
+        ids = torch.cat(ids, dim=0).cpu().numpy()
+
+        unscaled_predictions = torch.stack(
+                [_unscale_per_id(config, predictions[:,:,i], ids, scalers) for i in range(len(config.quantiles))], 
+                dim=-1)
+        unscaled_targets = _unscale_per_id(config, targets[:,:,0], ids, scalers).unsqueeze(-1)
+    else:
+        ids = None
+        unscaled_predictions = torch.stack(
+                [_unscale(config, predictions[:,:,i], scalers['']) for i in range(len(config.quantiles))], 
+                dim=-1)
+        unscaled_targets = _unscale(config, targets[:,:,0], scalers['']).unsqueeze(-1)
+
+    return unscaled_predictions, unscaled_targets, ids, perf_meter
+
+def visualize_v2(args, config, model, data_loader, scalers, cat_encodings):
+    unscaled_predictions, unscaled_targets, ids, _ = predict(args, config, model, data_loader, scalers, cat_encodings, extend_targets=True)
+
+    num_horizons = config.example_length - config.encoder_length + 1
+    pad = unscaled_predictions.new_full((unscaled_targets.shape[0], unscaled_targets.shape[1] - unscaled_predictions.shape[1], unscaled_predictions.shape[2]), fill_value=float('nan'))
+    pad[:,-1,:] = unscaled_targets[:,-num_horizons,:]
+    unscaled_predictions = torch.cat((pad, unscaled_predictions), dim=1)
+
+    ids = torch.from_numpy(ids.squeeze())
+    joint_graphs = torch.cat([unscaled_targets, unscaled_predictions], dim=2)
+    graphs = {i:joint_graphs[ids == i, :, :] for i in set(ids.tolist())}
+    for key, g in graphs.items():
+        for i, ex in enumerate(g):
+            df = pd.DataFrame(ex.numpy(), 
+                    index=range(num_horizons - ex.shape[0], num_horizons),
+                    columns=['target'] + [f'P{int(q*100)}' for q in config.quantiles])
+            fig = df.plot().get_figure()
+            ax = fig.get_axes()[0]
+            _values = df.values[config.encoder_length-1:,:]
+            ax.fill_between(range(num_horizons), _values[:,1], _values[:,-1], alpha=0.2, color='green')
+            os.makedirs(os.path.join(args.results, 'single_example_vis', str(key)), exist_ok=True)
+            fig.savefig(os.path.join(args.results, 'single_example_vis', str(key), f'{i}.pdf'))
+
+def inference(args, config, model, data_loader, scalers, cat_encodings):
+    unscaled_predictions, unscaled_targets, ids, perf_meter = predict(args, config, model, data_loader, scalers, cat_encodings)
+
+    if args.joint_visualization or args.save_predictions:
+        ids = torch.from_numpy(ids.squeeze())
+        #ids = torch.cat([x['id'][0] for x in data_loader.dataset])
+        joint_graphs = torch.cat([unscaled_targets, unscaled_predictions], dim=2)
+        graphs = {i:joint_graphs[ids == i, :, :] for i in set(ids.tolist())}
+        for key, g in graphs.items(): #timeseries id, joint targets and predictions
+            _g = {'targets': g[:,:,0]}
+            _g.update({f'P{int(q*100)}':g[:,:,i+1] for i, q in enumerate(config.quantiles)})
+            
+            if args.joint_visualization:
+                summary_writer = SummaryWriter(log_dir=os.path.join(args.results, 'predictions_vis', str(key)))
+                for q, t in _g.items(): # target and quantiles, timehorizon values
+                    if q == 'targets':
+                        targets = torch.cat([t[:,0], t[-1,1:]]) # WIP
+                        # We want to plot targets on the same graph as predictions. Probably could be written better.
+                        for i, val in enumerate(targets):
+                            summary_writer.add_scalars(str(key), {f'{q}':val}, i)
+                        continue
+
+                    # Tensor t contains different time horizons which are shifted in phase
+                    # Next lines realign them
+                    y = t.new_full((t.shape[0] + t.shape[1] -1, t.shape[1]), float('nan'))
+                    for i in range(y.shape[1]):
+                        y[i:i+t.shape[0], i] = t[:,i]
+
+                    for i, vals in enumerate(y): # timestep, timehorizon values value
+                        summary_writer.add_scalars(str(key), {f'{q}_t+{j+1}':v for j,v in enumerate(vals) if v == v}, i)
+                summary_writer.close()
+
+            if args.save_predictions:
+                for q, t in _g.items():
+                    df = pd.DataFrame(t.tolist())
+                    df.columns = [f't+{i+1}' for i in range(len(df.columns))]
+                    os.makedirs(os.path.join(args.results, 'predictions', str(key)), exist_ok=True)
+                    df.to_csv(os.path.join(args.results, 'predictions', str(key), q+'.csv'))
+
+    losses = QuantileLoss(config)(unscaled_predictions, unscaled_targets)
+    normalizer = unscaled_targets.abs().mean()
+    q_risk = 2 * losses / normalizer
+
+    perf_dict = {
+                'throughput': perf_meter.avg,
+                'latency_avg': perf_meter.total_time/len(perf_meter.intervals),
+                'latency_p90': perf_meter.p(90),
+                'latency_p95': perf_meter.p(95),
+                'latency_p99': perf_meter.p(99),
+                'total_infernece_time': perf_meter.total_time,
+                }
+
+    return q_risk, perf_dict
+
+
+def main(args):
+    
+    setup_logger(args)
+    # Set up model
+    state_dict = torch.load(args.checkpoint)
+    config = state_dict['config']
+    model = TemporalFusionTransformer(config).cuda()
+    model.load_state_dict(state_dict['model'])
+    model.eval()
+    model.cuda()
+
+    # Set up dataset
+    test_split = TFTDataset(args.data, config)
+    data_loader = DataLoader(test_split, batch_size=args.batch_size, num_workers=4)
+
+    scalers = pickle.load(open(args.tgt_scalers, 'rb'))
+    cat_encodings = pickle.load(open(args.cat_encodings, 'rb'))
+
+    if args.visualize:
+        # TODO: abstract away all forms of visualization.
+        visualize_v2(args, config, model, data_loader, scalers, cat_encodings)
+
+    quantiles, perf_dict = inference(args, config, model, data_loader, scalers, cat_encodings)
+    quantiles = {'test_p10': quantiles[0].item(), 'test_p50': quantiles[1].item(), 'test_p90': quantiles[2].item(), 'sum':sum(quantiles).item()}
+    finish_log = {**quantiles, **perf_dict}
+    dllogger.log(step=(), data=finish_log, verbosity=1)
+    print('Test q-risk: P10 {} | P50 {} | P90 {}'.format(*quantiles))
+    print('Latency:\n\tAverage {:.3f}s\n\tp90 {:.3f}s\n\tp95 {:.3f}s\n\tp99 {:.3f}s'.format(
+        perf_dict['latency_avg'], perf_dict['latency_p90'], perf_dict['latency_p95'], perf_dict['latency_p99']))
+
+if __name__=='__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--checkpoint', type=str,
+                        help='Path to the checkpoint')
+    parser.add_argument('--data', type=str,
+                        help='Path to the test split of the dataset')
+    parser.add_argument('--tgt_scalers', type=str,
+                        help='Path to the tgt_scalers.bin file produced by the preprocessing')
+    parser.add_argument('--cat_encodings', type=str,
+                        help='Path to the cat_encodings.bin file produced by the preprocessing')
+    parser.add_argument('--batch_size', type=int, default=64)
+    parser.add_argument('--visualize', action='store_true', help='Visualize predictions - each example on the separate plot')
+    parser.add_argument('--joint_visualization', action='store_true', help='Visualize predictions - each timeseries on separate plot. Projections will be concatenated.')
+    parser.add_argument('--save_predictions', action='store_true')
+    parser.add_argument('--results', type=str, default='/results')
+    parser.add_argument('--log_file', type=str, default='dllogger.json')
+    ARGS = parser.parse_args()
+    main(ARGS)
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/log_helper.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/log_helper.py
new file mode 100644
index 00000000..83d2ac7f
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/log_helper.py
@@ -0,0 +1,141 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import subprocess
+import sys
+import itertools
+import atexit
+
+import dllogger
+from dllogger import Backend, JSONStreamBackend, StdOutBackend
+
+import torch.distributed as dist
+from torch.utils.tensorboard import SummaryWriter
+
+class TensorBoardBackend(Backend):
+    def __init__(self, verbosity, log_dir):
+        super().__init__(verbosity=verbosity)
+        self.summary_writer = SummaryWriter(log_dir=os.path.join(log_dir, 'TB_summary'),
+                                            flush_secs=120,
+                                            max_queue=200
+                                            )
+        self.hp_cache = None
+        atexit.register(self.summary_writer.close)
+
+    @property
+    def log_level(self):
+        return self._log_level
+
+    def metadata(self, timestamp, elapsedtime, metric, metadata):
+        pass
+
+    def log(self, timestamp, elapsedtime, step, data):
+        if step == 'HPARAMS':
+            parameters = {k: v for k, v in data.items() if not isinstance(v, (list, tuple))}
+            #Unpack list and tuples
+            for d in [{k+f'_{i}':v for i,v in enumerate(l)} for k,l in data.items() if isinstance(l, (list, tuple))]:
+                parameters.update(d)
+            #Remove custom classes
+            parameters = {k: v for k, v in data.items() if isinstance(v, (int, float, str, bool))}
+            parameters.update({k:'None' for k, v in data.items() if v is None})
+            self.hp_cache = parameters
+        if step == ():
+            if self.hp_cache is None:
+                print('Warning: Cannot save HParameters. Please log HParameters with step=\'HPARAMS\'', file=sys.stderr)
+                return
+            self.summary_writer.add_hparams(self.hp_cache, data)
+        if not isinstance(step, int):
+            return
+        for k, v in data.items():
+            self.summary_writer.add_scalar(k, v, step)
+
+    def flush(self):
+        pass
+
+def setup_logger(args):
+    os.makedirs(args.results, exist_ok=True)
+    log_path = os.path.join(args.results, args.log_file)
+
+    if os.path.exists(log_path):
+        for i in itertools.count():
+            s_fname = args.log_file.split('.')
+            fname = '.'.join(s_fname[:-1]) + f'_{i}.' + s_fname[-1] if len(s_fname) > 1 else args.stat_file + f'.{i}'
+            log_path = os.path.join(args.results, fname)
+            if not os.path.exists(log_path):
+                break
+
+    def metric_format(metric, metadata, value):
+        return "{}: {}".format(metric, f'{value:.5f}' if isinstance(value, float) else value)
+    def step_format(step):
+        if step == ():
+            return "Finished |"
+        elif isinstance(step, int):
+            return "Step {0: <5} |".format(step)
+        return "Step {} |".format(step)
+
+
+    if not dist.is_initialized() or not args.distributed_world_size > 1 or args.distributed_rank == 0:
+        dllogger.init(backends=[JSONStreamBackend(verbosity=1, filename=log_path),
+                                TensorBoardBackend(verbosity=1, log_dir=args.results),
+                                StdOutBackend(verbosity=2, 
+                                              step_format=step_format,
+                                              prefix_format=lambda x: "")#,
+                                              #metric_format=metric_format)
+                                ])
+    else:
+        dllogger.init(backends=[])
+    dllogger.log(step='PARAMETER', data=vars(args), verbosity=0)
+
+    container_setup_info = {**get_framework_env_vars(), **get_system_info()}
+    dllogger.log(step='ENVIRONMENT', data=container_setup_info, verbosity=0)
+
+    dllogger.metadata('loss', {'GOAL': 'MINIMIZE', 'STAGE': 'TRAIN', 'format': ':5f'})
+    dllogger.metadata('P10', {'GOAL': 'MINIMIZE', 'STAGE': 'TRAIN', 'format': ':5f'})
+    dllogger.metadata('P50', {'GOAL': 'MINIMIZE', 'STAGE': 'TRAIN', 'format': ':5f'})
+    dllogger.metadata('P90', {'GOAL': 'MINIMIZE', 'STAGE': 'TRAIN', 'format': ':5f'})
+    dllogger.metadata('items/s', {'GOAL': 'MAXIMIZE', 'STAGE': 'TRAIN', 'format': ':1f'})
+    dllogger.metadata('val_loss', {'GOAL': 'MINIMIZE', 'STAGE': 'VAL', 'format':':5f'})
+    dllogger.metadata('val_P10', {'GOAL': 'MINIMIZE', 'STAGE': 'VAL', 'format': ':5f'})
+    dllogger.metadata('val_P50', {'GOAL': 'MINIMIZE', 'STAGE': 'VAL', 'format': ':5f'})
+    dllogger.metadata('val_P90', {'GOAL': 'MINIMIZE', 'STAGE': 'VAL', 'format': ':5f'})
+    dllogger.metadata('val_items/s', {'GOAL': 'MAXIMIZE', 'STAGE': 'VAL', 'format': ':1f'})
+    dllogger.metadata('test_P10', {'GOAL': 'MINIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
+    dllogger.metadata('test_P50', {'GOAL': 'MINIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
+    dllogger.metadata('test_P90', {'GOAL': 'MINIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
+    dllogger.metadata('throughput', {'GOAL': 'MAXIMIZE', 'STAGE': 'TEST', 'format': ':1f'})
+    dllogger.metadata('latency_p90', {'GOAL': 'MIMIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
+    dllogger.metadata('latency_p95', {'GOAL': 'MIMIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
+    dllogger.metadata('latency_p99', {'GOAL': 'MIMIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
+
+
+def get_framework_env_vars():
+    return {
+        'NVIDIA_PYTORCH_VERSION': os.environ.get('NVIDIA_PYTORCH_VERSION'),
+        'PYTORCH_VERSION': os.environ.get('PYTORCH_VERSION'),
+        'CUBLAS_VERSION': os.environ.get('CUBLAS_VERSION'),
+        'NCCL_VERSION': os.environ.get('NCCL_VERSION'),
+        'CUDA_DRIVER_VERSION': os.environ.get('CUDA_DRIVER_VERSION'),
+        'CUDNN_VERSION': os.environ.get('CUDNN_VERSION'),
+        'CUDA_VERSION': os.environ.get('CUDA_VERSION'),
+        'NVIDIA_PIPELINE_ID': os.environ.get('NVIDIA_PIPELINE_ID'),
+        'NVIDIA_BUILD_ID': os.environ.get('NVIDIA_BUILD_ID'),
+        'NVIDIA_TF32_OVERRIDE': os.environ.get('NVIDIA_TF32_OVERRIDE'),
+    }
+
+def get_system_info():
+    system_info = subprocess.run('nvidia-smi --query-gpu=gpu_name,memory.total,enforced.power.limit --format=csv'.split(), capture_output=True).stdout
+    system_info = [i.decode('utf-8') for i in system_info.split(b'\n')]
+    system_info = [x for x in system_info if x]
+    return {'system_info': system_info}
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/modeling.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/modeling.py
new file mode 100644
index 00000000..65e64983
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/modeling.py
@@ -0,0 +1,367 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from torch import Tensor
+from typing import Dict, Tuple, Optional, List
+
+if os.environ.get("TFT_SCRIPTING", False):
+    from torch.nn import LayerNorm
+else:
+    from apex.normalization.fused_layer_norm import FusedLayerNorm as LayerNorm
+
+class MaybeLayerNorm(nn.Module):
+    def __init__(self, output_size, hidden_size, eps):
+        super().__init__()
+        if output_size and output_size == 1:
+            self.ln = nn.Identity()
+        else:
+            self.ln = LayerNorm(output_size if output_size else hidden_size, eps=eps)
+    
+    def forward(self, x):
+        return self.ln(x)
+
+
+class GLU(nn.Module):
+    def __init__(self, hidden_size, output_size):
+        super().__init__()
+        self.lin = nn.Linear(hidden_size, output_size * 2)
+
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.lin(x)
+        x = F.glu(x)
+        return x
+
+
+class GRN(nn.Module):
+    def __init__(self,
+                 input_size,
+                 hidden_size, 
+                 output_size=None,
+                 context_hidden_size=None,
+                 dropout=0):
+        super().__init__()
+
+        
+        self.layer_norm = MaybeLayerNorm(output_size, hidden_size, eps=1e-3)
+        self.lin_a = nn.Linear(input_size, hidden_size)
+        if context_hidden_size is not None:
+            self.lin_c = nn.Linear(context_hidden_size, hidden_size, bias=False)
+        self.lin_i = nn.Linear(hidden_size, hidden_size)
+        self.glu = GLU(hidden_size, output_size if output_size else hidden_size)
+        self.dropout = nn.Dropout(dropout)
+        self.out_proj = nn.Linear(input_size, output_size) if output_size else None
+
+    def forward(self, a: Tensor, c: Optional[Tensor] = None):
+        x = self.lin_a(a)
+        if c is not None:
+            x = x + self.lin_c(c).unsqueeze(1)
+        x = F.elu(x)
+        x = self.lin_i(x)
+        x = self.dropout(x)
+        x = self.glu(x)
+        y = a if not self.out_proj else self.out_proj(a)
+        x = x + y
+        x = self.layer_norm(x)
+        return x 
+
+class TFTEmbedding(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.s_cat_inp_lens    = config.static_categorical_inp_lens
+        self.t_cat_k_inp_lens  = config.temporal_known_categorical_inp_lens
+        self.t_cat_o_inp_lens  = config.temporal_observed_categorical_inp_lens
+        self.s_cont_inp_size   = config.static_continuous_inp_size
+        self.t_cont_k_inp_size = config.temporal_known_continuous_inp_size
+        self.t_cont_o_inp_size = config.temporal_observed_continuous_inp_size
+        self.t_tgt_size        = config.temporal_target_size
+
+        self.hidden_size = config.hidden_size
+
+        # There are 7 types of input:
+        # 1. Static categorical
+        # 2. Static continuous
+        # 3. Temporal known a priori categorical
+        # 4. Temporal known a priori continuous
+        # 5. Temporal observed categorical
+        # 6. Temporal observed continuous
+        # 7. Temporal observed targets (time series obseved so far)
+
+        self.s_cat_embed = nn.ModuleList([
+            nn.Embedding(n, self.hidden_size) for n in self.s_cat_inp_lens]) if self.s_cat_inp_lens else None
+        self.t_cat_k_embed = nn.ModuleList([
+            nn.Embedding(n, self.hidden_size) for n in self.t_cat_k_inp_lens]) if self.t_cat_k_inp_lens else None
+        self.t_cat_o_embed = nn.ModuleList([
+            nn.Embedding(n, self.hidden_size) for n in self.t_cat_o_inp_lens]) if self.t_cat_o_inp_lens else None
+
+        self.s_cont_embedding_vectors = nn.Parameter(torch.Tensor(self.s_cont_inp_size, self.hidden_size)) if self.s_cont_inp_size else None
+        self.t_cont_k_embedding_vectors = nn.Parameter(torch.Tensor(self.t_cont_k_inp_size, self.hidden_size)) if self.t_cont_k_inp_size else None
+        self.t_cont_o_embedding_vectors = nn.Parameter(torch.Tensor(self.t_cont_o_inp_size, self.hidden_size)) if self.t_cont_o_inp_size else None
+        self.t_tgt_embedding_vectors = nn.Parameter(torch.Tensor(self.t_tgt_size, self.hidden_size))
+
+        self.s_cont_embedding_bias = nn.Parameter(torch.zeros(self.s_cont_inp_size, self.hidden_size)) if self.s_cont_inp_size else None
+        self.t_cont_k_embedding_bias = nn.Parameter(torch.zeros(self.t_cont_k_inp_size, self.hidden_size)) if self.t_cont_k_inp_size else None
+        self.t_cont_o_embedding_bias = nn.Parameter(torch.zeros(self.t_cont_o_inp_size, self.hidden_size)) if self.t_cont_o_inp_size else None
+        self.t_tgt_embedding_bias = nn.Parameter(torch.zeros(self.t_tgt_size, self.hidden_size))
+
+        if self.s_cont_embedding_vectors is not None:
+            torch.nn.init.xavier_normal_(self.s_cont_embedding_vectors)
+        if self.t_cont_k_embedding_vectors is not None:
+            torch.nn.init.xavier_normal_(self.t_cont_k_embedding_vectors)
+        if self.t_cont_o_embedding_vectors is not None:
+            torch.nn.init.xavier_normal_(self.t_cont_o_embedding_vectors)
+        torch.nn.init.xavier_normal_(self.t_tgt_embedding_vectors)
+
+    def _apply_embedding(self,
+            cat: Optional[Tensor],
+            cont: Optional[Tensor],
+            cat_emb: Optional[nn.ModuleList], 
+            cont_emb: Tensor,
+            cont_bias: Tensor,
+            ) -> Tuple[Optional[Tensor], Optional[Tensor]]:
+        e_cat = torch.stack([embed(cat[...,i]) for i, embed in enumerate(cat_emb)], dim=-2) if cat is not None else None
+        if cont is not None:
+            #the line below is equivalent to following einsums
+            #e_cont = torch.einsum('btf,fh->bthf', cont, cont_emb)
+            #e_cont = torch.einsum('bf,fh->bhf', cont, cont_emb)
+            e_cont = torch.mul(cont.unsqueeze(-1), cont_emb)
+            e_cont = e_cont + cont_bias
+        else:
+            e_cont = None
+
+        if e_cat is not None and e_cont is not None:
+            return torch.cat([e_cat, e_cont], dim=-2)
+        elif e_cat is not None:
+            return e_cat
+        elif e_cont is not None:
+            return e_cont
+        else:
+            return None
+
+    def forward(self, x: Dict[str, Tensor]):
+        # temporal/static categorical/continuous known/observed input 
+        s_cat_inp = x.get('s_cat', None)
+        s_cont_inp = x.get('s_cont', None)
+        t_cat_k_inp = x.get('k_cat', None)
+        t_cont_k_inp = x.get('k_cont', None)
+        t_cat_o_inp = x.get('o_cat', None)
+        t_cont_o_inp = x.get('o_cont', None)
+        t_tgt_obs = x['target'] # Has to be present
+
+        # Static inputs are expected to be equal for all timesteps
+        # For memory efficiency there is no assert statement
+        s_cat_inp = s_cat_inp[:,0,:] if s_cat_inp is not None else None
+        s_cont_inp = s_cont_inp[:,0,:] if s_cont_inp is not None else None
+
+        s_inp = self._apply_embedding(s_cat_inp,
+                                      s_cont_inp,
+                                      self.s_cat_embed,
+                                      self.s_cont_embedding_vectors,
+                                      self.s_cont_embedding_bias)
+        t_known_inp = self._apply_embedding(t_cat_k_inp,
+                                            t_cont_k_inp,
+                                            self.t_cat_k_embed,
+                                            self.t_cont_k_embedding_vectors,
+                                            self.t_cont_k_embedding_bias)
+        t_observed_inp = self._apply_embedding(t_cat_o_inp,
+                                               t_cont_o_inp,
+                                               self.t_cat_o_embed,
+                                               self.t_cont_o_embedding_vectors,
+                                               self.t_cont_o_embedding_bias)
+
+        # Temporal observed targets
+        # t_observed_tgt = torch.einsum('btf,fh->btfh', t_tgt_obs, self.t_tgt_embedding_vectors)
+        t_observed_tgt = torch.matmul(t_tgt_obs.unsqueeze(3).unsqueeze(4), self.t_tgt_embedding_vectors.unsqueeze(1)).squeeze(3)
+        t_observed_tgt = t_observed_tgt + self.t_tgt_embedding_bias
+
+        return s_inp, t_known_inp, t_observed_inp, t_observed_tgt
+
+class VariableSelectionNetwork(nn.Module):
+    def __init__(self, config, num_inputs):
+        super().__init__()
+        self.joint_grn = GRN(config.hidden_size*num_inputs, config.hidden_size, output_size=num_inputs, context_hidden_size=config.hidden_size)
+        self.var_grns = nn.ModuleList([GRN(config.hidden_size, config.hidden_size, dropout=config.dropout) for _ in range(num_inputs)])
+
+    def forward(self, x: Tensor, context: Optional[Tensor] = None):
+        Xi = x.reshape(*x.shape[:-2], -1)
+        grn_outputs = self.joint_grn(Xi, c=context)
+        sparse_weights = F.softmax(grn_outputs, dim=-1)
+        transformed_embed_list = [m(x[...,i,:]) for i, m in enumerate(self.var_grns)]
+        transformed_embed = torch.stack(transformed_embed_list, dim=-1)
+        #the line below performs batched matrix vector multiplication
+        #for temporal features it's bthf,btf->bth
+        #for static features it's bhf,bf->bh
+        variable_ctx = torch.matmul(transformed_embed, sparse_weights.unsqueeze(-1)).squeeze(-1)
+
+        return variable_ctx, sparse_weights
+
+class StaticCovariateEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.vsn = VariableSelectionNetwork(config, config.num_static_vars)
+        self.context_grns = nn.ModuleList([GRN(config.hidden_size, config.hidden_size, dropout=config.dropout) for _ in range(4)])
+
+    def forward(self, x: Tensor) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
+        variable_ctx, sparse_weights = self.vsn(x)
+
+        # Context vectors:
+        # variable selection context
+        # enrichment context
+        # state_c context
+        # state_h context
+        cs, ce, ch, cc = tuple(m(variable_ctx) for m in self.context_grns)
+
+        return cs, ce, ch, cc
+
+
+class InterpretableMultiHeadAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.n_head = config.n_head
+        assert config.hidden_size % config.n_head == 0
+        self.d_head = config.hidden_size // config.n_head
+        self.qkv_linears = nn.Linear(config.hidden_size, (2 * self.n_head + 1) * self.d_head, bias=False)
+        self.out_proj = nn.Linear(self.d_head, config.hidden_size, bias=False)
+        self.attn_dropout = nn.Dropout(config.attn_dropout)
+        self.out_dropout = nn.Dropout(config.dropout)
+        self.scale = self.d_head**-0.5
+        self.register_buffer("_mask", torch.triu(torch.full((config.example_length, config.example_length), float('-inf')), 1).unsqueeze(0))
+
+    def forward(self, x: Tensor, mask_future_timesteps: bool = True) -> Tuple[Tensor, Tensor]:
+        bs, t, h_size = x.shape
+        qkv = self.qkv_linears(x)
+        q, k, v = qkv.split((self.n_head * self.d_head, self.n_head * self.d_head, self.d_head), dim=-1)
+        q = q.view(bs, t, self.n_head, self.d_head)
+        k = k.view(bs, t, self.n_head, self.d_head)
+        v = v.view(bs, t, self.d_head)
+
+        # attn_score = torch.einsum('bind,bjnd->bnij', q, k)
+        attn_score = torch.matmul(q.permute((0, 2, 1, 3)), k.permute((0, 2, 3, 1)))
+        attn_score.mul_(self.scale)
+
+        if mask_future_timesteps:
+            attn_score = attn_score + self._mask
+
+        attn_prob = F.softmax(attn_score, dim=3)
+        attn_prob = self.attn_dropout(attn_prob)
+
+        # attn_vec = torch.einsum('bnij,bjd->bnid', attn_prob, v)
+        attn_vec = torch.matmul(attn_prob, v.unsqueeze(1))
+        m_attn_vec = torch.mean(attn_vec, dim=1)
+        out = self.out_proj(m_attn_vec)
+        out = self.out_dropout(out)
+
+        return out, attn_vec
+
+
+
+class TemporalFusionTransformer(nn.Module):
+    """ 
+    Implementation of https://arxiv.org/abs/1912.09363 
+    """
+    def __init__(self, config):
+        super().__init__()
+
+        if hasattr(config, 'model'):
+            config = config.model
+
+        self.encoder_length = config.encoder_length #this determines from how distant past we want to use data from
+
+        self.embedding = TFTEmbedding(config)
+        self.static_encoder = StaticCovariateEncoder(config)
+
+        self.history_vsn = VariableSelectionNetwork(config, config.num_historic_vars) 
+        self.history_encoder = nn.LSTM(config.hidden_size, config.hidden_size, batch_first=True)
+        self.future_vsn = VariableSelectionNetwork(config, config.num_future_vars)
+        self.future_encoder = nn.LSTM(config.hidden_size, config.hidden_size, batch_first=True)
+
+
+        self.input_gate = GLU(config.hidden_size, config.hidden_size)
+        self.input_gate_ln = LayerNorm(config.hidden_size, eps=1e-3)
+
+        self.enrichment_grn = GRN(config.hidden_size,
+                                  config.hidden_size,
+                                  context_hidden_size=config.hidden_size, 
+                                  dropout=config.dropout)
+        self.attention = InterpretableMultiHeadAttention(config)
+        self.attention_gate = GLU(config.hidden_size, config.hidden_size)
+        self.attention_ln = LayerNorm(config.hidden_size, eps=1e-3)
+
+        self.positionwise_grn = GRN(config.hidden_size,
+                                    config.hidden_size,
+                                    dropout=config.dropout)
+
+        self.decoder_gate = GLU(config.hidden_size, config.hidden_size)
+        self.decoder_ln = LayerNorm(config.hidden_size, eps=1e-3)
+
+        self.quantile_proj = nn.Linear(config.hidden_size, len(config.quantiles))
+
+    def forward(self, x: Dict[str, Tensor]) -> Tensor:
+        s_inp, t_known_inp, t_observed_inp, t_observed_tgt = self.embedding(x)
+
+        # Static context
+        cs, ce, ch, cc = self.static_encoder(s_inp)
+        ch, cc = ch.unsqueeze(0), cc.unsqueeze(0) #lstm initial states
+
+        # Temporal input
+        _historical_inputs = [t_known_inp[:,:self.encoder_length,:], t_observed_tgt[:,:self.encoder_length,:]]
+        if t_observed_inp is not None:
+            _historical_inputs.insert(0,t_observed_inp[:,:self.encoder_length,:])
+
+        historical_inputs = torch.cat(_historical_inputs, dim=-2)
+        future_inputs = t_known_inp[:, self.encoder_length:]
+
+        # Encoders
+        historical_features, _ = self.history_vsn(historical_inputs, cs)
+        history, state = self.history_encoder(historical_features, (ch, cc))
+        future_features, _ = self.future_vsn(future_inputs, cs)
+        future, _ = self.future_encoder(future_features, state)
+        torch.cuda.synchronize() # this call gives perf boost for unknown reasons
+
+        # skip connection
+        input_embedding = torch.cat([historical_features, future_features], dim=1)
+        temporal_features = torch.cat([history, future], dim=1)
+        temporal_features = self.input_gate(temporal_features)
+        temporal_features = temporal_features + input_embedding
+        temporal_features = self.input_gate_ln(temporal_features)
+
+        # Static enrichment
+        enriched = self.enrichment_grn(temporal_features, c=ce)
+
+        # Temporal self attention
+        x, _ = self.attention(enriched, mask_future_timesteps=True)
+
+        # Don't compute hictorical quantiles
+        x = x[:, self.encoder_length:, :]
+        temporal_features = temporal_features[:, self.encoder_length:, :]
+        enriched = enriched[:, self.encoder_length:, :]
+
+        x = self.attention_gate(x)
+        x = x + enriched
+        x = self.attention_ln(x)
+
+        # Position-wise feed-forward
+        x = self.positionwise_grn(x)
+
+        # Final skip connection
+        x = self.decoder_gate(x)
+        x = x + temporal_features
+        x = self.decoder_ln(x)
+
+        out = self.quantile_proj(x)
+
+        return out
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/requirements.txt b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/requirements.txt
new file mode 100644
index 00000000..8ba46efc
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/requirements.txt
@@ -0,0 +1 @@
+tensorboard
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/scripts/benchmark.sh b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/scripts/benchmark.sh
new file mode 100644
index 00000000..c8a04c36
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/scripts/benchmark.sh
@@ -0,0 +1,54 @@
+#! /bin/bash
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+NUM_GPUS=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
+[ $NUM_GPUS -eq 16 ] && WORKER_NUMS=(1 8 16) || WORKER_NUMS=(1 8)
+DATASETS=(electricity traffic)
+
+rm -r /tmp/benchmark_results
+
+for DATASET in ${DATASETS[@]}
+do
+    for NGPU in ${WORKER_NUMS[@]}
+    do
+        for BATCH_SIZE in 512 1024 1536 2048 2560
+        do
+            for USE_AMP in --use_amp ""
+            do
+                for AFFINITY in "--affinity disabled" "--affinity single" "--affinity socket_unique_interleaved"
+                do 
+                    EXP_NAME="TFT_benchmark_${DATASET}_BS_${BATCH_SIZE}_${NGPU}GPU${USE_AMP}_${AFFINITY}"
+                    python -m torch.distributed.launch --nproc_per_node=${NGPU} train.py \
+                            --dataset ${DATASET} \
+                            --data_path /data/processed/${DATASET}_bin \
+                            --batch_size=${BATCH_SIZE} \
+                            --lr 5e-4 \
+                            --epochs 1 \
+                            --sample 100000 5000 \
+                            --seed 1 \
+                            ${USE_AMP} \
+                            ${AFFINITY} \
+                            --clip_grad 0.1 \
+                            --results /tmp/benchmark_results/${EXP_NAME}
+                done
+            done
+        done
+    done
+done
+for P in `ls /tmp/benchmark_results/`;
+do
+    echo ${P}
+    tail -n 1 /tmp/benchmark_results/${P}/dllogger.json
+done
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/scripts/get_data.sh b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/scripts/get_data.sh
new file mode 100644
index 00000000..d4c7c7e1
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/scripts/get_data.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+DATAPATH='/data'
+
+declare -A URLS=( ['electricity']='https://archive.ics.uci.edu/ml/machine-learning-databases/00321/LD2011_2014.txt.zip'
+                  ['traffic']='https://archive.ics.uci.edu/ml/machine-learning-databases/00204/PEMS-SF.zip'
+                )
+
+mkdir -p ${DATAPATH}/raw
+mkdir -p ${DATAPATH}/processed
+
+for DS in electricity traffic
+do
+	DS_PATH=${DATAPATH}/raw/${DS}
+	ZIP_FNAME=${DS_PATH}.zip
+    if [ ! -d ${DS_PATH} ]
+    then
+        wget "${URLS[${DS}]}" -O ${ZIP_FNAME}
+        unzip ${ZIP_FNAME} -d ${DS_PATH}
+    fi
+	python -c "from data_utils import standarize_${DS} as standarize; standarize(\"${DS_PATH}\")"
+	python -c "from data_utils import preprocess; \
+               from configuration import ${DS^}Config as Config; \
+               preprocess(\"${DS_PATH}/standarized.csv\", \"${DATAPATH}/processed/${DS}_bin\", Config())" 
+done
+
+
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/scripts/run_electricity.sh b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/scripts/run_electricity.sh
new file mode 100644
index 00000000..86214a9a
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/scripts/run_electricity.sh
@@ -0,0 +1,30 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+: ${SEED:=1}
+: ${LR:=1e-3}
+: ${NGPU:=8}
+: ${BATCH_SIZE:=1024}
+: ${EPOCHS:=30}
+
+python -m torch.distributed.launch --nproc_per_node=${NGPU} train.py \
+        --dataset electricity \
+        --data_path /data/processed/electricity_bin \
+        --batch_size=${BATCH_SIZE} \
+        --sample 450000 50000 \
+        --lr ${LR} \
+        --epochs ${EPOCHS} \
+        --seed ${SEED} \
+        --use_amp \
+        --results /results/TFT_electricity_bs${NGPU}x${BATCH_SIZE}_lr${LR}/seed_${SEED}
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/scripts/run_electricity_DGX1-16G.sh b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/scripts/run_electricity_DGX1-16G.sh
new file mode 100644
index 00000000..86214a9a
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/scripts/run_electricity_DGX1-16G.sh
@@ -0,0 +1,30 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+: ${SEED:=1}
+: ${LR:=1e-3}
+: ${NGPU:=8}
+: ${BATCH_SIZE:=1024}
+: ${EPOCHS:=30}
+
+python -m torch.distributed.launch --nproc_per_node=${NGPU} train.py \
+        --dataset electricity \
+        --data_path /data/processed/electricity_bin \
+        --batch_size=${BATCH_SIZE} \
+        --sample 450000 50000 \
+        --lr ${LR} \
+        --epochs ${EPOCHS} \
+        --seed ${SEED} \
+        --use_amp \
+        --results /results/TFT_electricity_bs${NGPU}x${BATCH_SIZE}_lr${LR}/seed_${SEED}
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/scripts/run_traffic.sh b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/scripts/run_traffic.sh
new file mode 100644
index 00000000..cab8e473
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/scripts/run_traffic.sh
@@ -0,0 +1,30 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+: ${SEED:=1}
+: ${LR:=1e-3}
+: ${NGPU:=8}
+: ${BATCH_SIZE:=1024}
+: ${EPOCHS:=20}
+
+python -m torch.distributed.launch --nproc_per_node=${NGPU} train.py \
+        --dataset traffic \
+        --data_path /data/processed/traffic_bin \
+        --batch_size=${BATCH_SIZE} \
+        --sample 450000 50000 \
+        --lr ${LR} \
+        --epochs ${EPOCHS} \
+        --seed ${SEED} \
+        --use_amp \
+        --results /results/TFT_traffic_bs${NGPU}x${BATCH_SIZE}_lr${LR}/seed_${SEED}
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/scripts/run_traffic_DGX1-16G.sh b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/scripts/run_traffic_DGX1-16G.sh
new file mode 100644
index 00000000..cab8e473
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/scripts/run_traffic_DGX1-16G.sh
@@ -0,0 +1,30 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+: ${SEED:=1}
+: ${LR:=1e-3}
+: ${NGPU:=8}
+: ${BATCH_SIZE:=1024}
+: ${EPOCHS:=20}
+
+python -m torch.distributed.launch --nproc_per_node=${NGPU} train.py \
+        --dataset traffic \
+        --data_path /data/processed/traffic_bin \
+        --batch_size=${BATCH_SIZE} \
+        --sample 450000 50000 \
+        --lr ${LR} \
+        --epochs ${EPOCHS} \
+        --seed ${SEED} \
+        --use_amp \
+        --results /results/TFT_traffic_bs${NGPU}x${BATCH_SIZE}_lr${LR}/seed_${SEED}
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/train.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/train.py
new file mode 100644
index 00000000..e5ceceeb
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/train.py
@@ -0,0 +1,294 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import time
+import os
+import pickle
+import json
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.distributed as dist
+from torch.utils.data import DataLoader, DistributedSampler, RandomSampler
+from apex import amp
+from apex.optimizers import FusedAdam
+#from torch.nn.parallel import DistributedDataParallel as DDP
+from apex.parallel import DistributedDataParallel as DDP
+
+import numpy as np
+
+import dllogger
+
+from modeling import TemporalFusionTransformer
+from configuration import CONFIGS
+from data_utils import TFTBinaryDataset, sample_data
+from log_helper import setup_logger
+from criterions import QuantileLoss
+from inference import predict
+from utils import PerformanceMeter
+import gpu_affinity
+from ema import ModelEma
+
+def load_dataset(args, config):
+    train_split = TFTBinaryDataset(os.path.join(args.data_path, 'train.bin'), config)
+    train_split = sample_data(train_split, args.sample_data[0])
+    if args.distributed_world_size > 1:
+        data_sampler = DistributedSampler(train_split, args.distributed_world_size, args.distributed_rank, seed=args.seed + args.distributed_rank, drop_last=True)
+    else:
+        data_sampler = RandomSampler(train_split)
+    train_loader = DataLoader(train_split, batch_size=args.batch_size, num_workers=4, sampler=data_sampler, pin_memory=True)
+
+    valid_split = TFTBinaryDataset(os.path.join(args.data_path, 'valid.bin'), config)
+    valid_split = sample_data(valid_split, args.sample_data[1])
+    if args.distributed_world_size > 1:
+        data_sampler = DistributedSampler(valid_split, args.distributed_world_size, args.distributed_rank, shuffle=False, drop_last=False)
+    else:
+        data_sampler = None
+    valid_loader = DataLoader(valid_split, batch_size=args.batch_size, sampler=data_sampler, num_workers=4, pin_memory=True)
+
+    test_split = TFTBinaryDataset(os.path.join(args.data_path, 'test.bin'), config)
+    if args.distributed_world_size > 1:
+        data_sampler = DistributedSampler(test_split, args.distributed_world_size, args.distributed_rank, shuffle=False, drop_last=False)
+    else:
+        data_sampler = None
+    test_loader = DataLoader(test_split, batch_size=args.batch_size, sampler=data_sampler, num_workers=4, pin_memory=True)
+
+    print_once(f'Train split length: {len(train_split)}')
+    print_once(f'Valid split length: {len(valid_split)}')
+    print_once(f'Test split length: {len(test_split)}')
+
+    return train_loader, valid_loader, test_loader
+
+def print_once(*args, **kwargs):
+    if not dist.is_initialized() or dist.get_rank() == 0:
+        print(*args, **kwargs)
+
+
+def main(args):
+    # Enable CuDNN autotuner
+    nproc_per_node = torch.cuda.device_count()
+    if args.affinity != 'disabled':
+        affinity = gpu_affinity.set_affinity(
+                args.local_rank,
+                nproc_per_node,
+                args.affinity
+            )
+        print(f'{args.local_rank}: thread affinity: {affinity}')
+
+
+    torch.backends.cudnn.benchmark = True
+
+    ### INIT DISTRIBUTED
+    if args.distributed_world_size > 1:
+        args.local_rank = int(os.environ.get('LOCAL_RANK', args.local_rank))
+        torch.cuda.set_device(args.local_rank)
+        dist.init_process_group(backend='nccl', init_method='env://')
+        args.distributed_world_size = int(os.environ['WORLD_SIZE'])
+        args.distributed_rank = dist.get_rank()
+        print_once(f'Distributed training with {args.distributed_world_size} GPUs')
+        torch.cuda.synchronize()
+
+    if args.seed:
+        np.random.seed(args.seed)
+        torch.manual_seed(args.seed)
+        torch.cuda.manual_seed(args.seed)
+
+    setup_logger(args)
+
+    config = CONFIGS[args.dataset]()
+    if args.overwrite_config:
+        config.__dict__.update(json.loads(args.overwrite_config))
+
+    dllogger.log(step='HPARAMS', data={**vars(args), **vars(config)}, verbosity=1)
+
+    model = TemporalFusionTransformer(config).cuda()
+    if args.ema_decay:
+        model_ema = ModelEma(model, decay=args.ema_decay)
+
+    print_once('Model params: {}'.format(sum(p.numel() for p in model.parameters())))
+    criterion = QuantileLoss(config).cuda()
+    optimizer = FusedAdam(model.parameters(), lr=args.lr)
+    if args.use_amp:
+        model, optimizer = amp.initialize(model, optimizer, opt_level="O2", loss_scale="dynamic")
+    if args.distributed_world_size > 1:
+        #model = DDP(model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True)
+        model = DDP(model)
+
+    train_loader, valid_loader, test_loader = load_dataset(args, config)
+
+    global_step = 0
+    perf_meter = PerformanceMeter()
+
+    for epoch in range(args.epochs):
+        start = time.time()
+        dllogger.log(step=global_step, data={'epoch': epoch}, verbosity=1)
+
+        model.train() 
+        for local_step, batch in enumerate(train_loader):
+            perf_meter.reset_current_lap()
+            batch = {key: tensor.cuda() if tensor.numel() else None for key, tensor in batch.items()}
+            predictions = model(batch)
+            targets = batch['target'][:,config.encoder_length:,:]
+            p_losses = criterion(predictions, targets)
+            loss = p_losses.sum()
+
+            if args.use_amp:
+                with amp.scale_loss(loss, optimizer) as scaled_loss:
+                    scaled_loss.backward()
+            else:
+                loss.backward()
+            if not args.grad_accumulation or (global_step+1) % args.grad_accumulation == 0:
+                if args.clip_grad:
+                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip_grad)
+                optimizer.step()
+                optimizer.zero_grad()
+                if args.ema_decay:
+                    model_ema.update(model)
+
+            if args.distributed_world_size > 1:
+                dist.all_reduce(p_losses)
+                p_losses /= args.distributed_world_size
+                loss = p_losses.sum()
+
+            torch.cuda.synchronize()
+            ips = perf_meter.update(args.batch_size * args.distributed_world_size,
+                    exclude_from_total=local_step in [0, len(train_loader)-1])
+
+            log_dict = {'P10':p_losses[0].item(), 'P50':p_losses[1].item(), 'P90':p_losses[2].item(), 'loss': loss.item(), 'items/s':ips}
+            dllogger.log(step=global_step, data=log_dict, verbosity=1)
+            global_step += 1
+
+        validate(args, config, model_ema if args.ema_decay else model, criterion, valid_loader, global_step)
+
+        if validate.early_stop_c >= args.early_stopping:
+            print_once('Early stopping')
+            break
+
+    ### TEST PHASE ###
+    state_dict = torch.load(os.path.join(args.results, 'checkpoint.pt'), map_location='cpu')
+    if isinstance(model, DDP):
+        model.module.load_state_dict(state_dict['model'])
+    else:
+        model.load_state_dict(state_dict['model'])
+    model.cuda().eval()
+
+    tgt_scalers = pickle.load(open(os.path.join(args.data_path, 'tgt_scalers.bin'), 'rb'))
+    cat_encodings = pickle.load(open(os.path.join(args.data_path,'cat_encodings.bin'), 'rb'))
+
+    unscaled_predictions, unscaled_targets, _, _ = predict(args, config, model, test_loader, tgt_scalers, cat_encodings)
+    losses = QuantileLoss(config)(unscaled_predictions, unscaled_targets)
+    normalizer = unscaled_targets.abs().mean()
+    quantiles = 2 * losses / normalizer
+
+    if args.distributed_world_size > 1:
+        quantiles = quantiles.cuda()
+        dist.all_reduce(quantiles)
+        quantiles /= args.distributed_world_size
+
+    quantiles = {'test_p10': quantiles[0].item(), 'test_p50': quantiles[1].item(), 'test_p90': quantiles[2].item(), 'sum':sum(quantiles).item()}
+    finish_log = {**quantiles, 'average_ips':perf_meter.avg, 'convergence_step':validate.conv_step}
+    dllogger.log(step=(), data=finish_log, verbosity=1)
+
+def validate(args, config, model, criterion, dataloader, global_step):
+    if not hasattr(validate, 'best_valid_loss'):
+        validate.best_valid_loss = float('inf')
+    if not hasattr(validate, 'early_stop_c'):
+        validate.early_stop_c = 0
+    model.eval()
+
+    losses = []
+    validation_start = time.time()
+    for batch in dataloader:
+        with torch.no_grad():
+            batch = {key: tensor.cuda() if tensor.numel() else None for key, tensor in batch.items()}
+            predictions = model(batch)
+            targets = batch['target'][:,config.encoder_length:,:]
+            p_losses = criterion(predictions, targets)
+            bs = next(t for t in batch.values() if t is not None).shape[0]
+            losses.append((p_losses, bs))
+
+    validation_end = time.time()
+
+    p_losses = sum([l[0]*l[1] for l in losses])/sum([l[1] for l in losses]) #takes into accunt that the last batch is not full
+    if args.distributed_world_size > 1:
+        dist.all_reduce(p_losses)
+        p_losses = p_losses/args.distributed_world_size
+
+    ips = len(dataloader.dataset) / (validation_end - validation_start)
+
+    log_dict = {'P10':p_losses[0].item(), 'P50':p_losses[1].item(), 'P90':p_losses[2].item(), 'loss': p_losses.sum().item(), 'items/s':ips}
+
+    if log_dict['loss'] < validate.best_valid_loss:
+        validate.best_valid_loss = log_dict['loss']
+        validate.early_stop_c = 0
+        validate.conv_step = global_step
+        if not dist.is_initialized() or dist.get_rank() == 0:
+            state_dict = model.module.state_dict() if isinstance(model, (DDP, ModelEma)) else model.state_dict()
+            ckpt = {'args':args, 'config':config, 'model':state_dict}
+            torch.save(ckpt, os.path.join(args.results, 'checkpoint.pt'))
+        if args.distributed_world_size > 1:
+            dist.barrier()
+    else:
+        validate.early_stop_c += 1
+        
+    log_dict = {'val_'+k:v for k,v in log_dict.items()}
+    dllogger.log(step=global_step, data=log_dict, verbosity=1)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--data_path', type=str, required=True,
+                        help='Path to the dataset')
+    parser.add_argument('--dataset', type=str, required=True, choices=CONFIGS.keys(),
+                        help='Dataset name')
+    parser.add_argument('--epochs', type=int, default=25,
+                        help='Default number of training epochs')
+    parser.add_argument('--sample_data', type=lambda x: int(float(x)), nargs=2, default=[-1, -1],
+                        help="""Subsample the dataset. Specify number of training and valid examples.
+                        Values can be provided in scientific notation. Floats will be truncated.""")
+    parser.add_argument('--batch_size', type=int, default=64)
+    parser.add_argument('--lr', type=float, default=1e-3)
+    parser.add_argument('--seed', type=int, default=1)
+    parser.add_argument('--use_amp', action='store_true', help='Enable automatic mixed precision')
+    parser.add_argument('--clip_grad', type=float, default=0.0)
+    parser.add_argument('--grad_accumulation', type=int, default=0)
+    parser.add_argument('--early_stopping', type=int, default=1000,
+                        help='Stop training if validation loss does not improve for more than this number of epochs.')
+    parser.add_argument('--results', type=str, default='/results',
+                        help='Directory in which results are stored')
+    parser.add_argument('--log_file', type=str, default='dllogger.json',
+                        help='Name of dllogger output file')
+    parser.add_argument('--distributed_world_size', type=int, metavar='N',
+                       default=torch.cuda.device_count(),
+                       help='total number of GPUs across all nodes (default: all visible GPUs)')
+    parser.add_argument('--distributed_rank', default=os.getenv('LOCAL_RANK', 0), type=int,
+                       help='rank of the current worker')
+    parser.add_argument('--local_rank', default=0, type=int,
+                       help='rank of the current worker')
+    parser.add_argument('--overwrite_config', type=str, default='',
+                       help='JSON string used to overload config')
+    parser.add_argument('--affinity', type=str,
+                         default='socket_unique_interleaved',
+                         choices=['socket', 'single', 'single_unique',
+                                  'socket_unique_interleaved',
+                                  'socket_unique_continuous',
+                                  'disabled'],
+                         help='type of CPU affinity')
+    parser.add_argument("--ema_decay", type=float, default=0.0, help='Use exponential moving average')
+
+
+    ARGS = parser.parse_args()
+    main(ARGS)
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/utils.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/utils.py
new file mode 100644
index 00000000..bf88be40
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/utils.py
@@ -0,0 +1,46 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import time
+
+class PerformanceMeter():
+    def __init__(self):
+        self.reset()
+
+    def reset(self):
+        self.avg = 0
+        self.count = 0
+        self.total_time = 0
+        self.last_update_time = time.time()
+        self.intervals = []
+
+    def update(self, n, exclude_from_total=False):
+        delta = time.time() - self.last_update_time
+        self.intervals.append(delta)
+        if not exclude_from_total:
+            self.total_time += delta
+            self.count += n
+            self.avg = self.count / self.total_time
+        self.last_update_time = time.time()
+
+        return n/delta
+
+    def reset_current_lap(self):
+        self.last_update_time = time.time()
+
+    def p(self, i):
+        assert i <= 100
+        idx = int(len(self.intervals) * i / 100)
+        return sorted(self.intervals)[idx]
+
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/train.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/train.py
new file mode 100644
index 00000000..e5ceceeb
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/train.py
@@ -0,0 +1,294 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import time
+import os
+import pickle
+import json
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.distributed as dist
+from torch.utils.data import DataLoader, DistributedSampler, RandomSampler
+from apex import amp
+from apex.optimizers import FusedAdam
+#from torch.nn.parallel import DistributedDataParallel as DDP
+from apex.parallel import DistributedDataParallel as DDP
+
+import numpy as np
+
+import dllogger
+
+from modeling import TemporalFusionTransformer
+from configuration import CONFIGS
+from data_utils import TFTBinaryDataset, sample_data
+from log_helper import setup_logger
+from criterions import QuantileLoss
+from inference import predict
+from utils import PerformanceMeter
+import gpu_affinity
+from ema import ModelEma
+
+def load_dataset(args, config):
+    train_split = TFTBinaryDataset(os.path.join(args.data_path, 'train.bin'), config)
+    train_split = sample_data(train_split, args.sample_data[0])
+    if args.distributed_world_size > 1:
+        data_sampler = DistributedSampler(train_split, args.distributed_world_size, args.distributed_rank, seed=args.seed + args.distributed_rank, drop_last=True)
+    else:
+        data_sampler = RandomSampler(train_split)
+    train_loader = DataLoader(train_split, batch_size=args.batch_size, num_workers=4, sampler=data_sampler, pin_memory=True)
+
+    valid_split = TFTBinaryDataset(os.path.join(args.data_path, 'valid.bin'), config)
+    valid_split = sample_data(valid_split, args.sample_data[1])
+    if args.distributed_world_size > 1:
+        data_sampler = DistributedSampler(valid_split, args.distributed_world_size, args.distributed_rank, shuffle=False, drop_last=False)
+    else:
+        data_sampler = None
+    valid_loader = DataLoader(valid_split, batch_size=args.batch_size, sampler=data_sampler, num_workers=4, pin_memory=True)
+
+    test_split = TFTBinaryDataset(os.path.join(args.data_path, 'test.bin'), config)
+    if args.distributed_world_size > 1:
+        data_sampler = DistributedSampler(test_split, args.distributed_world_size, args.distributed_rank, shuffle=False, drop_last=False)
+    else:
+        data_sampler = None
+    test_loader = DataLoader(test_split, batch_size=args.batch_size, sampler=data_sampler, num_workers=4, pin_memory=True)
+
+    print_once(f'Train split length: {len(train_split)}')
+    print_once(f'Valid split length: {len(valid_split)}')
+    print_once(f'Test split length: {len(test_split)}')
+
+    return train_loader, valid_loader, test_loader
+
+def print_once(*args, **kwargs):
+    if not dist.is_initialized() or dist.get_rank() == 0:
+        print(*args, **kwargs)
+
+
+def main(args):
+    # Enable CuDNN autotuner
+    nproc_per_node = torch.cuda.device_count()
+    if args.affinity != 'disabled':
+        affinity = gpu_affinity.set_affinity(
+                args.local_rank,
+                nproc_per_node,
+                args.affinity
+            )
+        print(f'{args.local_rank}: thread affinity: {affinity}')
+
+
+    torch.backends.cudnn.benchmark = True
+
+    ### INIT DISTRIBUTED
+    if args.distributed_world_size > 1:
+        args.local_rank = int(os.environ.get('LOCAL_RANK', args.local_rank))
+        torch.cuda.set_device(args.local_rank)
+        dist.init_process_group(backend='nccl', init_method='env://')
+        args.distributed_world_size = int(os.environ['WORLD_SIZE'])
+        args.distributed_rank = dist.get_rank()
+        print_once(f'Distributed training with {args.distributed_world_size} GPUs')
+        torch.cuda.synchronize()
+
+    if args.seed:
+        np.random.seed(args.seed)
+        torch.manual_seed(args.seed)
+        torch.cuda.manual_seed(args.seed)
+
+    setup_logger(args)
+
+    config = CONFIGS[args.dataset]()
+    if args.overwrite_config:
+        config.__dict__.update(json.loads(args.overwrite_config))
+
+    dllogger.log(step='HPARAMS', data={**vars(args), **vars(config)}, verbosity=1)
+
+    model = TemporalFusionTransformer(config).cuda()
+    if args.ema_decay:
+        model_ema = ModelEma(model, decay=args.ema_decay)
+
+    print_once('Model params: {}'.format(sum(p.numel() for p in model.parameters())))
+    criterion = QuantileLoss(config).cuda()
+    optimizer = FusedAdam(model.parameters(), lr=args.lr)
+    if args.use_amp:
+        model, optimizer = amp.initialize(model, optimizer, opt_level="O2", loss_scale="dynamic")
+    if args.distributed_world_size > 1:
+        #model = DDP(model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True)
+        model = DDP(model)
+
+    train_loader, valid_loader, test_loader = load_dataset(args, config)
+
+    global_step = 0
+    perf_meter = PerformanceMeter()
+
+    for epoch in range(args.epochs):
+        start = time.time()
+        dllogger.log(step=global_step, data={'epoch': epoch}, verbosity=1)
+
+        model.train() 
+        for local_step, batch in enumerate(train_loader):
+            perf_meter.reset_current_lap()
+            batch = {key: tensor.cuda() if tensor.numel() else None for key, tensor in batch.items()}
+            predictions = model(batch)
+            targets = batch['target'][:,config.encoder_length:,:]
+            p_losses = criterion(predictions, targets)
+            loss = p_losses.sum()
+
+            if args.use_amp:
+                with amp.scale_loss(loss, optimizer) as scaled_loss:
+                    scaled_loss.backward()
+            else:
+                loss.backward()
+            if not args.grad_accumulation or (global_step+1) % args.grad_accumulation == 0:
+                if args.clip_grad:
+                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip_grad)
+                optimizer.step()
+                optimizer.zero_grad()
+                if args.ema_decay:
+                    model_ema.update(model)
+
+            if args.distributed_world_size > 1:
+                dist.all_reduce(p_losses)
+                p_losses /= args.distributed_world_size
+                loss = p_losses.sum()
+
+            torch.cuda.synchronize()
+            ips = perf_meter.update(args.batch_size * args.distributed_world_size,
+                    exclude_from_total=local_step in [0, len(train_loader)-1])
+
+            log_dict = {'P10':p_losses[0].item(), 'P50':p_losses[1].item(), 'P90':p_losses[2].item(), 'loss': loss.item(), 'items/s':ips}
+            dllogger.log(step=global_step, data=log_dict, verbosity=1)
+            global_step += 1
+
+        validate(args, config, model_ema if args.ema_decay else model, criterion, valid_loader, global_step)
+
+        if validate.early_stop_c >= args.early_stopping:
+            print_once('Early stopping')
+            break
+
+    ### TEST PHASE ###
+    state_dict = torch.load(os.path.join(args.results, 'checkpoint.pt'), map_location='cpu')
+    if isinstance(model, DDP):
+        model.module.load_state_dict(state_dict['model'])
+    else:
+        model.load_state_dict(state_dict['model'])
+    model.cuda().eval()
+
+    tgt_scalers = pickle.load(open(os.path.join(args.data_path, 'tgt_scalers.bin'), 'rb'))
+    cat_encodings = pickle.load(open(os.path.join(args.data_path,'cat_encodings.bin'), 'rb'))
+
+    unscaled_predictions, unscaled_targets, _, _ = predict(args, config, model, test_loader, tgt_scalers, cat_encodings)
+    losses = QuantileLoss(config)(unscaled_predictions, unscaled_targets)
+    normalizer = unscaled_targets.abs().mean()
+    quantiles = 2 * losses / normalizer
+
+    if args.distributed_world_size > 1:
+        quantiles = quantiles.cuda()
+        dist.all_reduce(quantiles)
+        quantiles /= args.distributed_world_size
+
+    quantiles = {'test_p10': quantiles[0].item(), 'test_p50': quantiles[1].item(), 'test_p90': quantiles[2].item(), 'sum':sum(quantiles).item()}
+    finish_log = {**quantiles, 'average_ips':perf_meter.avg, 'convergence_step':validate.conv_step}
+    dllogger.log(step=(), data=finish_log, verbosity=1)
+
+def validate(args, config, model, criterion, dataloader, global_step):
+    if not hasattr(validate, 'best_valid_loss'):
+        validate.best_valid_loss = float('inf')
+    if not hasattr(validate, 'early_stop_c'):
+        validate.early_stop_c = 0
+    model.eval()
+
+    losses = []
+    validation_start = time.time()
+    for batch in dataloader:
+        with torch.no_grad():
+            batch = {key: tensor.cuda() if tensor.numel() else None for key, tensor in batch.items()}
+            predictions = model(batch)
+            targets = batch['target'][:,config.encoder_length:,:]
+            p_losses = criterion(predictions, targets)
+            bs = next(t for t in batch.values() if t is not None).shape[0]
+            losses.append((p_losses, bs))
+
+    validation_end = time.time()
+
+    p_losses = sum([l[0]*l[1] for l in losses])/sum([l[1] for l in losses]) #takes into accunt that the last batch is not full
+    if args.distributed_world_size > 1:
+        dist.all_reduce(p_losses)
+        p_losses = p_losses/args.distributed_world_size
+
+    ips = len(dataloader.dataset) / (validation_end - validation_start)
+
+    log_dict = {'P10':p_losses[0].item(), 'P50':p_losses[1].item(), 'P90':p_losses[2].item(), 'loss': p_losses.sum().item(), 'items/s':ips}
+
+    if log_dict['loss'] < validate.best_valid_loss:
+        validate.best_valid_loss = log_dict['loss']
+        validate.early_stop_c = 0
+        validate.conv_step = global_step
+        if not dist.is_initialized() or dist.get_rank() == 0:
+            state_dict = model.module.state_dict() if isinstance(model, (DDP, ModelEma)) else model.state_dict()
+            ckpt = {'args':args, 'config':config, 'model':state_dict}
+            torch.save(ckpt, os.path.join(args.results, 'checkpoint.pt'))
+        if args.distributed_world_size > 1:
+            dist.barrier()
+    else:
+        validate.early_stop_c += 1
+        
+    log_dict = {'val_'+k:v for k,v in log_dict.items()}
+    dllogger.log(step=global_step, data=log_dict, verbosity=1)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--data_path', type=str, required=True,
+                        help='Path to the dataset')
+    parser.add_argument('--dataset', type=str, required=True, choices=CONFIGS.keys(),
+                        help='Dataset name')
+    parser.add_argument('--epochs', type=int, default=25,
+                        help='Default number of training epochs')
+    parser.add_argument('--sample_data', type=lambda x: int(float(x)), nargs=2, default=[-1, -1],
+                        help="""Subsample the dataset. Specify number of training and valid examples.
+                        Values can be provided in scientific notation. Floats will be truncated.""")
+    parser.add_argument('--batch_size', type=int, default=64)
+    parser.add_argument('--lr', type=float, default=1e-3)
+    parser.add_argument('--seed', type=int, default=1)
+    parser.add_argument('--use_amp', action='store_true', help='Enable automatic mixed precision')
+    parser.add_argument('--clip_grad', type=float, default=0.0)
+    parser.add_argument('--grad_accumulation', type=int, default=0)
+    parser.add_argument('--early_stopping', type=int, default=1000,
+                        help='Stop training if validation loss does not improve for more than this number of epochs.')
+    parser.add_argument('--results', type=str, default='/results',
+                        help='Directory in which results are stored')
+    parser.add_argument('--log_file', type=str, default='dllogger.json',
+                        help='Name of dllogger output file')
+    parser.add_argument('--distributed_world_size', type=int, metavar='N',
+                       default=torch.cuda.device_count(),
+                       help='total number of GPUs across all nodes (default: all visible GPUs)')
+    parser.add_argument('--distributed_rank', default=os.getenv('LOCAL_RANK', 0), type=int,
+                       help='rank of the current worker')
+    parser.add_argument('--local_rank', default=0, type=int,
+                       help='rank of the current worker')
+    parser.add_argument('--overwrite_config', type=str, default='',
+                       help='JSON string used to overload config')
+    parser.add_argument('--affinity', type=str,
+                         default='socket_unique_interleaved',
+                         choices=['socket', 'single', 'single_unique',
+                                  'socket_unique_interleaved',
+                                  'socket_unique_continuous',
+                                  'disabled'],
+                         help='type of CPU affinity')
+    parser.add_argument("--ema_decay", type=float, default=0.0, help='Use exponential moving average')
+
+
+    ARGS = parser.parse_args()
+    main(ARGS)
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/utils.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/utils.py
new file mode 100644
index 00000000..bf88be40
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/utils.py
@@ -0,0 +1,46 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import time
+
+class PerformanceMeter():
+    def __init__(self):
+        self.reset()
+
+    def reset(self):
+        self.avg = 0
+        self.count = 0
+        self.total_time = 0
+        self.last_update_time = time.time()
+        self.intervals = []
+
+    def update(self, n, exclude_from_total=False):
+        delta = time.time() - self.last_update_time
+        self.intervals.append(delta)
+        if not exclude_from_total:
+            self.total_time += delta
+            self.count += n
+            self.avg = self.count / self.total_time
+        self.last_update_time = time.time()
+
+        return n/delta
+
+    def reset_current_lap(self):
+        self.last_update_time = time.time()
+
+    def p(self, i):
+        assert i <= 100
+        idx = int(len(self.intervals) * i / 100)
+        return sorted(self.intervals)[idx]
+
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/trivial_model.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/trivial_model.py
new file mode 100755
index 00000000..27429d61
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/trivial_model.py
@@ -0,0 +1,42 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import torch.nn as nn
+
+
+class TrivialModel(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.device = torch.device(config.device.get("name", "cpu"))
+        self.bias = nn.Parameter(torch.ones(1))
+        self.encoder_length = config.dataset.encoder_length
+        self.example_length = config.dataset.example_length
+        self.predict_steps = self.example_length - self.encoder_length
+        self.output_dim = len(config.model.get("quantiles", [""]))
+
+    def test_with_last(self, batch):
+        bs = max([tensor.shape[0] if tensor is not None else 0 for tensor in batch.values()])
+        values = (
+            # TODO: this will become disfuntional after removing "targer_masked" from dataset. Seed comment in data_utils.py
+            batch["target_masked"]
+            .clone()[:, -1, :]
+            .reshape((bs, 1, self.output_dim))
+        )
+
+        return torch.cat((self.example_length - self.encoder_length) * [values], dim=1)
+
+    def forward(self, batch):
+        bs = max([tensor.shape[0] if tensor is not None else 0 for tensor in batch.values()])
+        return (
+            torch.ones([bs, self.example_length - self.encoder_length, self.output_dim]).to(device=self.device) + self.bias
+        )
+
+    def test_with_previous(self, batch):
+        targets = batch["target"].clone()
+        prev_predictions = targets.roll(1, 1)
+        return prev_predictions[:, -self.predict_steps :, :]
+
+    def test_with_previous_window(self, batch):
+        targets = batch["target"].clone()
+        prev_predictions = targets.roll(self.predict_steps, 1)
+        return prev_predictions[:, -self.predict_steps :, :]
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/optimizers.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/optimizers.py
new file mode 100755
index 00000000..58ee0374
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/optimizers.py
@@ -0,0 +1,23 @@
+# SPDX-License-Identifier: Apache-2.0
+import torch.optim as opt
+
+
+def optimizer_wrapped(config, params):
+    optimizer_dict = {
+        "Adadelta": {"var": ["lr", "rho", "eps", "weight_decay"], "func": opt.Adadelta},
+        "Adagrad": {"var": ["lr", "lr_decay", "weight_decay", "eps"], "func": opt.Adagrad},
+        "Adam": {"var": ["lr", "betas", "eps", "weight_decay", "amsgrad"], "func": opt.Adam},
+        "AdamW": {"var": ["lr", "betas", "eps", "weight_decay", "amsgrad"], "func": opt.AdamW},
+        "SparseAdam": {"var": ["lr", "betas", "eps"], "func": opt.SparseAdam},
+        "Adamax": {"var": ["lr", "betas", "eps", "weight_decay"], "func": opt.Adamax},
+        "ASGD": {"var": ["lr", "lambd", "alpha", "t0", "weight_decay"], "func": opt.ASGD},
+        "LBFGS": {
+            "var": ["lr", "max_iter", "max_eval", "tolerance_grad", "tolerance_change", "history_size", "line_search_fn"],
+            "func": opt.LBFGS,
+        },
+        "RMSprop": {"var": ["lr", "alpha", "eps", "weight_decay", "momentum", "centered"], "func": opt.RMSprop},
+        "Rprop": {"var": ["lr", "etas", "step_sizes"], "func": opt.Rprop},
+        "SGD": {"var": ["lr", "momentum", "weight_decay", "dampening", "nesterov"], "func": opt.SGD},
+    }
+    kwargs = {key: config.optimizer.get(key) for key in optimizer_dict[config.optimizer.name]["var"]}
+    return optimizer_dict[config.optimizer.name]["func"](params, **kwargs)
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/requirements.txt b/Tools/PyTorch/TimeSeriesPredictionPlatform/requirements.txt
new file mode 100755
index 00000000..e48de501
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/requirements.txt
@@ -0,0 +1,13 @@
+# SPDX-License-Identifier: Apache-2.0
+pmdarima==1.8.0
+matplotlib==3.3.2
+wget==3.2
+hydra-core==1.0.6
+pyunpack==0.2.2
+py7zr==0.15.0
+patool==1.12
+tensorboard
+optuna
+pandas==1.1.4
+dgl-cu111
+tables
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/scripts/setup.sh b/Tools/PyTorch/TimeSeriesPredictionPlatform/scripts/setup.sh
new file mode 100644
index 00000000..4665a086
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/scripts/setup.sh
@@ -0,0 +1 @@
+sudo cp -r ../../../PyTorch/Forecasting/TFT ./models/tft_pyt
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/training/ema.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/training/ema.py
new file mode 100755
index 00000000..73cafc18
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/training/ema.py
@@ -0,0 +1,77 @@
+# Copyright 2021 NVIDIA CORPORATION
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Copyright 2019 Ross Wightman
+
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+
+#        http://www.apache.org/licenses/LICENSE-2.0
+
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+""" 
+Exponential Moving Average (EMA) of model updates
+"""
+
+import logging
+from collections import OrderedDict
+from copy import deepcopy
+
+import torch
+import torch.nn as nn
+
+_logger = logging.getLogger(__name__)
+
+
+class ModelEmaV2(nn.Module):
+    """ Model Exponential Moving Average V2
+
+    Keep a moving average of everything in the model state_dict (parameters and buffers).
+    V2 of this module is simpler, it does not match params/buffers based on name but simply
+    iterates in order. It works with torchscript (JIT of full model).
+
+    """
+
+    def __init__(self, config, model, device=None):
+        super(ModelEmaV2, self).__init__()
+        # make a copy of the model for accumulating moving average of weights
+        self.module = deepcopy(model)
+        self.module.eval()
+        self.decay = config.trainer.ema.get("decay", 0.999)
+        self.device = device  # perform ema on different device from model if set
+        if self.device is not None:
+            self.module.to(device=device)
+
+    def update(self, model):
+        update_fn = lambda ema_v, model_v: self.decay * ema_v + (1.0 - self.decay) * model_v
+        with torch.no_grad():
+            for ema_v, model_v in zip(self.module.state_dict().values(), model.state_dict().values()):
+                if self.device is not None:
+                    model_v = model_v.to(device=self.device)
+                ema_v.copy_(update_fn(ema_v, model_v))
+
+    def set(self, model):
+        with torch.no_grad():
+            for ema_v, model_v in zip(self.module.state_dict().values(), model.state_dict().values()):
+                if self.device is not None:
+                    model_v = model_v.to(device=self.device)
+                ema_v.copy_(model_v)
+
+    def forward(self, x):
+        return self.module(x)
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/training/trainer.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/training/trainer.py
new file mode 100755
index 00000000..6549b872
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/training/trainer.py
@@ -0,0 +1,413 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from abc import ABC
+from functools import partial
+
+import dgl
+import dllogger
+import hydra
+import numpy as np
+import torch
+import torch.nn as nn
+from apex import amp
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.utils.data import DataLoader, DistributedSampler
+from torch.utils.data.dataloader import default_collate
+
+from callbacks.ctl_callbacks import CTLCallbackContainer
+from data.data_utils import TSBaseDataset, sample_data
+from distributed_utils import (
+    get_device,
+    init_distributed,
+    is_main_process,
+    log,
+    reduce_tensor,
+)
+from evaluators.evaluation_metrics import MetricEvaluator
+from loggers.log_helper import setup_logger
+from training.ema import ModelEmaV2
+from training.utils import (
+    maybe_restore_checkpoint,
+    round_dict,
+    save_checkpoint,
+    to_device,
+)
+
+
+class Trainer(ABC):
+    def train(self):
+        return
+
+    def evaluate(self):
+        return
+
+
+class CTLTrainer(Trainer):
+    def __init__(
+        self,
+        model: nn.Module,
+        train_dataset: TSBaseDataset,
+        valid_dataset: TSBaseDataset,
+        test_dataset: TSBaseDataset,
+        optimizer,
+        evaluator: MetricEvaluator,
+        criterion,
+        config,
+    ):
+        self.config = config
+
+        self._stop_training = False
+
+        self.metrics = {}
+
+        callbacks = [hydra.utils.call(callback_config) for callback_config in self.config.trainer.callback.values()]
+        self.callbacks = CTLCallbackContainer(self, callbacks)
+
+        self.world_size = self.config.device.get("world_size", 1)
+        train_dataset = sample_data(train_dataset, self.config.dataset.get("train_samples", -1))
+        valid_dataset = sample_data(valid_dataset, self.config.dataset.get("valid_samples", -1))
+        self.valid_dataset_len = len(valid_dataset)
+        self.train_dataset_len = len(train_dataset)
+        self.train_sampler = None
+        self.valid_sampler = None
+        if self.world_size > 1:
+            local_rank = int(self.config.device.get("local_rank", os.environ.get("LOCAL_RANK", 0)))
+            self.device = get_device(local_rank, self.config.device.get("name", "cpu"))
+            self.is_distributed = init_distributed(
+                int(self.config.device.get("world_size", os.environ.get("WORLD_SIZE", 1)))
+            )
+            torch.cuda.synchronize()
+            self.train_sampler = DistributedSampler(
+                train_dataset, config.device.world_size, seed=config.trainer.get("seed", 0), drop_last=True
+            )
+            self.valid_sampler = DistributedSampler(
+                valid_dataset, config.device.world_size, seed=config.trainer.get("seed", 0), drop_last=False
+            )
+        elif self.config.device.get("local_rank", None):
+            self.device = get_device(self.config.device.get("local_rank"), self.config.device.get("name", "cpu"))
+        else:
+            self.device = torch.device(self.config.device.get("name", "cpu"))
+        self.logger = setup_logger(self.config)
+        self.optimizer = optimizer
+        self.amp_enabled = self.config.trainer.get("AMP", False)
+        self.model = model.to(self.device)
+
+        if config.trainer.get("ema", None) is not None:
+            self.ema = ModelEmaV2(config, model, self.device)
+        else:
+            self.ema = None
+        if self.amp_enabled:
+            self.model, self.optimizer = amp.initialize(self.model, self.optimizer, opt_level="O2", loss_scale="dynamic")
+        if self.world_size > 1:
+            self.model = DDP(self.model, device_ids=[local_rank], output_device=local_rank, find_unused_parameters=True)
+
+        # TODO: line below has to go somewhere else. Or use default print. Logging module alters std streams which prevents us from
+        # capturing their outputs.
+        # log(config.pretty())
+
+        # XXX: Not sure about this. Maybe this should be isolated in collate_fn inside a DataLoader. Or maybe we should abstract it away in data_utils?
+        # For sure we have to rename this. This suggests that masked target is somehow different from
+        # regular target.
+        self.train_target = "target_masked" if config.model.get("train_target_mask", True) else "target"
+        self.eval_target = "target_masked" if config.model.get("eval_target_mask", True) else "target"
+        self.test_target = "target_masked" if config.model.get("test_target_mask", True) else "target"
+
+        if self.config.dataset.get("graph", False) and self.config.model.get("graph_eligible", False):
+
+            def _collate_graph(samples, target):
+                batch = dgl.batch(samples)
+                labels = batch.ndata["target"]
+                # XXX: we need discuss how to do this neatly
+                if target == "target_masked":
+                    labels = labels[:, self.config.dataset.encoder_length :, :]
+
+                return batch, labels
+
+            _collate = _collate_graph
+        else:
+
+            def _collate_dict(samples, target):
+                batch = default_collate(samples)
+                labels = batch["target"]
+                if target == "target_masked":
+                    labels = labels[:, self.config.dataset.encoder_length :, :]
+                return batch, labels
+
+            _collate = _collate_dict
+
+        self.train_dataloader = DataLoader(
+            train_dataset,
+            batch_size=self.config.trainer.batch_size,
+            num_workers=self.config.trainer.num_workers,
+            sampler=self.train_sampler,
+            shuffle=True if self.train_sampler is None else False,
+            pin_memory=True,
+            collate_fn=partial(_collate, target=self.train_target),
+        )
+        self.valid_dataloader = DataLoader(
+            valid_dataset,
+            batch_size=self.config.trainer.batch_size,
+            num_workers=self.config.trainer.num_workers,
+            sampler=self.valid_sampler,
+            shuffle=True if self.valid_sampler is None else False,
+            pin_memory=True,
+            collate_fn=partial(_collate, target=self.eval_target),
+        )
+        self.test_dataloader = DataLoader(
+            test_dataset,
+            batch_size=self.config.trainer.batch_size,
+            num_workers=1,
+            pin_memory=True,
+            collate_fn=partial(_collate, target=self.test_target),
+        )
+        if self.config.get("scheduler", None):
+            self.scheduler = hydra.utils.instantiate(self.config.scheduler, optimizer)
+        else:
+            self.scheduler = None
+
+        self.evaluator = evaluator
+        self.criterion = criterion
+
+        self.log_path = self.config.get("log_path", os.getcwd())
+        self.global_step = 0
+        self.epoch = 0
+
+        self.preds_train_output_selector = config.model.get("preds_train_output_selector", -1)
+        self.preds_eval_output_selector = config.model.get("preds_eval_output_selector", -1)
+        self.preds_test_output_selector = config.model.get("preds_test_output_selector", -1)
+
+        model_ref = self.model.module if self.world_size > 1 else self.model
+        test_method_name = config.model.get("test_method", "__call__")
+        self.test_method = getattr(model_ref, test_method_name)
+
+        checkpoint_path = config.trainer.get("checkpoint_path", None)
+        maybe_restore_checkpoint(self, checkpoint_path)
+
+    def assess_valid(self):
+        self.model.eval()
+        with torch.no_grad():
+            running_losses = 0
+
+            for i, (batch, labels) in enumerate(self.valid_dataloader):
+                batch = to_device(batch, device=self.device)
+                labels = to_device(labels, device=self.device)
+                if self.ema:
+                    preds = self.ema.module(batch)
+                else:
+                    preds = self.model(batch)
+                if self.preds_eval_output_selector >= 0:
+                    preds = preds[..., self.preds_eval_output_selector : self.preds_eval_output_selector + 1]
+
+                losses = self.criterion(preds, labels)
+                losses = reduce_tensor(losses, self.world_size).detach()
+                running_losses += losses
+
+        running_losses = running_losses / (len(self.valid_dataloader.dataset) / self.config.trainer.batch_size)
+        if len(running_losses.size()) < 1:
+            running_losses = running_losses.unsqueeze(0)
+        running_losses = [loss.item() for loss in running_losses]
+        data = {"val_loss": sum(running_losses)}
+        for i, elem in enumerate(running_losses):
+            data["val_loss_component_" + str(i)] = elem
+        self.logger.log(step=self.global_step, data=data, verbosity=dllogger.Verbosity.VERBOSE)
+
+        self.model.train()
+        return sum(running_losses)
+
+    def train(self):
+
+        self.callbacks.on_train_begin()
+        self.global_step = 0
+        for epoch in range(self.epoch, self.config.trainer.num_epochs):
+            self.callbacks.on_epoch_begin(epoch)
+
+            self.logger.log(step=self.global_step, data={"epoch": epoch}, verbosity=dllogger.Verbosity.VERBOSE)
+
+            for i, (batch, labels) in enumerate(self.train_dataloader):
+                self.callbacks.on_batch_begin(i)
+
+                self.optimizer.zero_grad()
+                batch = to_device(batch, device=self.device)
+                labels = to_device(labels, device=self.device)
+
+                preds = self.model(batch)
+                if self.preds_train_output_selector >= 0:
+                    preds = preds[..., self.preds_train_output_selector : self.preds_train_output_selector + 1]
+
+                losses = self.criterion(preds, labels)
+                loss = losses.sum()
+
+                if self.amp_enabled:
+                    with amp.scale_loss(loss, self.optimizer) as scaled_loss:
+                        scaled_loss.backward()
+                else:
+                    loss.backward()
+                self.optimizer.step()
+
+                losses = reduce_tensor(losses, self.world_size, average=True)
+                if len(losses.size()) < 1:
+                    losses = [losses]
+                losses = [loss.item() for loss in losses]
+                data = {"loss": loss.item()}
+                for k, v in enumerate(losses):
+                    data["loss_component_" + str(k)] = v
+
+                self.logger.log(step=self.global_step, data=data, verbosity=dllogger.Verbosity.VERBOSE)
+
+                if self.config.optimizer.get("gradient_norm", 0.0) > 0:
+                    nn.utils.clip_grad_norm(self.model.parameters(), self.config.optimizer.gradient_norm)
+                # XXX: shouldn't we move logging to a callback?
+                if self.global_step % self.config.trainer.log_interval == 0:
+                    self.logger.flush()
+                self.global_step += 1
+                self.callbacks.on_batch_end(i, logs=data)
+                if self.ema:
+                    self.ema.update(self.model)
+            if self.scheduler:
+                self.scheduler.step()
+            self.callbacks.on_valid_begin(epoch)
+            validation_loss = self.assess_valid()
+            data = {"val_loss": validation_loss}
+            self.callbacks.on_valid_end(epoch, logs=data)
+
+            if is_main_process():
+                save_checkpoint(self, checkpoint_dir=self.log_path)
+
+            if self.train_sampler:
+                self.train_sampler.set_epoch(epoch)
+                self.valid_sampler.set_epoch(epoch)
+
+            self.callbacks.on_epoch_end(epoch, logs=data)
+
+            if self._stop_training:
+                break
+
+        self.callbacks.on_train_end(logs=self.metrics)
+
+    def evaluate(self):
+        self.callbacks.on_evaluate_begin()
+        maybe_restore_checkpoint(self, os.path.join(self.log_path, "best_checkpoint.pth.tar"))
+        self.model.eval()
+
+        with torch.no_grad():
+
+            preds_full = []
+            labels_full = []
+            weights_full = []
+            ids_full = []
+
+            for i, (batch, labels) in enumerate(self.test_dataloader):
+                batch = to_device(batch, device=self.device)
+                labels = to_device(labels, device=self.device)
+
+                if self.config.evaluator.get("use_weights", False):
+                    weights = batch["weight"]
+                else:
+                    weights = None
+
+                # XXX we should abstract this away
+                ids = batch.ndata["id"] if isinstance(batch, dgl.DGLGraph) else batch["id"]
+                ids = ids[
+                    :, 0, ...
+                ]  # Assumes that time dimension is at index 1. We don't check whether te examle is constructed correctly
+
+                labels_full.append(labels)
+                weights_full.append(weights)
+                preds = self.test_method(batch)
+                if self.preds_test_output_selector >= 0:
+                    preds = preds[..., self.preds_test_output_selector : self.preds_test_output_selector + 1]
+                ids_full.append(ids)
+                preds_full.append(preds)
+
+            preds_full = torch.cat(preds_full, dim=0).cpu().numpy()
+            labels_full = torch.cat(labels_full, dim=0).cpu().numpy()
+
+            if self.config.evaluator.get("use_weights", False):
+                weights_full = torch.cat(weights_full).cpu().numpy()
+            else:
+                weights_full = np.zeros((0, 0))
+            ids_full = torch.cat(ids_full).cpu().numpy()
+            eval_metrics = self.evaluator(labels_full, preds_full, weights_full, ids_full)
+
+            self.metrics.update(eval_metrics)
+
+            self.logger.log(
+                step=[], data={k: float(v) for k, v in self.metrics.items()}, verbosity=dllogger.Verbosity.VERBOSE
+            )
+            self.callbacks.on_evaluate_end(logs=round_dict(self.metrics, decimal=3))
+            return round_dict(self.metrics, decimal=3)
+
+
+class StatTrainer(Trainer):
+    def __init__(self, dataset, evaluator: MetricEvaluator, config, model):
+        self.config = config
+        self.evaluator = evaluator
+        self.dataloader = dataset
+        self.global_step = 0
+        self.epoch = 0
+        self.model = model
+        setup_logger(self.config)
+
+    def evaluate(self):
+
+        preds_full = []
+        labels_full = []
+        weights_full = []
+        ids_full = []
+
+        for train, test in self.dataloader:
+
+            labels = test["endog"]
+            if self.config.evaluator.get("use_weights", False):
+                weights = test["weight"]
+            else:
+                weights = None
+            ids = test["id"].iloc[0]
+            self.model.fit(train["endog"], train["exog"])
+            preds = self.model.predict(test["exog"])
+            labels_full.append(labels)
+            weights_full.append(weights)
+
+            ids_full.append(ids)
+            preds_full.append(preds)
+
+        preds_full = np.stack(preds_full)
+        labels_full = np.stack(labels_full)
+
+        if self.config.evaluator.get("use_weights", False):
+            weights_full = np.stack(weights_full)
+        else:
+            weights_full = np.zeros((0, 0))
+        ids_full = np.stack(ids_full)
+        metrics = self.evaluator(labels_full, preds_full, weights_full, ids_full)
+
+        return metrics
+
+
+def numpy_normalised_quantile_loss(y, y_pred, quantile):
+    prediction_underflow = y - y_pred
+    losses = []
+
+    weighted_errors = quantile * np.maximum(prediction_underflow, 0.0) + (1.0 - quantile) * np.maximum(
+        -prediction_underflow, 0.0
+    )
+    losses.append(weighted_errors.mean())
+
+    normalizer = abs(y).mean()
+
+    losses = 2 * losses / normalizer
+
+    return losses
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/training/utils.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/training/utils.py
new file mode 100755
index 00000000..618bc57e
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/training/utils.py
@@ -0,0 +1,53 @@
+# SPDX-License-Identifier: Apache-2.0
+import logging
+import os
+
+import dgl
+import dllogger
+import numpy as np
+import torch
+
+
+def save_checkpoint(trainer, filename="checkpoint.pth.tar", checkpoint_dir="./"):
+    if trainer.ema:
+        module_to_save = trainer.ema.module if trainer.world_size == 1 else trainer.ema
+    else:
+        module_to_save = trainer.model
+    state = {
+        "epoch": trainer.epoch + 1,
+        "global_step": trainer.global_step,
+        "model_state_dict": module_to_save.state_dict(),
+        "optimizer_state_dict": trainer.optimizer.state_dict(),
+    }
+    checkpoint_path = os.path.join(checkpoint_dir, filename)
+    trainer.logger.log(step=[], data={"String": f"Saving checkpoint to {filename}"}, verbosity=dllogger.Verbosity.DEFAULT)
+    torch.save(state, checkpoint_path)
+
+
+def maybe_restore_checkpoint(trainer, checkpoint_path):
+    if checkpoint_path and os.path.isfile(checkpoint_path):
+        trainer.logger.log(
+            step=[], data={"String": f"Restoring checkpoint from {checkpoint_path}"}, verbosity=dllogger.Verbosity.DEFAULT
+        )
+        checkpoint = torch.load(checkpoint_path, map_location=trainer.device)
+        trainer.model.load_state_dict(checkpoint["model_state_dict"])
+        trainer.optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
+        trainer.global_step = checkpoint["global_step"]
+        trainer.epoch = checkpoint["epoch"]
+
+
+def round_dict(input_data, decimal=2):
+    rounded_data = {
+        key: (np.around(value, decimal) if isinstance(value, (np.floating, float)) else value)
+        for key, value in input_data.items()
+    }
+    return rounded_data
+
+
+def to_device(batch, device=None):
+    if isinstance(batch, torch.Tensor):
+        return batch.to(device=device)
+    if isinstance(batch, dict):
+        return {k: t.to(device=device) if t.numel() else None for k, t in batch.items()}
+    elif isinstance(batch, dgl.DGLGraph):
+        return batch.to(device=device)
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/triton/calculate_metrics.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/triton/calculate_metrics.py
new file mode 100755
index 00000000..10dee0a6
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/triton/calculate_metrics.py
@@ -0,0 +1,99 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+r"""
+Using `calculate_metrics.py` script, you can obtain model accuracy/error metrics using defined `MetricsCalculator` class.
+
+Data provided to `MetricsCalculator` are obtained from dump files
+stored in directory pointed by `--dump-dir` argument.
+Above files are prepared by `run_inference_on_fw.py` and `run_inference_on_triton.py` scripts.
+
+Output data is stored in csv file pointed by `--csv` argument.
+
+Example call:
+
+```shell script
+python ./triton/calculate_metrics.py \
+    --dump-dir /results/dump_triton \
+    --csv /results/accuracy_results.csv \
+    --metrics metrics.py \
+    --metric-class-param1 value
+```
+"""
+
+import argparse
+import csv
+import logging
+import string
+from pathlib import Path
+
+# method from PEP-366 to support relative import in executed modules
+
+if __package__ is None:
+    __package__ = Path(__file__).parent.name
+
+from .deployment_toolkit.args import ArgParserGenerator
+from .deployment_toolkit.core import BaseMetricsCalculator, load_from_file
+from .deployment_toolkit.dump import JsonDumpReader
+
+LOGGER = logging.getLogger("calculate_metrics")
+TOTAL_COLUMN_NAME = "_total_"
+
+
+def main():
+    logging.basicConfig(level=logging.INFO)
+
+    parser = argparse.ArgumentParser(description="Run models with given dataloader", allow_abbrev=False)
+    parser.add_argument("--metrics", help="Path to python module containing metrics calculator", required=True)
+    parser.add_argument("--csv", help="Path to csv file", required=True)
+    parser.add_argument("--dump-dir", help="Path to directory with dumped outputs (and labels)", required=True)
+
+    args, *_ = parser.parse_known_args()
+
+    MetricsCalculator = load_from_file(args.metrics, "metrics", "MetricsCalculator")
+    ArgParserGenerator(MetricsCalculator).update_argparser(parser)
+
+    args = parser.parse_args()
+
+    LOGGER.info("args:")
+    for key, value in vars(args).items():
+        LOGGER.info(f"    {key} = {value}")
+
+    MetricsCalculator = load_from_file(args.metrics, "metrics", "MetricsCalculator")
+    metrics_calculator: BaseMetricsCalculator = ArgParserGenerator(MetricsCalculator).from_args(args)
+
+    reader = JsonDumpReader(args.dump_dir)
+    for ids, x, y_true, y_pred in reader.iterate_over(["ids", "inputs", "labels", "outputs"]):
+        ids = list(ids["ids"]) if ids is not None else None
+        metrics_calculator.update(ids=ids, x=x, y_pred=y_pred, y_real=y_true)
+    metrics = metrics_calculator.metrics
+
+    metric_names_with_space = [name for name in metrics if any([c in string.whitespace for c in name])]
+    if metric_names_with_space:
+        raise ValueError(f"Metric names shall have no spaces; Incorrect names: {', '.join(metric_names_with_space)}")
+    LOGGER.info("Results:")
+    for key, value in metrics.items():
+        LOGGER.info(f"    {key}: {value}")
+    csv_path = Path(args.csv)
+    csv_path.parent.mkdir(parents=True, exist_ok=True)
+    with csv_path.open("w") as csv_file:
+        writer = csv.DictWriter(csv_file, fieldnames=list(metrics.keys()))
+        writer.writeheader()
+        writer.writerow(metrics)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/triton/check_accuracy.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/triton/check_accuracy.py
new file mode 100644
index 00000000..4d1e3f84
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/triton/check_accuracy.py
@@ -0,0 +1,151 @@
+#!/usr/bin/env python3
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import logging
+import os
+from pathlib import Path
+from tqdm import tqdm
+import numpy as np
+os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
+os.environ["TF_ENABLE_DEPRECATION_WARNINGS"] = "1"
+
+# method from PEP-366 to support relative import in executed modules
+if __name__ == "__main__" and __package__ is None:
+    __package__ = Path(__file__).parent.name
+
+from .deployment_toolkit.args import ArgParserGenerator  # noqa: E402  module level import not at top of file
+from .deployment_toolkit.core import (  # noqa: E402  module level import not at top of file
+    DATALOADER_FN_NAME,
+    BaseLoader,
+    BaseRunner,
+    Model,
+    load_from_file,
+)
+from .deployment_toolkit.extensions import loaders, runners  # noqa: E402  module level import not at top of file
+from .model import get_model
+
+LOGGER = logging.getLogger("check_accuracy")
+
+def _get_args():
+    parser = argparse.ArgumentParser(
+        description="Script for checking accuracy of export and conversion.", allow_abbrev=False
+    )
+    parser.add_argument("--native-model", help="Path to native model", required=True)
+    parser.add_argument("--native-type", help="Native model type", required=True)
+    parser.add_argument("--export-model", help="Path to exported model", required=True)
+    parser.add_argument("--export-type", help="Exported model type", required=True)
+    parser.add_argument("--convert-model", help="Path to converted model", required=True)
+    parser.add_argument("--convert-type", help="Converted model type", required=True)
+    parser.add_argument("--dataloader", help="Path to python module containing data loader", required=True)
+    parser.add_argument("-v", "--verbose", help="Verbose logs", action="store_true", default=False)
+    parser.add_argument(
+        "--ignore-unknown-parameters",
+        help="Ignore unknown parameters (argument often used in CI where set of arguments is constant)",
+        action="store_true",
+        default=False,
+    )
+
+    args, unparsed_args = parser.parse_known_args()
+
+    Loader: BaseLoader = loaders.get(args.native_type)
+    ArgParserGenerator(Loader, module_path=args.native_model).update_argparser(parser)
+    Runner: BaseRunner = runners.get(args.native_type)
+    ArgParserGenerator(Runner).update_argparser(parser)
+
+    Loader: BaseLoader = loaders.get(args.export_type)
+    ArgParserGenerator(Loader, module_path=args.export_model).update_argparser(parser)
+    Runner: BaseRunner = runners.get(args.export_type)
+    ArgParserGenerator(Runner).update_argparser(parser)
+
+    Loader: BaseLoader = loaders.get(args.convert_type)
+    ArgParserGenerator(Loader, module_path=args.convert_model).update_argparser(parser)
+    Runner: BaseRunner = runners.get(args.convert_type)
+    ArgParserGenerator(Runner).update_argparser(parser)
+
+
+    if args.dataloader is not None:
+        get_dataloader_fn = load_from_file(args.dataloader, label="dataloader", target=DATALOADER_FN_NAME)
+        ArgParserGenerator(get_dataloader_fn).update_argparser(parser)
+
+    if args.ignore_unknown_parameters:
+        args, unknown_args = parser.parse_known_args()
+        LOGGER.warning(f"Got additional args {unknown_args}")
+    else:
+        args = parser.parse_args()
+    return args
+
+
+def main():
+    args = _get_args()
+
+    log_level = logging.INFO if not args.verbose else logging.DEBUG
+    log_format = "%(asctime)s %(levelname)s %(name)s %(message)s"
+    logging.basicConfig(level=log_level, format=log_format)
+
+    LOGGER.info("args:")
+    for key, value in vars(args).items():
+        LOGGER.info(f"    {key} = {value}")
+    
+    LOGGER.info(f"Loading {args.native_model}")
+    Runner: BaseRunner = runners.get(args.native_type)
+
+    runner_native = ArgParserGenerator(Runner).from_args(args)
+    model_native, _ = get_model(model_dir= args.native_model)
+    model_native = Model(handle=model_native, precision=None, inputs=None, outputs=['target__0'])
+
+
+
+    LOGGER.info(f"Loading {args.export_model}")
+    Loader: BaseLoader = loaders.get(args.export_type)
+    Runner: BaseRunner = runners.get(args.export_type)
+
+    loader = ArgParserGenerator(Loader, module_path=args.export_model).from_args(args)
+    runner_export = ArgParserGenerator(Runner).from_args(args)
+    model_export = loader.load(args.export_model)
+
+    if args.convert_type != 'trt':
+        LOGGER.info(f"Loading {args.convert_model}")
+        Loader: BaseLoader = loaders.get(args.convert_type)
+        Runner: BaseRunner = runners.get(args.convert_type)
+
+        loader = ArgParserGenerator(Loader, module_path=args.convert_model).from_args(args)
+        runner_convert = ArgParserGenerator(Runner).from_args(args)
+        model_convert = loader.load(args.convert_model)
+
+    get_dataloader_fn = load_from_file(args.dataloader, label="dataloader", target=DATALOADER_FN_NAME)
+    dataloader_fn = ArgParserGenerator(get_dataloader_fn).from_args(args)
+    
+    ids, x, y_real = next(dataloader_fn())
+    with runner_native.init_inference(model=model_native) as runner_session:
+        y_pred_native = runner_session(x)
+    del model_native
+    del runner_native
+    with runner_export.init_inference(model=model_export) as runner_session:
+        y_pred_export = runner_session(x)
+    del model_export
+    del runner_export
+    e1 = [np.linalg.norm(y_pred_native[k]-y_pred_export[k]) for k in y_pred_native.keys()]
+    assert all([i < 1e-3 for i in e1]), "Error between native and export is {}, limit is 1e-3".format(e1)
+    if args.convert_type != 'trt':
+        with runner_convert.init_inference(model=model_convert) as runner_session:
+            y_pred_convert = runner_session(x)
+        e2 = [np.linalg.norm(y_pred_convert[k]-y_pred_export[k]) for k in y_pred_native.keys()]
+        assert all([i < 1e-3 for i in e2]), "Error between export and convert is {}, limit is 1e-3".format(e2)
+
+    
+
+
+if __name__ == "__main__":
+    main()
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/triton/dataloader.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/triton/dataloader.py
new file mode 100644
index 00000000..8d34bc89
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/triton/dataloader.py
@@ -0,0 +1,81 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import torch
+import torch.nn as nn
+import hydra
+from torch.utils.data import DataLoader
+import numpy as np
+from omegaconf import OmegaConf
+
+from torch.utils.data.dataloader import default_collate
+from functools import partial
+import dgl
+
+def update_argparser(parser):
+    parser.add_argument("--model-dir", type=str, help="Path to the model directory you would like to use (likely in outputs)", required=True)
+    parser.add_argument("--batch-size", type=int, required=True)
+
+
+
+def get_dataloader_fn(model_dir, batch_size):
+    with open(os.path.join(model_dir, ".hydra/config_merged.yaml"), "rb") as f:
+        config = OmegaConf.load(f)
+    config._target_ = config.config.dataset._target_
+    dataset_dir = config.config.dataset.dest_path
+    if os.path.isdir(dataset_dir):
+        train, valid, test = hydra.utils.call(config)
+        del train
+        del valid
+    else:
+        raise ValueError('dataset_dir must be a directory')
+    input_names_dict = {'s_cat': 's_cat__0', 's_cont':'s_cont__1', 'k_cat':'k_cat__2', 'k_cont':'k_cont__3', 'o_cat':'o_cat__4', 'o_cont':'o_cont__5', 'target':'target__6', 'weight':'weight__7', 'sample_weight': 'sample_weight__8', 'id':'id__9'}
+    reshaper = [-1] + [1 for i in range(9)]
+    test_target = "target_masked" if config.config.model.get("test_target_mask", True) else "target"
+    if config.config.model.get("quantiles", None):
+        tile_num = len(config.config.model.quantiles)
+    else:
+        tile_num = 1
+    if config.config.dataset.get('graph', False) and config.config.model.get('graph_eligible', False):
+        def _collate_graph(samples, target):
+            batch = dgl.batch(samples)
+            labels = batch.ndata['target']
+            # XXX: we need discuss how to do this neatly
+            if target == "target_masked":
+                labels = labels[:, config.config.dataset.encoder_length:, :]
+
+            return batch, labels
+
+        _collate = _collate_graph
+    else:
+        def _collate_dict(samples, target):
+            batch = default_collate(samples)
+            labels = batch['target']
+            if target == "target_masked":
+                labels = labels[:,config.config.dataset.encoder_length:, :]
+            return batch, labels
+
+        _collate = _collate_dict
+    data_loader = DataLoader(test, batch_size=int(batch_size), num_workers=2,  pin_memory=True, collate_fn=partial(_collate, target=test_target))
+    def _get_dataloader():
+        for step, (batch, labels) in enumerate(data_loader):
+            bs = batch['target'].shape[0]
+            x = {input_names_dict[key]: batch[key].numpy() if batch[key].numel() else np.ones([bs]).reshape(reshaper) for key in input_names_dict.keys()}
+            ids = batch['id'][:,0].numpy()
+            y_real = {'target__0':np.tile(labels.numpy(), (1, 1, tile_num))} #Probably need to expand the final dims here as well
+            yield (ids, x, y_real)
+
+
+    return _get_dataloader
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/triton/deployment_toolkit/.version b/Tools/PyTorch/TimeSeriesPredictionPlatform/triton/deployment_toolkit/.version
new file mode 100644
index 00000000..d1eeaed1
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/triton/deployment_toolkit/.version
@@ -0,0 +1 @@
+0.6.21
\ No newline at end of file
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/triton/deployment_toolkit/__init__.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/triton/deployment_toolkit/__init__.py
new file mode 100644
index 00000000..8ad3be9f
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/triton/deployment_toolkit/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
\ No newline at end of file
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/triton/deployment_toolkit/args.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/triton/deployment_toolkit/args.py
new file mode 100644
index 00000000..f6876b80
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/triton/deployment_toolkit/args.py
@@ -0,0 +1,136 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import inspect
+import logging
+from typing import Callable, Dict, Optional, Union
+
+from model_navigator.utils.cli import is_dict_generic, is_list_generic, is_optional_generic
+
+from .core import GET_ARGPARSER_FN_NAME, load_from_file
+
+LOGGER = logging.getLogger(__name__)
+
+
+def str2bool(v):
+    if isinstance(v, bool):
+        return v
+    if v.lower() in ("yes", "true", "t", "y", "1"):
+        return True
+    elif v.lower() in ("no", "false", "f", "n", "0"):
+        return False
+    else:
+        raise argparse.ArgumentTypeError("Boolean value expected.")
+
+
+def filter_fn_args(args: Union[dict, argparse.Namespace], fn: Callable) -> dict:
+    signature = inspect.signature(fn)
+    parameters_names = list(signature.parameters)
+    if isinstance(args, argparse.Namespace):
+        args = vars(args)
+    args = {k: v for k, v in args.items() if k in parameters_names}
+    return args
+
+
+def add_args_for_fn_signature(parser, fn) -> argparse.ArgumentParser:
+    parser.conflict_handler = "resolve"
+    signature = inspect.signature(fn)
+    for parameter in signature.parameters.values():
+        if parameter.name in ["self", "args", "kwargs"]:
+            continue
+        argument_kwargs = {}
+        if parameter.annotation != inspect.Parameter.empty:
+
+            is_optional = is_optional_generic(parameter.annotation)
+            if is_optional:
+                annotation = parameter.annotation.__args__[0]  # Optional[cls] will be changed into Union[cls, None]
+            else:
+                annotation = parameter.annotation
+
+            is_list = is_list_generic(annotation)
+            is_dict = is_dict_generic(annotation)
+
+            if parameter.annotation == bool:
+                argument_kwargs["type"] = str2bool
+                argument_kwargs["choices"] = [0, 1]
+            elif is_list:
+                argument_kwargs["type"] = annotation.__args__[0]  # List[cls] -> cls
+            elif is_dict:
+                raise RuntimeError(
+                    f"Could not prepare argument parser for {parameter.name}: {parameter.annotation} in {fn}"
+                )
+            else:
+                argument_kwargs["type"] = annotation
+
+        if parameter.default != inspect.Parameter.empty:
+            if parameter.annotation == bool:
+                argument_kwargs["default"] = str2bool(parameter.default)
+            else:
+                argument_kwargs["default"] = parameter.default
+        else:
+            argument_kwargs["required"] = True
+        name = parameter.name.replace("_", "-")
+        LOGGER.debug(f"Adding argument {name} with {argument_kwargs}")
+        parser.add_argument(f"--{name}", **argument_kwargs)
+    return parser
+
+
+class ArgParserGenerator:
+    def __init__(self, cls_or_fn, module_path: Optional[str] = None):
+        self._cls_or_fn = cls_or_fn
+
+        init_method_name = "__init__"
+        self._handle = cls_or_fn if inspect.isfunction(cls_or_fn) else getattr(cls_or_fn, init_method_name, None)
+        input_is_python_file = module_path and module_path.endswith(".py")
+        self._input_path = module_path if input_is_python_file else None
+        self._required_fn_name_for_signature_parsing = getattr(
+            cls_or_fn, "required_fn_name_for_signature_parsing", None
+        )
+
+    def update_argparser(self, parser):
+        name = self._handle.__name__
+        group_parser = parser.add_argument_group(name)
+        add_args_for_fn_signature(group_parser, fn=self._handle)
+        self._update_argparser(group_parser)
+
+    def get_args(self, args: argparse.Namespace):
+        filtered_args = filter_fn_args(args, fn=self._handle)
+
+        tmp_parser = argparse.ArgumentParser(allow_abbrev=False)
+        self._update_argparser(tmp_parser)
+        custom_names = [
+            p.dest.replace("-", "_") for p in tmp_parser._actions if not isinstance(p, argparse._HelpAction)
+        ]
+        custom_params = {n: getattr(args, n) for n in custom_names}
+        filtered_args = {**filtered_args, **custom_params}
+        return filtered_args
+
+    def from_args(self, args: Union[argparse.Namespace, Dict]):
+        args = self.get_args(args)
+        LOGGER.info(f"Initializing {self._cls_or_fn.__name__}({args})")
+        return self._cls_or_fn(**args)
+
+    def _update_argparser(self, parser):
+        label = "argparser_update"
+        if self._input_path:
+            update_argparser_handle = load_from_file(self._input_path, label=label, target=GET_ARGPARSER_FN_NAME)
+            if update_argparser_handle:
+                update_argparser_handle(parser)
+            elif self._required_fn_name_for_signature_parsing:
+                fn_handle = load_from_file(
+                    self._input_path, label=label, target=self._required_fn_name_for_signature_parsing
+                )
+                if fn_handle:
+                    add_args_for_fn_signature(parser, fn_handle)
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/triton/deployment_toolkit/bermuda/__init__.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/triton/deployment_toolkit/bermuda/__init__.py
new file mode 100644
index 00000000..8ad3be9f
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/triton/deployment_toolkit/bermuda/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
\ No newline at end of file
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/triton/deployment_toolkit/bermuda/onnx.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/triton/deployment_toolkit/bermuda/onnx.py
new file mode 100644
index 00000000..2b93b9f0
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/triton/deployment_toolkit/bermuda/onnx.py
@@ -0,0 +1,237 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+from pathlib import Path
+from typing import Dict, Optional, Union
+
+import numpy as np
+
+# pytype: disable=import-error
+import onnx
+import onnx.shape_inference
+import onnxruntime
+from google.protobuf import text_format
+from onnx.mapping import TENSOR_TYPE_TO_NP_TYPE
+
+from ..core import BaseLoader, BaseRunner, BaseRunnerSession, BaseSaver, Format, Model, Precision, TensorSpec
+from ..extensions import loaders, runners, savers
+from .utils import infer_precision
+
+# pytype: enable=import-error
+
+
+LOGGER = logging.getLogger(__name__)
+
+
+def _value_info2tensor_spec(value_info: onnx.ValueInfoProto):
+    onnx_data_type_map = {"float": "float32", "double": "float64"}
+
+    elem_type_name = onnx.TensorProto.DataType.Name(value_info.type.tensor_type.elem_type).lower()
+    dtype = onnx_data_type_map.get(elem_type_name, elem_type_name)
+
+    def _get_dim(dim):
+        which = dim.WhichOneof("value")
+        if which is not None:  # which is None when dim is None
+            dim = getattr(dim, which)
+        return None if isinstance(dim, (str, bytes)) else dim
+
+    shape = value_info.type.tensor_type.shape
+    shape = tuple(_get_dim(d) for d in shape.dim)
+    return TensorSpec(value_info.name, dtype=dtype, shape=shape)
+
+
+def _infer_graph_precision(onnx_graph: onnx.GraphProto) -> Optional[Precision]:
+    import networkx as nx
+
+    # build directed graph
+    nx_graph = nx.DiGraph()
+
+    def _get_dtype(vi):
+        t = vi.type
+        if hasattr(t, "tensor_type"):
+            type_id = t.tensor_type.elem_type
+        else:
+            raise NotImplementedError("Not implemented yet")
+        return TENSOR_TYPE_TO_NP_TYPE[type_id]
+
+    node_output2type = {vi.name: _get_dtype(vi) for vi in onnx_graph.value_info}
+
+    node_outputs2node = {output_name: node for node in onnx_graph.node for output_name in node.output}
+    node_inputs2node = {input_name: node for node in onnx_graph.node for input_name in node.input}
+
+    for node in onnx_graph.node:
+        node_dtype = node_output2type.get("+".join(node.output), None)
+        nx_graph.add_node(
+            node.name,
+            op=node.op_type,
+            attr={a.name: a for a in node.attribute},
+            dtype=node_dtype,
+        )
+        for input_name in node.input:
+            prev_node = node_outputs2node.get(input_name, None)
+            if prev_node:
+                nx_graph.add_edge(prev_node.name, node.name)
+
+    for input_node in onnx_graph.input:
+        input_name = input_node.name
+        nx_graph.add_node(input_name, op="input", dtype=_get_dtype(input_node))
+        next_node = node_inputs2node.get(input_name, None)
+        if next_node:
+            nx_graph.add_edge(input_name, next_node.name)
+
+    for output in onnx_graph.output:
+        output_name = output.name
+        nx_graph.add_node(output_name, op="output", dtype=_get_dtype(output))
+        prev_node = node_outputs2node.get(output_name, None)
+        if prev_node:
+            nx_graph.add_edge(prev_node.name, output_name)
+        else:
+            LOGGER.warning(f"Could not find previous node for {output_name}")
+
+    input_names = [n.name for n in onnx_graph.input]
+    output_names = [n.name for n in onnx_graph.output]
+    most_common_dtype = infer_precision(nx_graph, input_names, output_names, lambda node: node.get("dtype", None))
+    if most_common_dtype is not None:
+        precision = {np.dtype("float32"): Precision.FP32, np.dtype("float16"): Precision.FP16}[most_common_dtype]
+    else:
+        precision = None
+    return precision
+
+
+class OnnxLoader(BaseLoader):
+    def load(self, model_path: Union[str, Path], **_) -> Model:
+        if isinstance(model_path, Path):
+            model_path = model_path.as_posix()
+
+        model = onnx.load(model_path)
+        onnx.checker.check_model(model)
+        onnx.helper.strip_doc_string(model)
+        model = onnx.shape_inference.infer_shapes(model)
+
+        # TODO: probably modification of onnx model ios causes error on optimize
+        # from onnx.utils import polish_model
+        # model = polish_model(model)  # run checker, docs strip, optimizer and shape inference
+
+        inputs = {vi.name: _value_info2tensor_spec(vi) for vi in model.graph.input}
+        outputs = {vi.name: _value_info2tensor_spec(vi) for vi in model.graph.output}
+
+        precision = _infer_graph_precision(model.graph)
+
+        return Model(model, precision, inputs, outputs)
+
+
+class OnnxSaver(BaseSaver):
+    def __init__(self, as_text: bool = False):
+        self._as_text = as_text
+
+    def save(self, model: Model, model_path: Union[str, Path], dataloader_fn) -> None:
+        model_path = Path(model_path)
+        LOGGER.debug(f"Saving ONNX model to {model_path.as_posix()}")
+        model_path.parent.mkdir(parents=True, exist_ok=True)
+
+        onnx_model: onnx.ModelProto = model.handle
+        if self._as_text:
+            with model_path.open("w") as f:
+                f.write(text_format.MessageToString(onnx_model))
+        else:
+            with model_path.open("wb") as f:
+                f.write(onnx_model.SerializeToString())
+
+
+"""
+ExecutionProviders on onnxruntime 1.4.0
+['TensorrtExecutionProvider',
+ 'CUDAExecutionProvider',
+ 'MIGraphXExecutionProvider',
+ 'NGRAPHExecutionProvider',
+ 'OpenVINOExecutionProvider',
+ 'DnnlExecutionProvider',
+ 'NupharExecutionProvider',
+ 'VitisAIExecutionProvider',
+ 'ArmNNExecutionProvider',
+ 'ACLExecutionProvider',
+ 'CPUExecutionProvider']
+"""
+
+
+def _check_providers(providers):
+    providers = providers or []
+    if not isinstance(providers, (list, tuple)):
+        providers = [providers]
+    available_providers = onnxruntime.get_available_providers()
+    unavailable = set(providers) - set(available_providers)
+    if unavailable:
+        raise RuntimeError(f"Unavailable providers {unavailable}")
+    return providers
+
+
+class OnnxRunner(BaseRunner):
+    def __init__(self, verbose_runtime_logs: bool = False):
+        self._providers = None
+        self._verbose_runtime_logs = verbose_runtime_logs
+
+    def init_inference(self, model: Model):
+        assert isinstance(model.handle, onnx.ModelProto)
+        return OnnxRunnerSession(
+            model=model, providers=self._providers, verbose_runtime_logs=self._verbose_runtime_logs
+        )
+
+
+class OnnxRunnerSession(BaseRunnerSession):
+    def __init__(self, model: Model, providers, verbose_runtime_logs: bool = False):
+        super().__init__(model)
+        self._input_names = None
+        self._output_names = None
+        self._session = None
+        self._providers = providers
+        self._verbose_runtime_logs = verbose_runtime_logs
+        self._old_env_values = {}
+
+    def __enter__(self):
+        self._old_env_values = self._set_env_variables()
+        sess_options = onnxruntime.SessionOptions()  # default session options
+        if self._verbose_runtime_logs:
+            sess_options.log_severity_level = 0
+            sess_options.log_verbosity_level = 1
+        LOGGER.info(
+            f"Starting inference session for onnx model providers={self._providers} sess_options={sess_options}"
+        )
+
+        self._input_names = list(self._model.inputs)
+        self._output_names = list(self._model.outputs)
+
+        model_payload = self._model.handle.SerializeToString()
+        self._session = onnxruntime.InferenceSession(
+            model_payload, providers=self._providers, sess_options=sess_options
+        )
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self._input_names = None
+        self._output_names = None
+        self._session = None
+        self._recover_env_variables(self._old_env_values)
+
+    def __call__(self, x: Dict[str, object]):
+        feed_dict = {k: x[k] for k in self._input_names}
+        y_pred = self._session.run(self._output_names, feed_dict)
+        y_pred = dict(zip(self._output_names, y_pred))
+
+        return y_pred
+
+
+loaders.register_extension(Format.ONNX.value, OnnxLoader)
+runners.register_extension(Format.ONNX.value, OnnxRunner)
+savers.register_extension(Format.ONNX.value, OnnxSaver)
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/triton/deployment_toolkit/bermuda/pyt.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/triton/deployment_toolkit/bermuda/pyt.py
new file mode 100644
index 00000000..60456c16
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/triton/deployment_toolkit/bermuda/pyt.py
@@ -0,0 +1,295 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import typing
+from collections import Counter
+from pathlib import Path
+from typing import Dict, Optional, Union
+
+import numpy as np
+import torch  # pytype: disable=import-error
+import yaml
+from model_navigator.model import ModelSignatureConfig
+from model_navigator.tensor import TensorSpec
+from model_navigator.utils.config import YamlConfigFile
+
+from ..core import (
+    GET_MODEL_FN_NAME,
+    BaseLoader,
+    BaseRunner,
+    BaseRunnerSession,
+    BaseSaver,
+    Format,
+    Model,
+    Precision,
+    load_from_file,
+)
+from ..extensions import loaders, runners, savers
+from .utils import get_dynamic_axes, get_shapes_with_dynamic_axes
+
+LOGGER = logging.getLogger(__name__)
+
+
+def get_sample_input(dataloader, device):
+    for batch in dataloader:
+        _, x, _ = batch
+        break
+    if isinstance(x, dict):
+        sample_input = list(x.values())
+    elif isinstance(x, list):
+        sample_input = x
+    else:
+        raise TypeError("The first element (x) of batch returned by dataloader must be a list or a dict")
+
+    for idx, s in enumerate(sample_input):
+        sample_input[idx] = torch.from_numpy(s).to(device)
+
+    return tuple(sample_input)
+
+
+def get_model_device(torch_model):
+    if next(torch_model.parameters()).is_cuda:
+        return "cuda"
+    else:
+        return "cpu"
+
+
+def infer_model_precision(model):
+    counter = Counter()
+    for param in model.parameters():
+        counter[param.dtype] += 1
+    if counter[torch.float16] > 0:
+        return Precision.FP16
+    else:
+        return Precision.FP32
+
+
+def _get_tensor_dtypes(dataloader, precision):
+    def _get_dtypes(t):
+        def _get_dtype(v):
+            dtype = str(v.dtype)
+            if dtype == "float64":
+                dtype = "float32"
+            if precision == Precision.FP16 and dtype == "float32":
+                dtype = "float16"
+            return np.dtype(dtype)
+
+        return {k: _get_dtype(v) for k, v in t.items()}
+
+    batch = next(dataloader)
+    _, x, y = batch
+    input_dtypes = _get_dtypes(x)
+    output_dtypes = _get_dtypes(y)
+
+    return input_dtypes, output_dtypes
+
+
+### TODO assumption: floating point input
+### type has same precision as the model
+def _get_model_signature(
+    inputs_names: typing.List[str],
+    outputs_names: typing.List[str],
+    precision,
+    dataloader_fn,
+    batch_size_dim: typing.Optional[int] = None,
+):
+    dataloader = dataloader_fn()
+    input_dtypes, output_dtypes = _get_tensor_dtypes(dataloader, precision)
+    input_shapes, output_shapes = get_shapes_with_dynamic_axes(dataloader, batch_size_dim=batch_size_dim)
+
+    inputs = {
+        name: TensorSpec(name=name, dtype=input_dtypes[name], shape=tuple(input_shapes[name])) for name in inputs_names
+    }
+    outputs = {
+        name: TensorSpec(name=name, dtype=output_dtypes[name], shape=tuple(output_shapes[name]))
+        for name in outputs_names
+    }
+
+    return ModelSignatureConfig(inputs, outputs)
+
+
+class PyTorchModelLoader(BaseLoader):
+    required_fn_name_for_signature_parsing: Optional[str] = GET_MODEL_FN_NAME
+
+    def __init__(self, **kwargs):
+        self._model_args = kwargs
+
+    def load(self, model_path: Union[str, Path], **kwargs) -> Model:
+        if isinstance(model_path, Path):
+            model_path = model_path.as_posix()
+
+        get_model = load_from_file(model_path, "model", GET_MODEL_FN_NAME)
+        model, io_names_dict = get_model(**self._model_args)
+
+        dataloader_fn = kwargs.get("dataloader_fn", None)
+        output_type = kwargs.get("output_type", None)
+        precision = infer_model_precision(model)
+
+        batch_axis = getattr(model, "bermuda_batch_axis", 0)  # by default models supports batching; batch_axis=0
+
+        model_signature = _get_model_signature(
+            inputs_names=io_names_dict["inputs"],
+            outputs_names=io_names_dict["outputs"],
+            precision=precision,
+            dataloader_fn=dataloader_fn,
+            batch_size_dim=batch_axis,
+        )
+
+        model = Model(handle=model, precision=precision, inputs=model_signature.inputs, outputs=model_signature.outputs)
+
+        if output_type == Format.TS_TRACE.value:
+            return self._trace(model, dataloader_fn)
+        elif output_type == Format.TS_SCRIPT.value:
+            return self._script(model)
+        elif output_type == Format.ONNX.value:
+            return model
+        else:
+            raise ValueError(f"Not supported PyTorch format: {output_type}")
+
+    def _trace(self, model: Model, dataloader_fn) -> Model:
+        device = get_model_device(model.handle)
+        dummy_input = get_sample_input(dataloader_fn(), device)
+        traced_model = torch.jit.trace_module(model.handle, {"forward": dummy_input})
+        return Model(traced_model, precision=model.precision, inputs=model.inputs, outputs=model.outputs)
+
+    def _script(self, model: Model) -> Model:
+        scripted_model = torch.jit.script(model.handle)
+        return Model(scripted_model, precision=model.precision, inputs=model.inputs, outputs=model.outputs)
+
+
+class TorchScriptLoader(BaseLoader):
+    def __init__(self, tensor_names_path: str = None, **kwargs):
+        self._model_args = kwargs
+        self._io_spec = None
+        if tensor_names_path is not None:
+            with Path(tensor_names_path).open("r") as fh:
+                tensor_infos = yaml.load(fh, Loader=yaml.SafeLoader)
+                self._io_spec = ModelSignatureConfig(tensor_infos["inputs"], tensor_infos["outputs"])
+
+    def load(self, model_path: Union[str, Path], **_) -> Model:
+        if not isinstance(model_path, Path):
+            model_path = Path(model_path)
+        model = torch.jit.load(model_path.as_posix())
+        precision = infer_model_precision(model)
+
+        io_spec = self._io_spec
+        if not io_spec:
+            yaml_path = model_path.parent / f"{model_path.name}.yaml"
+            if not yaml_path.is_file():
+                raise ValueError(
+                    f"If `--tensor-names-path is not provided, "
+                    f"TorchScript model loader expects file {yaml_path} with tensor information."
+                )
+            with yaml_path.open("r") as fh:
+                tensor_info = yaml.load(fh, Loader=yaml.SafeLoader)
+                io_spec = ModelSignatureConfig(tensor_info["inputs"], tensor_info["outputs"])
+
+        return Model(handle=model, precision=precision, inputs=io_spec.inputs, outputs=io_spec.outputs)
+
+
+class PYT2ONNXSaver(BaseSaver):
+    def __init__(self, onnx_opset: int = None):
+        self._onnx_opset = onnx_opset
+
+    def save(self, model: Model, model_path: Union[str, Path], dataloader_fn) -> Model:
+        if isinstance(model_path, Path):
+            model_path = model_path.as_posix()
+        assert isinstance(model.handle, torch.jit.ScriptModule) or isinstance(
+            model.handle, torch.nn.Module
+        ), "The model must be of type 'torch.jit.ScriptModule' or 'torch.nn.Module'. Converter aborted."
+        dynamic_axes = get_dynamic_axes(dataloader_fn(), batch_size_dim=0)
+
+        device = get_model_device(model.handle)
+        dummy_input = get_sample_input(dataloader_fn(), device)
+
+        with torch.no_grad():
+            torch.onnx.export(
+                model.handle,
+                dummy_input,
+                model_path,
+                do_constant_folding=True,
+                input_names=list(model.inputs),
+                output_names=list(model.outputs),
+                dynamic_axes=dynamic_axes,
+                opset_version=self._onnx_opset,
+                enable_onnx_checker=True,
+            )
+
+
+class TorchScriptSaver(BaseSaver):
+    def save(self, model: Model, model_path: Union[str, Path], dataloader_fn) -> None:
+        if not isinstance(model_path, Path):
+            model_path = Path(model_path)
+        if isinstance(model.handle, torch.jit.ScriptModule):
+            torch.jit.save(model.handle, model_path.as_posix())
+        else:
+            raise RuntimeError("The model must be of type 'torch.jit.ScriptModule'. Saving aborted.")
+
+        signature_config = ModelSignatureConfig(inputs=model.inputs, outputs=model.outputs)
+        annotation_path = model_path.parent / f"{model_path.name}.yaml"
+        with YamlConfigFile(annotation_path) as config_file:
+            config_file.save_config(signature_config)
+
+
+class PyTorchRunner(BaseRunner):
+    def __init__(self):
+        pass
+
+    def init_inference(self, model: Model):
+        return PyTorchRunnerSession(model=model)
+
+
+class PyTorchRunnerSession(BaseRunnerSession):
+    def __init__(self, model: Model):
+        super().__init__(model)
+
+        assert isinstance(model.handle, torch.jit.ScriptModule) or isinstance(
+            model.handle, torch.nn.Module
+        ), "The model must be of type 'torch.jit.ScriptModule' or 'torch.nn.Module'. Runner aborted."
+
+        self._model = model
+        self._output_names = None
+
+    def __enter__(self):
+        self._output_names = list(self._model.outputs)
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self._output_names = None
+        self._model = None
+
+    def __call__(self, x: Dict[str, object]):
+        with torch.no_grad():
+            feed_list = [torch.from_numpy(v).cuda() for k, v in x.items()]
+            y_pred = self._model.handle(*feed_list)
+            if isinstance(y_pred, torch.Tensor):
+                y_pred = (y_pred,)
+            y_pred = [t.cpu().numpy() for t in y_pred]
+            y_pred = dict(zip(self._output_names, y_pred))
+
+        return y_pred
+
+
+loaders.register_extension(Format.PYT.value, PyTorchModelLoader)
+loaders.register_extension(Format.TS_TRACE.value, TorchScriptLoader)
+loaders.register_extension(Format.TS_SCRIPT.value, TorchScriptLoader)
+
+savers.register_extension(Format.TS_SCRIPT.value, TorchScriptSaver)
+savers.register_extension(Format.TS_TRACE.value, TorchScriptSaver)
+savers.register_extension(f"{Format.PYT.value}--{Format.ONNX.value}", PYT2ONNXSaver)
+
+runners.register_extension(Format.PYT.value, PyTorchRunner)
+runners.register_extension(Format.TS_SCRIPT.value, PyTorchRunner)
+runners.register_extension(Format.TS_TRACE.value, PyTorchRunner)
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/triton/deployment_toolkit/bermuda/tensorrt.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/triton/deployment_toolkit/bermuda/tensorrt.py
new file mode 100644
index 00000000..55b717de
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/triton/deployment_toolkit/bermuda/tensorrt.py
@@ -0,0 +1,222 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import sys
+from pathlib import Path
+from typing import Dict, NamedTuple, Optional, Union
+
+import numpy as np
+
+# pytype: disable=import-error
+try:
+    import pycuda.autoinit
+    import pycuda.driver as cuda
+except Exception as e:
+    logging.getLogger(__name__).warning(f"Problems with importing pycuda package; {e}")
+# pytype: enable=import-error
+
+import tensorrt as trt  # pytype: disable=import-error
+
+from ..core import BaseLoader, BaseRunner, BaseRunnerSession, Format, Model, TensorSpec
+from ..extensions import loaders, runners
+
+LOGGER = logging.getLogger(__name__)
+TRT_LOGGER = trt.Logger(trt.Logger.INFO)
+
+# documentation:
+# https://docs.nvidia.com/deeplearning/tensorrt/api/python_api/index.html
+# https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#python_samples_section
+
+_NP_DTYPE2TRT_DTYPE = {
+    np.dtype("float32"): trt.DataType.FLOAT,
+    np.dtype("float16"): trt.DataType.HALF,
+    np.dtype("int8"): trt.DataType.INT8,
+    np.dtype("int32"): trt.DataType.INT32,
+    np.dtype("bool"): trt.DataType.BOOL,
+}
+
+
+class TensorRTLoader(BaseLoader):
+    def load(self, model_path: Union[str, Path], **_) -> Model:
+        model_path = Path(model_path)
+        LOGGER.debug(f"Loading TensorRT engine from {model_path}")
+
+        with model_path.open("rb") as fh, trt.Runtime(TRT_LOGGER) as runtime:
+            engine = runtime.deserialize_cuda_engine(fh.read())
+
+        if engine is None:
+            raise RuntimeError(f"Could not load ICudaEngine from {model_path}")
+
+        inputs = {}
+        outputs = {}
+        for binding_idx in range(engine.num_bindings):
+            name = engine.get_binding_name(binding_idx)
+            is_input = engine.binding_is_input(binding_idx)
+            dtype = np.dtype(trt.nptype(engine.get_binding_dtype(binding_idx))).name
+            shape = engine.get_binding_shape(binding_idx)
+            if is_input:
+                inputs[name] = TensorSpec(name, dtype, shape)
+            else:
+                outputs[name] = TensorSpec(name, dtype, shape)
+
+        return Model(engine, None, inputs, outputs)
+
+
+class TRTBuffers(NamedTuple):
+    x_host: Optional[Dict[str, object]]
+    x_dev: Dict[str, object]
+    y_pred_host: Dict[str, object]
+    y_pred_dev: Dict[str, object]
+
+
+class TensorRTRunner(BaseRunner):
+    def __init__(self):
+        pass
+
+    def init_inference(self, model: Model):
+        return TensorRTRunnerSession(model=model)
+
+
+class TensorRTRunnerSession(BaseRunnerSession):
+    def __init__(self, model: Model):
+        super().__init__(model)
+        assert isinstance(model.handle, trt.ICudaEngine)
+        self._model = model
+        self._has_dynamic_shapes = None
+
+        self._context = None
+        self._engine: trt.ICudaEngine = self._model.handle
+        self._cuda_context = pycuda.autoinit.context
+
+        self._input_names = None
+        self._output_names = None
+        self._buffers = None
+
+    def __enter__(self):
+        self._context = self._engine.create_execution_context()
+        self._context.__enter__()
+
+        self._input_names = [
+            self._engine[idx] for idx in range(self._engine.num_bindings) if self._engine.binding_is_input(idx)
+        ]
+        self._output_names = [
+            self._engine[idx] for idx in range(self._engine.num_bindings) if not self._engine.binding_is_input(idx)
+        ]
+        # all_binding_shapes_specified is True for models without dynamic shapes
+        # so initially this variable is False for models with dynamic shapes
+        self._has_dynamic_shapes = not self._context.all_binding_shapes_specified
+
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self._context.__exit__(exc_type, exc_value, traceback)
+        self._input_names = None
+        self._output_names = None
+
+        # TODO: are cuda buffers dealloc automatically?
+        self._buffers = None
+
+    def __call__(self, x):
+        buffers = self._prepare_buffers_if_needed(x)
+        bindings = self._update_bindings(buffers)
+
+        for name in self._input_names:
+            cuda.memcpy_htod(buffers.x_dev[name], buffers.x_host[name])
+        self._cuda_context.push()
+        self._context.execute_v2(bindings=bindings)
+        self._cuda_context.pop()
+        for name in self._output_names:
+            cuda.memcpy_dtoh(buffers.y_pred_host[name], buffers.y_pred_dev[name])
+
+        return buffers.y_pred_host
+
+    def _update_bindings(self, buffers: TRTBuffers):
+        bindings = [None] * self._engine.num_bindings
+        for name in buffers.y_pred_dev:
+            binding_idx: int = self._engine[name]
+            bindings[binding_idx] = buffers.y_pred_dev[name]
+
+        for name in buffers.x_dev:
+            binding_idx: int = self._engine[name]
+            bindings[binding_idx] = buffers.x_dev[name]
+
+        return bindings
+
+    def _set_dynamic_input_shapes(self, x_host):
+        def _is_shape_dynamic(input_shape):
+            return any([dim is None or dim == -1 for dim in input_shape])
+
+        for name in self._input_names:
+            bindings_idx = self._engine[name]
+            data_shape = x_host[name].shape  # pytype: disable=attribute-error
+            if self._engine.is_shape_binding(bindings_idx):
+                input_shape = self._context.get_shape(bindings_idx)
+                if _is_shape_dynamic(input_shape):
+                    self._context.set_shape_input(bindings_idx, data_shape)
+            else:
+                input_shape = self._engine.get_binding_shape(bindings_idx)
+                if _is_shape_dynamic(input_shape):
+                    self._context.set_binding_shape(bindings_idx, data_shape)
+
+        assert self._context.all_binding_shapes_specified and self._context.all_shape_inputs_specified
+
+    def _prepare_buffers_if_needed(self, x_host: Dict[str, object]):
+        # pytype: disable=attribute-error
+        new_batch_size = list(x_host.values())[0].shape[0]
+        current_batch_size = list(self._buffers.y_pred_host.values())[0].shape[0] if self._buffers else 0
+        # pytype: enable=attribute-error
+
+        if self._has_dynamic_shapes or new_batch_size != current_batch_size:
+            # TODO: are CUDA buffers dealloc automatically?
+
+            self._set_dynamic_input_shapes(x_host)
+
+            y_pred_host = {}
+            for name in self._output_names:
+                shape = self._context.get_binding_shape(self._engine[name])
+                binding_idx: int = self._engine[name]
+                dtype_from_trt_binding = np.dtype(trt.nptype(self._engine.get_binding_dtype(binding_idx)))
+                dtype_from_model_spec = np.dtype(self._model.outputs[name].dtype)
+
+                assert dtype_from_model_spec == dtype_from_trt_binding
+
+                y_pred_host[name] = np.zeros(shape, dtype=dtype_from_model_spec)
+
+            y_pred_dev = {name: cuda.mem_alloc(data.nbytes) for name, data in y_pred_host.items()}
+
+            # cast host input into binding dtype
+            def _cast_input(name, data):
+                binding_idx: int = self._engine[name]
+                np_dtype = trt.nptype(self._engine.get_binding_dtype(binding_idx))
+                return data.astype(np_dtype)
+
+            x_host = {name: _cast_input(name, host_input) for name, host_input in x_host.items()}
+
+            x_dev = {
+                name: cuda.mem_alloc(host_input.nbytes)
+                for name, host_input in x_host.items()
+                if name in self._input_names  # pytype: disable=attribute-error
+            }
+
+            self._buffers = TRTBuffers(None, x_dev, y_pred_host, y_pred_dev)
+
+        return self._buffers._replace(x_host=x_host)
+
+
+if "pycuda.driver" in sys.modules:
+    loaders.register_extension(Format.TRT.value, TensorRTLoader)
+    runners.register_extension(Format.TRT.value, TensorRTRunner)
+else:
+    LOGGER.warning("Do not register TensorRT extension due problems with importing pycuda.driver package.")
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/triton/deployment_toolkit/bermuda/utils.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/triton/deployment_toolkit/bermuda/utils.py
new file mode 100644
index 00000000..686f37a8
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/triton/deployment_toolkit/bermuda/utils.py
@@ -0,0 +1,129 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from collections import Counter
+from typing import Callable, Dict, List, Optional
+
+import networkx as nx
+
+from ..core import ShapeSpec
+
+
+def infer_precision(
+    nx_graph: nx.Graph,
+    input_names: List[str],
+    output_names: List[str],
+    get_node_dtype_fn: Callable,
+):
+    node_dtypes = [nx_graph.nodes[node_name].get("dtype", None) for node_name in nx_graph.nodes]
+    node_dtypes = [dt for dt in node_dtypes if dt is None or dt.kind not in ["i", "b"]]
+    dtypes_counter = Counter(node_dtypes)
+    return dtypes_counter.most_common()[0][0]
+
+
+def get_shapes_with_dynamic_axes(dataloader, batch_size_dim: Optional[int] = None):
+    def _set_dynamic_shapes(t, shapes):
+        for k, v in t.items():
+            shape = list(v.shape)
+            for dim, s in enumerate(shape):
+                if shapes[k][dim] != -1 and shapes[k][dim] != s:
+                    shapes[k][dim] = -1
+
+    def _mark_batch_axis(shape, batch_axis: int):
+        shape = list(shape)
+        shape[batch_axis] = -1
+        return tuple(shape)
+
+    ## get all shapes from input and output tensors
+    input_shapes = {}
+    output_shapes = {}
+    for batch in dataloader:
+        _, x, y = batch
+        for k, v in x.items():
+            input_shapes[k] = list(v.shape)
+        for k, v in y.items():
+            output_shapes[k] = list(v.shape)
+        break
+
+    # based on max <max_num_iters> iterations, check which
+    # dimensions differ to determine dynamic_axes
+    max_num_iters = 100
+    for idx, batch in enumerate(dataloader):
+        if idx >= max_num_iters:
+            break
+
+        _, x, y = batch
+
+        _set_dynamic_shapes(x, input_shapes)
+        _set_dynamic_shapes(y, output_shapes)
+
+    if batch_size_dim is not None:
+        input_shapes = {name: _mark_batch_axis(shape, batch_size_dim) for name, shape in input_shapes.items()}
+        output_shapes = {name: _mark_batch_axis(shape, batch_size_dim) for name, shape in output_shapes.items()}
+
+    return input_shapes, output_shapes
+
+
+def get_dynamic_axes(dataloader, batch_size_dim: Optional[int] = None):
+    input_shapes, output_shapes = get_shapes_with_dynamic_axes(dataloader, batch_size_dim=batch_size_dim)
+    all_shapes = {**input_shapes, **output_shapes}
+    dynamic_axes = {}
+
+    for k, shape in all_shapes.items():
+        for idx, s in enumerate(shape):
+            if s == -1:
+                dynamic_axes[k] = {idx: k + "_" + str(idx)}
+
+    for k in all_shapes:
+        if k in dynamic_axes:
+            dynamic_axes[k].update({batch_size_dim: "batch_size_" + str(batch_size_dim)})
+        else:
+            dynamic_axes[k] = {batch_size_dim: "batch_size_" + str(batch_size_dim)}
+
+    return dynamic_axes
+
+
+def get_input_shapes(dataloader, max_batch_size=1) -> Dict[str, ShapeSpec]:
+    def init_counters_and_shapes(x, counters, min_shapes, max_shapes):
+        for k, v in x.items():
+            counters[k] = Counter()
+            min_shapes[k] = [float("inf")] * v.ndim
+            max_shapes[k] = [float("-inf")] * v.ndim
+
+    counters = {}
+    min_shapes: Dict[str, tuple] = {}
+    max_shapes: Dict[str, tuple] = {}
+    for idx, batch in enumerate(dataloader):
+        ids, x, y = batch
+
+        if idx == 0:
+            init_counters_and_shapes(x, counters, min_shapes, max_shapes)
+
+        for k, v in x.items():
+            shape = v.shape
+            counters[k][shape] += 1
+            min_shapes[k] = tuple(min(a, b) for a, b in zip(min_shapes[k], shape))
+            max_shapes[k] = tuple(max(a, b) for a, b in zip(max_shapes[k], shape))
+
+    opt_shapes: Dict[str, tuple] = {}
+    for k, v in counters.items():
+        opt_shapes[k] = v.most_common(1)[0][0]
+
+    shapes = {}
+    for k in opt_shapes.keys():  # same keys in min_shapes and max_shapes
+        shapes[k] = ShapeSpec(
+            min=(1,) + min_shapes[k][1:],
+            max=(max_batch_size,) + max_shapes[k][1:],
+            opt=(max_batch_size,) + opt_shapes[k][1:],
+        )
+    return shapes
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/triton/deployment_toolkit/core.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/triton/deployment_toolkit/core.py
new file mode 100644
index 00000000..c65617fc
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/triton/deployment_toolkit/core.py
@@ -0,0 +1,242 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import abc
+import importlib
+import logging
+import os
+from enum import Enum
+from pathlib import Path
+from typing import Any, Dict, List, NamedTuple, Optional, Tuple, Union
+
+import numpy as np
+
+LOGGER = logging.getLogger(__name__)
+DATALOADER_FN_NAME = "get_dataloader_fn"
+GET_MODEL_FN_NAME = "get_model"
+GET_SERVING_INPUT_RECEIVER_FN = "get_serving_input_receiver_fn"
+GET_ARGPARSER_FN_NAME = "update_argparser"
+
+
+class TensorSpec(NamedTuple):
+    name: str
+    dtype: str
+    shape: Tuple
+
+
+class Parameter(Enum):
+    def __lt__(self, other: "Parameter") -> bool:
+        return self.value < other.value
+
+    def __str__(self):
+        return self.value
+
+
+class Accelerator(Parameter):
+    NONE = "none"
+    AMP = "amp"
+    TRT = "trt"
+
+    CUDA = NONE  # backward compatibility
+
+
+class Precision(Parameter):
+    INT8 = "int8"
+    FP16 = "fp16"
+    FP32 = "fp32"
+    TF32 = "tf32"  # Deprecated
+
+
+class Format(Parameter):
+    TF_GRAPHDEF = "tf-graphdef"
+    TF_SAVEDMODEL = "tf-savedmodel"
+    TF_TRT = "tf-trt"
+    TF_ESTIMATOR = "tf-estimator"
+    TF_KERAS = "tf-keras"
+    ONNX = "onnx"
+    TRT = "trt"
+    TS_SCRIPT = "ts-script"
+    TS_TRACE = "ts-trace"
+    PYT = "pyt"
+    FASTERTRANSFORMER = "fastertransformer"
+
+
+class Model(NamedTuple):
+    handle: object
+    # TODO: precision should be removed
+    precision: Optional[Precision]
+    inputs: Dict[str, TensorSpec]
+    outputs: Dict[str, TensorSpec]
+
+
+def load_from_file(file_path, label, target):
+    spec = importlib.util.spec_from_file_location(name=label, location=file_path)
+    my_module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(my_module)  # pytype: disable=attribute-error
+    return getattr(my_module, target, None)
+
+
+class BaseLoader(abc.ABC):
+    required_fn_name_for_signature_parsing: Optional[str] = None
+
+    @abc.abstractmethod
+    def load(self, model_path: Union[str, Path], **kwargs) -> Model:
+        """
+        Loads and process model from file based on given set of args
+        """
+        pass
+
+
+class BaseSaver(abc.ABC):
+    required_fn_name_for_signature_parsing: Optional[str] = None
+
+    @abc.abstractmethod
+    def save(self, model: Model, model_path: Union[str, Path], dataloader_fn) -> None:
+        """
+        Save model to file
+        """
+        pass
+
+
+class BaseRunner(abc.ABC):
+    required_fn_name_for_signature_parsing: Optional[str] = None
+
+    @abc.abstractmethod
+    def init_inference(self, model: Model):
+        raise NotImplementedError
+
+
+class BaseRunnerSession(abc.ABC):
+    def __init__(self, model: Model):
+        self._model = model
+
+    @abc.abstractmethod
+    def __enter__(self):
+        raise NotImplementedError()
+
+    @abc.abstractmethod
+    def __exit__(self, exc_type, exc_value, traceback):
+        raise NotImplementedError()
+
+    @abc.abstractmethod
+    def __call__(self, x: Dict[str, object]):
+        raise NotImplementedError()
+
+    def _set_env_variables(self) -> Dict[str, object]:
+        """this method not remove values; fix it if needed"""
+        to_set = {}
+        old_values = {k: os.environ.pop(k, None) for k in to_set}
+        os.environ.update(to_set)
+        return old_values
+
+    def _recover_env_variables(self, old_envs: Dict[str, object]):
+        for name, value in old_envs.items():
+            if value is None:
+                del os.environ[name]
+            else:
+                os.environ[name] = str(value)
+
+
+class BaseConverter(abc.ABC):
+    required_fn_name_for_signature_parsing: Optional[str] = None
+
+    @abc.abstractmethod
+    def convert(self, model: Model, dataloader_fn) -> Model:
+        raise NotImplementedError()
+
+    @staticmethod
+    def required_source_model_precision(requested_model_precision: Precision) -> Precision:
+        return requested_model_precision
+
+
+class BaseMetricsCalculator(abc.ABC):
+    required_fn_name_for_signature_parsing: Optional[str] = None
+
+    def calc(
+        self,
+        *,
+        ids: List[Any],
+        y_pred: Dict[str, np.ndarray],
+        x: Optional[Dict[str, np.ndarray]],
+        y_real: Optional[Dict[str, np.ndarray]],
+    ) -> Dict[str, float]:
+        """
+        Calculates error/accuracy metrics
+        Args:
+            ids: List of ids identifying each sample in the batch
+            y_pred: model output as dict where key is output name and value is output value
+            x: model input as dict where key is input name and value is input value
+            y_real: input ground truth as dict where key is output name and value is output value
+        Returns:
+            dictionary where key is metric name and value is its value
+        """
+        pass
+
+    @abc.abstractmethod
+    def update(
+        self,
+        ids: List[Any],
+        y_pred: Dict[str, np.ndarray],
+        x: Optional[Dict[str, np.ndarray]],
+        y_real: Optional[Dict[str, np.ndarray]],
+    ):
+        pass
+
+    @property
+    @abc.abstractmethod
+    def metrics(self) -> Dict[str, Any]:
+        pass
+
+
+class ShapeSpec(NamedTuple):
+    min: Tuple
+    opt: Tuple
+    max: Tuple
+
+
+class MeasurementMode(Enum):
+    COUNT_WINDOWS = "count_windows"
+    TIME_WINDOWS = "time_windows"
+
+
+class PerformanceTool(Enum):
+    """
+    Available performance evaluation tools
+    """
+
+    MODEL_ANALYZER = "model_analyzer"
+    PERF_ANALYZER = "perf_analyzer"
+
+
+class BatchingMode(Enum):
+    """
+    Available batching modes
+    """
+
+    STATIC = "static"
+    DYNAMIC = "dynamic"
+
+
+class EvaluationMode(Enum):
+    """
+    Available evaluation modes
+    """
+
+    OFFLINE = "offline"
+    ONLINE = "online"
+
+
+class OfflineMode(Enum):
+    SYSTEM = "system"
+    CUDA = "cuda"
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/triton/deployment_toolkit/dump.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/triton/deployment_toolkit/dump.py
new file mode 100644
index 00000000..9090f1f9
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/triton/deployment_toolkit/dump.py
@@ -0,0 +1,253 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import abc
+import json
+import pickle
+import threading
+from pathlib import Path
+from typing import Dict, Iterator, List, Union
+
+import numpy as np
+
+MB2B = 2 ** 20
+B2MB = 1 / MB2B
+FLUSH_THRESHOLD_B = 256 * MB2B
+
+
+def _validate_batch(name: str, value: Union[list, np.ndarray]):
+    if not isinstance(value, (list, np.ndarray)):
+        raise ValueError(f"Values shall be lists or np.ndarrays; current type {type(value)}")
+
+
+def _validate_prefix_data(prefix_data: Dict[str, List[np.ndarray]]):
+    batch_sizes_per_io_name = {name: [len(batch) for batch in batches] for name, batches in prefix_data.items()}
+    names = list(batch_sizes_per_io_name)
+    for io_name in names:
+        for batch_idx, batch_size in enumerate(batch_sizes_per_io_name[io_name]):
+            if not all([batch_sizes_per_io_name[other_name][batch_idx] == batch_size for other_name in names]):
+                non_equal_batch_sizes = {
+                    other_name: batch_sizes_per_io_name[other_name][batch_idx] for other_name in names
+                }
+                non_equal_batch_sizes_str = ", ".join(
+                    [f"{name}={batch_size}" for name, batch_size in non_equal_batch_sizes.items()]
+                )
+                raise ValueError(
+                    "All inputs/outputs should have same number of batches with equal batch_size. "
+                    f"At batch_idx={batch_idx} there are batch_sizes: {non_equal_batch_sizes_str}"
+                )
+        # ensure if each io has same number of batches with equal size
+
+
+def _get_nitems_and_batches(prefix_data: Dict[str, List[np.ndarray]]):
+    nitems = 0
+    nbatches = 0
+
+    if prefix_data:
+        nitems_per_io_name = {name: sum(len(batch) for batch in batches) for name, batches in prefix_data.items()}
+        nbatches_per_io_name = {name: len(batches) for name, batches in prefix_data.items()}
+        nitems = list(nitems_per_io_name.values())[0]
+        nbatches = list(nbatches_per_io_name.values())[0]
+    return nitems, nbatches
+
+
+class BaseDumpWriter(abc.ABC):
+    FILE_SUFFIX = ".abstract"
+
+    def __init__(self, output_dir: Union[str, Path]):
+        self._output_dir = Path(output_dir)
+        # outer dict key is prefix (i.e. input/output/labels/...), inner dict key is input/output name
+        # list is list of batches
+        self._items_cache: Dict[str, Dict[str, List[np.ndarray]]] = {}
+        # key is prefix
+        self._items_counters: Dict[str, int] = {}
+        self._cache_lock = threading.RLock()
+        self._flush_threshold_b = FLUSH_THRESHOLD_B
+
+    @property
+    def cache_size(self):
+        def _get_bytes_size(name, batch):
+            _validate_batch(name, batch)
+            if not isinstance(batch, np.ndarray):
+                batch = np.narray(batch)
+
+            return batch.nbytes
+
+        with self._cache_lock:
+            return {
+                prefix: sum(_get_bytes_size(name, batch) for name, batches in data.items() for batch in batches)
+                for prefix, data in self._items_cache.items()
+            }
+
+    def _append_to_cache(self, prefix, prefix_data):
+        if prefix_data is None:
+            return
+
+        if not isinstance(prefix_data, dict):
+            raise ValueError(f"{prefix} data to store shall be dict")
+
+        with self._cache_lock:
+            cached_prefix_data = self._items_cache.setdefault(prefix, {})
+            for name, batch in prefix_data.items():
+                _validate_batch(name, batch)
+                if not isinstance(batch, np.ndarray):
+                    batch = np.array(batch)
+
+                cached_batches = cached_prefix_data.setdefault(name, [])
+                cached_batches += [batch]
+
+    def write(self, **kwargs):
+        with self._cache_lock:
+            for prefix, prefix_data in kwargs.items():
+                self._append_to_cache(prefix, prefix_data)
+
+            biggest_prefix_data_size = max(self.cache_size.values())
+            if biggest_prefix_data_size > self._flush_threshold_b:
+                self.flush()
+
+    def flush(self):
+        with self._cache_lock:
+            for prefix, prefix_data in self._items_cache.items():
+                _validate_prefix_data(prefix_data)
+
+                output_path = self._output_dir / self._get_filename(prefix)
+                self._dump(prefix_data, output_path)
+
+                nitems, nbatches = _get_nitems_and_batches(prefix_data)
+                self._items_counters[prefix] += nitems
+            self._items_cache = {}
+
+    def _get_filename(self, prefix):
+        idx = self._items_counters.setdefault(prefix, 0)
+        return f"{prefix}-{idx:012d}{self.FILE_SUFFIX}"
+
+    @abc.abstractmethod
+    def _dump(self, prefix_data: Dict[str, List[np.ndarray]], output_path: Path):
+        pass
+
+    def __enter__(self):
+        if self._output_dir.exists() and len(list(self._output_dir.iterdir())):
+            raise ValueError(f"{self._output_dir.as_posix()} is not empty")
+        self._output_dir.mkdir(parents=True, exist_ok=True)
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.flush()
+
+
+class PickleDumpWriter(BaseDumpWriter):
+    FILE_SUFFIX = ".pkl"
+
+    def _dump(self, prefix_data: Dict[str, List[np.ndarray]], output_path: Path):
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+        with output_path.open("wb") as pickle_file:
+            pickle.dump(prefix_data, pickle_file)
+
+
+class JsonDumpWriter(BaseDumpWriter):
+    FILE_SUFFIX = ".json"
+
+    def _dump(self, prefix_data: Dict[str, List[np.ndarray]], output_path: Path):
+        repacked_prefix_data = self._format_data(prefix_data)
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+        with output_path.open("w") as json_file:
+            json.dump(repacked_prefix_data, json_file)
+
+    def _format_data(self, prefix_data: Dict[str, List[np.ndarray]]) -> Dict:
+        def _format_batch_for_perf_analyzer_json_format(batch: np.ndarray):
+            return {
+                "content": batch.flatten().tolist(),
+                "shape": list(batch.shape),
+                "dtype": str(batch.dtype),
+            }
+
+        _, nbatches = _get_nitems_and_batches(prefix_data)
+        batches = [{} for _ in range(nbatches)]
+        for io_name, batches_per_io in prefix_data.items():
+            for batch_idx, batch in enumerate(batches_per_io):
+                batches[batch_idx][io_name] = _format_batch_for_perf_analyzer_json_format(batch)
+
+        return {"data": batches}
+
+
+class BaseDumpReader(abc.ABC):
+    FILE_SUFFIX = ".abstract"
+
+    def __init__(self, dump_dir: Union[Path, str]):
+        self._dump_dir = Path(dump_dir)
+
+    def get(self, prefix: str) -> Iterator[Dict[str, np.ndarray]]:
+        dump_files_paths = sorted(self._dump_dir.glob(f"{prefix}*{self.FILE_SUFFIX}"))
+        for dump_file_path in dump_files_paths:
+            prefix_data = self._load_file(dump_file_path)
+            nitems, nbatches = _get_nitems_and_batches(prefix_data)
+            for batch_idx in range(nbatches):
+                yield {io_name: prefix_data[io_name][batch_idx] for io_name in prefix_data}
+
+    @abc.abstractmethod
+    def _load_file(self, dump_file_path: Path) -> Dict[str, List[np.ndarray]]:
+        pass
+
+    def iterate_over(self, prefix_list: List[str]) -> Iterator:
+        iterators = [self.get(prefix) for prefix in prefix_list]
+        empty_iterators = [False] * len(iterators)
+        while not all(empty_iterators):
+            values = [None] * len(iterators)
+            for idx, iterator in enumerate(iterators):
+                if empty_iterators[idx]:
+                    continue
+                try:
+                    values[idx] = next(iterator)
+                except StopIteration:
+                    empty_iterators[idx] = True
+                    if all(empty_iterators):
+                        break
+
+            if not all(empty_iterators):
+                yield values
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        pass
+
+
+class PickleDumpReader(BaseDumpReader):
+    FILE_SUFFIX = ".pkl"
+
+    def _load_file(self, dump_file_path: Path) -> Dict[str, List[np.ndarray]]:
+        with dump_file_path.open("rb") as pickle_file:
+            return pickle.load(pickle_file)
+
+
+class JsonDumpReader(BaseDumpReader):
+    FILE_SUFFIX = ".json"
+
+    def _load_file(self, dump_file_path: Path) -> Dict[str, List[np.ndarray]]:
+        with dump_file_path.open("rb") as json_file:
+            data = json.load(json_file)
+            return self._repack_data(data)
+
+    def _repack_data(self, data: Dict) -> Dict[str, List[np.ndarray]]:
+        result: Dict[str, List[np.ndarray]] = {}
+        batches = data["data"]
+        for batch in batches:
+            for io_name, batch_as_dict in batch.items():
+                io_batches = result.setdefault(io_name, [])
+                flat_array = batch_as_dict["content"]
+                shape = batch_as_dict["shape"]
+                dtype = batch_as_dict["dtype"]
+                batch_as_array = np.array(flat_array).reshape(shape).astype(dtype)
+                io_batches.append(batch_as_array)
+        return result
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/triton/deployment_toolkit/extensions.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/triton/deployment_toolkit/extensions.py
new file mode 100644
index 00000000..c328b64f
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/triton/deployment_toolkit/extensions.py
@@ -0,0 +1,83 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import importlib
+import logging
+import os
+import re
+from pathlib import Path
+from typing import List
+
+LOGGER = logging.getLogger(__name__)
+
+
+class ExtensionManager:
+    def __init__(self, name: str):
+        self._name = name
+        self._registry = {}
+
+    def register_extension(self, extension: str, clazz):
+        already_registered_class = self._registry.get(extension, None)
+        if already_registered_class and already_registered_class.__module__ != clazz.__module__:
+            raise RuntimeError(
+                f"Conflicting extension {self._name}/{extension}; "
+                f"{already_registered_class.__module__}.{already_registered_class.__name} "
+                f"and "
+                f"{clazz.__module__}.{clazz.__name__}"
+            )
+        elif already_registered_class is None:
+            clazz_full_name = f"{clazz.__module__}.{clazz.__name__}" if clazz is not None else "None"
+            LOGGER.debug(f"Registering extension {self._name}/{extension}: {clazz_full_name}")
+            self._registry[extension] = clazz
+
+    def get(self, extension):
+        if extension not in self._registry:
+            raise RuntimeError(f"Missing extension {self._name}/{extension}")
+        return self._registry[extension]
+
+    @property
+    def supported_extensions(self):
+        return list(self._registry)
+
+    @staticmethod
+    def scan_for_extensions(extension_dirs: List[Path]):
+        register_pattern = r".*\.register_extension\(.*"
+
+        for extension_dir in extension_dirs:
+            for python_path in extension_dir.rglob("*.py"):
+                if not python_path.is_file():
+                    continue
+                payload = python_path.read_text()
+                if re.findall(register_pattern, payload):
+                    import_path = python_path.relative_to(toolkit_root_dir.parent)
+                    package = import_path.parent.as_posix().replace(os.sep, ".")
+                    package_with_module = f"{package}.{import_path.stem}"
+                    spec = importlib.util.spec_from_file_location(name=package_with_module, location=python_path)
+                    my_module = importlib.util.module_from_spec(spec)
+                    my_module.__package__ = package
+
+                    try:
+                        spec.loader.exec_module(my_module)  # pytype: disable=attribute-error
+                    except ModuleNotFoundError as e:
+                        LOGGER.error(
+                            f"Could not load extensions from {import_path} due to missing python packages; {e}"
+                        )
+
+
+runners = ExtensionManager("runners")
+loaders = ExtensionManager("loaders")
+savers = ExtensionManager("savers")
+converters = ExtensionManager("converters")
+toolkit_root_dir = (Path(__file__).parent / "..").resolve()
+ExtensionManager.scan_for_extensions([toolkit_root_dir])
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/triton/deployment_toolkit/model_analyzer/__init__.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/triton/deployment_toolkit/model_analyzer/__init__.py
new file mode 100644
index 00000000..0dfac345
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/triton/deployment_toolkit/model_analyzer/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .model_analyzer import ModelAnalyzer, ModelAnalyzerMode  # noqa: F401
+from .model_analyzer_config import ModelAnalyzerConfig  # noqa: F401
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/triton/deployment_toolkit/model_analyzer/exceptions.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/triton/deployment_toolkit/model_analyzer/exceptions.py
new file mode 100644
index 00000000..8947a98e
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/triton/deployment_toolkit/model_analyzer/exceptions.py
@@ -0,0 +1,39 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+class ModelAnalyzerException(Exception):
+    def __init__(self, message: str):
+        self._message = message
+
+    def __str__(self):
+        """
+        Get the exception string representation.
+
+        Returns
+        -------
+        str
+            The message associated with this exception, or None if no message.
+        """
+        return self._message
+
+    @property
+    def message(self):
+        """
+        Get the exception message.
+
+        Returns
+        -------
+        str
+            The message associated with this exception, or None if no message.
+        """
+        return self._message
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/triton/deployment_toolkit/model_analyzer/model_analyzer.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/triton/deployment_toolkit/model_analyzer/model_analyzer.py
new file mode 100644
index 00000000..9ca2d954
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/triton/deployment_toolkit/model_analyzer/model_analyzer.py
@@ -0,0 +1,77 @@
+# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+import subprocess
+from subprocess import CalledProcessError
+
+from .exceptions import ModelAnalyzerException
+
+SERVER_OUTPUT_TIMEOUT_SECS = 5
+LOGGER = logging.getLogger(__name__)
+
+
+class ModelAnalyzerMode:
+    PROFILE = "profile"
+    ANALYZE = "analyze"
+    REPORT = "report"
+
+
+class ModelAnalyzer:
+    """
+    Concrete Implementation of Model Analyzer interface that runs
+    analyzer locally as as subprocess.
+    """
+
+    _analyzer_path = "model-analyzer"
+
+    def __init__(self, config):
+        """
+        Parameters
+        ----------
+        path  : str
+            The absolute path to the model analyzer executable
+        config : AnalyzerConfig
+            the config object containing arguments for this server instance
+        """
+
+        self._analyzer_process = None
+        self._analyzer_config = config
+        self._log = None
+
+    def run(self, mode: str, verbose: bool = False, quiet: bool = False):
+        """
+        Starts the model analyzer locally
+        """
+
+        if self._analyzer_path:
+
+            cmd = [self._analyzer_path]
+            if verbose:
+                cmd += ["--verbose"]
+
+            if quiet:
+                cmd += ["--quiet"]
+
+            cmd += [mode]
+            cmd += self._analyzer_config.to_cli_string().split()
+
+            LOGGER.debug(f"Model Analyze command: {cmd}")
+            try:
+                subprocess.run(cmd, check=True, start_new_session=True)
+
+            except CalledProcessError as e:
+                raise ModelAnalyzerException(
+                    f"Running {self._analyzer_path} with {e.cmd} failed with"
+                    f" exit status {e.returncode} : {e.output}"
+                )
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/triton/deployment_toolkit/model_analyzer/model_analyzer_config.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/triton/deployment_toolkit/model_analyzer/model_analyzer_config.py
new file mode 100644
index 00000000..21dbb965
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/triton/deployment_toolkit/model_analyzer/model_analyzer_config.py
@@ -0,0 +1,113 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .exceptions import ModelAnalyzerException
+
+
+class ModelAnalyzerConfig:
+    """
+    A config class to set arguments to the Model Analyzer.
+    An argument set to None will use the default.
+    """
+
+    model_analyzer_args = [
+        "config-file",
+    ]
+
+    input_to_options = [
+        "config-file",
+    ]
+
+    def __init__(self):
+        # Args will be a dict with the string representation as key
+        self._args = {k: None for k in self.model_analyzer_args}
+
+        self._options = {
+            "-f": "config.yaml",
+        }
+
+        self._input_to_options = {
+            "config-file": "-f",
+        }
+
+    def to_cli_string(self):
+        """
+        Utility function to convert a config into a
+        string of arguments to the server with CLI.
+        Returns
+        -------
+        str
+            the command consisting of all set arguments to
+            the model analyzer.
+            e.g. '--model-repository=/models --verbose=True'
+        """
+        # single dashed options, then verbose flags, then main args
+        args = [f"{k} {v}" for k, v in self._options.items() if v]
+        args += [f"--{k}={v}" for k, v in self._args.items() if v]
+
+        return " ".join(args)
+
+    @classmethod
+    def allowed_keys(cls):
+        """
+        Returns
+        -------
+        list of str
+            The keys that are allowed to be
+            passed into model_analyzer
+        """
+
+        return list(cls.model_analyzer_args) + list(cls.input_to_options)
+
+    def __getitem__(self, key):
+        """
+        Gets an arguments value in config
+        Parameters
+        ----------
+        key : str
+            The name of the argument to the model analyzer
+        Returns
+        -------
+            The value that the argument is set to in this config
+        """
+
+        if key in self._args:
+            return self._args[key]
+        elif key in self._input_to_options:
+            return self._options[self._input_to_options[key]]
+        else:
+            raise ModelAnalyzerException(f"'{key}' Key not found in config")
+
+    def __setitem__(self, key, value):
+        """
+        Sets an arguments value in config
+        after checking if defined/supported.
+        Parameters
+        ----------
+        key : str
+            The name of the argument to the model analyzer
+        value : (any)
+            The value to which the argument is being set
+        Raises
+        ------
+        TritonModelAnalyzerException
+            If key is unsupported or undefined in the
+            config class
+        """
+        if key in self._args:
+            self._args[key] = value
+        elif key in self._input_to_options:
+            self._options[self._input_to_options[key]] = value
+        else:
+            raise ModelAnalyzerException(f"The argument '{key}' to the Model Analyzer is not supported.")
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/triton/deployment_toolkit/perf_analyzer/__init__.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/triton/deployment_toolkit/perf_analyzer/__init__.py
new file mode 100644
index 00000000..e1dfc06e
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/triton/deployment_toolkit/perf_analyzer/__init__.py
@@ -0,0 +1,21 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import pathlib
+
+# method from PEP-366 to support relative import in executed modules
+if __package__ is None:
+    __package__ = pathlib.Path(__file__).parent.name
+
+from .perf_analyzer import PerfAnalyzer  # noqa: F401
+from .perf_config import PerfAnalyzerConfig  # noqa: F401
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/triton/deployment_toolkit/perf_analyzer/exceptions.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/triton/deployment_toolkit/perf_analyzer/exceptions.py
new file mode 100644
index 00000000..56a595e0
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/triton/deployment_toolkit/perf_analyzer/exceptions.py
@@ -0,0 +1,40 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+class PerfAnalyzerException(Exception):
+    def __init__(self, message: str):
+        self._message = message
+
+    def __str__(self):
+        """
+        Get the exception string representation.
+
+        Returns
+        -------
+        str
+            The message associated with this exception, or None if no message.
+        """
+        return self._message
+
+    @property
+    def message(self):
+        """
+        Get the exception message.
+
+        Returns
+        -------
+        str
+            The message associated with this exception, or None if no message.
+        """
+        return self._message
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/triton/deployment_toolkit/perf_analyzer/perf_analyzer.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/triton/deployment_toolkit/perf_analyzer/perf_analyzer.py
new file mode 100644
index 00000000..68282a43
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/triton/deployment_toolkit/perf_analyzer/perf_analyzer.py
@@ -0,0 +1,127 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+import pathlib
+from subprocess import PIPE, CalledProcessError, Popen
+
+# method from PEP-366 to support relative import in executed modules
+if __package__ is None:
+    __package__ = pathlib.Path(__file__).parent.name
+
+from .exceptions import PerfAnalyzerException
+
+MAX_INTERVAL_CHANGES = 10
+COUNT_INTERVAL_DELTA = 50
+TIME_INTERVAL_DELTA = 2000
+
+LOGGER = logging.getLogger(__name__)
+
+
+class PerfAnalyzer:
+    """
+    This class provides an interface for running workloads
+    with perf_analyzer.
+    """
+
+    def __init__(self, config):
+        """
+        Parameters
+        ----------
+        config : PerfAnalyzerConfig
+            keys are names of arguments to perf_analyzer,
+            values are their values.
+        """
+        self.bin_path = "perf_analyzer"
+        self._config = config
+        self._output = str()
+
+    def run(self):
+        """
+        Runs the perf analyzer with the
+        initialized configuration
+
+        Returns
+        -------
+        List of Records
+            List of the metrics obtained from this
+            run of perf_analyzer
+
+        Raises
+        ------
+        ServicAnalyzerException
+            If subprocess throws CalledProcessError
+        """
+        for _ in range(MAX_INTERVAL_CHANGES):
+            command = [self.bin_path]
+            command += self._config.to_cli_string().replace("=", " ").split()
+
+            LOGGER.debug(f"Perf Analyze command: {command}")
+            try:
+                process = Popen(command, start_new_session=True, stdout=PIPE, encoding="utf-8")
+                while True:
+                    output = process.stdout.readline()
+                    if output == "" and process.poll() is not None:
+                        break
+                    if output:
+                        self._output += output
+                        print(output.rstrip())
+
+                result = process.poll()
+                if result != 0:
+                    raise CalledProcessError(returncode=result, cmd=command, output=self._output)
+
+                return
+
+            except CalledProcessError as e:
+                if self._faild_with_measruement_inverval(e.output):
+                    if self._config["measurement-mode"] is None or self._config["measurement-mode"] == "count_windows":
+                        self._increase_request_count()
+                    else:
+                        self._increase_time_interval()
+                else:
+                    raise PerfAnalyzerException(
+                        f"Running perf_analyzer with {e.cmd} failed with" f" exit status {e.returncode} : {e.output}"
+                    )
+
+        raise PerfAnalyzerException(f"Ran perf_analyzer {MAX_INTERVAL_CHANGES} times, but no valid requests recorded.")
+
+    def output(self):
+        """
+        Returns
+        -------
+        The stdout output of the
+        last perf_analyzer run
+        """
+        if self._output:
+            return self._output
+        raise PerfAnalyzerException("Attempted to get perf_analyzer output" "without calling run first.")
+
+    def _faild_with_measruement_inverval(self, output: str):
+        return (
+            output.find("Failed to obtain stable measurement") or output.find("Please use a larger time window") != -1
+        )
+
+    def _increase_request_count(self):
+        self._config["measurement-request-count"] += COUNT_INTERVAL_DELTA
+        LOGGER.debug(
+            "perf_analyzer's measurement request count is too small, "
+            f"decreased to {self._config['measurement-request-count']}."
+        )
+
+    def _increase_time_interval(self):
+        self._config["measurement-interval"] += TIME_INTERVAL_DELTA
+        LOGGER.debug(
+            "perf_analyzer's measurement window is too small, "
+            f"increased to {self._config['measurement-interval']} ms."
+        )
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/triton/deployment_toolkit/perf_analyzer/perf_config.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/triton/deployment_toolkit/perf_analyzer/perf_config.py
new file mode 100644
index 00000000..39d363a5
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/triton/deployment_toolkit/perf_analyzer/perf_config.py
@@ -0,0 +1,216 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any, Dict
+
+from .exceptions import PerfAnalyzerException
+
+
+class PerfAnalyzerConfig:
+    """
+    A config class to set arguments to the perf_analyzer.
+    An argument set to None will use the perf_analyzer's default.
+    """
+
+    perf_analyzer_args = [
+        "async",
+        "sync",
+        "measurement-interval",
+        "measurement-mode",
+        "measurement-request-count",
+        "concurrency-range",
+        "request-rate-range",
+        "request-distribution",
+        "request-intervals",
+        "binary-search",
+        "num-of-sequence",
+        "latency-threshold",
+        "max-threads",
+        "stability-percentage",
+        "max-trials",
+        "percentile",
+        "input-data",
+        "shared-memory",
+        "output-shared-memory-size",
+        "sequence-length",
+        "string-length",
+        "string-data",
+    ]
+
+    perf_analyzer_multiple_args = [
+        "shape",
+    ]
+
+    input_to_options = [
+        "model-name",
+        "model-version",
+        "batch-size",
+        "url",
+        "protocol",
+        "latency-report-file",
+        "streaming",
+    ]
+
+    input_to_verbose = ["verbose", "extra-verbose"]
+
+    def __init__(self):
+        """
+        Construct a PerfAnalyzerConfig
+        """
+
+        self._args = {k: None for k in self.perf_analyzer_args}
+        self._multiple_args = {k: [] for k in self.perf_analyzer_multiple_args}
+
+        self._options = {
+            "-m": None,
+            "-x": None,
+            "-b": None,
+            "-u": None,
+            "-i": None,
+            "-f": None,
+            "-H": None,
+            "-c": None,
+            "-t": None,
+        }
+        self._verbose = {"-v": None, "-v -v": None}
+
+        self._input_to_options = {
+            "model-name": "-m",
+            "model-version": "-x",
+            "batch-size": "-b",
+            "url": "-u",
+            "protocol": "-i",
+            "latency-report-file": "-f",
+            "streaming": "-H",
+            "concurrency": "-c",
+            "threads": "-t",
+        }
+
+        self._input_to_verbose = {"verbose": "-v", "extra-verbose": "-v -v"}
+
+    @classmethod
+    def allowed_keys(cls):
+        """
+        Returns
+        -------
+        list of str
+            The keys that are allowed to be
+            passed into perf_analyzer
+        """
+
+        return (
+            list(cls.perf_analyzer_args)
+            + list(cls.perf_analyzer_multiple_args)
+            + list(cls.input_to_options)
+            + list(cls.input_to_verbose)
+        )
+
+    def update_config(self, params=None):
+        """
+        Allows setting values from a
+        params dict
+
+        Parameters
+        ----------
+        params: dict
+            keys are allowed args to perf_analyzer
+        """
+
+        if params:
+            for key in params:
+                self[key] = params[key]
+
+    def to_cli_string(self):
+        """
+        Utility function to convert a config into a
+        string of arguments to the perf_analyzer with CLI.
+
+        Returns
+        -------
+        str
+            cli command string consisting of all arguments
+            to the perf_analyzer set in the config, without
+            the executable name.
+        """
+
+        # single dashed options, then verbose flags, then main args
+        args = [f"{k} {v}" for k, v in self._options.items() if v]
+        args += [k for k, v in self._verbose.items() if v]
+        args += [f"--{k}={v}" for k, v in self._args.items() if v]
+        for k, v in self._multiple_args.items():
+            for item in v:
+                args.append(f"--{k}={item}")
+
+        return " ".join(args)
+
+    def __getitem__(self, key: str):
+        """
+        Gets an arguments value in config
+
+        Parameters
+        ----------
+        key : str
+            The name of the argument to the perf_analyzer
+
+        Returns
+        -------
+            The value that the argument is set to in this config
+
+        Raises
+        ------
+        TritonModelAnalyzerException
+            If argument not found in the config
+        """
+
+        if key in self._args:
+            return self._args[key]
+        elif key in self._multiple_args:
+            return self._multiple_args[key]
+        elif key in self._input_to_options:
+            return self._options[self._input_to_options[key]]
+        elif key in self._input_to_verbose:
+            return self._verbose[self._input_to_verbose[key]]
+        else:
+            raise PerfAnalyzerException(f"'{key}' Key not found in config")
+
+    def __setitem__(self, key: str, value: Any):
+        """
+        Sets an arguments value in config
+        after checking if defined/supported.
+
+        Parameters
+        ----------
+        key : str
+            The name of the argument to the perf_analyzer
+        value : (any)
+            The value to which the argument is being set
+
+        Raises
+        ------
+        TritonModelAnalyzerException
+            If key is unsupported or undefined in the
+            config class
+        """
+
+        if key in self._args:
+            self._args[key] = value
+        elif key in self._multiple_args:
+            self._multiple_args[key].append(value)
+        elif key in self._input_to_options:
+            self._options[self._input_to_options[key]] = value
+        elif key in self._input_to_verbose:
+            self._verbose[self._input_to_verbose[key]] = value
+        else:
+            raise PerfAnalyzerException(
+                f"The argument '{key}' to the perf_analyzer " "is not supported by the model analyzer."
+            )
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/triton/deployment_toolkit/report.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/triton/deployment_toolkit/report.py
new file mode 100644
index 00000000..0e53e437
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/triton/deployment_toolkit/report.py
@@ -0,0 +1,61 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import csv
+import re
+from typing import Dict, List
+
+from natsort import natsorted
+from tabulate import tabulate
+
+
+def sort_results(results: List):
+    results = natsorted(results, key=lambda item: [item[key] for key in item.keys()])
+    return results
+
+
+def save_results(filename: str, data: List, formatted: bool = False):
+    data = format_data(data=data) if formatted else data
+    with open(filename, "a") as csvfile:
+        fieldnames = data[0].keys()
+        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
+
+        writer.writeheader()
+        for row in data:
+            writer.writerow(row)
+
+
+def format_data(data: List[Dict]) -> List[Dict]:
+    formatted_data = list()
+    for item in data:
+        formatted_item = format_keys(data=item)
+        formatted_data.append(formatted_item)
+
+    return formatted_data
+
+
+def format_keys(data: Dict) -> Dict:
+    keys = {format_key(key=key): value for key, value in data.items()}
+    return keys
+
+
+def format_key(key: str) -> str:
+    key = " ".join([k.capitalize() for k in re.split("_| ", key)])
+    return key
+
+
+def show_results(results: List[Dict]):
+    headers = list(results[0].keys())
+    summary = map(lambda x: list(map(lambda item: item[1], x.items())), results)
+    print(tabulate(summary, headers=headers))
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/triton/deployment_toolkit/utils.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/triton/deployment_toolkit/utils.py
new file mode 100644
index 00000000..c1a1a6f3
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/triton/deployment_toolkit/utils.py
@@ -0,0 +1,47 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+from typing import Tuple
+
+LOGGER = logging.getLogger(__name__)
+
+
+def parse_server_url(server_url: str) -> Tuple[str, str, int]:
+    DEFAULT_PORTS = {"http": 8000, "grpc": 8001}
+
+    # extract protocol
+    server_url_items = server_url.split("://")
+    if len(server_url_items) != 2:
+        raise ValueError("Prefix server_url with protocol ex.: grpc://127.0.0.1:8001")
+    requested_protocol, server_url = server_url_items
+    requested_protocol = requested_protocol.lower()
+
+    if requested_protocol not in DEFAULT_PORTS:
+        raise ValueError(f"Unsupported protocol: {requested_protocol}")
+
+    # extract host and port
+    default_port = DEFAULT_PORTS[requested_protocol]
+    server_url_items = server_url.split(":")
+    if len(server_url_items) == 1:
+        host, port = server_url, default_port
+    elif len(server_url_items) == 2:
+        host, port = server_url_items
+        port = int(port)
+        if port != default_port:
+            LOGGER.warning(
+                f"Current server URL is {server_url} while default {requested_protocol} port is {default_port}"
+            )
+    else:
+        raise ValueError(f"Could not parse {server_url}. Example of correct server URL: grpc://127.0.0.1:8001")
+    return requested_protocol, host, port
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/triton/deployment_toolkit/warmup.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/triton/deployment_toolkit/warmup.py
new file mode 100644
index 00000000..8dbdd37f
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/triton/deployment_toolkit/warmup.py
@@ -0,0 +1,96 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+import pathlib
+from distutils.version import LooseVersion
+from importlib.metadata import version
+from typing import List
+
+TRITON_CLIENT_VERSION = LooseVersion(version("tritonclient"))
+
+# method from PEP-366 to support relative import in executed modules
+if __package__ is None:
+    __package__ = pathlib.Path(__file__).parent.name
+
+from .core import BatchingMode, EvaluationMode, MeasurementMode, OfflineMode
+from .perf_analyzer import PerfAnalyzer, PerfAnalyzerConfig
+from .utils import parse_server_url
+
+LOGGER = logging.getLogger("warmup")
+
+
+def performance_evaluation_warmup(
+    server_url: str,
+    model_name: str,
+    batch_sizes: List[int],
+    number_of_triton_instances: int,
+    number_of_model_instances: int,
+    input_data: str,
+    input_shapes: List[str],
+    measurement_mode: MeasurementMode,
+    measurement_interval: int,
+    measurement_request_count: int,
+    batching_mode: BatchingMode,
+    offline_mode: OfflineMode,
+    evaluation_mode: EvaluationMode,
+):
+    protocol, host, port = parse_server_url(server_url)
+
+    measurement_interval = 2 * measurement_interval
+    measurement_request_count = 2 * measurement_request_count
+
+    if batching_mode == BatchingMode.STATIC:
+        batch_sizes = sorted({1, batch_sizes[-1]})
+        max_concurrency = 1
+        min_concurrency = 1
+        step = 1
+    elif batching_mode == BatchingMode.DYNAMIC:
+        max_batch_size = max(batch_sizes)
+        max_total_requests = 2 * max_batch_size * number_of_triton_instances * number_of_model_instances
+        max_concurrency = min(256, max_total_requests)
+        step = max(1, max_concurrency // 2)
+        min_concurrency = step
+        batch_sizes = [max(1, max_total_requests // 256)]
+    else:
+        raise ValueError(f"Unsupported batching mode: {batching_mode}")
+
+    for batch_size in batch_sizes:
+        params = {
+            "model-name": model_name,
+            "model-version": 1,
+            "batch-size": batch_size,
+            "url": f"{host}:{port}",
+            "protocol": protocol,
+            "input-data": input_data,
+            "measurement-interval": measurement_interval,
+            "concurrency-range": f"{min_concurrency}:{max_concurrency}:{step}",
+            "verbose": True,
+        }
+
+        if TRITON_CLIENT_VERSION >= LooseVersion("2.11.0"):
+            params["measurement-mode"] = measurement_mode.value
+            params["measurement-request-count"] = measurement_request_count
+
+        if evaluation_mode == EvaluationMode.OFFLINE:
+            params["shared-memory"] = offline_mode.value
+
+        config = PerfAnalyzerConfig()
+        for param, value in params.items():
+            config[param] = value
+
+        for shape in input_shapes:
+            config["shape"] = shape
+
+        perf_analyzer = PerfAnalyzer(config=config)
+        perf_analyzer.run()
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/triton/export_model.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/triton/export_model.py
new file mode 100755
index 00000000..175da1ab
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/triton/export_model.py
@@ -0,0 +1,121 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import logging
+import os
+from pathlib import Path
+
+os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
+os.environ["TF_ENABLE_DEPRECATION_WARNINGS"] = "1"
+
+# method from PEP-366 to support relative import in executed modules
+if __name__ == "__main__" and __package__ is None:
+    __package__ = Path(__file__).parent.name
+
+from .deployment_toolkit.args import ArgParserGenerator  # noqa: E402  module level import not at top of file
+from .deployment_toolkit.core import (  # noqa: E402  module level import not at top of file
+    DATALOADER_FN_NAME,
+    BaseLoader,
+    BaseSaver,
+    Format,
+    load_from_file,
+)
+from .deployment_toolkit.extensions import loaders, savers  # noqa: E402  module level import not at top of file
+
+LOGGER = logging.getLogger("export_model")
+
+INPUT_MODEL_TYPES = [Format.TF_ESTIMATOR, Format.TF_KERAS, Format.PYT]
+OUTPUT_MODEL_TYPES = [Format.TF_SAVEDMODEL, Format.TS_TRACE, Format.TS_SCRIPT, Format.ONNX]
+
+
+def _get_args():
+    parser = argparse.ArgumentParser(
+        description="Script for exporting models from supported frameworks.", allow_abbrev=False
+    )
+    parser.add_argument("--input-path", help="Path to input python module", required=True)
+    parser.add_argument(
+        "--input-type", help="Input model type", choices=[f.value for f in INPUT_MODEL_TYPES], required=True
+    )
+    parser.add_argument("--output-path", help="Path to output model file", required=True)
+    parser.add_argument(
+        "--output-type", help="Output model type", choices=[f.value for f in OUTPUT_MODEL_TYPES], required=True
+    )
+    parser.add_argument("--dataloader", help="Path to python module containing data loader")
+    parser.add_argument("-v", "--verbose", help="Verbose logs", action="store_true", default=False)
+    parser.add_argument(
+        "--ignore-unknown-parameters",
+        help="Ignore unknown parameters (argument often used in CI where set of arguments is constant)",
+        action="store_true",
+        default=False,
+    )
+
+    args, unparsed_args = parser.parse_known_args()
+
+    Loader: BaseLoader = loaders.get(args.input_type)
+    ArgParserGenerator(Loader, module_path=args.input_path).update_argparser(parser)
+
+    if args.input_type == Format.PYT.value and args.output_type == Format.ONNX.value:
+        saver_type = f"{Format.PYT.value}--{Format.ONNX.value}"
+    else:
+        saver_type = args.output_type
+    Saver: BaseSaver = savers.get(saver_type)
+    ArgParserGenerator(Saver).update_argparser(parser)
+
+    if args.dataloader is not None:
+        get_dataloader_fn = load_from_file(args.dataloader, label="dataloader", target=DATALOADER_FN_NAME)
+        ArgParserGenerator(get_dataloader_fn).update_argparser(parser)
+
+    if args.ignore_unknown_parameters:
+        args, unknown_args = parser.parse_known_args()
+        LOGGER.warning(f"Got additional args {unknown_args}")
+    else:
+        args = parser.parse_args()
+    return args
+
+
+def main():
+    args = _get_args()
+
+    log_level = logging.INFO if not args.verbose else logging.DEBUG
+    log_format = "%(asctime)s %(levelname)s %(name)s %(message)s"
+    logging.basicConfig(level=log_level, format=log_format)
+
+    LOGGER.info("args:")
+    for key, value in vars(args).items():
+        LOGGER.info(f"    {key} = {value}")
+
+    dataloader_fn = None
+    if args.dataloader is not None:
+        get_dataloader_fn = load_from_file(args.dataloader, label="dataloader", target=DATALOADER_FN_NAME)
+        dataloader_fn = ArgParserGenerator(get_dataloader_fn).from_args(args)
+
+    Loader: BaseLoader = loaders.get(args.input_type)
+    loader = ArgParserGenerator(Loader, module_path=args.input_path).from_args(args)
+    model = loader.load(args.input_path, dataloader_fn=dataloader_fn, output_type=args.output_type)
+
+    LOGGER.info("inputs: %s", model.inputs)
+    LOGGER.info("outputs: %s", model.outputs)
+
+    if args.input_type == Format.PYT.value and args.output_type == Format.ONNX.value:
+        saver_type = f"{Format.PYT.value}--{Format.ONNX.value}"
+    else:
+        saver_type = args.output_type
+    Saver: BaseSaver = savers.get(saver_type)
+    saver = ArgParserGenerator(Saver).from_args(args)
+    saver.save(model, args.output_path, dataloader_fn)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/triton/metrics.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/triton/metrics.py
new file mode 100644
index 00000000..774e5c7b
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/triton/metrics.py
@@ -0,0 +1,76 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import pandas as pd
+import numpy as np
+import pickle
+import argparse
+import hydra
+import torch
+from triton.deployment_toolkit.core import BaseMetricsCalculator
+from omegaconf import OmegaConf
+
+def update_argparser(parser):
+    parser.add_argument("--model-dir", type=str, help="Path to the model directory you would like to use (likely in outputs)", required=True)
+
+
+
+
+
+class MetricsCalculator(BaseMetricsCalculator):
+    def __init__(self, model_dir):
+        with open(os.path.join(model_dir, ".hydra/config_merged.yaml"), "rb") as f:
+            self.config = OmegaConf.load(f)
+        self.config._target_ = self.config.config.evaluator._target_
+        self.evaluator = hydra.utils.call(self.config)
+        self.config= self.config.config
+        self.output_selector = self.config.model.get("preds_test_output_selector", -1)
+        self.predictions = []
+        self.targets = []
+        self.ids = []
+        if self.config.evaluator.get("use_weights", False):
+            self.weights = []
+
+
+    @property
+    def metrics(self):
+        targets = np.concatenate(self.targets, axis=0)
+        # targets = torch.cat(self.targets, dim=0)
+        predictions = np.concatenate(self.predictions, axis=0)
+        # predictions = torch.cat(self.predictions, dim=0)
+
+        ids = np.concatenate(self.ids, axis=0)
+        if self.config.evaluator.get("use_weights", False):
+            weights = torch.cat(self.weights).cpu().numpy()
+        else:
+            weights = np.zeros((0, 0))
+        return self.evaluator(targets, predictions, weights, ids=ids)
+
+    def update(
+        self,
+        ids,
+        y_pred,
+        x,
+        y_real,
+    ):
+        #can probably just pass all of this to the evaluator main class
+        self.targets.append(y_real['target__0'][:,:,0][:,:,np.newaxis])
+        self.ids.append(ids)
+        if self.config.evaluator.get("use_weights", False):
+            self.weights.append(x["weight"])
+        preds = y_pred["target__0"]
+        self.predictions.append(preds)
+
+        # return self.metrics
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/triton/model.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/triton/model.py
new file mode 100644
index 00000000..055f8170
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/triton/model.py
@@ -0,0 +1,79 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import torch
+import torch.nn as nn
+import hydra
+
+from typing import Dict, Tuple, Optional, List
+from omegaconf import OmegaConf
+
+
+def update_argparser(parser):
+    parser.add_argument("--model-dir", type=str, help="Path to the model directory you would like to use (likely in outputs)", required=True)
+
+class ModelWrapper(nn.Module):
+    def __init__(self, model, test_func, output_selector):
+        super().__init__()
+        self.model = model
+        self.test_func = test_func
+        self.output_selector = output_selector
+
+    def forward(self, s_cat, s_cont, k_cat, k_cont, o_cat, o_cont, target, weight, sample_weight, id):
+        wrapped_input = {}
+        wrapped_input['s_cat'] = s_cat if len(s_cat.shape) != 10 else None
+        wrapped_input['s_cont'] = s_cont if len(s_cont.shape) != 10 else None
+        wrapped_input['k_cat'] = k_cat if len(k_cat.shape) != 10 else None
+        wrapped_input['k_cont'] = k_cont if len(k_cont.shape) != 10 else None
+        wrapped_input['o_cat'] = o_cat if len(o_cat.shape) != 10 else None
+        wrapped_input['o_cont'] = o_cont if len(o_cont.shape) != 10 else None
+        wrapped_input['weight'] = weight if len(weight.shape) != 10 else None
+        wrapped_input['sample_weight'] = sample_weight if len(sample_weight.shape) != 10 else None
+        wrapped_input['target'] = target
+        wrapped_input['id'] = id if id.numel() else None
+        output = self.test_func(wrapped_input)
+        if self.output_selector >= 0:
+            return output[..., self.output_selector : self.output_selector + 1]
+        return output
+
+def get_model(**args):
+    #get model config
+    with open(os.path.join(args['model_dir'], ".hydra/config_merged.yaml"), "rb") as f:
+        config = OmegaConf.load(f)
+    os.environ["TFT_SCRIPTING"] = "True"
+    state_dict = torch.load(os.path.join(args['model_dir'], "best_checkpoint.pth.tar"))['model_state_dict']
+    if config.config.device.get("world_size", 1) > 1:
+        model_params = list(state_dict.items())
+        for k, v in model_params:
+            if k[:7] == "module.":
+                state_dict[k[7:]] = v
+                del state_dict[k]
+    config._target_ = config.config.model._target_
+    model = hydra.utils.instantiate(config)
+    test_method_name = config.config.model.get("test_method", "__call__")
+    test_method = getattr(model, test_method_name)
+    #load model
+    preds_test_output_selector = config.config.model.get(
+            "preds_test_output_selector", -1
+        )
+    model.load_state_dict(state_dict)
+    model.eval()
+    model.cuda()
+    model = ModelWrapper(model, test_method, preds_test_output_selector).cuda()
+    tensor_names = {
+        "inputs": ['s_cat__0', 's_cont__1', 'k_cat__2', 'k_cont__3', 'o_cat__4', 'o_cont__5', 'target__6', 'weight__7', 'sample_weight__8', 'id__9'],
+        "outputs": ["target__0"]
+    }
+    return model, tensor_names
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/triton/requirements.txt b/Tools/PyTorch/TimeSeriesPredictionPlatform/triton/requirements.txt
new file mode 100644
index 00000000..4b8db835
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/triton/requirements.txt
@@ -0,0 +1,23 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+model_navigator[pyt] @ git+https://github.com/triton-inference-server/model_navigator.git@v0.2.2#egg=model_navigator
+natsort>=7.0.0
+networkx==2.5
+numpy
+onnx==1.10.1
+onnxruntime-gpu==1.8.1
+pycuda>=2019.1.2
+PyYAML>=5.2
+tabulate>=0.8.7
+tqdm>=4.44.1
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/triton/run_inference_on_fw.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/triton/run_inference_on_fw.py
new file mode 100755
index 00000000..ad33b1fc
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/triton/run_inference_on_fw.py
@@ -0,0 +1,140 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+r"""
+To infer the model on framework runtime, you can use `run_inference_on_fw.py` script.
+It infers data obtained from pointed data loader locally and saves received data into dump files.
+Those files are stored in directory pointed by `--output-dir` argument.
+
+Example call:
+
+```shell script
+python ./triton/run_inference_on_fw.py \
+    --input-path /models/exported/model.onnx \
+    --input-type onnx \
+    --dataloader triton/dataloader.py \
+    --data-dir /data/imagenet \
+    --batch-size 32 \
+    --output-dir /results/dump_local \
+    --dump-labels
+```
+"""
+
+import argparse
+import logging
+import os
+from pathlib import Path
+
+from tqdm import tqdm
+
+# method from PEP-366 to support relative import in executed modules
+if __package__ is None:
+    __package__ = Path(__file__).parent.name
+
+os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
+os.environ["TF_ENABLE_DEPRECATION_WARNINGS"] = "0"
+
+
+from .deployment_toolkit.args import ArgParserGenerator  # noqa: E402  module level import not at top of file
+from .deployment_toolkit.core import (  # noqa: E402  module level import not at top of file
+    DATALOADER_FN_NAME,
+    BaseLoader,
+    BaseRunner,
+    load_from_file,
+)
+from .deployment_toolkit.dump import JsonDumpWriter  # noqa: E402  module level import not at top of file
+from .deployment_toolkit.extensions import loaders, runners  # noqa: E402  module level import not at top of file
+
+LOGGER = logging.getLogger("run_inference_on_fw")
+
+
+def _verify_and_format_dump(args, ids, x, y_pred, y_real):
+    data = {"outputs": y_pred, "ids": {"ids": ids}}
+    if args.dump_inputs:
+        data["inputs"] = x
+    if args.dump_labels:
+        if not y_real:
+            raise ValueError(
+                "Found empty label values. Please provide labels in dataloader_fn or do not use --dump-labels argument"
+            )
+        data["labels"] = y_real
+    return data
+
+
+def _parse_and_validate_args():
+    supported_inputs = set(runners.supported_extensions) & set(loaders.supported_extensions)
+
+    parser = argparse.ArgumentParser(description="Dump local inference output of given model", allow_abbrev=False)
+    parser.add_argument("--input-path", help="Path to input model", required=True)
+    parser.add_argument("--input-type", help="Input model type", choices=supported_inputs, required=True)
+    parser.add_argument("--dataloader", help="Path to python file containing dataloader.", required=True)
+    parser.add_argument("--output-dir", help="Path to dir where output files will be stored", required=True)
+    parser.add_argument("--dump-labels", help="Dump labels to output dir", action="store_true", default=False)
+    parser.add_argument("--dump-inputs", help="Dump inputs to output dir", action="store_true", default=False)
+    parser.add_argument("-v", "--verbose", help="Verbose logs", action="store_true", default=False)
+
+    args, *_ = parser.parse_known_args()
+
+    get_dataloader_fn = load_from_file(args.dataloader, label="dataloader", target=DATALOADER_FN_NAME)
+    ArgParserGenerator(get_dataloader_fn).update_argparser(parser)
+
+    Loader: BaseLoader = loaders.get(args.input_type)
+    ArgParserGenerator(Loader, module_path=args.input_path).update_argparser(parser)
+
+    Runner: BaseRunner = runners.get(args.input_type)
+    ArgParserGenerator(Runner).update_argparser(parser)
+
+    args = parser.parse_args()
+
+    types_requiring_io_params = []
+
+    if args.input_type in types_requiring_io_params and not all(p for p in [args.inputs, args.outptputs]):
+        parser.error(f"For {args.input_type} input provide --inputs and --outputs parameters")
+
+    return args
+
+
+def main():
+    args = _parse_and_validate_args()
+
+    log_level = logging.INFO if not args.verbose else logging.DEBUG
+    log_format = "%(asctime)s %(levelname)s %(name)s %(message)s"
+    logging.basicConfig(level=log_level, format=log_format)
+
+    LOGGER.info("args:")
+    for key, value in vars(args).items():
+        LOGGER.info(f"    {key} = {value}")
+
+    Loader: BaseLoader = loaders.get(args.input_type)
+    Runner: BaseRunner = runners.get(args.input_type)
+
+    loader = ArgParserGenerator(Loader, module_path=args.input_path).from_args(args)
+    runner = ArgParserGenerator(Runner).from_args(args)
+    LOGGER.info(f"Loading {args.input_path}")
+    model = loader.load(args.input_path)
+    with runner.init_inference(model=model) as runner_session, JsonDumpWriter(args.output_dir) as writer:
+        get_dataloader_fn = load_from_file(args.dataloader, label="dataloader", target=DATALOADER_FN_NAME)
+        dataloader_fn = ArgParserGenerator(get_dataloader_fn).from_args(args)
+        LOGGER.info("Data loader initialized; Running inference")
+        for ids, x, y_real in tqdm(dataloader_fn(), unit="batch", mininterval=10):
+            y_pred = runner_session(x)
+            data = _verify_and_format_dump(args, ids=ids, x=x, y_pred=y_pred, y_real=y_real)
+            writer.write(**data)
+        LOGGER.info("Inference finished")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/triton/run_inference_on_triton.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/triton/run_inference_on_triton.py
new file mode 100755
index 00000000..86977449
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/triton/run_inference_on_triton.py
@@ -0,0 +1,394 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+r"""
+To infer the model deployed on Triton, you can use `run_inference_on_triton.py` script.
+It sends a request with data obtained from pointed data loader and dumps received data into dump files.
+Those files are stored in directory pointed by `--output-dir` argument.
+
+Currently, the client communicates with the Triton server asynchronously using GRPC protocol.
+
+Example call:
+
+```shell script
+python ./triton/run_inference_on_triton.py \
+    --server-url localhost:8001 \
+    --model-name ResNet50 \
+    --model-version 1 \
+    --dump-labels \
+    --output-dir /results/dump_triton
+```
+"""
+
+import argparse
+import functools
+import logging
+import queue
+import threading
+import time
+import traceback
+from pathlib import Path
+from typing import Optional
+
+from tqdm import tqdm
+
+# pytype: disable=import-error
+try:
+    from tritonclient import utils as client_utils  # noqa: F401
+    from tritonclient.grpc import InferenceServerClient, InferInput, InferRequestedOutput
+except ImportError:
+    from tritongrpcclient import InferenceServerClient, InferInput, InferRequestedOutput
+# pytype: enable=import-error
+
+# method from PEP-366 to support relative import in executed modules
+if __package__ is None:
+    __package__ = Path(__file__).parent.name
+
+from .deployment_toolkit.args import ArgParserGenerator
+from .deployment_toolkit.core import DATALOADER_FN_NAME, load_from_file
+from .deployment_toolkit.dump import JsonDumpWriter
+
+LOGGER = logging.getLogger("run_inference_on_triton")
+
+
+class SyncGRPCTritonRunner:
+    DEFAULT_MAX_RESP_WAIT_S = 120
+
+    def __init__(
+        self,
+        server_url: str,
+        model_name: str,
+        model_version: str,
+        *,
+        dataloader,
+        verbose=False,
+        resp_wait_s: Optional[float] = None,
+    ):
+        self._server_url = server_url
+        self._model_name = model_name
+        self._model_version = model_version
+        self._dataloader = dataloader
+        self._verbose = verbose
+        self._response_wait_t = self.DEFAULT_MAX_RESP_WAIT_S if resp_wait_s is None else resp_wait_s
+
+    def __iter__(self):
+        client = InferenceServerClient(self._server_url, verbose=self._verbose)
+        error = self._verify_triton_state(client)
+        if error:
+            raise RuntimeError(f"Could not communicate to Triton Server: {error}")
+
+        LOGGER.debug(
+            f"Triton server {self._server_url} and model {self._model_name}:{self._model_version} " f"are up and ready!"
+        )
+
+        model_config = client.get_model_config(self._model_name, self._model_version)
+        model_metadata = client.get_model_metadata(self._model_name, self._model_version)
+        LOGGER.info(f"Model config {model_config}")
+        LOGGER.info(f"Model metadata {model_metadata}")
+
+        inputs = {tm.name: tm for tm in model_metadata.inputs}
+        outputs = {tm.name: tm for tm in model_metadata.outputs}
+        output_names = list(outputs)
+        outputs_req = [InferRequestedOutput(name) for name in outputs]
+
+        for ids, x, y_real in self._dataloader:
+            infer_inputs = []
+            for name in inputs:
+                data = x[name]
+                infer_input = InferInput(name, data.shape, inputs[name].datatype)
+
+                target_np_dtype = client_utils.triton_to_np_dtype(inputs[name].datatype)
+                data = data.astype(target_np_dtype)
+
+                infer_input.set_data_from_numpy(data)
+                infer_inputs.append(infer_input)
+
+            results = client.infer(
+                model_name=self._model_name,
+                model_version=self._model_version,
+                inputs=infer_inputs,
+                outputs=outputs_req,
+                client_timeout=self._response_wait_t,
+            )
+            y_pred = {name: results.as_numpy(name) for name in output_names}
+            yield ids, x, y_pred, y_real
+
+    def _verify_triton_state(self, triton_client):
+        if not triton_client.is_server_live():
+            return f"Triton server {self._server_url} is not live"
+        elif not triton_client.is_server_ready():
+            return f"Triton server {self._server_url} is not ready"
+        elif not triton_client.is_model_ready(self._model_name, self._model_version):
+            return f"Model {self._model_name}:{self._model_version} is not ready"
+        return None
+
+
+class AsyncGRPCTritonRunner:
+    DEFAULT_MAX_RESP_WAIT_S = 120
+    DEFAULT_MAX_UNRESP_REQS = 128
+    DEFAULT_MAX_FINISH_WAIT_S = 900  # 15min
+
+    def __init__(
+        self,
+        server_url: str,
+        model_name: str,
+        model_version: str,
+        *,
+        dataloader,
+        verbose=False,
+        resp_wait_s: Optional[float] = None,
+        max_unresponded_reqs: Optional[int] = None,
+    ):
+        self._server_url = server_url
+        self._model_name = model_name
+        self._model_version = model_version
+        self._dataloader = dataloader
+        self._verbose = verbose
+        self._response_wait_t = self.DEFAULT_MAX_RESP_WAIT_S if resp_wait_s is None else resp_wait_s
+        self._max_unresp_reqs = self.DEFAULT_MAX_UNRESP_REQS if max_unresponded_reqs is None else max_unresponded_reqs
+
+        self._results = queue.Queue()
+        self._processed_all = False
+        self._errors = []
+        self._num_waiting_for = 0
+        self._sync = threading.Condition()
+        self._req_thread = threading.Thread(target=self.req_loop, daemon=True)
+
+    def __iter__(self):
+        self._req_thread.start()
+        timeout_s = 0.050  # check flags processed_all and error flags every 50ms
+        while True:
+            try:
+                ids, x, y_pred, y_real = self._results.get(timeout=timeout_s)
+                yield ids, x, y_pred, y_real
+            except queue.Empty:
+                shall_stop = self._processed_all or self._errors
+                if shall_stop:
+                    break
+
+        LOGGER.debug("Waiting for request thread to stop")
+        self._req_thread.join()
+        if self._errors:
+            error_msg = "\n".join(map(str, self._errors))
+            raise RuntimeError(error_msg)
+
+    def _on_result(self, ids, x, y_real, output_names, result, error):
+        with self._sync:
+            request_id = str(ids[0])
+            NOT_MATCHING_REQUEST_ID_MSG = (
+                "Error during processing result - request_id doesn't match. This shouldn't have happened."
+            )
+            if error:
+                response_id = error.get_response().id
+                if response_id != request_id:
+                    raise RuntimeError(NOT_MATCHING_REQUEST_ID_MSG)
+                self._errors.append(error)
+            else:
+                response_id = result.get_response().id
+                if response_id != request_id:
+                    raise RuntimeError(NOT_MATCHING_REQUEST_ID_MSG)
+                y_pred = {name: result.as_numpy(name) for name in output_names}
+                self._results.put((ids, x, y_pred, y_real))
+            self._num_waiting_for -= 1
+            self._sync.notify_all()
+
+    def req_loop(self):
+        client = InferenceServerClient(self._server_url, verbose=self._verbose)
+        self._errors = self._verify_triton_state(client)
+        if self._errors:
+            return
+
+        LOGGER.debug(
+            f"Triton server {self._server_url} and model {self._model_name}:{self._model_version} " f"are up and ready!"
+        )
+
+        model_config = client.get_model_config(self._model_name, self._model_version)
+        model_metadata = client.get_model_metadata(self._model_name, self._model_version)
+        LOGGER.info(f"Model config {model_config}")
+        LOGGER.info(f"Model metadata {model_metadata}")
+
+        inputs = {tm.name: tm for tm in model_metadata.inputs}
+        outputs = {tm.name: tm for tm in model_metadata.outputs}
+        output_names = list(outputs)
+
+        self._num_waiting_for = 0
+
+        for ids, x, y_real in self._dataloader:
+            infer_inputs = []
+            for name in inputs:
+                data = x[name]
+                infer_input = InferInput(name, data.shape, inputs[name].datatype)
+
+                target_np_dtype = client_utils.triton_to_np_dtype(inputs[name].datatype)
+                data = data.astype(target_np_dtype)
+
+                infer_input.set_data_from_numpy(data)
+                infer_inputs.append(infer_input)
+
+            outputs_req = [InferRequestedOutput(name) for name in outputs]
+
+            with self._sync:
+
+                def _check_can_send():
+                    return self._num_waiting_for < self._max_unresp_reqs
+
+                can_send = self._sync.wait_for(_check_can_send, timeout=self._response_wait_t)
+                if not can_send:
+                    error_msg = f"Runner could not send new requests for {self._response_wait_t}s"
+                    self._errors.append(error_msg)
+                    self._sync.notify_all()
+                    break
+
+                request_id = str(ids[0])
+                callback = functools.partial(AsyncGRPCTritonRunner._on_result, self, ids, x, y_real, output_names)
+                client.async_infer(
+                    model_name=self._model_name,
+                    model_version=self._model_version,
+                    inputs=infer_inputs,
+                    outputs=outputs_req,
+                    callback=callback,
+                    request_id=request_id,
+                )
+                self._num_waiting_for += 1
+                self._sync.notify_all()
+
+        # wait till receive all requested data
+        with self._sync:
+
+            def _all_processed():
+                LOGGER.debug(f"wait for {self._num_waiting_for} unprocessed jobs")
+                return self._num_waiting_for == 0
+
+            self._processed_all = self._sync.wait_for(_all_processed, self.DEFAULT_MAX_FINISH_WAIT_S)
+            if not self._processed_all:
+                error_msg = f"Runner {self._response_wait_t}s timeout received while waiting for results from server"
+                self._errors.append(error_msg)
+
+            self._sync.notify_all()
+
+        LOGGER.debug("Finished request thread")
+
+    def _verify_triton_state(self, triton_client):
+        errors = []
+        if not triton_client.is_server_live():
+            errors.append(f"Triton server {self._server_url} is not live")
+        elif not triton_client.is_server_ready():
+            errors.append(f"Triton server {self._server_url} is not ready")
+        elif not triton_client.is_model_ready(self._model_name, self._model_version):
+            errors.append(f"Model {self._model_name}:{self._model_version} is not ready")
+        return errors
+
+
+def _parse_args():
+    parser = argparse.ArgumentParser(description="Infer model on Triton server", allow_abbrev=False)
+    parser.add_argument(
+        "--server-url", type=str, default="localhost:8001", help="Inference server URL (default localhost:8001)"
+    )
+    parser.add_argument("--model-name", help="The name of the model used for inference.", required=True)
+    parser.add_argument("--model-version", help="The version of the model used for inference.", required=True)
+    parser.add_argument("--dataloader", help="Path to python file containing dataloader.", required=True)
+    parser.add_argument("--dump-labels", help="Dump labels to output dir", action="store_true", default=False)
+    parser.add_argument("--dump-inputs", help="Dump inputs to output dir", action="store_true", default=False)
+    parser.add_argument("-v", "--verbose", help="Verbose logs", action="store_true", default=True)
+    parser.add_argument("--output-dir", required=True, help="Path to directory where outputs will be saved")
+    parser.add_argument(
+        "--response-wait-time", required=False, help="Maximal time to wait for response", default=120, type=float
+    )
+    parser.add_argument(
+        "--max-unresponded-requests",
+        required=False,
+        help="Maximal number of unresponded requests",
+        default=128,
+        type=int,
+    )
+    parser.add_argument(
+        "--synchronous", help="Enable synchronous calls to Triton Server", action="store_true", default=False
+    )
+
+    args, *_ = parser.parse_known_args()
+
+    get_dataloader_fn = load_from_file(args.dataloader, label="dataloader", target=DATALOADER_FN_NAME)
+    ArgParserGenerator(get_dataloader_fn).update_argparser(parser)
+    args = parser.parse_args()
+
+    return args
+
+
+def main():
+    args = _parse_args()
+
+    log_format = "%(asctime)s %(levelname)s %(name)s %(message)s"
+    log_level = logging.INFO if not args.verbose else logging.DEBUG
+    logging.basicConfig(level=log_level, format=log_format)
+
+    LOGGER.info("args:")
+    for key, value in vars(args).items():
+        LOGGER.info(f"    {key} = {value}")
+
+    get_dataloader_fn = load_from_file(args.dataloader, label="dataloader", target=DATALOADER_FN_NAME)
+    dataloader_fn = ArgParserGenerator(get_dataloader_fn).from_args(args)
+
+    try:
+        if args.synchronous:
+            runner = SyncGRPCTritonRunner(
+                args.server_url,
+                args.model_name,
+                args.model_version,
+                dataloader=dataloader_fn(),
+                verbose=False,
+                resp_wait_s=args.response_wait_time,
+            )
+        else:
+            runner = AsyncGRPCTritonRunner(
+                args.server_url,
+                args.model_name,
+                args.model_version,
+                dataloader=dataloader_fn(),
+                verbose=False,
+                resp_wait_s=args.response_wait_time,
+                max_unresponded_reqs=args.max_unresponded_requests,
+            )
+
+    except Exception as e:
+        message = traceback.format_exc()
+        LOGGER.error(f"Encountered exception \n{message}")
+        raise e
+
+    with JsonDumpWriter(output_dir=args.output_dir) as writer:
+        start = time.time()
+        for ids, x, y_pred, y_real in tqdm(runner, unit="batch", mininterval=10):
+            data = _verify_and_format_dump(args, ids, x, y_pred, y_real)
+            writer.write(**data)
+        stop = time.time()
+
+    LOGGER.info(f"\nThe inference took {stop - start:0.3f}s")
+
+
+def _verify_and_format_dump(args, ids, x, y_pred, y_real):
+    data = {"outputs": y_pred, "ids": {"ids": ids}}
+    if args.dump_inputs:
+        data["inputs"] = x
+    if args.dump_labels:
+        if not y_real:
+            raise ValueError(
+                "Found empty label values. Please provide labels in dataloader_fn or do not use --dump-labels argument"
+            )
+        data["labels"] = y_real
+    return data
+
+
+if __name__ == "__main__":
+    main()
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/triton/run_performance_on_triton.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/triton/run_performance_on_triton.py
new file mode 100755
index 00000000..ca9c7146
--- /dev/null
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/triton/run_performance_on_triton.py
@@ -0,0 +1,608 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import csv
+import logging
+import os
+import pathlib
+import shutil
+from distutils.version import LooseVersion
+from enum import Enum
+from importlib.metadata import version
+from typing import Any, Dict, List
+
+import yaml
+
+# method from PEP-366 to support relative import in executed modules
+if __package__ is None:
+    __package__ = pathlib.Path(__file__).parent.name
+
+from .deployment_toolkit.core import BatchingMode, EvaluationMode, MeasurementMode, OfflineMode, PerformanceTool
+from .deployment_toolkit.model_analyzer import ModelAnalyzer, ModelAnalyzerConfig, ModelAnalyzerMode
+from .deployment_toolkit.perf_analyzer import PerfAnalyzer, PerfAnalyzerConfig
+from .deployment_toolkit.report import save_results, show_results, sort_results
+from .deployment_toolkit.utils import parse_server_url
+from .deployment_toolkit.warmup import performance_evaluation_warmup
+
+LOGGER = logging.getLogger("run_performance_on_triton")
+
+TRITON_CLIENT_VERSION = LooseVersion(version("tritonclient"))
+
+
+def _log_dict(title: str, dict_: Dict[str, Any]):
+    LOGGER.info(title)
+    for key, value in dict_.items():
+        LOGGER.info(f"\t{key} = {value}")
+
+
+def _calculate_average_latency(r):
+    avg_sum_fields = [
+        "Client Send",
+        "Network+Server Send/Recv",
+        "Server Queue",
+        "Server Compute",
+        "Server Compute Input",
+        "Server Compute Infer",
+        "Server Compute Output",
+        "Client Recv",
+    ]
+    avg_latency = sum([int(r.get(f, 0)) for f in avg_sum_fields])
+
+    return avg_latency
+
+
+def _update_performance_data(results: List, batch_size: int, performance_partial_file: str):
+    row: Dict = {"Batch": batch_size}
+    with open(performance_partial_file, "r") as csvfile:
+        reader = csv.DictReader(csvfile)
+        for r in reader:
+            avg_latency = _calculate_average_latency(r)
+            row = {**row, **r, "avg latency": avg_latency}
+            results.append(row)
+
+
+def _model_analyzer_evaluation(
+    server_url: str,
+    model_name: str,
+    input_data: str,
+    input_shapes: List[str],
+    batch_sizes: List[int],
+    number_of_triton_instances: int,
+    number_of_model_instances: int,
+    measurement_mode: MeasurementMode,
+    measurement_interval: int,
+    measurement_request_count: int,
+    concurrency_steps: int,
+    batching_mode: BatchingMode,
+    evaluation_mode: EvaluationMode,
+    offline_mode: OfflineMode,
+    model_repository: str,
+    result_path: str,
+    verbose: bool,
+):
+    _log_dict(
+        "Selected configuration",
+        {
+            "server_url": server_url,
+            "model_name": model_name,
+            "input_data": input_data,
+            "input_shapes": input_shapes,
+            "batch_sizes": batch_sizes,
+            "number_of_triton_instances": number_of_triton_instances,
+            "number_of_model_instances": number_of_model_instances,
+            "measurement_mode": measurement_mode,
+            "measurement_interval": measurement_interval,
+            "measurement_request_count": measurement_request_count,
+            "concurrency_steps": concurrency_steps,
+            "batching_mode": batching_mode,
+            "evaluation_mode": evaluation_mode,
+            "offline_mode": offline_mode,
+            "model_repository": model_repository,
+            "result_path": result_path,
+            "verbose": verbose,
+        },
+    )
+
+    perf_analyzer_config = {
+        "input-data": input_data,
+        "measurement-interval": measurement_interval,
+        "verbose": verbose
+    }
+
+    if TRITON_CLIENT_VERSION >= LooseVersion("2.11.0"):
+        perf_analyzer_config["measurement-mode"] = measurement_mode.value
+        perf_analyzer_config["measurement-request-count"] = measurement_request_count
+
+    if evaluation_mode == EvaluationMode.OFFLINE:
+        perf_analyzer_config["shared-memory"] = offline_mode.value
+
+    for shape in input_shapes:
+        perf_analyzer_config["shape"] = shape
+        LOGGER.warning("Model Analyzer support only single shape param for Perf Analyzer.")
+        break
+
+    if batching_mode == BatchingMode.STATIC:
+        batch_sizes = batch_sizes
+        concurrency = [number_of_triton_instances]
+    elif batching_mode == BatchingMode.DYNAMIC:
+        max_batch_size = max(batch_sizes)
+        max_total_requests = 2 * max_batch_size * number_of_triton_instances * number_of_model_instances
+        max_concurrency = min(256, max_total_requests)
+        step = max(1, max_concurrency // concurrency_steps)
+        min_concurrency = step
+
+        concurrency = {"start": min_concurrency, "stop": max_concurrency, "step": step}
+        batch_sizes = [max(1, max_total_requests // 256)]
+    else:
+        raise ValueError(f"Unsupported batching mode: {batching_mode}")
+
+    protocol, host, port = parse_server_url(server_url)
+
+    checkpoints = pathlib.Path("./checkpoints")
+    if checkpoints.is_dir():
+        shutil.rmtree(checkpoints.as_posix())
+
+    checkpoints.mkdir(parents=True, exist_ok=True)
+
+    config = {
+        "model_repository": model_repository,
+        "triton_launch_mode": "remote",
+        "run_config_search_disable": True,
+        "perf_analyzer_flags": perf_analyzer_config,
+        "perf_analyzer_timeout": 3600,  # Workaround for Perf Analyzer timeout - use 1h
+        "profile_models": [model_name],
+        "batch_sizes": batch_sizes,
+        "concurrency": concurrency,
+        "verbose": verbose,
+        "checkpoint_directory": checkpoints.as_posix(),
+        "override_output_model_repository": True,
+        "client_protocol": protocol,
+        f"triton_{protocol}_endpoint": f"{host}:{port}",
+    }
+
+    if verbose:
+        _log_dict("Model Analyzer profiling configuration", config)
+
+    with open("config.yaml", "w") as file:
+        yaml.safe_dump(config, file)
+
+    config = ModelAnalyzerConfig()
+    model_analyzer = ModelAnalyzer(config=config)
+    model_analyzer.run(mode=ModelAnalyzerMode.PROFILE, verbose=verbose)
+
+    result_path = pathlib.Path(result_path)
+    result_path.mkdir(parents=True, exist_ok=True)
+
+    for file in checkpoints.iterdir():
+        if not file.is_file() or file.suffix != ".ckpt":
+            continue
+
+        LOGGER.info(f"Moving checkpoint {file.name} to {result_path}")
+        shutil.move(file, result_path / file.name)
+
+    inference_output_fields = [
+        "batch_size",
+        "concurrency",
+        "perf_throughput",
+        "perf_latency",
+        "perf_client_send_recv",
+        "perf_client_response_wait",
+        "perf_server_queue",
+        "perf_server_compute_input",
+        "perf_server_compute_infer",
+        "perf_server_compute_output",
+    ]
+    gpu_output_fields = [
+        "gpu_uuid",
+        "batch_size",
+        "concurrency",
+        "gpu_used_memory",
+        "gpu_free_memory",
+        "gpu_utilization",
+        "gpu_power_usage",
+    ]
+
+    filename_model_inference = "metrics-model-inference.csv"
+    filename_model_gpu = "metrics-model-gpu.csv"
+
+    config = {
+        "analysis_models": model_name,
+        "checkpoint_directory": result_path.as_posix(),
+        "export_path": "/tmp",
+        "inference_output_fields": inference_output_fields,
+        "gpu_output_fields": gpu_output_fields,
+        "filename_model_inference": filename_model_inference,
+        "filename_model_gpu": filename_model_gpu,
+        "summarize": False,
+    }
+
+    if verbose:
+        _log_dict("Model Analyzer analysis configuration", config)
+
+    with open("config.yaml", "w") as file:
+        yaml.safe_dump(config, file)
+
+    config = ModelAnalyzerConfig()
+
+    model_analyzer = ModelAnalyzer(config=config)
+    model_analyzer.run(mode=ModelAnalyzerMode.ANALYZE, verbose=verbose)
+
+    inference_metrics_file = pathlib.Path("/tmp") / "results" / filename_model_inference
+    gpu_metrics_file = pathlib.Path("/tmp") / "results" / filename_model_gpu
+
+    for file in [inference_metrics_file, gpu_metrics_file]:
+        LOGGER.info(f"Moving metrics {file.name} to {result_path}")
+        shutil.move(file, result_path / file.name)
+
+
+def _perf_analyzer_evaluation(
+    server_url: str,
+    model_name: str,
+    input_data: str,
+    input_shapes: List[str],
+    batch_sizes: List[int],
+    number_of_triton_instances: int,
+    number_of_model_instances: int,
+    measurement_mode: MeasurementMode,
+    measurement_interval: int,
+    measurement_request_count: int,
+    concurrency_steps: int,
+    batching_mode: BatchingMode,
+    evaluation_mode: EvaluationMode,
+    offline_mode: OfflineMode,
+    result_path: str,
+    verbose: bool,
+):
+    protocol, host, port = parse_server_url(server_url)
+
+    if batching_mode == BatchingMode.STATIC:
+        batch_sizes = batch_sizes
+        max_concurrency = 1
+        min_concurrency = 1
+        step = 1
+    elif batching_mode == BatchingMode.DYNAMIC:
+        max_batch_size = max(batch_sizes)
+        max_total_requests = 2 * max_batch_size * number_of_triton_instances * number_of_model_instances
+        max_concurrency = min(256, max_total_requests)
+        step = max(1, max_concurrency // concurrency_steps)
+        min_concurrency = step
+        batch_sizes = [max(1, max_total_requests // 256)]
+    else:
+        raise ValueError(f"Unsupported batching mode: {batching_mode}")
+
+    _log_dict(
+        "Selected configuration",
+        {
+            "server_url": server_url,
+            "model_name": model_name,
+            "input_data": input_data,
+            "input_shapes": input_shapes,
+            "batch_sizes": batch_sizes,
+            "number_of_triton_instances": number_of_triton_instances,
+            "number_of_model_instances": number_of_model_instances,
+            "measurement_mode": measurement_mode,
+            "measurement_interval": measurement_interval,
+            "measurement_request_count": measurement_request_count,
+            "concurrency_steps": concurrency_steps,
+            "batching_mode": batching_mode,
+            "evaluation_mode": evaluation_mode,
+            "offline_mode": offline_mode,
+            "result_path": result_path,
+            "verbose": verbose,
+        },
+    )
+
+    results: List[Dict] = list()
+    for batch_size in batch_sizes:
+        for concurrency in range(min_concurrency, max_concurrency + step, step):
+            performance_partial_file = (
+                f"triton_performance_{evaluation_mode.value.lower()}_{batching_mode.value.lower()}_partial_{batch_size}_{concurrency}.csv"
+            )
+
+            params = {
+                "model-name": model_name,
+                "model-version": 1,
+                "batch-size": batch_size,
+                "url": f"{host}:{port}",
+                "protocol": protocol,
+                "input-data": input_data,
+                "measurement-interval": measurement_interval,
+                "concurrency-range": f"{concurrency}:{concurrency}:1",
+                "latency-report-file": performance_partial_file,
+            }
+
+            if verbose:
+                params["extra-verbose"] = True
+            else:
+                params["verbose"] = True
+
+            if TRITON_CLIENT_VERSION >= LooseVersion("2.11.0"):
+                params["measurement-mode"] = measurement_mode.value
+                params["measurement-request-count"] = measurement_request_count
+
+            if evaluation_mode == EvaluationMode.OFFLINE:
+                params["shared-memory"] = offline_mode.value
+
+            if verbose:
+                _log_dict(f"Perf Analyzer config for batch_size: {batch_size} and concurrency: {concurrency}", params)
+
+            config = PerfAnalyzerConfig()
+            for param, value in params.items():
+                config[param] = value
+
+            for shape in input_shapes:
+                config["shape"] = shape
+
+            perf_analyzer = PerfAnalyzer(config=config)
+            perf_analyzer.run()
+            _update_performance_data(results, batch_size, performance_partial_file)
+            os.remove(performance_partial_file)
+
+    results = sort_results(results=results)
+
+    save_results(filename=result_path, data=results)
+    show_results(results=results)
+
+
+def _run_performance_analysis(
+    server_url: str,
+    model_name: str,
+    input_data: str,
+    input_shapes: List[str],
+    batch_sizes: List[int],
+    number_of_triton_instances: int,
+    number_of_model_instances: int,
+    measurement_mode: MeasurementMode,
+    measurement_interval: int,
+    measurement_request_count: int,
+    concurrency_steps: int,
+    batching_mode: BatchingMode,
+    evaluation_mode: EvaluationMode,
+    offline_mode: OfflineMode,
+    performance_tool: PerformanceTool,
+    model_repository: str,
+    result_path: str,
+    warmup: bool,
+    verbose: bool,
+):
+    log_level = logging.INFO if not verbose else logging.DEBUG
+    log_format = "%(asctime)s %(levelname)s %(name)s %(message)s"
+    logging.basicConfig(level=log_level, format=log_format)
+
+    if warmup:
+        LOGGER.info("Running warmup before the main test")
+        performance_evaluation_warmup(
+            server_url=server_url,
+            model_name=model_name,
+            input_data=input_data,
+            input_shapes=input_shapes,
+            batch_sizes=batch_sizes,
+            number_of_triton_instances=number_of_triton_instances,
+            number_of_model_instances=number_of_model_instances,
+            measurement_mode=measurement_mode,
+            measurement_interval=measurement_interval,
+            measurement_request_count=measurement_request_count,
+            batching_mode=batching_mode,
+            evaluation_mode=evaluation_mode,
+            offline_mode=offline_mode,
+        )
+
+    if performance_tool == PerformanceTool.MODEL_ANALYZER:
+        LOGGER.info("Using Model Analyzer for performance evaluation")
+        _model_analyzer_evaluation(
+            server_url=server_url,
+            model_name=model_name,
+            input_data=input_data,
+            input_shapes=input_shapes,
+            batch_sizes=batch_sizes,
+            number_of_triton_instances=number_of_triton_instances,
+            number_of_model_instances=number_of_model_instances,
+            measurement_mode=measurement_mode,
+            measurement_interval=measurement_interval,
+            measurement_request_count=measurement_request_count,
+            concurrency_steps=concurrency_steps,
+            batching_mode=batching_mode,
+            evaluation_mode=evaluation_mode,
+            offline_mode=offline_mode,
+            model_repository=model_repository,
+            result_path=result_path,
+            verbose=verbose,
+        )
+    elif performance_tool == PerformanceTool.PERF_ANALYZER:
+        LOGGER.info("Using Perf Analyzer for performance evaluation")
+        _perf_analyzer_evaluation(
+            server_url=server_url,
+            model_name=model_name,
+            input_data=input_data,
+            input_shapes=input_shapes,
+            batch_sizes=batch_sizes,
+            number_of_triton_instances=number_of_triton_instances,
+            number_of_model_instances=number_of_model_instances,
+            measurement_mode=measurement_mode,
+            measurement_interval=measurement_interval,
+            measurement_request_count=measurement_request_count,
+            concurrency_steps=concurrency_steps,
+            batching_mode=batching_mode,
+            evaluation_mode=evaluation_mode,
+            offline_mode=offline_mode,
+            result_path=result_path,
+            verbose=verbose,
+        )
+    else:
+        raise ValueError(f"Unsupported performance tool {performance_tool}")
+
+
+class MeasurementMode(Enum):
+    """
+    Available measurement stabilization modes
+    """
+
+    COUNT_WINDOWS = "count_windows"
+    TIME_WINDOWS = "time_windows"
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--server-url",
+        type=str,
+        required=False,
+        default="grpc://127.0.0.1:8001",
+        help="Url to Triton server",
+    )
+    parser.add_argument(
+        "--model-name",
+        type=str,
+        required=True,
+        help="Name of the model to test",
+    )
+    parser.add_argument(
+        "--input-data",
+        type=str,
+        required=False,
+        default="random",
+        help="Input data to perform profiling.",
+    )
+    parser.add_argument(
+        "--input-shapes",
+        action="append",
+        required=False,
+        help="Input data shape in form INPUT_NAME:<full_shape_without_batch_axis>.",
+    )
+    parser.add_argument(
+        "--batch-sizes",
+        type=str,
+        required=True,
+        help="List of batch sizes to tests. Comma separated.",
+    )
+    parser.add_argument(
+        "--number-of-triton-instances",
+        type=int,
+        default=1,
+        help="Number of Triton Server instances",
+    )
+    parser.add_argument(
+        "--number-of-model-instances",
+        type=int,
+        default=1,
+        help="Number of models instances on Triton Server",
+    )
+    parser.add_argument(
+        "--measurement-mode",
+        choices=[item.value for item in MeasurementMode],
+        default=MeasurementMode.COUNT_WINDOWS.value,
+        type=str,
+        help="Select measurement mode "
+        "'time_windows' stabilize performance on measurement window. "
+        "'count_windows' stabilize performance on number of samples.",
+    )
+    parser.add_argument(
+        "--measurement-interval",
+        required=False,
+        help="Time window perf_analyzer will wait to stabilize the measurement",
+        default=5000,
+        type=int,
+    )
+    parser.add_argument(
+        "--measurement-request-count",
+        required=False,
+        help="Number of samples on which perf_analyzer will stabilize the measurement",
+        default=50,
+        type=int,
+    )
+    parser.add_argument(
+        "--concurrency-steps",
+        help="Define number of concurrency steps used for dynamic batching tests",
+        default=32,
+        type=int,
+    )
+    parser.add_argument(
+        "--batching-mode",
+        choices=[item.value for item in BatchingMode],
+        default=BatchingMode.STATIC.value,
+        type=str,
+        help="Select batching mode "
+        "'static' run static batching scenario. "
+        "'dynamic' run dynamic batching scenario.",
+    )
+    parser.add_argument(
+        "--evaluation-mode",
+        choices=[item.value for item in EvaluationMode],
+        default=EvaluationMode.OFFLINE.value,
+        type=str,
+        help="Select evaluation mode "
+        "'offline' run offline analysis and use GPU memory to pass tensors. "
+        "'online' run online analysis and use HTTP protocol.",
+    )
+    parser.add_argument(
+        "--offline-mode",
+        choices=[item.value for item in OfflineMode],
+        default=OfflineMode.SYSTEM.value,
+        type=str,
+        help="Select offline mode "
+        "'system' pass tensors through CPU RAM memory. "
+        "'cuda' pass tensors through GPU RAM memory.",
+    )
+    parser.add_argument(
+        "--performance-tool",
+        choices=[item.value for item in PerformanceTool],
+        default=PerformanceTool.MODEL_ANALYZER.value,
+        type=str,
+        help="Select performance tool for measurement mode "
+        "'model_analyzer' use Model Analyzer "
+        "'perf_analyzer' use Perf Analyzer",
+    )
+    parser.add_argument(
+        "--model-repository",
+        default=None,
+        type=str,
+        help="Path to model repository. Valid when using Model Analyzer",
+    )
+    parser.add_argument("--result-path", type=str, required=True, help="Path where results files is stored.")
+    parser.add_argument(
+        "--warmup", help="Enable model warmup before performance test", action="store_true", default=False
+    )
+    parser.add_argument("-v", "--verbose", help="Verbose logs", action="store_true", default=False)
+
+    args = parser.parse_args()
+
+    batch_sizes = list(map(lambda x: int(x), args.batch_sizes.split(",")))
+    _run_performance_analysis(
+        server_url=args.server_url,
+        model_name=args.model_name,
+        input_data=args.input_data,
+        input_shapes=args.input_shapes or [],
+        batch_sizes=batch_sizes,
+        number_of_triton_instances=args.number_of_triton_instances,
+        number_of_model_instances=args.number_of_model_instances,
+        measurement_mode=MeasurementMode(args.measurement_mode),
+        measurement_interval=args.measurement_interval,
+        measurement_request_count=args.measurement_request_count,
+        concurrency_steps=args.concurrency_steps,
+        batching_mode=BatchingMode(args.batching_mode),
+        evaluation_mode=EvaluationMode(args.evaluation_mode),
+        offline_mode=OfflineMode(args.offline_mode),
+        performance_tool=PerformanceTool(args.performance_tool),
+        model_repository=args.model_repository,
+        result_path=args.result_path,
+        warmup=args.warmup,
+        verbose=args.verbose,
+    )
+
+
+if __name__ == "__main__":
+    main()