diff --git a/PyTorch/Forecasting/TFT/LICENCE b/PyTorch/Forecasting/TFT/LICENSE
similarity index 99%
rename from PyTorch/Forecasting/TFT/LICENCE
rename to PyTorch/Forecasting/TFT/LICENSE
index 261eeb9e..7f9708a7 100644
--- a/PyTorch/Forecasting/TFT/LICENCE
+++ b/PyTorch/Forecasting/TFT/LICENSE
@@ -186,7 +186,7 @@
       same "printed page" as the copyright notice for easier
       identification within third-party archives.
 
-   Copyright [yyyy] [name of copyright owner]
+   Copyright 2021 NVIDIA Corporation
 
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.
diff --git a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/Dockerfile b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/Dockerfile
deleted file mode 100644
index 70552ea1..00000000
--- a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/Dockerfile
+++ /dev/null
@@ -1,36 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:21.06-py3
-
-FROM ${FROM_IMAGE_NAME}
-
-RUN apt-get update && apt-get install -y libb64-dev libb64-0d
-WORKDIR /workspace
-#ENV PYTHONPATH /workspace
-RUN pip uninstall -y typing
-
-RUN apt update && apt install -y p7zip-full
-COPY requirements.txt .
-RUN pip install --upgrade pip
-RUN pip install --no-cache-dir --ignore-installed -r requirements.txt
-RUN pip install --no-cache-dir -e git://github.com/NVIDIA/dllogger#egg=dllogger
-
-COPY . .
-ENV PYTHONPATH="${PYTHONPATH}:/workspace"
-
-# AMP monkey-patch
-RUN sed -i 's/  def forward(ctx,/  @amp.custom_fwd\(cast_inputs=torch.float32\)\n  def forward(ctx,/g' /opt/conda/lib/python3.8/site-packages/apex/normalization/fused_layer_norm.py
-RUN sed -i 's/  def backward(ctx,/  @amp.custom_bwd\n  def backward(ctx,/g' /opt/conda/lib/python3.8/site-packages/apex/normalization/fused_layer_norm.py
-RUN sed -i 's/^import torch$/import torch\nfrom torch.cuda import amp/' /opt/conda/lib/python3.8/site-packages/apex/normalization/fused_layer_norm.py
diff --git a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/LICENCE b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/LICENCE
deleted file mode 100644
index 261eeb9e..00000000
--- a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/LICENCE
+++ /dev/null
@@ -1,201 +0,0 @@
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-   1. Definitions.
-
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-
-   END OF TERMS AND CONDITIONS
-
-   APPENDIX: How to apply the Apache License to your work.
-
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-
-   Copyright [yyyy] [name of copyright owner]
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
diff --git a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/LICENSE AGREEMENT b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/LICENSE AGREEMENT
deleted file mode 100644
index 5d1d88cf..00000000
--- a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/LICENSE AGREEMENT	
+++ /dev/null
@@ -1,25 +0,0 @@
-Individual Contributor License Agreement (CLA)
-Thank you for submitting your contributions to this project.
-
-By signing this CLA, you agree that the following terms apply to all of your past, present and future contributions to the project.
-
-License.
-You hereby represent that all present, past and future contributions are governed by the Apache 2.0 License copyright statement.
-
-This entails that to the extent possible under law, you transfer all copyright and related or neighboring rights of the code or documents you contribute to the project itself or its maintainers. Furthermore you also represent that you have the authority to perform the above waiver with respect to the entirety of you contributions.
-
-Moral Rights.
-To the fullest extent permitted under applicable law, you hereby waive, and agree not to assert, all of your “moral rights” in or relating to your contributions for the benefit of the project.
-
-Third Party Content.
-If your Contribution includes or is based on any source code, object code, bug fixes, configuration changes, tools, specifications, documentation, data, materials, feedback, information or other works of authorship that were not authored by you (“Third Party Content”) or if you are aware of any third party intellectual property or proprietary rights associated with your Contribution (“Third Party Rights”), then you agree to include with the submission of your Contribution full details respecting such Third Party Content and Third Party Rights, including, without limitation, identification of which aspects of your Contribution contain Third Party Content or are associated with Third Party Rights, the owner/author of the Third Party Content and Third Party Rights, where you obtained the Third Party Content, and any applicable third party license terms or restrictions respecting the Third Party Content and Third Party Rights. For greater certainty, the foregoing obligations respecting the identification of Third Party Content and Third Party Rights do not apply to any portion of a Project that is incorporated into your Contribution to that same Project.
-
-Representations.
-You represent that, other than the Third Party Content and Third Party Rights identified by you in accordance with this Agreement, you are the sole author of your Contributions and are legally entitled to grant the foregoing licenses and waivers in respect of your Contributions. If your Contributions were created in the course of your employment with your past or present employer(s), you represent that such employer(s) has authorized you to make your Contributions on behalf of such employer(s) or such employer (s) has waived all of their right, title or interest in or to your Contributions.
-
-Disclaimer.
-To the fullest extent permitted under applicable law, your Contributions are provided on an "as is" basis, without any warranties or conditions, express or implied, including, without limitation, any implied warranties or conditions of non-infringement, merchantability or fitness for a particular purpose. You are not required to provide support for your Contributions, except to the extent you desire to provide support.
-
-No Obligation.
-You acknowledge that the maintainers of this project are under no obligation to use or incorporate your contributions into the project. The decision to use or incorporate your contributions into the project will be made at the sole discretion of the maintainers or their authorized delegates.
-
diff --git a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/NOTICE b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/NOTICE
deleted file mode 100644
index ae19bb47..00000000
--- a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/NOTICE
+++ /dev/null
@@ -1,3 +0,0 @@
-TFT for PyTorch
-
-This repository includes software from https://github.com/google-research/google-research/tree/master/tft licensed under the Apache License, Version 2.0
diff --git a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/README.md b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/README.md
deleted file mode 100644
index 69b39d12..00000000
--- a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/README.md
+++ /dev/null
@@ -1,465 +0,0 @@
-# Temporal Fusion Transformer For PyTorch
-
-This repository provides a script and recipe to train the Temporal Fusion Transformer model to achieve state-of-the-art accuracy. The content of this repository is tested and maintained by NVIDIA.
-
-## Table Of Contents
-
-- [Model overview](#model-overview)
-    * [Model architecture](#model-architecture)
-    * [Default configuration](#default-configuration)
-    * [Feature support matrix](#feature-support-matrix)
-	    * [Features](#features)
-    * [Mixed precision training](#mixed-precision-training)
-	    * [Enabling mixed precision](#enabling-mixed-precision)
-          * [Enabling TF32](#enabling-tf32)
-    * [Glossary](#glossary)
-- [Setup](#setup)
-    * [Requirements](#requirements)
-- [Quick Start Guide](#quick-start-guide)
-- [Advanced](#advanced)
-    * [Scripts and sample code](#scripts-and-sample-code)
-    * [Command-line options](#command-line-options)
-    * [Getting the data](#getting-the-data)
-        * [Dataset guidelines](#dataset-guidelines)
-        * [Multi-dataset](#multi-dataset)
-    * [Training process](#training-process)
-    * [Inference process](#inference-process)
-- [Performance](#performance)
-    * [Benchmarking](#benchmarking)
-        * [Training performance benchmark](#training-performance-benchmark)
-        * [Inference performance benchmark](#inference-performance-benchmark)
-    * [Results](#results)
-        * [Training accuracy results](#training-accuracy-results)                         
-            * [Training accuracy: NVIDIA DGX A100 (8x A100 80GB)](#training-accuracy-nvidia-dgx-a100-8x-a100-80gb)
-            * [Training accuracy: NVIDIA DGX-1 (8x V100 16GB)](#training-accuracy-nvidia-dgx-1-8x-v100-16gb)
-            * [Training stability test](#training-stability-test)
-        * [Training performance results](#training-performance-results)
-            * [Training performance: NVIDIA DGX A100 (8x A100 80GB)](#training-performance-nvidia-dgx-a100-8x-a100-80gb)
-            * [Training performance: NVIDIA DGX-1 (8x V100 16GB)](#training-performance-nvidia-dgx-1-8x-v100-16gb)
-- [Release notes](#release-notes)
-    * [Changelog](#changelog)
-    * [Known issues](#known-issues)
-
-
-
-## Model overview
-
-The Temporal Fusion Transformer [TFT](https://arxiv.org/abs/1912.09363) model is a state-of-the-art architecture for interpretable, multi-horizon time-series prediction. The model was first developed and [implemented by Google](https://github.com/google-research/google-research/tree/master/tft) with the collaboration with the University of Oxford.
-This implementation differs from the reference implementation by addressing the issue of missing data, which is common in production datasets, by either masking their values in attention matrices or embedding them as a special value in the latent space.
-This model enables the prediction of confidence intervals for future values of time series for multiple future timesteps.
-
-This model is trained with mixed precision using Tensor Cores on Volta, Turing, and the NVIDIA Ampere GPU architectures. Therefore, researchers can get results 1.45x faster than training without Tensor Cores while experiencing the benefits of mixed precision training. This model is tested against each NGC monthly container release to ensure consistent accuracy and performance over time.
-
-### Model architecture
-
-The TFT model is a hybrid architecture joining LSTM encoding of time series and interpretability of transformer attention layers. Prediction is based on three  types of variables: static (constant for a given time series), known (known in advance for whole history and future), observed (known only for historical data). All these variables come in two flavors: categorical, and continuous. In addition to historical data, we feed the model with historical values of time series. All variables are embedded in high-dimensional space by learning an embedding vector. Categorical variables embeddings are learned in the classical sense of embedding discrete values. The model learns a single vector for each continuous variable, which is then scaled by this variable’s value for further processing. The next step is to filter variables through the Variable Selection Network (VSN), which assigns weights to the inputs in accordance with their relevance to the prediction. Static variables are used as a context for variable selection of other variables and as an initial state of LSTM encoders.
-After encoding, variables are passed to multi-head attention layers (decoder), which produce the final prediction. Whole architecture is interwoven with residual connections with gating mechanisms that allow  the architecture to adapt to various problems by skipping some parts of it.
-For the sake of explainability, heads of self-attention layers share value matrices. This allows interpreting  self-attention as an ensemble of models predicting different temporal patterns over the same feature set. The other feature that helps us understand the model is VSN activations, which tells us how relevant the given feature is to the prediction.
-![](TFT_architecture.PNG)
-*image source: https://arxiv.org/abs/1912.09363*
-
-### Default configuration
-
-The specific configuration of the TFT model depends on the dataset used. Not only is the volume of the model subject to change but so are the data sampling and preprocessing strategies. During preprocessing, data is normalized per feature. For a part of the datasets, we apply scaling per-time-series, which takes into account shifts in distribution between entities (i.e., a factory consumes more electricity than an average house). The model is trained with the quantile loss: <img src="https://render.githubusercontent.com/render/math?math=\Large\sum_{i=1}^N\sum_{q\in\mathcal{Q}}\sum_{t=1}^{t_{max}}\frac{QL(y_it,\hat{y}_i(q,t),q)}{Nt_{max}}">
-For quantiles in [0.1, 0.5, 0.9]. The default configurations are tuned for distributed training on DGX-1-32G with mixed precision. We use dynamic loss scaling. Specific values are provided in the table below.
-
-| Dataset | Training samples | Validation samples | Test samples | History length | Forecast horizon | Dropout | Hidden size | #Heads | BS | LR | Gradient clipping |
-| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
-| Electricity | 450k | 50k | 53.5k | 168 | 24 | 0.1 | 128 | 4 | 8x1024 | 1e-3 | 0.0 |
-| Traffic | 450k | 50k | 139.6k | 168 | 24 | 0.3 | 128 | 4 | 8x1024 | 1e-3 | 0.0
-
-### Feature support matrix
-
-The following features are supported by this model: 
-
-| Feature                    | Yes column                
-|----------------------------|--------------------------
-|Distributed data parallel   |         Yes
-|PyTorch AMP                 |         Yes 
-    
-         
-#### Features
-
-[Automatic Mixed Precision](https://pytorch.org/docs/stable/amp.html)
-provides an easy way to leverage Tensor Cores’ performance. It allows the execution of parts of a network in lower precision. Refer to [Mixed precision training](#mixed-precision-training) for more information.
-
-[PyTorch
-DistributedDataParallel](https://pytorch.org/docs/stable/nn.html#torch.nn.parallel.DistributedDataParallel) - a module
-wrapper that enables easy multiprocess distributed data-parallel
-training.
-
-### Mixed precision training
-
-Mixed precision is the combined use of different numerical precisions in a
-computational method.
-[Mixed precision](https://arxiv.org/abs/1710.03740) training offers significant
-computational speedup by performing operations in half-precision format while
-storing minimal information in single-precision to retain as much information
-as possible in critical parts of the network. Since the introduction of [Tensor Cores](https://developer.nvidia.com/tensor-cores) in Volta, and following with 
-both the Turing and Ampere architectures, significant training speedups are 
-experienced by switching to
-mixed precision -- up to 3x overall speedup on the most arithmetically intense
-model architectures. Using mixed precision training previously required two
-steps:
-
-1. Porting the model to use the FP16 data type where appropriate.
-2. Manually adding loss scaling to preserve small gradient values.
-
-The ability to train deep learning networks with lower precision was introduced
-in the Pascal architecture and first supported in [CUDA
-8](https://devblogs.nvidia.com/parallelforall/tag/fp16/) in the NVIDIA Deep
-Learning SDK.
-
-For information about:
-* How to train using mixed precision, refer to the [Mixed Precision
-  Training](https://arxiv.org/abs/1710.03740) paper and [Training With Mixed
-  Precision](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html)
-  documentation.
-* Techniques used for mixed precision training, refer to the [Mixed-Precision
-  Training of Deep Neural
-  Networks](https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/)
-  blog.
-* APEX tools for mixed precision training, refer to the [NVIDIA Apex: Tools for Easy Mixed-Precision Training in
-  PyTorch](https://devblogs.nvidia.com/apex-pytorch-easy-mixed-precision-training/)
-  .
-
-
-#### Enabling mixed precision
-
-
-Mixed precision is enabled in PyTorch by using the Automatic Mixed Precision torch.cuda.amp module, which casts variables to half-precision upon retrieval while storing variables in single-precision format. Furthermore, to preserve small gradient magnitudes in backpropagation, a [loss scaling](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html#lossscaling) step must be included when applying gradients. In PyTorch, loss scaling can be applied automatically by the GradScaler class. All the necessary steps to implement AMP are verbosely described [here](https://pytorch.org/docs/stable/notes/amp_examples.html#amp-examples).
-
-To enable mixed precision for TFT, simply add the `--use_amp` option to the training script.
-#### Enabling TF32
-
-TensorFloat-32 (TF32) is the new math mode in [NVIDIA A100](https://www.nvidia.com/en-us/data-center/a100/) GPUs for handling the matrix math, also called tensor operations. TF32 running on Tensor Cores in A100 GPUs can provide up to 10x speedups compared to single-precision floating-point math (FP32) on Volta GPUs. 
-
-TF32 Tensor Cores can speed up networks using FP32, typically with no loss of accuracy. It is more robust than FP16 for models which require high dynamic range for weights or activations.
-
-For more information, refer to the [TensorFloat-32 in the A100 GPU Accelerates AI Training, HPC up to 20x](https://blogs.nvidia.com/blog/2020/05/14/tensorfloat-32-precision-format/) blog post.
-
-TF32 is supported in the NVIDIA Ampere GPU architecture and is enabled by default.
-
-
-
-### Glossary
-
-**Multi horizon prediction**  
-Process of estimating values of a time series for multiple future time steps.
-
-**Quantiles**  
-Cut points dividing the range of a probability distribution intervals with equal probabilities.
-
-**Time series**  
-Series of data points indexed and equally spaced in time.
-
-**Transformer**  
-The paper [Attention Is All You Need](https://arxiv.org/abs/1706.03762) introduces a novel architecture called Transformer that uses an attention mechanism and transforms one sequence into another.
- 
-
-## Setup
-
-The following section lists the requirements that you need to meet in order to start training the TFT model.
-
-### Requirements
-
-This repository contains Dockerfile, which extends the PyTorch NGC container and encapsulates some dependencies. Aside from these dependencies, ensure you have the following components:
--   [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker)
--   [PyTorch 21.06 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch)
--   Supported GPUs:
-- [NVIDIA Volta architecture](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/)
-- [NVIDIA Turing architecture](https://www.nvidia.com/en-us/design-visualization/technologies/turing-architecture/)
-- [NVIDIA Ampere architecture](https://www.nvidia.com/en-us/data-center/nvidia-ampere-gpu-architecture/)
-
-For more information about how to get started with NGC containers, refer to the following sections from the NVIDIA GPU Cloud Documentation and the Deep Learning Documentation:
--   [Getting Started Using NVIDIA GPU Cloud](https://docs.nvidia.com/ngc/ngc-getting-started-guide/index.html)
--   [Accessing And Pulling From The NGC Container Registry](https://docs.nvidia.com/deeplearning/frameworks/user-guide/index.html#accessing_registry)
--   Running [PyTorch](https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/running.html#running)
-
-  
-For those unable to use the PyTorch NGC container to set up the required environment or create your own container, refer to the versioned [NVIDIA Container Support Matrix](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html).
-
-## Quick Start Guide
-
-To train your model using mixed or TF32 precision with Tensor Cores, perform the following steps using the default parameters of the TFT model on any of the benchmark datasets. For the specifics concerning training and inference, refer to the [Advanced](#advanced) section.
-
-1. Clone the repository.
-```bash
-git clone https://github.com/NVIDIA/DeepLearningExamples
-cd DeepLearningExamples/PyTorch/Forecasting/TFT
-```
-
-2.  Build the TFT PyTorch NGC container.
-```bash
-docker build --network=host -t tft .
-```
-
-3.  Start an interactive session in the NGC container to run training/inference.
-```bash
-docker run -it --rm --ipc=host --network=host --gpus all -v /path/to/your/data:/data/ tft
-```
-
-Note: Ensure to mount your dataset using the -v flag to make it available for training inside the NVIDIA Docker container.
-
-4.  Download and preprocess datasets.
-```bash
-bash scripts/get_data.sh
-```
-
-5. Start training. Choose one of the scripts provided in the `scripts/` directory. Results are stored in the `/results` directory.
-These scripts are tuned for DGX1-32G. If you have a different system, use NGPU and BATCH_SIZE variables to adjust the parameters for your system.
-```bash
-bash scripts/run_electricity.sh
-bash scripts/run_traffic.sh
-```
-
-6. Start validation/evaluation. The metric we use for evaluation is q-risk. We can compare it per-quantile in the Pareto sense or jointly as one number indicating accuracy.
-```bash
-python inference.py \
---checkpoint <your_checkpoint> \
---data /data/processed/<dataset>/test.csv \
---cat_encodings /data/processed/<dataset>/cat_encodings.bin \
---tgt_scalers /data/processed/<dataset>/tgt_scalers.bin
-```
-
-7. Start inference/predictions. Visualize and save predictions by running the following command.
-```bash
-python inference.py \
---checkpoint <your_checkpoint> \
---data /data/processed/<dataset>/test.csv \
---cat_encodings /data/processed/<dataset>/cat_encodings.bin \
---tgt_scalers /data/processed/<dataset>/tgt_scalers.bin \
---visualize \
---save_predictions
-```
-
-
-
-Now that you have your model trained and evaluated, you can choose to compare your training results with our [Training accuracy results](#training-accuracy-results). You can also choose to benchmark your performance to [Training performance benchmark](#training-performance-results). Following the steps in these sections will ensure that you achieve the same accuracy and performance results as stated in the [Results](#results) section.
-## Advanced
-
-The following sections provide more  details about the dataset, running training and inference, and the training results.
-
-### Scripts and sample code
-
-In the root directory, the most important files are:
-
-`train.py`: Entry point for training
-`data_utils.py`: File containing the dataset implementation and preprocessing functions
-`modeling.py`: Definition of the model
-`configuration.py`: Contains configuration classes for various experiments
-`test.py`: Entry point testing trained model.
-`Dockerfile`: Container definition
-`log_helper.py`: Contains helper functions for setting up dllogger
-`criterions.py`: Definitions of loss functions
-
-The `scripts` directory contains scripts for default use cases:
-`run_electricity.sh`: train default model on the electricity dataset
-`run_traffic.sh`: train default model on the traffic dataset
-
-### Command-line options
-
-To view the full list of available options and their descriptions, use the `-h` or `--help` command-line option, for example:
-`python train.py --help`.
-
-The following example output is printed when running the model:
-```
-usage: train.py [-h] --data_path DATA_PATH --dataset {electricity,volatility,traffic,favorita} [--epochs EPOCHS] [--sample_data SAMPLE_DATA SAMPLE_DATA] [--batch_size BATCH_SIZE] [--lr LR] [--seed SEED] [--use_amp] [--clip_grad CLIP_GRAD]
-                [--early_stopping EARLY_STOPPING] [--results RESULTS] [--log_file LOG_FILE] [--distributed_world_size N] [--distributed_rank DISTRIBUTED_RANK] [--local_rank LOCAL_RANK] [--overwrite_config OVERWRITE_CONFIG]
-
-optional arguments:
-  -h, --help            show this help message and exit
-  --data_path DATA_PATH
-  --dataset {electricity,volatility,traffic,favorita}
-  --epochs EPOCHS
-  --sample_data SAMPLE_DATA SAMPLE_DATA
-  --batch_size BATCH_SIZE
-  --lr LR
-  --seed SEED
-  --use_amp             Enable automatic mixed precision
-  --clip_grad CLIP_GRAD
-  --early_stopping EARLY_STOPPING
-                        Stop training if validation loss does not improve for more than this number of epochs.
-  --results RESULTS
-  --log_file LOG_FILE
-  --distributed_world_size N
-                        total number of GPUs across all nodes (default: all visible GPUs)
-  --distributed_rank DISTRIBUTED_RANK
-                        rank of the current worker
-  --local_rank LOCAL_RANK
-                        rank of the current worker
-  --overwrite_config OVERWRITE_CONFIG
-                        JSON string used to overload config
-
-```
-
-### Getting the data
-    
-The TFT model was trained on the electricity and traffic benchmark datasets. This repository contains the `get_data.sh` download script, which for electricity and and traffic datasets will automatically download and preprocess the training, validation and test datasets, and produce files that contain scalers.
-#### Dataset guidelines
-
-The `data_utils.py` file contains all functions that are used to preprocess the data. Initially the data is loaded to a `pandas.DataFrame` and parsed to the common format which contains the features we will use for training. Then standardized data is cleaned, normalized, encoded and binarized.
-This step does the following:
-Drop all the columns that are not marked in the configuration file as used for training or preprocessing
-Flatten indices in case time series are indexed by more than one column
-Split the data into training, validation and test splits
-Filter out all the time series shorter than minimal example length
-Normalize columns marked as continuous in the configuration file
-Encode as integers columns marked as categorical
-Save the data in csv and binary formats
-
-#### Multi-dataset
-In order to use an alternate dataset, you have to write a function that parses your data to a common format. The format is as follows:
-There is at least one id column
-There is exactly one time column (that can also be used as a feature column)
-Each feature is in a separate column
-Each row represents a moment in time for only one time series
-Additionally, you must specify a configuration of the network, including a data description. Refer to the example in `configuration.py` file.
-### Training process
-
-The `train.py` script is an entry point for a training procedure. Refined recipes can be found in the `scripts` directory.
-The model trains for at most `--epochs` epochs. If option `--early_stopping N` is set, then training will end if for N subsequent epochs validation loss hadn’t improved.
-The details of the architecture and the dataset configuration are encapsulated by the `--dataset` option. This option chooses one of the configurations stored in the `configuration.py` file. You can enable mixed precision training by providing the `--use_amp` option. The training script supports multi-GPU training with the APEX package. To enable distributed training prepend training command with `python -m torch.distributed.launch --nproc_per_node=${NGPU}`.
-
-Example command:
-```
-python -m torch.distributed.launch --nproc_per_node=8 train.py \
-        --dataset electricity \
-        --data_path /data/processed/electricity_bin \
-        --batch_size=1024 \
-        --sample 450000 50000 \
-        --lr 1e-3 \
-        --epochs 25 \
-        --early_stopping 5 \
-        --seed 1 \
-        --use_amp \
-        --results /results/TFT_electricity_bs8x1024_lr1e-3/seed_1
-```
-
-The model is trained by optimizing quantile loss <img src="https://render.githubusercontent.com/render/math?math=\Large\sum_{i=1}^N\sum_{q\in\mathcal{Q}}\sum_{t=1}^{t_{max}}\frac{QL(y_{it},\hat{y}_i(q,t),q)}{Nt_{max}}">
-. After training, the checkpoint with the least validation loss is evaluated on a test split with q-risk metric <img src="https://render.githubusercontent.com/render/math?math=\Large\frac{2\sum_{y\in\Omega}\sum_{t=1}^{t_{max}}QL(y_t,\hat{y}(q,t),q)}{\sum_{y\in\Omega}\sum_{t=1}^{t_{max}}|y_t|}">.
-Results are by default stored in the `/results` directory. This can be changed by providing the `--results` option. At the end of the training,  the results directory will contain the trained checkpoint which had the lowest validation loss, dllogger logs (in dictionary per line format), and TensorBoard logs.
-
-### Inference process
-
-Inference can be run by launching the `inference.py` script. The script requires a trained checkpoint to run. It is crucial to prepare the data in the same way as training data prior to running the inference. Example command:
-```
-python inference.py \
---checkpoint /results/checkpoint.pt \
---data /data/processed/electricity_bin/test.csv \
---tgt_scalers /data/processed/electricity_bin/tgt_scalers.bin \
---cat_encodings /data/processed/electricity_bin/cat_encodings.bin \
---batch_size 2048 \
---visualize \
---save_predictions \
---joint_visualization \
---results /results \
---use_amp
-```
-
-In the default setting, it performs the evaluation of the model on a specified dataset and prints q-risk evaluated on this dataset. In order to save the predictions, use the `--save_predictions` option. Predictions will be stored in the directory specified by the `--results` option in the csv format. Option `--joint_visualization` allows us to plot graphs in TensorBoard format, allowing us to inspect the results and compare them to true values. Using `--visualize`, you can save plots for each example in a separate file.
-## Performance
-
-### Benchmarking
-
-The following section shows how to run benchmarks measuring the model performance in training and inference modes.
-
-#### Training performance benchmark
-
-In order to run training benchmarks, use the `scripts/benchmark.sh` script.
-
-#### Inference performance benchmark
-
-To benchmark the inference performance on a specific batch size and dataset, run the `inference.py` script.
-### Results
-
-The following sections provide details on how we achieved our performance and accuracy in training and inference.
-
-#### Training accuracy results
-
-We conducted an extensive hyperparameter search along with stability tests. The presented results are the averages from the hundreds of runs.
-
-##### Training accuracy: NVIDIA DGX A100 (A100 80GB)
-
-Our results were obtained by running the `train.sh` training script in the [PyTorch 21.06 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch) on NVIDIA A100 GPUs.
-
-| Dataset | GPUs | Batch size / GPU    | Accuracy - TF32  | Accuracy - mixed precision  |   Time to train - TF32  |  Time to train - mixed precision | Time to train speedup (TF32 to mixed precision)     
-|-------------|---|------|-----------------------|-----------------------|-------|-------|-------
-| Electricity | 1 | 1024 | 0.027 / 0.059 / 0.029 | 0.028 / 0.058 / 0.029 | 1427s | 1087s | 1.313x
-| Electricity | 8 | 1024 | 0.027 / 0.056 / 0.028 | 0.026 / 0.054 / 0.029 | 216s  | 176s  | 1.227x
-| Traffic     | 1 | 1024 | 0.040 / 0.103 / 0.075 | 0.040 / 0.103 / 0.075 | 957s  | 726s  | 1.318x
-| Traffic     | 8 | 1024 | 0.042 / 0.104 / 0.076 | 0.042 / 0.106 / 0.077 | 151s  | 126s  | 1.198x
-
-
-
-
-##### Training accuracy: NVIDIA DGX-1 (V100 16GB)
-
-Our results were obtained by running the `train.sh` training script in the [PyTorch 21.06 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch) on NVIDIA DGX-1 with V100 16GB GPUs.
-
-| Dataset | GPUs    | Batch size / GPU    | Accuracy - FP32  | Accuracy - mixed precision  |   Time to train - FP32  |  Time to train - mixed precision | Time to train speedup (FP32 to mixed precision)        
-|-------------|---|------|-----------------------|-----------------------|-------|-------|-----------
-| Electricity | 1 | 1024 | 0.027 / 0.056 / 0.028 | 0.027 / 0.058 / 0.029 | 2559s | 1598s | 1.601x 
-| Electricity | 8 | 1024 | 0.027 / 0.055 / 0.028 | 0.027 / 0.055 / 0.029 | 381s  | 261s  | 1.460x   
-| Traffic     | 1 | 1024 | 0.040 / 0.102 / 0.075 | 0.041 / 0.101 / 0.074 | 1718s | 1062s | 1.618x 
-| Traffic     | 8 | 1024 | 0.042 / 0.106 / 0.076 | 0.042 / 0.105 / 0.077 | 256s  | 176s  | 1.455x
-
-
-
-##### Training stability test
-
-In order to get a greater picture of the model’s accuracy, we performed a hyperparameter search along with stability tests on 100 random seeds for each configuration. Then, for each benchmark dataset, we have chosen the architecture with the least mean test q-risk. The table below summarizes the best configurations.
-
-| Dataset     | #GPU | Hidden size | #Heads | Local BS | LR   | Gradient clipping | Dropout | Mean q-risk | Std q-risk | Min q-risk | Max q-risk
-|-------------|------|-------------|--------|----------|------|-------------------|---------|-------------|------------| -----------|------ 
-| Electricity | 8    | 128         | 4      | 1024     | 1e-3 | 0.0               | 0.1     | 0.1131      | 0.0025     | 0.1080     | 0.1200
-| Traffic     | 8    | 128         | 4      | 1024     | 1e-3 | 0.0               | 0.3     | 0.2180      | 0.0049     | 0.2069     | 0.2336
-
-
-#### Training performance results
-
-##### Training performance: NVIDIA DGX A100 (A100 80GB)
-
-Our results were obtained by running the `train.sh` training script in the [PyTorch 21.06 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch) on NVIDIA A100 (A100 80GB) GPUs. Performance numbers (in items/images per second) were averaged over an entire training epoch.
-
-| Dataset | GPUs   | Batch size / GPU   | Throughput - TF32    | Throughput - mixed precision    | Throughput speedup (TF32 - mixed precision)   | Weak scaling - TF32    | Weak scaling - mixed precision        
-|-------------|---|------|--------|--------|-------|-------|-----
-| Electricity | 1 | 1024 | 10173  | 13703  | 1.35x | 1     | 1
-| Electricity | 8 | 1024 | 80596  | 107761 | 1.34x | 7.92x | 7.86x
-| Traffic     | 1 | 1024 | 10197  | 13779  | 1.35x | 1     | 1
-| Traffic     | 8 | 1024 | 80692  | 107979 | 1.34x | 7.91x | 7.84x
-
-
-To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
-
-The performance metrics used were items per second.
-
-
-##### Training performance: NVIDIA DGX-1 (V100 16GB)
-
-Our results were obtained by running the `train.sh` training script in the [PyTorch 21.06 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch) on NVIDIA DGX-1 with (V100 16GB) GPUs. Performance numbers (in items/images per second) were averaged over an entire training epoch.
-
-| Dataset | GPUs   | Batch size / GPU   | Throughput - FP32    | Throughput - mixed precision    | Throughput speedup (FP32 - mixed precision)   | Weak scaling - FP32    | Weak scaling - mixed precision        
-|-------------|---|------|-------|-------|-------|------|----
-| Electricity | 1 | 1024 | 5580  | 9148  | 1.64x | 1     | 1
-| Electricity | 8 | 1024 | 43351 | 69855 | 1.61x | 7.77x | 7.64x
-| Traffic     | 1 | 1024 | 5593  | 9194  | 1.64x | 1     | 1
-| Traffic     | 8 | 1024 | 43426 | 69983 | 1.61x | 7.76x | 7.61x
-
-
-
-To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
-
-The performance metrics used were items per second.
-
-## Release notes
-The performance measurements in this document were conducted at the time of publication and may not reflect the performance achieved from NVIDIA’s latest software release. For the most up-to-date performance measurements, go to https://developer.nvidia.com/deep-learning-performance-training-inference.
-
-### Changelog
-
-October 2021
-- Initial release
-
-### Known issues
-There are no known issues with this model.
-
diff --git a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/TFT_architecture.PNG b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/TFT_architecture.PNG
deleted file mode 100644
index c3431031..00000000
Binary files a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/TFT_architecture.PNG and /dev/null differ
diff --git a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/configuration.py b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/configuration.py
deleted file mode 100644
index bef26e66..00000000
--- a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/configuration.py
+++ /dev/null
@@ -1,128 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from data_utils import InputTypes, DataTypes, FeatureSpec
-import datetime
-
-class ElectricityConfig():
-    def __init__(self):
-
-        self.features = [
-                         FeatureSpec('id', InputTypes.ID, DataTypes.CATEGORICAL),
-                         FeatureSpec('hours_from_start', InputTypes.TIME, DataTypes.CONTINUOUS),
-                         FeatureSpec('power_usage', InputTypes.TARGET, DataTypes.CONTINUOUS),
-                         FeatureSpec('hour', InputTypes.KNOWN, DataTypes.CONTINUOUS),
-                         FeatureSpec('day_of_week', InputTypes.KNOWN, DataTypes.CONTINUOUS),
-                         FeatureSpec('hours_from_start', InputTypes.KNOWN, DataTypes.CONTINUOUS),
-                         FeatureSpec('categorical_id', InputTypes.STATIC, DataTypes.CATEGORICAL),
-                        ]
-        # Dataset split boundaries
-        self.time_ids = 'days_from_start' # This column contains time indices across which we split the data
-        self.train_range = (1096, 1315)
-        self.valid_range = (1308, 1339)
-        self.test_range = (1332, 1346)
-        self.dataset_stride = 1 #how many timesteps between examples
-        self.scale_per_id = True
-        self.missing_id_strategy = None
-        self.missing_cat_data_strategy='encode_all'
-
-        # Feature sizes
-        self.static_categorical_inp_lens = [369]
-        self.temporal_known_categorical_inp_lens = []
-        self.temporal_observed_categorical_inp_lens = []
-        self.quantiles = [0.1, 0.5, 0.9]
-
-        self.example_length = 8 * 24
-        self.encoder_length = 7 * 24
-
-        self.n_head = 4
-        self.hidden_size = 128
-        self.dropout = 0.1
-        self.attn_dropout = 0.0
-
-        #### Derived variables ####
-        self.temporal_known_continuous_inp_size = len([x for x in self.features 
-            if x.feature_type == InputTypes.KNOWN and x.feature_embed_type == DataTypes.CONTINUOUS])
-        self.temporal_observed_continuous_inp_size = len([x for x in self.features 
-            if x.feature_type == InputTypes.OBSERVED and x.feature_embed_type == DataTypes.CONTINUOUS])
-        self.temporal_target_size = len([x for x in self.features if x.feature_type == InputTypes.TARGET])
-        self.static_continuous_inp_size = len([x for x in self.features 
-            if x.feature_type == InputTypes.STATIC and x.feature_embed_type == DataTypes.CONTINUOUS])
-
-        self.num_static_vars = self.static_continuous_inp_size + len(self.static_categorical_inp_lens)
-        self.num_future_vars = self.temporal_known_continuous_inp_size + len(self.temporal_known_categorical_inp_lens)
-        self.num_historic_vars = sum([self.num_future_vars,
-                                      self.temporal_observed_continuous_inp_size,
-                                      self.temporal_target_size,
-                                      len(self.temporal_observed_categorical_inp_lens),
-                                      ])
-
-
-class TrafficConfig():
-    def __init__(self):
-
-        self.features = [
-                         FeatureSpec('id', InputTypes.ID, DataTypes.CATEGORICAL),
-                         FeatureSpec('hours_from_start', InputTypes.TIME, DataTypes.CONTINUOUS),
-                         FeatureSpec('values', InputTypes.TARGET, DataTypes.CONTINUOUS),
-                         FeatureSpec('time_on_day', InputTypes.KNOWN, DataTypes.CONTINUOUS),
-                         FeatureSpec('day_of_week', InputTypes.KNOWN, DataTypes.CONTINUOUS),
-                         FeatureSpec('hours_from_start', InputTypes.KNOWN, DataTypes.CONTINUOUS),
-                         FeatureSpec('categorical_id', InputTypes.STATIC, DataTypes.CATEGORICAL),
-                        ]
-        # Dataset split boundaries
-        self.time_ids = 'sensor_day' # This column contains time indices across which we split the data
-        self.train_range = (0, 151)
-        self.valid_range = (144, 166)
-        self.test_range = (159, float('inf'))
-        self.dataset_stride = 1 #how many timesteps between examples
-        self.scale_per_id = False
-        self.missing_id_strategy = None
-        self.missing_cat_data_strategy='encode_all'
-
-        # Feature sizes
-        self.static_categorical_inp_lens = [963]
-        self.temporal_known_categorical_inp_lens = []
-        self.temporal_observed_categorical_inp_lens = []
-        self.quantiles = [0.1, 0.5, 0.9]
-
-        self.example_length = 8 * 24
-        self.encoder_length = 7 * 24
-
-        self.n_head = 4
-        self.hidden_size = 128
-        self.dropout = 0.3
-        self.attn_dropout = 0.0
-
-        #### Derived variables ####
-        self.temporal_known_continuous_inp_size = len([x for x in self.features 
-            if x.feature_type == InputTypes.KNOWN and x.feature_embed_type == DataTypes.CONTINUOUS])
-        self.temporal_observed_continuous_inp_size = len([x for x in self.features 
-            if x.feature_type == InputTypes.OBSERVED and x.feature_embed_type == DataTypes.CONTINUOUS])
-        self.temporal_target_size = len([x for x in self.features if x.feature_type == InputTypes.TARGET])
-        self.static_continuous_inp_size = len([x for x in self.features 
-            if x.feature_type == InputTypes.STATIC and x.feature_embed_type == DataTypes.CONTINUOUS])
-
-        self.num_static_vars = self.static_continuous_inp_size + len(self.static_categorical_inp_lens)
-        self.num_future_vars = self.temporal_known_continuous_inp_size + len(self.temporal_known_categorical_inp_lens)
-        self.num_historic_vars = sum([self.num_future_vars,
-                                      self.temporal_observed_continuous_inp_size,
-                                      self.temporal_target_size,
-                                      len(self.temporal_observed_categorical_inp_lens),
-                                      ])
-
-
-CONFIGS = {'electricity':  ElectricityConfig,
-           'traffic':      TrafficConfig, 
-           }
diff --git a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/criterions.py b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/criterions.py
deleted file mode 100644
index 5c9df6ae..00000000
--- a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/criterions.py
+++ /dev/null
@@ -1,28 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-class QuantileLoss(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.register_buffer('q', torch.tensor(config.quantiles))
-
-    def forward(self, predictions, targets):
-        diff = predictions - targets
-        ql = (1-self.q)*F.relu(diff) + self.q*F.relu(-diff)
-        losses = ql.view(-1, ql.shape[-1]).mean(0)
-        return losses
diff --git a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/data_utils.py b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/data_utils.py
deleted file mode 100644
index f38f8bfb..00000000
--- a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/data_utils.py
+++ /dev/null
@@ -1,790 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-################################
-# Copyright 2021 The Google Research Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import math
-import pickle
-import enum
-import datetime
-
-from collections import namedtuple, OrderedDict
-
-import sklearn.preprocessing
-from sklearn.impute import SimpleImputer
-import pandas as pd
-import numpy as np
-from bisect import bisect
-
-import torch
-from torch.utils.data import Dataset,IterableDataset,DataLoader
-
-class DataTypes(enum.IntEnum):
-    """Defines numerical types of each column."""
-    CONTINUOUS = 0
-    CATEGORICAL = 1
-    DATE = 2
-    STR = 3
-
-class InputTypes(enum.IntEnum):
-    """Defines input types of each column."""
-    TARGET = 0
-    OBSERVED = 1
-    KNOWN = 2
-    STATIC = 3
-    ID = 4  # Single column used as an entity identifier
-    TIME = 5  # Single column exclusively used as a time index
-
-FeatureSpec = namedtuple('FeatureSpec', ['name', 'feature_type', 'feature_embed_type'])
-DTYPE_MAP = {
-        DataTypes.CONTINUOUS : np.float32,
-        DataTypes.CATEGORICAL : np.int64,
-        DataTypes.DATE:'datetime64[ns]',
-        DataTypes.STR: str
-        }
-
-FEAT_ORDER = [
-        (InputTypes.STATIC, DataTypes.CATEGORICAL),
-        (InputTypes.STATIC, DataTypes.CONTINUOUS),
-        (InputTypes.KNOWN, DataTypes.CATEGORICAL),
-        (InputTypes.KNOWN, DataTypes.CONTINUOUS),
-        (InputTypes.OBSERVED, DataTypes.CATEGORICAL),
-        (InputTypes.OBSERVED, DataTypes.CONTINUOUS),
-        (InputTypes.TARGET, DataTypes.CONTINUOUS),
-        (InputTypes.ID, DataTypes.CATEGORICAL)
-        ]
-
-FEAT_NAMES = ['s_cat' , 's_cont' , 'k_cat' , 'k_cont' , 'o_cat' , 'o_cont' , 'target', 'id']
-DEFAULT_ID_COL = 'id'
-
-class TFTBinaryDataset(Dataset):
-    def __init__(self, path, config):
-        super(TFTBinaryDataset).__init__()
-        self.features = [x for x in config.features if x.feature_embed_type != DataTypes.DATE]
-        self.example_length = config.example_length
-        self.stride = config.dataset_stride
-
-        self.grouped = pickle.load(open(path, 'rb'))
-        self.grouped = [x for x in self.grouped if x.shape[0] >= self.example_length]
-        self._cum_examples_in_group = np.cumsum([(g.shape[0] - self.example_length + 1)//self.stride for g in self.grouped])
-
-
-        self.feature_type_col_map = [[i for i,f in enumerate(self.features) if (f.feature_type, f.feature_embed_type) == x] for x in FEAT_ORDER]
-
-        # The list comprehension below is an elaborate way of rearranging data into correct order,
-        # simultaneously doing casting to proper types. Probably can be written neater
-        self.grouped = [
-                [
-                    arr[:, idxs].view(dtype=np.float32).astype(DTYPE_MAP[t[1]]) 
-                    for t, idxs in zip(FEAT_ORDER, self.feature_type_col_map)
-                ] 
-                for arr in self.grouped
-            ]
-
-    def __len__(self):
-        return self._cum_examples_in_group[-1] if len(self._cum_examples_in_group) else 0
-
-    def __getitem__(self, idx):
-        g_idx = bisect(self._cum_examples_in_group, idx)
-        e_idx = idx - self._cum_examples_in_group[g_idx-1] if g_idx else idx
-
-        group =  self.grouped[g_idx]
-
-        tensors = [
-                   torch.from_numpy(feat[e_idx * self.stride:e_idx*self.stride + self.example_length])
-                   if feat.size else torch.empty(0)
-                   for feat in group
-                  ]
-
-        return OrderedDict(zip(FEAT_NAMES, tensors))
-
-
-class TFTDataset(Dataset):
-    def __init__(self, path, config):
-        super(TFTDataset).__init__()
-        self.features = config.features
-        self.data = pd.read_csv(path, index_col=0)
-        self.example_length = config.example_length
-        self.stride = config.dataset_stride
-
-        # name field is a column name.
-        # there can be multiple entries with the same name because one column can be interpreted in many ways
-        time_col_name = next(x.name for x in self.features if x.feature_type==InputTypes.TIME)
-        id_col_name   = next(x.name for x in self.features if x.feature_type==InputTypes.ID)
-        if not id_col_name in self.data.columns:
-            id_col_name = DEFAULT_ID_COL
-            self.features = [x for x in self.features if x.feature_type!=InputTypes.ID]
-            self.features.append(FeatureSpec(DEFAULT_ID_COL, InputTypes.ID, DataTypes.CATEGORICAL))
-        col_dtypes    = {v.name:DTYPE_MAP[v.feature_embed_type] for v in self.features}
-
-
-        self.data.sort_values(time_col_name,inplace=True)
-        self.data = self.data[set(x.name for x in self.features)] #leave only relevant columns
-        self.data = self.data.astype(col_dtypes)
-        self.data = self.data.groupby(id_col_name).filter(lambda group: len(group) >= self.example_length)
-        self.grouped = list(self.data.groupby(id_col_name))
-
-        self._cum_examples_in_group = np.cumsum([(len(g[1]) - self.example_length + 1)//self.stride for g in self.grouped])
-
-    def __len__(self):
-        return self._cum_examples_in_group[-1]
-
-    def __getitem__(self, idx):
-        g_idx = len([x for x in self._cum_examples_in_group if x <= idx])
-        e_idx = idx - self._cum_examples_in_group[g_idx-1] if g_idx else idx
-
-        group =  self.grouped[g_idx][1]
-        sliced = group.iloc[e_idx * self.stride:e_idx*self.stride + self.example_length]
-
-        # We need to be sure that tensors are returned in the correct order
-        tensors = tuple([] for _ in range(8))
-        for v in self.features:
-            if v.feature_type == InputTypes.STATIC and v.feature_embed_type == DataTypes.CATEGORICAL:
-                tensors[0].append(torch.from_numpy(sliced[v.name].to_numpy()))
-            elif v.feature_type == InputTypes.STATIC and v.feature_embed_type == DataTypes.CONTINUOUS:
-                tensors[1].append(torch.from_numpy(sliced[v.name].to_numpy()))
-            elif v.feature_type == InputTypes.KNOWN and v.feature_embed_type == DataTypes.CATEGORICAL:
-                tensors[2].append(torch.from_numpy(sliced[v.name].to_numpy()))
-            elif v.feature_type == InputTypes.KNOWN and v.feature_embed_type == DataTypes.CONTINUOUS:
-                tensors[3].append(torch.from_numpy(sliced[v.name].to_numpy()))
-            elif v.feature_type == InputTypes.OBSERVED and v.feature_embed_type == DataTypes.CATEGORICAL:
-                tensors[4].append(torch.from_numpy(sliced[v.name].to_numpy()))
-            elif v.feature_type == InputTypes.OBSERVED and v.feature_embed_type == DataTypes.CONTINUOUS:
-                tensors[5].append(torch.from_numpy(sliced[v.name].to_numpy()))
-            elif v.feature_type == InputTypes.TARGET:
-                tensors[6].append(torch.from_numpy(sliced[v.name].to_numpy()))
-            elif v.feature_type == InputTypes.ID:
-                tensors[7].append(torch.from_numpy(sliced[v.name].to_numpy()))
-
-
-        tensors = [torch.stack(x, dim=-1) if x else torch.empty(0) for x in tensors]
-
-        return OrderedDict(zip(FEAT_NAMES, tensors))
-        
-def get_dataset_splits(df, config):
-
-    if hasattr(config, 'relative_split') and config.relative_split:
-        forecast_len = config.example_length - config.encoder_length
-        # The valid split is shifted from the train split by number of the forecast steps to the future.
-        # The test split is shifted by the number of the forecast steps from the valid split
-        train = []
-        valid = []
-        test = []
-
-        for _, group in df.groupby(DEFAULT_ID_COL):
-            index = group[config.time_ids]
-            _train = group.loc[index < config.valid_boundary]
-            _valid = group.iloc[(len(_train) - config.encoder_length):(len(_train) + forecast_len)]
-            _test = group.iloc[(len(_train) - config.encoder_length + forecast_len):(len(_train) + 2*forecast_len)]
-            train.append(_train)
-            valid.append(_valid)
-            test.append(_test)
-
-        train = pd.concat(train, axis=0)
-        valid = pd.concat(valid, axis=0)
-        test = pd.concat(test, axis=0)
-    else:
-        index = df[config.time_ids]
-        train = df.loc[(index >= config.train_range[0]) & (index < config.train_range[1])]
-        valid = df.loc[(index >= config.valid_range[0]) & (index < config.valid_range[1])]
-        test  = df.loc[(index >= config.test_range[0]) & (index < config.test_range[1])]
-
-    return train, valid, test
-
-def flatten_ids(df, config):
-
-    if config.missing_id_strategy == 'drop':
-        if hasattr(config, 'combine_ids') and config.combine_ids:
-            index = np.logical_or.reduce([df[c].isna() for c in config.combine_ids])
-        else:
-            id_col = next(x.name for x in config.features if x.feature_type == InputTypes.ID)
-            index = df[id_col].isna()
-        index = index[index == True].index # Extract indices of nans
-        df.drop(index, inplace=True)
-
-    if not (hasattr(config, 'combine_ids') and config.combine_ids):
-        id_col = next(x.name for x in config.features if x.feature_type == InputTypes.ID)
-        ids = df[id_col].apply(str)
-        df.drop(id_col, axis=1, inplace=True)
-        encoder = sklearn.preprocessing.LabelEncoder().fit(ids.values)
-        df[DEFAULT_ID_COL] = encoder.transform(ids)
-        encoders = OrderedDict({id_col: encoder})
-
-    else:
-        encoders = {c:sklearn.preprocessing.LabelEncoder().fit(df[c].values) for c in config.combine_ids}
-        encoders = OrderedDict(encoders)
-        lens = [len(v.classes_) for v in encoders.values()]
-        clens = np.roll(np.cumprod(lens), 1)
-        clens[0] = 1
-
-        # this takes a looooooot of time. Probably it would be better to create 2 dummy columns
-        df[DEFAULT_ID_COL] = df.apply(lambda row: sum([encoders[c].transform([row[c]])[0]*clens[i] for i,c in enumerate(encoders.keys())]), axis=1)
-        df.drop(config.combine_ids, axis=1, inplace=True)
-
-    return DEFAULT_ID_COL, encoders
-
-def impute(df, config):
-    #XXX This ensures that out scaling will have the same mean. We still need to check the variance
-    if not hasattr(config, 'missing_data_label'):
-        return df, None
-    else:
-        imp = SimpleImputer(missing_values=config.missing_data_label, strategy='mean')
-        mask = df.applymap(lambda x: True if x == config.missing_data_label else False)
-        data = df.values
-        col_mask = (data == config.missing_data_label).all(axis=0)
-        data[:,~col_mask] = imp.fit_transform(data)
-        return data, mask
-
-def normalize_reals(train, valid, test, config, id_col=DEFAULT_ID_COL):
-    tgt_cols = [x.name for x in config.features if x.feature_type == InputTypes.TARGET]
-    real_cols = list(set(v.name for v in config.features if v.feature_embed_type == DataTypes.CONTINUOUS).difference(set(tgt_cols)))
-    real_scalers = {}
-    tgt_scalers = {}
-
-    def apply_scalers(df, name=None):
-        if name is None:
-            name = df.name
-        mask = df.applymap(lambda x: True if x == config.missing_data_label else False) if hasattr(config, 'missing_data_label') else None
-        df[real_cols] = real_scalers[name].transform(df[real_cols])
-        if mask is not None and any(mask):
-            df[real_cols].mask(mask, 10**9)
-        df[tgt_cols] = tgt_scalers[name].transform(df[tgt_cols])
-        return df
-
-    if config.scale_per_id:
-        for identifier, sliced in train.groupby(id_col):
-            data = sliced[real_cols]
-            data, _ = impute(data, config)
-            real_scalers[identifier] = sklearn.preprocessing.StandardScaler().fit(data)
-            # XXX We should probably remove examples that contain NaN as a target
-            target = sliced[tgt_cols]
-            tgt_scalers[identifier] = sklearn.preprocessing.StandardScaler().fit(target)
-
-        train = train.groupby(id_col).apply(apply_scalers)
-        # For valid and testing leave only timeseries previously present in train subset
-        # XXX for proper data science we should consider encoding unseen timeseries as a special case, not throwing them away
-        valid = valid.loc[valid[id_col].isin(real_scalers.keys())]
-        valid = valid.groupby(id_col).apply(apply_scalers)
-        test = test.loc[test[id_col].isin(real_scalers.keys())]
-        test = test.groupby(id_col).apply(apply_scalers)
-
-    else:
-        data, _ = impute(train[real_cols], config)
-        real_scalers[''] = sklearn.preprocessing.StandardScaler().fit(data)
-        tgt_scalers[''] = sklearn.preprocessing.StandardScaler().fit(train[tgt_cols])
-
-        train = apply_scalers(train, name='')
-        valid = apply_scalers(valid, name='')
-        test = apply_scalers(test, name='')
-
-    return train, valid, test, real_scalers, tgt_scalers
-
-def encode_categoricals(train, valid, test, config):
-    cat_encodings = {}
-    cat_cols = list(set(v.name for v in config.features if v.feature_embed_type == DataTypes.CATEGORICAL and v.feature_type != InputTypes.ID))
-    num_classes = [] #XXX Maybe we should modify config based on this value? Or send a warninig?
-                     # For TC performance reasons we might want for num_classes[i] be divisible by 8
-
-    # Train categorical encoders
-    for c in cat_cols:
-        if config.missing_cat_data_strategy == 'special_token':
-            #XXX this will probably require some data augmentation
-            unique = train[c].unique()
-            valid[c].loc[valid[c].isin(unique)] = '<UNK>'
-            test[c].loc[test[c].isin(unique)] = '<UNK>'
-
-        if config.missing_cat_data_strategy == 'encode_all' or \
-                config.missing_cat_data_strategy == 'special_token':
-            srs = pd.concat([train[c], valid[c], test[c]]).apply(str)
-            cat_encodings[c] = sklearn.preprocessing.LabelEncoder().fit(srs.values)
-        elif config.missing_cat_data_strategy == 'drop':
-            # TODO: implement this. In addition to dropping rows this has to split specific time series in chunks
-            # to prevent data from having temporal gaps
-            pass
-        num_classes.append(srs.nunique())
-    print('Categorical variables encodings lens: ', num_classes)
-
-
-    for split in [train, valid, test]:
-        for c in cat_cols:
-            srs = split[c].apply(str)
-            split[c] = srs
-            split.loc[:,c] = cat_encodings[c].transform(srs)
-
-    return cat_encodings
-
-
-def preprocess(src_path, dst_path, config):
-    df = pd.read_csv(src_path, index_col=0)
-
-    for c in config.features:
-        if c.feature_embed_type == DataTypes.DATE:
-            df[c.name] = pd.to_datetime(df[c.name])
-
-    # Leave only columns relevant to preprocessing
-    relevant_columns = list(set([f.name for f in config.features] + [config.time_ids]))
-    df = df[relevant_columns]
-
-
-    id_col, id_encoders = flatten_ids(df, config)
-    df = df.reindex(sorted(df.columns), axis=1)
-    
-    train, valid, test = get_dataset_splits(df, config)
-   
-    # Length filter the data (all timeseries shorter than example len will be dropped)
-    #for df in [train, valid, test]:
-    #    df.groupby(id_col).filter(lambda x: len(x) >= config.example_length)
-    train = pd.concat([x[1] for x in train.groupby(id_col) if len(x[1]) >= config.example_length])
-    valid = pd.concat([x[1] for x in valid.groupby(id_col) if len(x[1]) >= config.example_length])
-    test  = pd.concat([x[1] for x in test.groupby(id_col)  if len(x[1]) >= config.example_length])
-
-    train, valid, test, real_scalers, tgt_scalers = normalize_reals(train, valid, test, config, id_col)
-
-    cat_encodings = encode_categoricals(train, valid, test, config)
-
-    os.makedirs(dst_path, exist_ok=True)
-    
-    train.to_csv(os.path.join(dst_path, 'train.csv'))
-    valid.to_csv(os.path.join(dst_path, 'valid.csv'))
-    test.to_csv(os.path.join(dst_path, 'test.csv'))
-
-    # Save relevant columns in binary form for faster dataloading
-    # IMORTANT: We always expect id to be a single column indicating the complete timeseries
-    # We also expect a copy of id in form of static categorical input!!!
-    col_names = [id_col] + [x.name for x in config.features if x.feature_embed_type != DataTypes.DATE and x.feature_type != InputTypes.ID]
-    grouped_train = [x[1][col_names].values.astype(np.float32).view(dtype=np.int32) for x in train.groupby(id_col)]
-    grouped_valid = [x[1][col_names].values.astype(np.float32).view(dtype=np.int32) for x in valid.groupby(id_col)]
-    grouped_test  = [x[1][col_names].values.astype(np.float32).view(dtype=np.int32) for x in test.groupby(id_col)]
-
-    pickle.dump(grouped_train, open(os.path.join(dst_path, 'train.bin'), 'wb'))
-    pickle.dump(grouped_valid, open(os.path.join(dst_path, 'valid.bin'), 'wb'))
-    pickle.dump(grouped_test,  open(os.path.join(dst_path, 'test.bin'), 'wb'))
-
-    
-    with open(os.path.join(dst_path, 'real_scalers.bin'), 'wb') as f:
-        pickle.dump(real_scalers, f)
-    with open(os.path.join(dst_path, 'tgt_scalers.bin'), 'wb') as f:
-        pickle.dump(tgt_scalers, f)
-    with open(os.path.join(dst_path, 'cat_encodings.bin'), 'wb') as f:
-        pickle.dump(cat_encodings, f)
-    with open(os.path.join(dst_path, 'id_encoders.bin'), 'wb') as f:
-        pickle.dump(id_encoders, f)
-    
-
-def sample_data(dataset, num_samples):
-    if num_samples < 0:
-        return dataset
-    else:
-        return torch.utils.data.Subset(dataset, np.random.choice(np.arange(len(dataset)), size=num_samples, replace=False))
-
-
-def standarize_electricity(path):
-    """Code taken from https://github.com/google-research/google-research/blob/master/tft/script_download_data.py"""
-    df = pd.read_csv(os.path.join(path, 'LD2011_2014.txt'), index_col=0, sep=';', decimal=',')
-    df.index = pd.to_datetime(df.index)
-    df.sort_index(inplace=True)
-  
-    # Used to determine the start and end dates of a series
-    output = df.resample('1h').mean().replace(0., np.nan)
-  
-    earliest_time = output.index.min()
-  
-    df_list = []
-    for label in output:
-        print('Processing {}'.format(label))
-        srs = output[label]
-  
-        start_date = min(srs.fillna(method='ffill').dropna().index)
-        end_date = max(srs.fillna(method='bfill').dropna().index)
-  
-        active_range = (srs.index >= start_date) & (srs.index <= end_date)
-        srs = srs[active_range].fillna(0.)
-  
-        tmp = pd.DataFrame({'power_usage': srs})
-        date = tmp.index
-        tmp['t'] = (date - earliest_time).seconds / 60 / 60 + (
-            date - earliest_time).days * 24
-        tmp['days_from_start'] = (date - earliest_time).days
-        tmp['categorical_id'] = label
-        tmp['date'] = date
-        tmp['id'] = label
-        tmp['hour'] = date.hour
-        tmp['day'] = date.day
-        tmp['day_of_week'] = date.dayofweek
-        tmp['month'] = date.month
-  
-        df_list.append(tmp)
-  
-    output = pd.concat(df_list, axis=0, join='outer').reset_index(drop=True)
-  
-    output['categorical_id'] = output['id'].copy()
-    output['hours_from_start'] = output['t']
-    output['categorical_day_of_week'] = output['day_of_week'].copy()
-    output['categorical_hour'] = output['hour'].copy()
-  
-    output.to_csv(os.path.join(path, 'standarized.csv'))
-
-def standarize_volatility(path):
-    df = pd.read_csv(os.path.join(path, 'oxfordmanrealizedvolatilityindices.csv'), index_col=0)  # no explicit index
-  
-    # Adds additional date/day fields
-    idx = [str(s).split('+')[0] for s in df.index
-          ]  # ignore timezones, we don't need them
-    dates = pd.to_datetime(idx)
-    df['date'] = dates
-    df['days_from_start'] = (dates - pd.datetime(2000, 1, 3)).days
-    df['day_of_week'] = dates.dayofweek
-    df['day_of_month'] = dates.day
-    df['week_of_year'] = dates.weekofyear
-    df['month'] = dates.month
-    df['year'] = dates.year
-    df['categorical_id'] = df['Symbol'].copy()
-  
-    # Processes log volatility
-    vol = df['rv5_ss'].copy()
-    vol.loc[vol == 0.] = np.nan
-    df['log_vol'] = np.log(vol)
-  
-    # Adds static information
-    symbol_region_mapping = {
-        '.AEX': 'EMEA',
-        '.AORD': 'APAC',
-        '.BFX': 'EMEA',
-        '.BSESN': 'APAC',
-        '.BVLG': 'EMEA',
-        '.BVSP': 'AMER',
-        '.DJI': 'AMER',
-        '.FCHI': 'EMEA',
-        '.FTMIB': 'EMEA',
-        '.FTSE': 'EMEA',
-        '.GDAXI': 'EMEA',
-        '.GSPTSE': 'AMER',
-        '.HSI': 'APAC',
-        '.IBEX': 'EMEA',
-        '.IXIC': 'AMER',
-        '.KS11': 'APAC',
-        '.KSE': 'APAC',
-        '.MXX': 'AMER',
-        '.N225': 'APAC ',
-        '.NSEI': 'APAC',
-        '.OMXC20': 'EMEA',
-        '.OMXHPI': 'EMEA',
-        '.OMXSPI': 'EMEA',
-        '.OSEAX': 'EMEA',
-        '.RUT': 'EMEA',
-        '.SMSI': 'EMEA',
-        '.SPX': 'AMER',
-        '.SSEC': 'APAC',
-        '.SSMI': 'EMEA',
-        '.STI': 'APAC',
-        '.STOXX50E': 'EMEA'
-    }
-  
-    df['Region'] = df['Symbol'].apply(lambda k: symbol_region_mapping[k])
-  
-    # Performs final processing
-    output_df_list = []
-    for grp in df.groupby('Symbol'):
-        sliced = grp[1].copy()
-        sliced.sort_values('days_from_start', inplace=True)
-        # Impute log volatility values
-        sliced['log_vol'].fillna(method='ffill', inplace=True)
-        sliced.dropna()
-        output_df_list.append(sliced)
-  
-    df = pd.concat(output_df_list, axis=0)
-  
-    df.to_csv(os.path.join(path, 'standarized.csv'))
-
-
-def standarize_traffic(path):
-    def process_list(s, variable_type=int, delimiter=None):
-        """Parses a line in the PEMS format to a list."""
-        if delimiter is None:
-            l = [
-                variable_type(i) for i in s.replace('[', '').replace(']', '').split()
-            ]
-        else:
-            l = [
-                variable_type(i)
-                for i in s.replace('[', '').replace(']', '').split(delimiter)
-            ]
-  
-        return l
-  
-    def read_single_list(filename):
-        """Returns single list from a file in the PEMS-custom format."""
-        with open(os.path.join(path, filename), 'r') as dat:
-            l = process_list(dat.readlines()[0])
-        return l
-  
-    def read_matrix(filename):
-        """Returns a matrix from a file in the PEMS-custom format."""
-        array_list = []
-        with open(os.path.join(path, filename), 'r') as dat:
-            lines = dat.readlines()
-            for i, line in enumerate(lines):
-                if (i + 1) % 50 == 0:
-                    print('Completed {} of {} rows for {}'.format(i + 1, len(lines),
-                                                                filename))
-                array = [
-                    process_list(row_split, variable_type=float, delimiter=None)
-                    for row_split in process_list(
-                        line, variable_type=str, delimiter=';')
-                ]
-                array_list.append(array)
-  
-        return array_list
-  
-    shuffle_order = np.array(read_single_list('randperm')) - 1  # index from 0
-    train_dayofweek = read_single_list('PEMS_trainlabels')
-    train_tensor = read_matrix('PEMS_train')
-    test_dayofweek = read_single_list('PEMS_testlabels')
-    test_tensor = read_matrix('PEMS_test')
-  
-    # Inverse permutate shuffle order
-    print('Shuffling')
-    inverse_mapping = {
-        new_location: previous_location
-        for previous_location, new_location in enumerate(shuffle_order)
-    }
-    reverse_shuffle_order = np.array([
-        inverse_mapping[new_location]
-        for new_location, _ in enumerate(shuffle_order)
-    ])
-  
-    # Group and reoder based on permuation matrix
-    print('Reodering')
-    day_of_week = np.array(train_dayofweek + test_dayofweek)
-    combined_tensor = np.array(train_tensor + test_tensor)
-  
-    day_of_week = day_of_week[reverse_shuffle_order]
-    combined_tensor = combined_tensor[reverse_shuffle_order]
-  
-    # Put everything back into a dataframe
-    print('Parsing as dataframe')
-    labels = ['traj_{}'.format(i) for i in read_single_list('stations_list')]
-  
-    hourly_list = []
-    for day, day_matrix in enumerate(combined_tensor):
-        # Hourly data
-        hourly = pd.DataFrame(day_matrix.T, columns=labels)
-        hourly['hour_on_day'] = [int(i / 6) for i in hourly.index
-                                ]  # sampled at 10 min intervals
-        if hourly['hour_on_day'].max() > 23 or hourly['hour_on_day'].min() < 0:
-            raise ValueError('Invalid hour! {}-{}'.format(
-                hourly['hour_on_day'].min(), hourly['hour_on_day'].max()))
-  
-        hourly = hourly.groupby('hour_on_day', as_index=True).mean()[labels]
-        hourly['sensor_day'] = day
-        hourly['time_on_day'] = hourly.index
-        hourly['day_of_week'] = day_of_week[day]
-  
-        hourly_list.append(hourly)
-  
-    hourly_frame = pd.concat(hourly_list, axis=0, ignore_index=True, sort=False)
-  
-    # Flatten such that each entitiy uses one row in dataframe
-    store_columns = [c for c in hourly_frame.columns if 'traj' in c]
-    other_columns = [c for c in hourly_frame.columns if 'traj' not in c]
-    flat_df = pd.DataFrame(columns=['values', 'prev_values', 'next_values'] +
-                           other_columns + ['id'])
-  
-    for store in store_columns:
-        print('Processing {}'.format(store))
-  
-        sliced = hourly_frame[[store] + other_columns].copy()
-        sliced.columns = ['values'] + other_columns
-        sliced['id'] = int(store.replace('traj_', ''))
-  
-        # Sort by Sensor-date-time
-        key = sliced['id'].apply(str) \
-                + sliced['sensor_day'].apply(lambda x: '_{:03d}'.format(x)) \
-                + sliced['time_on_day'].apply(lambda x: '_{:03d}'.format(x))
-        sliced = sliced.set_index(key).sort_index()
-  
-        sliced['values'] = sliced['values'].fillna(method='ffill')
-        sliced['prev_values'] = sliced['values'].shift(1)
-        sliced['next_values'] = sliced['values'].shift(-1)
-  
-        flat_df = flat_df.append(sliced.dropna(), ignore_index=True, sort=False)
-  
-    # Filter to match range used by other academic papers
-    index = flat_df['sensor_day']
-    flat_df = flat_df[index < 173].copy()
-  
-    # Creating columns fo categorical inputs
-    flat_df['categorical_id'] = flat_df['id'].copy()
-    flat_df['hours_from_start'] = flat_df['time_on_day'] \
-        + flat_df['sensor_day']*24.
-    flat_df['categorical_day_of_week'] = flat_df['day_of_week'].copy()
-    flat_df['categorical_time_on_day'] = flat_df['time_on_day'].copy()
-  
-    flat_df.to_csv(os.path.join(path, 'standarized.csv'))
-
-
-# XXX needs rework
-def standarize_favorita(data_folder):
-    import gc
-    # Extract only a subset of data to save/process for efficiency
-    start_date = pd.datetime(2015, 1, 1)
-    end_date = pd.datetime(2016, 6, 1)
-  
-    print('Regenerating data...')
-  
-    # load temporal data
-    temporal = pd.read_csv(os.path.join(data_folder, 'train.csv'), index_col=0)
-  
-    store_info = pd.read_csv(os.path.join(data_folder, 'stores.csv'), index_col=0)
-    oil = pd.read_csv(
-        os.path.join(data_folder, 'oil.csv'), index_col=0).iloc[:, 0]
-    holidays = pd.read_csv(os.path.join(data_folder, 'holidays_events.csv'))
-    items = pd.read_csv(os.path.join(data_folder, 'items.csv'), index_col=0)
-    transactions = pd.read_csv(os.path.join(data_folder, 'transactions.csv'))
-  
-    # Take first 6 months of data
-    temporal['date'] = pd.to_datetime(temporal['date'])
-  
-    # Filter dates to reduce storage space requirements
-    if start_date is not None:
-        temporal = temporal[(temporal['date'] >= start_date)]
-    if end_date is not None:
-        temporal = temporal[(temporal['date'] < end_date)]
-  
-    dates = temporal['date'].unique()
-  
-    # Add trajectory identifier
-    temporal['traj_id'] = temporal['store_nbr'].apply(
-        str) + '_' + temporal['item_nbr'].apply(str)
-    temporal['unique_id'] = temporal['traj_id'] + '_' + temporal['date'].apply(
-        str)
-  
-    # Remove all IDs with negative returns
-    print('Removing returns data')
-    min_returns = temporal['unit_sales'].groupby(temporal['traj_id']).min()
-    valid_ids = set(min_returns[min_returns >= 0].index)
-    selector = temporal['traj_id'].apply(lambda traj_id: traj_id in valid_ids)
-    new_temporal = temporal[selector].copy()
-    del temporal
-    gc.collect()
-    temporal = new_temporal
-    temporal['open'] = 1
-  
-    # Resampling
-    print('Resampling to regular grid')
-    resampled_dfs = []
-    for traj_id, raw_sub_df in temporal.groupby('traj_id'):
-        print('Resampling', traj_id)
-        sub_df = raw_sub_df.set_index('date', drop=True).copy()
-        sub_df = sub_df.resample('1d').last()
-        sub_df['date'] = sub_df.index
-        sub_df[['store_nbr', 'item_nbr', 'onpromotion']] \
-            = sub_df[['store_nbr', 'item_nbr', 'onpromotion']].fillna(method='ffill')
-        sub_df['open'] = sub_df['open'].fillna(
-            0)  # flag where sales data is unknown
-        sub_df['log_sales'] = np.log(sub_df['unit_sales'])
-    
-        resampled_dfs.append(sub_df.reset_index(drop=True))
-  
-    new_temporal = pd.concat(resampled_dfs, axis=0)
-    del temporal
-    gc.collect()
-    temporal = new_temporal
-  
-    print('Adding oil')
-    oil.name = 'oil'
-    oil.index = pd.to_datetime(oil.index)
-    #XXX the lines below match the value of the oil on given date with the rest of the timeseries
-    # missing values in oil series are copied from the index before. Then the oil series is joined with
-    # temporal. Then there are some dates present in temporal which arent present in oil, for which 
-    # oil values is substituted with -1. WHY?!
-    #TODO: check how many nans there are after first step. Previously oil series was extended by dates
-    # present in dates variable with nan value, which were forward filled. 
-    # This behavior is no longer supported by pandas, so we changed to DataFrame.isin method.
-    # This leaves us with more nans after first step than previously. To achieve previous behavior
-    # we have to join series before filling nans.
-    temporal = temporal.join(
-        #oil.loc[oil.index.isin(dates)].fillna(method='ffill'), on='date', how='left')
-        oil.loc[oil.index.isin(dates)], on='date', how='left')
-    temporal['oil'] = temporal['oil'].fillna(method='ffill')
-    temporal['oil'] = temporal['oil'].fillna(-1)
-  
-    print('Adding store info')
-    temporal = temporal.join(store_info, on='store_nbr', how='left')
-  
-    print('Adding item info')
-    temporal = temporal.join(items, on='item_nbr', how='left')
-  
-    transactions['date'] = pd.to_datetime(transactions['date'])
-    temporal = temporal.merge(
-        transactions,
-        left_on=['date', 'store_nbr'],
-        right_on=['date', 'store_nbr'],
-        how='left')
-    temporal['transactions'] = temporal['transactions'].fillna(-1)
-  
-    # Additional date info
-    temporal['day_of_week'] = pd.to_datetime(temporal['date'].values).dayofweek
-    temporal['day_of_month'] = pd.to_datetime(temporal['date'].values).day
-    temporal['month'] = pd.to_datetime(temporal['date'].values).month
-  
-    # Add holiday info
-    print('Adding holidays')
-    holiday_subset = holidays[holidays['transferred'].apply(
-        lambda x: not x)].copy()
-    holiday_subset.columns = [
-        s if s != 'type' else 'holiday_type' for s in holiday_subset.columns
-    ]
-    holiday_subset['date'] = pd.to_datetime(holiday_subset['date'])
-    local_holidays = holiday_subset[holiday_subset['locale'] == 'Local']
-    regional_holidays = holiday_subset[holiday_subset['locale'] == 'Regional']
-    national_holidays = holiday_subset[holiday_subset['locale'] == 'National']
-  
-    temporal['national_hol'] = temporal.merge(
-        national_holidays, left_on=['date'], right_on=['date'],
-        how='left')['description'].fillna('')
-    temporal['regional_hol'] = temporal.merge(
-        regional_holidays,
-        left_on=['state', 'date'],
-        right_on=['locale_name', 'date'],
-        how='left')['description'].fillna('')
-    temporal['local_hol'] = temporal.merge(
-        local_holidays,
-        left_on=['city', 'date'],
-        right_on=['locale_name', 'date'],
-        how='left')['description'].fillna('')
-  
-    temporal.sort_values('unique_id', inplace=True)
-
-    # Transform date to integer index
-    start_date = pd.to_datetime(min(temporal['date']))
-    dates = temporal['date'].apply(pd.to_datetime)
-    temporal['days_from_start'] = (dates - start_date).dt.days
-    temporal['categorical_id'] = temporal['traj_id'].copy()
-  
-    print('Saving processed file to {}'.format(os.path.join(data_folder, 'standarized.csv')))
-    temporal.to_csv(os.path.join(data_folder, 'standarized.csv'))
diff --git a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/ema.py b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/ema.py
deleted file mode 100644
index f8f5b331..00000000
--- a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/ema.py
+++ /dev/null
@@ -1,73 +0,0 @@
-# Copyright 2021 NVIDIA CORPORATION
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-
-#     http://www.apache.org/licenses/LICENSE-2.0
-
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Copyright 2019 Ross Wightman
-
-#    Licensed under the Apache License, Version 2.0 (the "License");
-#    you may not use this file except in compliance with the License.
-#    You may obtain a copy of the License at
-
-#        http://www.apache.org/licenses/LICENSE-2.0
-
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS,
-#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#    See the License for the specific language governing permissions and
-#    limitations under the License.
-
-""" 
-Exponential Moving Average (EMA) of model updates
-"""
-
-from collections import OrderedDict
-from copy import deepcopy
-
-import torch
-import torch.nn as nn
-
-class ModelEma(nn.Module):
-    """ Model Exponential Moving Average V2
-
-    Keep a moving average of everything in the model state_dict (parameters and buffers).
-    V2 of this module is simpler, it does not match params/buffers based on name but simply
-    iterates in order. It works with torchscript (JIT of full model).
-
-    """
-    def __init__(self, model, decay=0.999, device=None):
-        super().__init__()
-        # make a copy of the model for accumulating moving average of weights
-        self.module = deepcopy(model)
-        self.module.eval()
-        self.decay = decay
-        self.device = device  # perform ema on different device from model if set
-        if self.device is not None:
-            self.module.to(device=device)
-
-    def update(self, model):
-        update_fn=lambda ema_v, model_v: self.decay * ema_v + (1. - self.decay) * model_v
-        with torch.no_grad():
-            for ema_v, model_v in zip(self.module.state_dict().values(), model.state_dict().values()):
-                if self.device is not None:
-                    model_v = model_v.to(device=self.device)
-                ema_v.copy_(update_fn(ema_v, model_v))
-
-    def set(self, model):
-        with torch.no_grad():
-            for ema_v, model_v in zip(self.module.state_dict().values(), model.state_dict().values()):
-                if self.device is not None:
-                    model_v = model_v.to(device=self.device)
-                ema_v.copy_( model_v )
-
-    def forward(self, x):
-        return self.module(x)
diff --git a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/gpu_affinity.py b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/gpu_affinity.py
deleted file mode 100644
index 79fb1fc4..00000000
--- a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/gpu_affinity.py
+++ /dev/null
@@ -1,157 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import collections
-import math
-import os
-import pathlib
-import re
-
-import pynvml
-
-pynvml.nvmlInit()
-
-
-def systemGetDriverVersion():
-    return pynvml.nvmlSystemGetDriverVersion()
-
-
-def deviceGetCount():
-    return pynvml.nvmlDeviceGetCount()
-
-
-class device:
-    # assume nvml returns list of 64 bit ints
-    _nvml_affinity_elements = math.ceil(os.cpu_count() / 64)
-
-    def __init__(self, device_idx):
-        super().__init__()
-        self.handle = pynvml.nvmlDeviceGetHandleByIndex(device_idx)
-
-    def getName(self):
-        return pynvml.nvmlDeviceGetName(self.handle)
-
-    def getCpuAffinity(self):
-        affinity_string = ''
-        for j in pynvml.nvmlDeviceGetCpuAffinity(
-            self.handle, device._nvml_affinity_elements
-        ):
-            # assume nvml returns list of 64 bit ints
-            affinity_string = '{:064b}'.format(j) + affinity_string
-        affinity_list = [int(x) for x in affinity_string]
-        affinity_list.reverse()  # so core 0 is in 0th element of list
-
-        ret = [i for i, e in enumerate(affinity_list) if e != 0]
-        return ret
-
-
-def set_socket_affinity(gpu_id):
-    dev = device(gpu_id)
-    affinity = dev.getCpuAffinity()
-    os.sched_setaffinity(0, affinity)
-
-
-def set_single_affinity(gpu_id):
-    dev = device(gpu_id)
-    affinity = dev.getCpuAffinity()
-    os.sched_setaffinity(0, affinity[:1])
-
-
-def set_single_unique_affinity(gpu_id, nproc_per_node):
-    devices = [device(i) for i in range(nproc_per_node)]
-    socket_affinities = [dev.getCpuAffinity() for dev in devices]
-
-    siblings_list = get_thread_siblings_list()
-    siblings_dict = dict(siblings_list)
-
-    # remove siblings
-    for idx, socket_affinity in enumerate(socket_affinities):
-        socket_affinities[idx] = list(set(socket_affinity) - set(siblings_dict.values()))
-
-    affinities = []
-    assigned = []
-
-    for socket_affinity in socket_affinities:
-        for core in socket_affinity:
-            if core not in assigned:
-                affinities.append([core])
-                assigned.append(core)
-                break
-    os.sched_setaffinity(0, affinities[gpu_id])
-
-
-def set_socket_unique_affinity(gpu_id, nproc_per_node, mode):
-    device_ids = [device(i) for i in range(nproc_per_node)]
-    socket_affinities = [dev.getCpuAffinity() for dev in device_ids]
-
-    siblings_list = get_thread_siblings_list()
-    siblings_dict = dict(siblings_list)
-
-    # remove siblings
-    for idx, socket_affinity in enumerate(socket_affinities):
-        socket_affinities[idx] = list(set(socket_affinity) - set(siblings_dict.values()))
-
-    socket_affinities_to_device_ids = collections.defaultdict(list)
-
-    for idx, socket_affinity in enumerate(socket_affinities):
-        socket_affinities_to_device_ids[tuple(socket_affinity)].append(idx)
-
-    for socket_affinity, device_ids in socket_affinities_to_device_ids.items():
-        devices_per_group = len(device_ids)
-        cores_per_device = len(socket_affinity) // devices_per_group
-        for group_id, device_id in enumerate(device_ids):
-            if device_id == gpu_id:
-                if mode == 'interleaved':
-                    affinity = list(socket_affinity[group_id::devices_per_group])
-                elif mode == 'continuous':
-                    affinity = list(socket_affinity[group_id*cores_per_device:(group_id+1)*cores_per_device])
-                else:
-                    raise RuntimeError('Unknown set_socket_unique_affinity mode')
-
-                # reintroduce siblings
-                affinity += [siblings_dict[aff] for aff in affinity if aff in siblings_dict]
-                os.sched_setaffinity(0, affinity)
-
-
-def get_thread_siblings_list():
-    path = '/sys/devices/system/cpu/cpu*/topology/thread_siblings_list'
-    thread_siblings_list = []
-    pattern = re.compile(r'(\d+)\D(\d+)')
-    for fname in pathlib.Path(path[0]).glob(path[1:]):
-        with open(fname) as f:
-            content = f.read().strip()
-            res = pattern.findall(content)
-            if res:
-                pair = tuple(map(int, res[0]))
-                thread_siblings_list.append(pair)
-    return thread_siblings_list
-
-
-def set_affinity(gpu_id, nproc_per_node, mode='socket'):
-    if mode == 'socket':
-        set_socket_affinity(gpu_id)
-    elif mode == 'single':
-        set_single_affinity(gpu_id)
-    elif mode == 'single_unique':
-        set_single_unique_affinity(gpu_id, nproc_per_node)
-    elif mode == 'socket_unique_interleaved':
-        set_socket_unique_affinity(gpu_id, nproc_per_node, 'interleaved')
-    elif mode == 'socket_unique_continuous':
-        set_socket_unique_affinity(gpu_id, nproc_per_node, 'continuous')
-    else:
-        raise RuntimeError('Unknown affinity mode')
-
-    affinity = os.sched_getaffinity(0)
-    return affinity
-
diff --git a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/inference.py b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/inference.py
deleted file mode 100644
index 056429f1..00000000
--- a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/inference.py
+++ /dev/null
@@ -1,239 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import pandas as pd
-import numpy as np
-import pickle
-import argparse
-import torch
-from torch.utils.data import DataLoader
-from torch.cuda import amp
-from torch.utils.tensorboard import SummaryWriter
-from tqdm import tqdm
-from modeling import TemporalFusionTransformer
-from configuration import ElectricityConfig
-from data_utils import TFTDataset
-from utils import PerformanceMeter
-from criterions import QuantileLoss
-import dllogger
-from log_helper import setup_logger
-
-def _unscale_per_id(config, values, ids, scalers):
-    values = values.cpu().numpy()
-    num_horizons = config.example_length - config.encoder_length + 1
-    flat_values = pd.DataFrame(
-            values,
-            columns=[f't{j}' for j in range(num_horizons - values.shape[1], num_horizons)]
-            )
-    flat_values['id'] = ids
-    df_list = []
-    for idx, group in flat_values.groupby('id'):
-        scaler = scalers[idx]
-        group_copy = group.copy()
-        for col in group_copy.columns:
-            if not 'id' in col:
-                _col = np.expand_dims(group_copy[col].values, -1)
-                _t_col = scaler.inverse_transform(_col)[:,-1]
-                group_copy[col] = _t_col
-        df_list.append(group_copy)
-    flat_values = pd.concat(df_list, axis=0)
-
-    flat_values = flat_values[[col for col in flat_values if not 'id' in col]]
-    flat_tensor = torch.from_numpy(flat_values.values)
-    return flat_tensor
-
-def _unscale(config, values, scaler):
-    values = values.cpu().numpy()
-    num_horizons = config.example_length - config.encoder_length + 1
-    flat_values = pd.DataFrame(
-            values,
-            columns=[f't{j}' for j in range(num_horizons - values.shape[1], num_horizons)]
-            )
-    for col in flat_values.columns:
-        if not 'id' in col:
-            _col = np.expand_dims(flat_values[col].values, -1)
-            _t_col = scaler.inverse_transform(_col)[:,-1]
-            flat_values[col] = _t_col
-
-    flat_values = flat_values[[col for col in flat_values if not 'id' in col]]
-    flat_tensor = torch.from_numpy(flat_values.values)
-    return flat_tensor
-
-def predict(args, config, model, data_loader, scalers, cat_encodings, extend_targets=False):
-    model.eval()
-    predictions = []
-    targets = []
-    ids = []
-    perf_meter = PerformanceMeter()
-    n_workers = args.distributed_world_size if hasattr(args, 'distributed_world_size') else 1
-
-    for step, batch in enumerate(data_loader):
-        perf_meter.reset_current_lap()
-        with torch.no_grad():
-            batch = {key: tensor.cuda() if tensor.numel() else None for key, tensor in batch.items()}
-            ids.append(batch['id'][:,0,:])
-            targets.append(batch['target'])
-            predictions.append(model(batch).float())
-
-        perf_meter.update(args.batch_size * n_workers,
-            exclude_from_total=step in [0, len(data_loader)-1])
-
-    targets = torch.cat(targets, dim=0)
-    if not extend_targets:
-        targets = targets[:,config.encoder_length:,:] 
-    predictions = torch.cat(predictions, dim=0)
-    
-    if config.scale_per_id:
-        ids = torch.cat(ids, dim=0).cpu().numpy()
-
-        unscaled_predictions = torch.stack(
-                [_unscale_per_id(config, predictions[:,:,i], ids, scalers) for i in range(len(config.quantiles))], 
-                dim=-1)
-        unscaled_targets = _unscale_per_id(config, targets[:,:,0], ids, scalers).unsqueeze(-1)
-    else:
-        ids = None
-        unscaled_predictions = torch.stack(
-                [_unscale(config, predictions[:,:,i], scalers['']) for i in range(len(config.quantiles))], 
-                dim=-1)
-        unscaled_targets = _unscale(config, targets[:,:,0], scalers['']).unsqueeze(-1)
-
-    return unscaled_predictions, unscaled_targets, ids, perf_meter
-
-def visualize_v2(args, config, model, data_loader, scalers, cat_encodings):
-    unscaled_predictions, unscaled_targets, ids, _ = predict(args, config, model, data_loader, scalers, cat_encodings, extend_targets=True)
-
-    num_horizons = config.example_length - config.encoder_length + 1
-    pad = unscaled_predictions.new_full((unscaled_targets.shape[0], unscaled_targets.shape[1] - unscaled_predictions.shape[1], unscaled_predictions.shape[2]), fill_value=float('nan'))
-    pad[:,-1,:] = unscaled_targets[:,-num_horizons,:]
-    unscaled_predictions = torch.cat((pad, unscaled_predictions), dim=1)
-
-    ids = torch.from_numpy(ids.squeeze())
-    joint_graphs = torch.cat([unscaled_targets, unscaled_predictions], dim=2)
-    graphs = {i:joint_graphs[ids == i, :, :] for i in set(ids.tolist())}
-    for key, g in graphs.items():
-        for i, ex in enumerate(g):
-            df = pd.DataFrame(ex.numpy(), 
-                    index=range(num_horizons - ex.shape[0], num_horizons),
-                    columns=['target'] + [f'P{int(q*100)}' for q in config.quantiles])
-            fig = df.plot().get_figure()
-            ax = fig.get_axes()[0]
-            _values = df.values[config.encoder_length-1:,:]
-            ax.fill_between(range(num_horizons), _values[:,1], _values[:,-1], alpha=0.2, color='green')
-            os.makedirs(os.path.join(args.results, 'single_example_vis', str(key)), exist_ok=True)
-            fig.savefig(os.path.join(args.results, 'single_example_vis', str(key), f'{i}.pdf'))
-
-def inference(args, config, model, data_loader, scalers, cat_encodings):
-    unscaled_predictions, unscaled_targets, ids, perf_meter = predict(args, config, model, data_loader, scalers, cat_encodings)
-
-    if args.joint_visualization or args.save_predictions:
-        ids = torch.from_numpy(ids.squeeze())
-        #ids = torch.cat([x['id'][0] for x in data_loader.dataset])
-        joint_graphs = torch.cat([unscaled_targets, unscaled_predictions], dim=2)
-        graphs = {i:joint_graphs[ids == i, :, :] for i in set(ids.tolist())}
-        for key, g in graphs.items(): #timeseries id, joint targets and predictions
-            _g = {'targets': g[:,:,0]}
-            _g.update({f'P{int(q*100)}':g[:,:,i+1] for i, q in enumerate(config.quantiles)})
-            
-            if args.joint_visualization:
-                summary_writer = SummaryWriter(log_dir=os.path.join(args.results, 'predictions_vis', str(key)))
-                for q, t in _g.items(): # target and quantiles, timehorizon values
-                    if q == 'targets':
-                        targets = torch.cat([t[:,0], t[-1,1:]]) # WIP
-                        # We want to plot targets on the same graph as predictions. Probably could be written better.
-                        for i, val in enumerate(targets):
-                            summary_writer.add_scalars(str(key), {f'{q}':val}, i)
-                        continue
-
-                    # Tensor t contains different time horizons which are shifted in phase
-                    # Next lines realign them
-                    y = t.new_full((t.shape[0] + t.shape[1] -1, t.shape[1]), float('nan'))
-                    for i in range(y.shape[1]):
-                        y[i:i+t.shape[0], i] = t[:,i]
-
-                    for i, vals in enumerate(y): # timestep, timehorizon values value
-                        summary_writer.add_scalars(str(key), {f'{q}_t+{j+1}':v for j,v in enumerate(vals) if v == v}, i)
-                summary_writer.close()
-
-            if args.save_predictions:
-                for q, t in _g.items():
-                    df = pd.DataFrame(t.tolist())
-                    df.columns = [f't+{i+1}' for i in range(len(df.columns))]
-                    os.makedirs(os.path.join(args.results, 'predictions', str(key)), exist_ok=True)
-                    df.to_csv(os.path.join(args.results, 'predictions', str(key), q+'.csv'))
-
-    losses = QuantileLoss(config)(unscaled_predictions, unscaled_targets)
-    normalizer = unscaled_targets.abs().mean()
-    q_risk = 2 * losses / normalizer
-
-    perf_dict = {
-                'throughput': perf_meter.avg,
-                'latency_avg': perf_meter.total_time/len(perf_meter.intervals),
-                'latency_p90': perf_meter.p(90),
-                'latency_p95': perf_meter.p(95),
-                'latency_p99': perf_meter.p(99),
-                'total_infernece_time': perf_meter.total_time,
-                }
-
-    return q_risk, perf_dict
-
-
-def main(args):
-    
-    setup_logger(args)
-    # Set up model
-    state_dict = torch.load(args.checkpoint)
-    config = state_dict['config']
-    model = TemporalFusionTransformer(config).cuda()
-    model.load_state_dict(state_dict['model'])
-    model.eval()
-    model.cuda()
-
-    # Set up dataset
-    test_split = TFTDataset(args.data, config)
-    data_loader = DataLoader(test_split, batch_size=args.batch_size, num_workers=4)
-
-    scalers = pickle.load(open(args.tgt_scalers, 'rb'))
-    cat_encodings = pickle.load(open(args.cat_encodings, 'rb'))
-
-    if args.visualize:
-        # TODO: abstract away all forms of visualization.
-        visualize_v2(args, config, model, data_loader, scalers, cat_encodings)
-
-    quantiles, perf_dict = inference(args, config, model, data_loader, scalers, cat_encodings)
-    quantiles = {'test_p10': quantiles[0].item(), 'test_p50': quantiles[1].item(), 'test_p90': quantiles[2].item(), 'sum':sum(quantiles).item()}
-    finish_log = {**quantiles, **perf_dict}
-    dllogger.log(step=(), data=finish_log, verbosity=1)
-    print('Test q-risk: P10 {} | P50 {} | P90 {}'.format(*quantiles))
-    print('Latency:\n\tAverage {:.3f}s\n\tp90 {:.3f}s\n\tp95 {:.3f}s\n\tp99 {:.3f}s'.format(
-        perf_dict['latency_avg'], perf_dict['latency_p90'], perf_dict['latency_p95'], perf_dict['latency_p99']))
-
-if __name__=='__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--checkpoint', type=str,
-                        help='Path to the checkpoint')
-    parser.add_argument('--data', type=str,
-                        help='Path to the test split of the dataset')
-    parser.add_argument('--tgt_scalers', type=str,
-                        help='Path to the tgt_scalers.bin file produced by the preprocessing')
-    parser.add_argument('--cat_encodings', type=str,
-                        help='Path to the cat_encodings.bin file produced by the preprocessing')
-    parser.add_argument('--batch_size', type=int, default=64)
-    parser.add_argument('--visualize', action='store_true', help='Visualize predictions - each example on the separate plot')
-    parser.add_argument('--joint_visualization', action='store_true', help='Visualize predictions - each timeseries on separate plot. Projections will be concatenated.')
-    parser.add_argument('--save_predictions', action='store_true')
-    parser.add_argument('--results', type=str, default='/results')
-    parser.add_argument('--log_file', type=str, default='dllogger.json')
-    ARGS = parser.parse_args()
-    main(ARGS)
diff --git a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/log_helper.py b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/log_helper.py
deleted file mode 100644
index 83d2ac7f..00000000
--- a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/log_helper.py
+++ /dev/null
@@ -1,141 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import subprocess
-import sys
-import itertools
-import atexit
-
-import dllogger
-from dllogger import Backend, JSONStreamBackend, StdOutBackend
-
-import torch.distributed as dist
-from torch.utils.tensorboard import SummaryWriter
-
-class TensorBoardBackend(Backend):
-    def __init__(self, verbosity, log_dir):
-        super().__init__(verbosity=verbosity)
-        self.summary_writer = SummaryWriter(log_dir=os.path.join(log_dir, 'TB_summary'),
-                                            flush_secs=120,
-                                            max_queue=200
-                                            )
-        self.hp_cache = None
-        atexit.register(self.summary_writer.close)
-
-    @property
-    def log_level(self):
-        return self._log_level
-
-    def metadata(self, timestamp, elapsedtime, metric, metadata):
-        pass
-
-    def log(self, timestamp, elapsedtime, step, data):
-        if step == 'HPARAMS':
-            parameters = {k: v for k, v in data.items() if not isinstance(v, (list, tuple))}
-            #Unpack list and tuples
-            for d in [{k+f'_{i}':v for i,v in enumerate(l)} for k,l in data.items() if isinstance(l, (list, tuple))]:
-                parameters.update(d)
-            #Remove custom classes
-            parameters = {k: v for k, v in data.items() if isinstance(v, (int, float, str, bool))}
-            parameters.update({k:'None' for k, v in data.items() if v is None})
-            self.hp_cache = parameters
-        if step == ():
-            if self.hp_cache is None:
-                print('Warning: Cannot save HParameters. Please log HParameters with step=\'HPARAMS\'', file=sys.stderr)
-                return
-            self.summary_writer.add_hparams(self.hp_cache, data)
-        if not isinstance(step, int):
-            return
-        for k, v in data.items():
-            self.summary_writer.add_scalar(k, v, step)
-
-    def flush(self):
-        pass
-
-def setup_logger(args):
-    os.makedirs(args.results, exist_ok=True)
-    log_path = os.path.join(args.results, args.log_file)
-
-    if os.path.exists(log_path):
-        for i in itertools.count():
-            s_fname = args.log_file.split('.')
-            fname = '.'.join(s_fname[:-1]) + f'_{i}.' + s_fname[-1] if len(s_fname) > 1 else args.stat_file + f'.{i}'
-            log_path = os.path.join(args.results, fname)
-            if not os.path.exists(log_path):
-                break
-
-    def metric_format(metric, metadata, value):
-        return "{}: {}".format(metric, f'{value:.5f}' if isinstance(value, float) else value)
-    def step_format(step):
-        if step == ():
-            return "Finished |"
-        elif isinstance(step, int):
-            return "Step {0: <5} |".format(step)
-        return "Step {} |".format(step)
-
-
-    if not dist.is_initialized() or not args.distributed_world_size > 1 or args.distributed_rank == 0:
-        dllogger.init(backends=[JSONStreamBackend(verbosity=1, filename=log_path),
-                                TensorBoardBackend(verbosity=1, log_dir=args.results),
-                                StdOutBackend(verbosity=2, 
-                                              step_format=step_format,
-                                              prefix_format=lambda x: "")#,
-                                              #metric_format=metric_format)
-                                ])
-    else:
-        dllogger.init(backends=[])
-    dllogger.log(step='PARAMETER', data=vars(args), verbosity=0)
-
-    container_setup_info = {**get_framework_env_vars(), **get_system_info()}
-    dllogger.log(step='ENVIRONMENT', data=container_setup_info, verbosity=0)
-
-    dllogger.metadata('loss', {'GOAL': 'MINIMIZE', 'STAGE': 'TRAIN', 'format': ':5f'})
-    dllogger.metadata('P10', {'GOAL': 'MINIMIZE', 'STAGE': 'TRAIN', 'format': ':5f'})
-    dllogger.metadata('P50', {'GOAL': 'MINIMIZE', 'STAGE': 'TRAIN', 'format': ':5f'})
-    dllogger.metadata('P90', {'GOAL': 'MINIMIZE', 'STAGE': 'TRAIN', 'format': ':5f'})
-    dllogger.metadata('items/s', {'GOAL': 'MAXIMIZE', 'STAGE': 'TRAIN', 'format': ':1f'})
-    dllogger.metadata('val_loss', {'GOAL': 'MINIMIZE', 'STAGE': 'VAL', 'format':':5f'})
-    dllogger.metadata('val_P10', {'GOAL': 'MINIMIZE', 'STAGE': 'VAL', 'format': ':5f'})
-    dllogger.metadata('val_P50', {'GOAL': 'MINIMIZE', 'STAGE': 'VAL', 'format': ':5f'})
-    dllogger.metadata('val_P90', {'GOAL': 'MINIMIZE', 'STAGE': 'VAL', 'format': ':5f'})
-    dllogger.metadata('val_items/s', {'GOAL': 'MAXIMIZE', 'STAGE': 'VAL', 'format': ':1f'})
-    dllogger.metadata('test_P10', {'GOAL': 'MINIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
-    dllogger.metadata('test_P50', {'GOAL': 'MINIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
-    dllogger.metadata('test_P90', {'GOAL': 'MINIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
-    dllogger.metadata('throughput', {'GOAL': 'MAXIMIZE', 'STAGE': 'TEST', 'format': ':1f'})
-    dllogger.metadata('latency_p90', {'GOAL': 'MIMIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
-    dllogger.metadata('latency_p95', {'GOAL': 'MIMIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
-    dllogger.metadata('latency_p99', {'GOAL': 'MIMIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
-
-
-def get_framework_env_vars():
-    return {
-        'NVIDIA_PYTORCH_VERSION': os.environ.get('NVIDIA_PYTORCH_VERSION'),
-        'PYTORCH_VERSION': os.environ.get('PYTORCH_VERSION'),
-        'CUBLAS_VERSION': os.environ.get('CUBLAS_VERSION'),
-        'NCCL_VERSION': os.environ.get('NCCL_VERSION'),
-        'CUDA_DRIVER_VERSION': os.environ.get('CUDA_DRIVER_VERSION'),
-        'CUDNN_VERSION': os.environ.get('CUDNN_VERSION'),
-        'CUDA_VERSION': os.environ.get('CUDA_VERSION'),
-        'NVIDIA_PIPELINE_ID': os.environ.get('NVIDIA_PIPELINE_ID'),
-        'NVIDIA_BUILD_ID': os.environ.get('NVIDIA_BUILD_ID'),
-        'NVIDIA_TF32_OVERRIDE': os.environ.get('NVIDIA_TF32_OVERRIDE'),
-    }
-
-def get_system_info():
-    system_info = subprocess.run('nvidia-smi --query-gpu=gpu_name,memory.total,enforced.power.limit --format=csv'.split(), capture_output=True).stdout
-    system_info = [i.decode('utf-8') for i in system_info.split(b'\n')]
-    system_info = [x for x in system_info if x]
-    return {'system_info': system_info}
diff --git a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/modeling.py b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/modeling.py
deleted file mode 100644
index 65e64983..00000000
--- a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/modeling.py
+++ /dev/null
@@ -1,367 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-from torch import Tensor
-from typing import Dict, Tuple, Optional, List
-
-if os.environ.get("TFT_SCRIPTING", False):
-    from torch.nn import LayerNorm
-else:
-    from apex.normalization.fused_layer_norm import FusedLayerNorm as LayerNorm
-
-class MaybeLayerNorm(nn.Module):
-    def __init__(self, output_size, hidden_size, eps):
-        super().__init__()
-        if output_size and output_size == 1:
-            self.ln = nn.Identity()
-        else:
-            self.ln = LayerNorm(output_size if output_size else hidden_size, eps=eps)
-    
-    def forward(self, x):
-        return self.ln(x)
-
-
-class GLU(nn.Module):
-    def __init__(self, hidden_size, output_size):
-        super().__init__()
-        self.lin = nn.Linear(hidden_size, output_size * 2)
-
-    def forward(self, x: Tensor) -> Tensor:
-        x = self.lin(x)
-        x = F.glu(x)
-        return x
-
-
-class GRN(nn.Module):
-    def __init__(self,
-                 input_size,
-                 hidden_size, 
-                 output_size=None,
-                 context_hidden_size=None,
-                 dropout=0):
-        super().__init__()
-
-        
-        self.layer_norm = MaybeLayerNorm(output_size, hidden_size, eps=1e-3)
-        self.lin_a = nn.Linear(input_size, hidden_size)
-        if context_hidden_size is not None:
-            self.lin_c = nn.Linear(context_hidden_size, hidden_size, bias=False)
-        self.lin_i = nn.Linear(hidden_size, hidden_size)
-        self.glu = GLU(hidden_size, output_size if output_size else hidden_size)
-        self.dropout = nn.Dropout(dropout)
-        self.out_proj = nn.Linear(input_size, output_size) if output_size else None
-
-    def forward(self, a: Tensor, c: Optional[Tensor] = None):
-        x = self.lin_a(a)
-        if c is not None:
-            x = x + self.lin_c(c).unsqueeze(1)
-        x = F.elu(x)
-        x = self.lin_i(x)
-        x = self.dropout(x)
-        x = self.glu(x)
-        y = a if not self.out_proj else self.out_proj(a)
-        x = x + y
-        x = self.layer_norm(x)
-        return x 
-
-class TFTEmbedding(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.s_cat_inp_lens    = config.static_categorical_inp_lens
-        self.t_cat_k_inp_lens  = config.temporal_known_categorical_inp_lens
-        self.t_cat_o_inp_lens  = config.temporal_observed_categorical_inp_lens
-        self.s_cont_inp_size   = config.static_continuous_inp_size
-        self.t_cont_k_inp_size = config.temporal_known_continuous_inp_size
-        self.t_cont_o_inp_size = config.temporal_observed_continuous_inp_size
-        self.t_tgt_size        = config.temporal_target_size
-
-        self.hidden_size = config.hidden_size
-
-        # There are 7 types of input:
-        # 1. Static categorical
-        # 2. Static continuous
-        # 3. Temporal known a priori categorical
-        # 4. Temporal known a priori continuous
-        # 5. Temporal observed categorical
-        # 6. Temporal observed continuous
-        # 7. Temporal observed targets (time series obseved so far)
-
-        self.s_cat_embed = nn.ModuleList([
-            nn.Embedding(n, self.hidden_size) for n in self.s_cat_inp_lens]) if self.s_cat_inp_lens else None
-        self.t_cat_k_embed = nn.ModuleList([
-            nn.Embedding(n, self.hidden_size) for n in self.t_cat_k_inp_lens]) if self.t_cat_k_inp_lens else None
-        self.t_cat_o_embed = nn.ModuleList([
-            nn.Embedding(n, self.hidden_size) for n in self.t_cat_o_inp_lens]) if self.t_cat_o_inp_lens else None
-
-        self.s_cont_embedding_vectors = nn.Parameter(torch.Tensor(self.s_cont_inp_size, self.hidden_size)) if self.s_cont_inp_size else None
-        self.t_cont_k_embedding_vectors = nn.Parameter(torch.Tensor(self.t_cont_k_inp_size, self.hidden_size)) if self.t_cont_k_inp_size else None
-        self.t_cont_o_embedding_vectors = nn.Parameter(torch.Tensor(self.t_cont_o_inp_size, self.hidden_size)) if self.t_cont_o_inp_size else None
-        self.t_tgt_embedding_vectors = nn.Parameter(torch.Tensor(self.t_tgt_size, self.hidden_size))
-
-        self.s_cont_embedding_bias = nn.Parameter(torch.zeros(self.s_cont_inp_size, self.hidden_size)) if self.s_cont_inp_size else None
-        self.t_cont_k_embedding_bias = nn.Parameter(torch.zeros(self.t_cont_k_inp_size, self.hidden_size)) if self.t_cont_k_inp_size else None
-        self.t_cont_o_embedding_bias = nn.Parameter(torch.zeros(self.t_cont_o_inp_size, self.hidden_size)) if self.t_cont_o_inp_size else None
-        self.t_tgt_embedding_bias = nn.Parameter(torch.zeros(self.t_tgt_size, self.hidden_size))
-
-        if self.s_cont_embedding_vectors is not None:
-            torch.nn.init.xavier_normal_(self.s_cont_embedding_vectors)
-        if self.t_cont_k_embedding_vectors is not None:
-            torch.nn.init.xavier_normal_(self.t_cont_k_embedding_vectors)
-        if self.t_cont_o_embedding_vectors is not None:
-            torch.nn.init.xavier_normal_(self.t_cont_o_embedding_vectors)
-        torch.nn.init.xavier_normal_(self.t_tgt_embedding_vectors)
-
-    def _apply_embedding(self,
-            cat: Optional[Tensor],
-            cont: Optional[Tensor],
-            cat_emb: Optional[nn.ModuleList], 
-            cont_emb: Tensor,
-            cont_bias: Tensor,
-            ) -> Tuple[Optional[Tensor], Optional[Tensor]]:
-        e_cat = torch.stack([embed(cat[...,i]) for i, embed in enumerate(cat_emb)], dim=-2) if cat is not None else None
-        if cont is not None:
-            #the line below is equivalent to following einsums
-            #e_cont = torch.einsum('btf,fh->bthf', cont, cont_emb)
-            #e_cont = torch.einsum('bf,fh->bhf', cont, cont_emb)
-            e_cont = torch.mul(cont.unsqueeze(-1), cont_emb)
-            e_cont = e_cont + cont_bias
-        else:
-            e_cont = None
-
-        if e_cat is not None and e_cont is not None:
-            return torch.cat([e_cat, e_cont], dim=-2)
-        elif e_cat is not None:
-            return e_cat
-        elif e_cont is not None:
-            return e_cont
-        else:
-            return None
-
-    def forward(self, x: Dict[str, Tensor]):
-        # temporal/static categorical/continuous known/observed input 
-        s_cat_inp = x.get('s_cat', None)
-        s_cont_inp = x.get('s_cont', None)
-        t_cat_k_inp = x.get('k_cat', None)
-        t_cont_k_inp = x.get('k_cont', None)
-        t_cat_o_inp = x.get('o_cat', None)
-        t_cont_o_inp = x.get('o_cont', None)
-        t_tgt_obs = x['target'] # Has to be present
-
-        # Static inputs are expected to be equal for all timesteps
-        # For memory efficiency there is no assert statement
-        s_cat_inp = s_cat_inp[:,0,:] if s_cat_inp is not None else None
-        s_cont_inp = s_cont_inp[:,0,:] if s_cont_inp is not None else None
-
-        s_inp = self._apply_embedding(s_cat_inp,
-                                      s_cont_inp,
-                                      self.s_cat_embed,
-                                      self.s_cont_embedding_vectors,
-                                      self.s_cont_embedding_bias)
-        t_known_inp = self._apply_embedding(t_cat_k_inp,
-                                            t_cont_k_inp,
-                                            self.t_cat_k_embed,
-                                            self.t_cont_k_embedding_vectors,
-                                            self.t_cont_k_embedding_bias)
-        t_observed_inp = self._apply_embedding(t_cat_o_inp,
-                                               t_cont_o_inp,
-                                               self.t_cat_o_embed,
-                                               self.t_cont_o_embedding_vectors,
-                                               self.t_cont_o_embedding_bias)
-
-        # Temporal observed targets
-        # t_observed_tgt = torch.einsum('btf,fh->btfh', t_tgt_obs, self.t_tgt_embedding_vectors)
-        t_observed_tgt = torch.matmul(t_tgt_obs.unsqueeze(3).unsqueeze(4), self.t_tgt_embedding_vectors.unsqueeze(1)).squeeze(3)
-        t_observed_tgt = t_observed_tgt + self.t_tgt_embedding_bias
-
-        return s_inp, t_known_inp, t_observed_inp, t_observed_tgt
-
-class VariableSelectionNetwork(nn.Module):
-    def __init__(self, config, num_inputs):
-        super().__init__()
-        self.joint_grn = GRN(config.hidden_size*num_inputs, config.hidden_size, output_size=num_inputs, context_hidden_size=config.hidden_size)
-        self.var_grns = nn.ModuleList([GRN(config.hidden_size, config.hidden_size, dropout=config.dropout) for _ in range(num_inputs)])
-
-    def forward(self, x: Tensor, context: Optional[Tensor] = None):
-        Xi = x.reshape(*x.shape[:-2], -1)
-        grn_outputs = self.joint_grn(Xi, c=context)
-        sparse_weights = F.softmax(grn_outputs, dim=-1)
-        transformed_embed_list = [m(x[...,i,:]) for i, m in enumerate(self.var_grns)]
-        transformed_embed = torch.stack(transformed_embed_list, dim=-1)
-        #the line below performs batched matrix vector multiplication
-        #for temporal features it's bthf,btf->bth
-        #for static features it's bhf,bf->bh
-        variable_ctx = torch.matmul(transformed_embed, sparse_weights.unsqueeze(-1)).squeeze(-1)
-
-        return variable_ctx, sparse_weights
-
-class StaticCovariateEncoder(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.vsn = VariableSelectionNetwork(config, config.num_static_vars)
-        self.context_grns = nn.ModuleList([GRN(config.hidden_size, config.hidden_size, dropout=config.dropout) for _ in range(4)])
-
-    def forward(self, x: Tensor) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
-        variable_ctx, sparse_weights = self.vsn(x)
-
-        # Context vectors:
-        # variable selection context
-        # enrichment context
-        # state_c context
-        # state_h context
-        cs, ce, ch, cc = tuple(m(variable_ctx) for m in self.context_grns)
-
-        return cs, ce, ch, cc
-
-
-class InterpretableMultiHeadAttention(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.n_head = config.n_head
-        assert config.hidden_size % config.n_head == 0
-        self.d_head = config.hidden_size // config.n_head
-        self.qkv_linears = nn.Linear(config.hidden_size, (2 * self.n_head + 1) * self.d_head, bias=False)
-        self.out_proj = nn.Linear(self.d_head, config.hidden_size, bias=False)
-        self.attn_dropout = nn.Dropout(config.attn_dropout)
-        self.out_dropout = nn.Dropout(config.dropout)
-        self.scale = self.d_head**-0.5
-        self.register_buffer("_mask", torch.triu(torch.full((config.example_length, config.example_length), float('-inf')), 1).unsqueeze(0))
-
-    def forward(self, x: Tensor, mask_future_timesteps: bool = True) -> Tuple[Tensor, Tensor]:
-        bs, t, h_size = x.shape
-        qkv = self.qkv_linears(x)
-        q, k, v = qkv.split((self.n_head * self.d_head, self.n_head * self.d_head, self.d_head), dim=-1)
-        q = q.view(bs, t, self.n_head, self.d_head)
-        k = k.view(bs, t, self.n_head, self.d_head)
-        v = v.view(bs, t, self.d_head)
-
-        # attn_score = torch.einsum('bind,bjnd->bnij', q, k)
-        attn_score = torch.matmul(q.permute((0, 2, 1, 3)), k.permute((0, 2, 3, 1)))
-        attn_score.mul_(self.scale)
-
-        if mask_future_timesteps:
-            attn_score = attn_score + self._mask
-
-        attn_prob = F.softmax(attn_score, dim=3)
-        attn_prob = self.attn_dropout(attn_prob)
-
-        # attn_vec = torch.einsum('bnij,bjd->bnid', attn_prob, v)
-        attn_vec = torch.matmul(attn_prob, v.unsqueeze(1))
-        m_attn_vec = torch.mean(attn_vec, dim=1)
-        out = self.out_proj(m_attn_vec)
-        out = self.out_dropout(out)
-
-        return out, attn_vec
-
-
-
-class TemporalFusionTransformer(nn.Module):
-    """ 
-    Implementation of https://arxiv.org/abs/1912.09363 
-    """
-    def __init__(self, config):
-        super().__init__()
-
-        if hasattr(config, 'model'):
-            config = config.model
-
-        self.encoder_length = config.encoder_length #this determines from how distant past we want to use data from
-
-        self.embedding = TFTEmbedding(config)
-        self.static_encoder = StaticCovariateEncoder(config)
-
-        self.history_vsn = VariableSelectionNetwork(config, config.num_historic_vars) 
-        self.history_encoder = nn.LSTM(config.hidden_size, config.hidden_size, batch_first=True)
-        self.future_vsn = VariableSelectionNetwork(config, config.num_future_vars)
-        self.future_encoder = nn.LSTM(config.hidden_size, config.hidden_size, batch_first=True)
-
-
-        self.input_gate = GLU(config.hidden_size, config.hidden_size)
-        self.input_gate_ln = LayerNorm(config.hidden_size, eps=1e-3)
-
-        self.enrichment_grn = GRN(config.hidden_size,
-                                  config.hidden_size,
-                                  context_hidden_size=config.hidden_size, 
-                                  dropout=config.dropout)
-        self.attention = InterpretableMultiHeadAttention(config)
-        self.attention_gate = GLU(config.hidden_size, config.hidden_size)
-        self.attention_ln = LayerNorm(config.hidden_size, eps=1e-3)
-
-        self.positionwise_grn = GRN(config.hidden_size,
-                                    config.hidden_size,
-                                    dropout=config.dropout)
-
-        self.decoder_gate = GLU(config.hidden_size, config.hidden_size)
-        self.decoder_ln = LayerNorm(config.hidden_size, eps=1e-3)
-
-        self.quantile_proj = nn.Linear(config.hidden_size, len(config.quantiles))
-
-    def forward(self, x: Dict[str, Tensor]) -> Tensor:
-        s_inp, t_known_inp, t_observed_inp, t_observed_tgt = self.embedding(x)
-
-        # Static context
-        cs, ce, ch, cc = self.static_encoder(s_inp)
-        ch, cc = ch.unsqueeze(0), cc.unsqueeze(0) #lstm initial states
-
-        # Temporal input
-        _historical_inputs = [t_known_inp[:,:self.encoder_length,:], t_observed_tgt[:,:self.encoder_length,:]]
-        if t_observed_inp is not None:
-            _historical_inputs.insert(0,t_observed_inp[:,:self.encoder_length,:])
-
-        historical_inputs = torch.cat(_historical_inputs, dim=-2)
-        future_inputs = t_known_inp[:, self.encoder_length:]
-
-        # Encoders
-        historical_features, _ = self.history_vsn(historical_inputs, cs)
-        history, state = self.history_encoder(historical_features, (ch, cc))
-        future_features, _ = self.future_vsn(future_inputs, cs)
-        future, _ = self.future_encoder(future_features, state)
-        torch.cuda.synchronize() # this call gives perf boost for unknown reasons
-
-        # skip connection
-        input_embedding = torch.cat([historical_features, future_features], dim=1)
-        temporal_features = torch.cat([history, future], dim=1)
-        temporal_features = self.input_gate(temporal_features)
-        temporal_features = temporal_features + input_embedding
-        temporal_features = self.input_gate_ln(temporal_features)
-
-        # Static enrichment
-        enriched = self.enrichment_grn(temporal_features, c=ce)
-
-        # Temporal self attention
-        x, _ = self.attention(enriched, mask_future_timesteps=True)
-
-        # Don't compute hictorical quantiles
-        x = x[:, self.encoder_length:, :]
-        temporal_features = temporal_features[:, self.encoder_length:, :]
-        enriched = enriched[:, self.encoder_length:, :]
-
-        x = self.attention_gate(x)
-        x = x + enriched
-        x = self.attention_ln(x)
-
-        # Position-wise feed-forward
-        x = self.positionwise_grn(x)
-
-        # Final skip connection
-        x = self.decoder_gate(x)
-        x = x + temporal_features
-        x = self.decoder_ln(x)
-
-        out = self.quantile_proj(x)
-
-        return out
diff --git a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/requirements.txt b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/requirements.txt
deleted file mode 100644
index 8ba46efc..00000000
--- a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/requirements.txt
+++ /dev/null
@@ -1 +0,0 @@
-tensorboard
diff --git a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/scripts/benchmark.sh b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/scripts/benchmark.sh
deleted file mode 100644
index c8a04c36..00000000
--- a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/scripts/benchmark.sh
+++ /dev/null
@@ -1,54 +0,0 @@
-#! /bin/bash
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-NUM_GPUS=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
-[ $NUM_GPUS -eq 16 ] && WORKER_NUMS=(1 8 16) || WORKER_NUMS=(1 8)
-DATASETS=(electricity traffic)
-
-rm -r /tmp/benchmark_results
-
-for DATASET in ${DATASETS[@]}
-do
-    for NGPU in ${WORKER_NUMS[@]}
-    do
-        for BATCH_SIZE in 512 1024 1536 2048 2560
-        do
-            for USE_AMP in --use_amp ""
-            do
-                for AFFINITY in "--affinity disabled" "--affinity single" "--affinity socket_unique_interleaved"
-                do 
-                    EXP_NAME="TFT_benchmark_${DATASET}_BS_${BATCH_SIZE}_${NGPU}GPU${USE_AMP}_${AFFINITY}"
-                    python -m torch.distributed.launch --nproc_per_node=${NGPU} train.py \
-                            --dataset ${DATASET} \
-                            --data_path /data/processed/${DATASET}_bin \
-                            --batch_size=${BATCH_SIZE} \
-                            --lr 5e-4 \
-                            --epochs 1 \
-                            --sample 100000 5000 \
-                            --seed 1 \
-                            ${USE_AMP} \
-                            ${AFFINITY} \
-                            --clip_grad 0.1 \
-                            --results /tmp/benchmark_results/${EXP_NAME}
-                done
-            done
-        done
-    done
-done
-for P in `ls /tmp/benchmark_results/`;
-do
-    echo ${P}
-    tail -n 1 /tmp/benchmark_results/${P}/dllogger.json
-done
diff --git a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/scripts/get_data.sh b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/scripts/get_data.sh
deleted file mode 100644
index d4c7c7e1..00000000
--- a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/scripts/get_data.sh
+++ /dev/null
@@ -1,40 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-DATAPATH='/data'
-
-declare -A URLS=( ['electricity']='https://archive.ics.uci.edu/ml/machine-learning-databases/00321/LD2011_2014.txt.zip'
-                  ['traffic']='https://archive.ics.uci.edu/ml/machine-learning-databases/00204/PEMS-SF.zip'
-                )
-
-mkdir -p ${DATAPATH}/raw
-mkdir -p ${DATAPATH}/processed
-
-for DS in electricity traffic
-do
-	DS_PATH=${DATAPATH}/raw/${DS}
-	ZIP_FNAME=${DS_PATH}.zip
-    if [ ! -d ${DS_PATH} ]
-    then
-        wget "${URLS[${DS}]}" -O ${ZIP_FNAME}
-        unzip ${ZIP_FNAME} -d ${DS_PATH}
-    fi
-	python -c "from data_utils import standarize_${DS} as standarize; standarize(\"${DS_PATH}\")"
-	python -c "from data_utils import preprocess; \
-               from configuration import ${DS^}Config as Config; \
-               preprocess(\"${DS_PATH}/standarized.csv\", \"${DATAPATH}/processed/${DS}_bin\", Config())" 
-done
-
-
diff --git a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/scripts/run_electricity.sh b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/scripts/run_electricity.sh
deleted file mode 100644
index 86214a9a..00000000
--- a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/scripts/run_electricity.sh
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-: ${SEED:=1}
-: ${LR:=1e-3}
-: ${NGPU:=8}
-: ${BATCH_SIZE:=1024}
-: ${EPOCHS:=30}
-
-python -m torch.distributed.launch --nproc_per_node=${NGPU} train.py \
-        --dataset electricity \
-        --data_path /data/processed/electricity_bin \
-        --batch_size=${BATCH_SIZE} \
-        --sample 450000 50000 \
-        --lr ${LR} \
-        --epochs ${EPOCHS} \
-        --seed ${SEED} \
-        --use_amp \
-        --results /results/TFT_electricity_bs${NGPU}x${BATCH_SIZE}_lr${LR}/seed_${SEED}
diff --git a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/scripts/run_electricity_DGX1-16G.sh b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/scripts/run_electricity_DGX1-16G.sh
deleted file mode 100644
index 86214a9a..00000000
--- a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/scripts/run_electricity_DGX1-16G.sh
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-: ${SEED:=1}
-: ${LR:=1e-3}
-: ${NGPU:=8}
-: ${BATCH_SIZE:=1024}
-: ${EPOCHS:=30}
-
-python -m torch.distributed.launch --nproc_per_node=${NGPU} train.py \
-        --dataset electricity \
-        --data_path /data/processed/electricity_bin \
-        --batch_size=${BATCH_SIZE} \
-        --sample 450000 50000 \
-        --lr ${LR} \
-        --epochs ${EPOCHS} \
-        --seed ${SEED} \
-        --use_amp \
-        --results /results/TFT_electricity_bs${NGPU}x${BATCH_SIZE}_lr${LR}/seed_${SEED}
diff --git a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/scripts/run_traffic.sh b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/scripts/run_traffic.sh
deleted file mode 100644
index cab8e473..00000000
--- a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/scripts/run_traffic.sh
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-: ${SEED:=1}
-: ${LR:=1e-3}
-: ${NGPU:=8}
-: ${BATCH_SIZE:=1024}
-: ${EPOCHS:=20}
-
-python -m torch.distributed.launch --nproc_per_node=${NGPU} train.py \
-        --dataset traffic \
-        --data_path /data/processed/traffic_bin \
-        --batch_size=${BATCH_SIZE} \
-        --sample 450000 50000 \
-        --lr ${LR} \
-        --epochs ${EPOCHS} \
-        --seed ${SEED} \
-        --use_amp \
-        --results /results/TFT_traffic_bs${NGPU}x${BATCH_SIZE}_lr${LR}/seed_${SEED}
diff --git a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/scripts/run_traffic_DGX1-16G.sh b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/scripts/run_traffic_DGX1-16G.sh
deleted file mode 100644
index cab8e473..00000000
--- a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/scripts/run_traffic_DGX1-16G.sh
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-: ${SEED:=1}
-: ${LR:=1e-3}
-: ${NGPU:=8}
-: ${BATCH_SIZE:=1024}
-: ${EPOCHS:=20}
-
-python -m torch.distributed.launch --nproc_per_node=${NGPU} train.py \
-        --dataset traffic \
-        --data_path /data/processed/traffic_bin \
-        --batch_size=${BATCH_SIZE} \
-        --sample 450000 50000 \
-        --lr ${LR} \
-        --epochs ${EPOCHS} \
-        --seed ${SEED} \
-        --use_amp \
-        --results /results/TFT_traffic_bs${NGPU}x${BATCH_SIZE}_lr${LR}/seed_${SEED}
diff --git a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/Dockerfile b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/Dockerfile
deleted file mode 100644
index 70552ea1..00000000
--- a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/Dockerfile
+++ /dev/null
@@ -1,36 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:21.06-py3
-
-FROM ${FROM_IMAGE_NAME}
-
-RUN apt-get update && apt-get install -y libb64-dev libb64-0d
-WORKDIR /workspace
-#ENV PYTHONPATH /workspace
-RUN pip uninstall -y typing
-
-RUN apt update && apt install -y p7zip-full
-COPY requirements.txt .
-RUN pip install --upgrade pip
-RUN pip install --no-cache-dir --ignore-installed -r requirements.txt
-RUN pip install --no-cache-dir -e git://github.com/NVIDIA/dllogger#egg=dllogger
-
-COPY . .
-ENV PYTHONPATH="${PYTHONPATH}:/workspace"
-
-# AMP monkey-patch
-RUN sed -i 's/  def forward(ctx,/  @amp.custom_fwd\(cast_inputs=torch.float32\)\n  def forward(ctx,/g' /opt/conda/lib/python3.8/site-packages/apex/normalization/fused_layer_norm.py
-RUN sed -i 's/  def backward(ctx,/  @amp.custom_bwd\n  def backward(ctx,/g' /opt/conda/lib/python3.8/site-packages/apex/normalization/fused_layer_norm.py
-RUN sed -i 's/^import torch$/import torch\nfrom torch.cuda import amp/' /opt/conda/lib/python3.8/site-packages/apex/normalization/fused_layer_norm.py
diff --git a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/LICENCE b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/LICENCE
deleted file mode 100644
index 261eeb9e..00000000
--- a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/LICENCE
+++ /dev/null
@@ -1,201 +0,0 @@
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-   1. Definitions.
-
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-
-   END OF TERMS AND CONDITIONS
-
-   APPENDIX: How to apply the Apache License to your work.
-
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-
-   Copyright [yyyy] [name of copyright owner]
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
diff --git a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/LICENSE AGREEMENT b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/LICENSE AGREEMENT
deleted file mode 100644
index 5d1d88cf..00000000
--- a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/LICENSE AGREEMENT	
+++ /dev/null
@@ -1,25 +0,0 @@
-Individual Contributor License Agreement (CLA)
-Thank you for submitting your contributions to this project.
-
-By signing this CLA, you agree that the following terms apply to all of your past, present and future contributions to the project.
-
-License.
-You hereby represent that all present, past and future contributions are governed by the Apache 2.0 License copyright statement.
-
-This entails that to the extent possible under law, you transfer all copyright and related or neighboring rights of the code or documents you contribute to the project itself or its maintainers. Furthermore you also represent that you have the authority to perform the above waiver with respect to the entirety of you contributions.
-
-Moral Rights.
-To the fullest extent permitted under applicable law, you hereby waive, and agree not to assert, all of your “moral rights” in or relating to your contributions for the benefit of the project.
-
-Third Party Content.
-If your Contribution includes or is based on any source code, object code, bug fixes, configuration changes, tools, specifications, documentation, data, materials, feedback, information or other works of authorship that were not authored by you (“Third Party Content”) or if you are aware of any third party intellectual property or proprietary rights associated with your Contribution (“Third Party Rights”), then you agree to include with the submission of your Contribution full details respecting such Third Party Content and Third Party Rights, including, without limitation, identification of which aspects of your Contribution contain Third Party Content or are associated with Third Party Rights, the owner/author of the Third Party Content and Third Party Rights, where you obtained the Third Party Content, and any applicable third party license terms or restrictions respecting the Third Party Content and Third Party Rights. For greater certainty, the foregoing obligations respecting the identification of Third Party Content and Third Party Rights do not apply to any portion of a Project that is incorporated into your Contribution to that same Project.
-
-Representations.
-You represent that, other than the Third Party Content and Third Party Rights identified by you in accordance with this Agreement, you are the sole author of your Contributions and are legally entitled to grant the foregoing licenses and waivers in respect of your Contributions. If your Contributions were created in the course of your employment with your past or present employer(s), you represent that such employer(s) has authorized you to make your Contributions on behalf of such employer(s) or such employer (s) has waived all of their right, title or interest in or to your Contributions.
-
-Disclaimer.
-To the fullest extent permitted under applicable law, your Contributions are provided on an "as is" basis, without any warranties or conditions, express or implied, including, without limitation, any implied warranties or conditions of non-infringement, merchantability or fitness for a particular purpose. You are not required to provide support for your Contributions, except to the extent you desire to provide support.
-
-No Obligation.
-You acknowledge that the maintainers of this project are under no obligation to use or incorporate your contributions into the project. The decision to use or incorporate your contributions into the project will be made at the sole discretion of the maintainers or their authorized delegates.
-
diff --git a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/NOTICE b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/NOTICE
deleted file mode 100644
index ae19bb47..00000000
--- a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/NOTICE
+++ /dev/null
@@ -1,3 +0,0 @@
-TFT for PyTorch
-
-This repository includes software from https://github.com/google-research/google-research/tree/master/tft licensed under the Apache License, Version 2.0
diff --git a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/README.md b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/README.md
deleted file mode 100644
index 69b39d12..00000000
--- a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/README.md
+++ /dev/null
@@ -1,465 +0,0 @@
-# Temporal Fusion Transformer For PyTorch
-
-This repository provides a script and recipe to train the Temporal Fusion Transformer model to achieve state-of-the-art accuracy. The content of this repository is tested and maintained by NVIDIA.
-
-## Table Of Contents
-
-- [Model overview](#model-overview)
-    * [Model architecture](#model-architecture)
-    * [Default configuration](#default-configuration)
-    * [Feature support matrix](#feature-support-matrix)
-	    * [Features](#features)
-    * [Mixed precision training](#mixed-precision-training)
-	    * [Enabling mixed precision](#enabling-mixed-precision)
-          * [Enabling TF32](#enabling-tf32)
-    * [Glossary](#glossary)
-- [Setup](#setup)
-    * [Requirements](#requirements)
-- [Quick Start Guide](#quick-start-guide)
-- [Advanced](#advanced)
-    * [Scripts and sample code](#scripts-and-sample-code)
-    * [Command-line options](#command-line-options)
-    * [Getting the data](#getting-the-data)
-        * [Dataset guidelines](#dataset-guidelines)
-        * [Multi-dataset](#multi-dataset)
-    * [Training process](#training-process)
-    * [Inference process](#inference-process)
-- [Performance](#performance)
-    * [Benchmarking](#benchmarking)
-        * [Training performance benchmark](#training-performance-benchmark)
-        * [Inference performance benchmark](#inference-performance-benchmark)
-    * [Results](#results)
-        * [Training accuracy results](#training-accuracy-results)                         
-            * [Training accuracy: NVIDIA DGX A100 (8x A100 80GB)](#training-accuracy-nvidia-dgx-a100-8x-a100-80gb)
-            * [Training accuracy: NVIDIA DGX-1 (8x V100 16GB)](#training-accuracy-nvidia-dgx-1-8x-v100-16gb)
-            * [Training stability test](#training-stability-test)
-        * [Training performance results](#training-performance-results)
-            * [Training performance: NVIDIA DGX A100 (8x A100 80GB)](#training-performance-nvidia-dgx-a100-8x-a100-80gb)
-            * [Training performance: NVIDIA DGX-1 (8x V100 16GB)](#training-performance-nvidia-dgx-1-8x-v100-16gb)
-- [Release notes](#release-notes)
-    * [Changelog](#changelog)
-    * [Known issues](#known-issues)
-
-
-
-## Model overview
-
-The Temporal Fusion Transformer [TFT](https://arxiv.org/abs/1912.09363) model is a state-of-the-art architecture for interpretable, multi-horizon time-series prediction. The model was first developed and [implemented by Google](https://github.com/google-research/google-research/tree/master/tft) with the collaboration with the University of Oxford.
-This implementation differs from the reference implementation by addressing the issue of missing data, which is common in production datasets, by either masking their values in attention matrices or embedding them as a special value in the latent space.
-This model enables the prediction of confidence intervals for future values of time series for multiple future timesteps.
-
-This model is trained with mixed precision using Tensor Cores on Volta, Turing, and the NVIDIA Ampere GPU architectures. Therefore, researchers can get results 1.45x faster than training without Tensor Cores while experiencing the benefits of mixed precision training. This model is tested against each NGC monthly container release to ensure consistent accuracy and performance over time.
-
-### Model architecture
-
-The TFT model is a hybrid architecture joining LSTM encoding of time series and interpretability of transformer attention layers. Prediction is based on three  types of variables: static (constant for a given time series), known (known in advance for whole history and future), observed (known only for historical data). All these variables come in two flavors: categorical, and continuous. In addition to historical data, we feed the model with historical values of time series. All variables are embedded in high-dimensional space by learning an embedding vector. Categorical variables embeddings are learned in the classical sense of embedding discrete values. The model learns a single vector for each continuous variable, which is then scaled by this variable’s value for further processing. The next step is to filter variables through the Variable Selection Network (VSN), which assigns weights to the inputs in accordance with their relevance to the prediction. Static variables are used as a context for variable selection of other variables and as an initial state of LSTM encoders.
-After encoding, variables are passed to multi-head attention layers (decoder), which produce the final prediction. Whole architecture is interwoven with residual connections with gating mechanisms that allow  the architecture to adapt to various problems by skipping some parts of it.
-For the sake of explainability, heads of self-attention layers share value matrices. This allows interpreting  self-attention as an ensemble of models predicting different temporal patterns over the same feature set. The other feature that helps us understand the model is VSN activations, which tells us how relevant the given feature is to the prediction.
-![](TFT_architecture.PNG)
-*image source: https://arxiv.org/abs/1912.09363*
-
-### Default configuration
-
-The specific configuration of the TFT model depends on the dataset used. Not only is the volume of the model subject to change but so are the data sampling and preprocessing strategies. During preprocessing, data is normalized per feature. For a part of the datasets, we apply scaling per-time-series, which takes into account shifts in distribution between entities (i.e., a factory consumes more electricity than an average house). The model is trained with the quantile loss: <img src="https://render.githubusercontent.com/render/math?math=\Large\sum_{i=1}^N\sum_{q\in\mathcal{Q}}\sum_{t=1}^{t_{max}}\frac{QL(y_it,\hat{y}_i(q,t),q)}{Nt_{max}}">
-For quantiles in [0.1, 0.5, 0.9]. The default configurations are tuned for distributed training on DGX-1-32G with mixed precision. We use dynamic loss scaling. Specific values are provided in the table below.
-
-| Dataset | Training samples | Validation samples | Test samples | History length | Forecast horizon | Dropout | Hidden size | #Heads | BS | LR | Gradient clipping |
-| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
-| Electricity | 450k | 50k | 53.5k | 168 | 24 | 0.1 | 128 | 4 | 8x1024 | 1e-3 | 0.0 |
-| Traffic | 450k | 50k | 139.6k | 168 | 24 | 0.3 | 128 | 4 | 8x1024 | 1e-3 | 0.0
-
-### Feature support matrix
-
-The following features are supported by this model: 
-
-| Feature                    | Yes column                
-|----------------------------|--------------------------
-|Distributed data parallel   |         Yes
-|PyTorch AMP                 |         Yes 
-    
-         
-#### Features
-
-[Automatic Mixed Precision](https://pytorch.org/docs/stable/amp.html)
-provides an easy way to leverage Tensor Cores’ performance. It allows the execution of parts of a network in lower precision. Refer to [Mixed precision training](#mixed-precision-training) for more information.
-
-[PyTorch
-DistributedDataParallel](https://pytorch.org/docs/stable/nn.html#torch.nn.parallel.DistributedDataParallel) - a module
-wrapper that enables easy multiprocess distributed data-parallel
-training.
-
-### Mixed precision training
-
-Mixed precision is the combined use of different numerical precisions in a
-computational method.
-[Mixed precision](https://arxiv.org/abs/1710.03740) training offers significant
-computational speedup by performing operations in half-precision format while
-storing minimal information in single-precision to retain as much information
-as possible in critical parts of the network. Since the introduction of [Tensor Cores](https://developer.nvidia.com/tensor-cores) in Volta, and following with 
-both the Turing and Ampere architectures, significant training speedups are 
-experienced by switching to
-mixed precision -- up to 3x overall speedup on the most arithmetically intense
-model architectures. Using mixed precision training previously required two
-steps:
-
-1. Porting the model to use the FP16 data type where appropriate.
-2. Manually adding loss scaling to preserve small gradient values.
-
-The ability to train deep learning networks with lower precision was introduced
-in the Pascal architecture and first supported in [CUDA
-8](https://devblogs.nvidia.com/parallelforall/tag/fp16/) in the NVIDIA Deep
-Learning SDK.
-
-For information about:
-* How to train using mixed precision, refer to the [Mixed Precision
-  Training](https://arxiv.org/abs/1710.03740) paper and [Training With Mixed
-  Precision](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html)
-  documentation.
-* Techniques used for mixed precision training, refer to the [Mixed-Precision
-  Training of Deep Neural
-  Networks](https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/)
-  blog.
-* APEX tools for mixed precision training, refer to the [NVIDIA Apex: Tools for Easy Mixed-Precision Training in
-  PyTorch](https://devblogs.nvidia.com/apex-pytorch-easy-mixed-precision-training/)
-  .
-
-
-#### Enabling mixed precision
-
-
-Mixed precision is enabled in PyTorch by using the Automatic Mixed Precision torch.cuda.amp module, which casts variables to half-precision upon retrieval while storing variables in single-precision format. Furthermore, to preserve small gradient magnitudes in backpropagation, a [loss scaling](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html#lossscaling) step must be included when applying gradients. In PyTorch, loss scaling can be applied automatically by the GradScaler class. All the necessary steps to implement AMP are verbosely described [here](https://pytorch.org/docs/stable/notes/amp_examples.html#amp-examples).
-
-To enable mixed precision for TFT, simply add the `--use_amp` option to the training script.
-#### Enabling TF32
-
-TensorFloat-32 (TF32) is the new math mode in [NVIDIA A100](https://www.nvidia.com/en-us/data-center/a100/) GPUs for handling the matrix math, also called tensor operations. TF32 running on Tensor Cores in A100 GPUs can provide up to 10x speedups compared to single-precision floating-point math (FP32) on Volta GPUs. 
-
-TF32 Tensor Cores can speed up networks using FP32, typically with no loss of accuracy. It is more robust than FP16 for models which require high dynamic range for weights or activations.
-
-For more information, refer to the [TensorFloat-32 in the A100 GPU Accelerates AI Training, HPC up to 20x](https://blogs.nvidia.com/blog/2020/05/14/tensorfloat-32-precision-format/) blog post.
-
-TF32 is supported in the NVIDIA Ampere GPU architecture and is enabled by default.
-
-
-
-### Glossary
-
-**Multi horizon prediction**  
-Process of estimating values of a time series for multiple future time steps.
-
-**Quantiles**  
-Cut points dividing the range of a probability distribution intervals with equal probabilities.
-
-**Time series**  
-Series of data points indexed and equally spaced in time.
-
-**Transformer**  
-The paper [Attention Is All You Need](https://arxiv.org/abs/1706.03762) introduces a novel architecture called Transformer that uses an attention mechanism and transforms one sequence into another.
- 
-
-## Setup
-
-The following section lists the requirements that you need to meet in order to start training the TFT model.
-
-### Requirements
-
-This repository contains Dockerfile, which extends the PyTorch NGC container and encapsulates some dependencies. Aside from these dependencies, ensure you have the following components:
--   [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker)
--   [PyTorch 21.06 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch)
--   Supported GPUs:
-- [NVIDIA Volta architecture](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/)
-- [NVIDIA Turing architecture](https://www.nvidia.com/en-us/design-visualization/technologies/turing-architecture/)
-- [NVIDIA Ampere architecture](https://www.nvidia.com/en-us/data-center/nvidia-ampere-gpu-architecture/)
-
-For more information about how to get started with NGC containers, refer to the following sections from the NVIDIA GPU Cloud Documentation and the Deep Learning Documentation:
--   [Getting Started Using NVIDIA GPU Cloud](https://docs.nvidia.com/ngc/ngc-getting-started-guide/index.html)
--   [Accessing And Pulling From The NGC Container Registry](https://docs.nvidia.com/deeplearning/frameworks/user-guide/index.html#accessing_registry)
--   Running [PyTorch](https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/running.html#running)
-
-  
-For those unable to use the PyTorch NGC container to set up the required environment or create your own container, refer to the versioned [NVIDIA Container Support Matrix](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html).
-
-## Quick Start Guide
-
-To train your model using mixed or TF32 precision with Tensor Cores, perform the following steps using the default parameters of the TFT model on any of the benchmark datasets. For the specifics concerning training and inference, refer to the [Advanced](#advanced) section.
-
-1. Clone the repository.
-```bash
-git clone https://github.com/NVIDIA/DeepLearningExamples
-cd DeepLearningExamples/PyTorch/Forecasting/TFT
-```
-
-2.  Build the TFT PyTorch NGC container.
-```bash
-docker build --network=host -t tft .
-```
-
-3.  Start an interactive session in the NGC container to run training/inference.
-```bash
-docker run -it --rm --ipc=host --network=host --gpus all -v /path/to/your/data:/data/ tft
-```
-
-Note: Ensure to mount your dataset using the -v flag to make it available for training inside the NVIDIA Docker container.
-
-4.  Download and preprocess datasets.
-```bash
-bash scripts/get_data.sh
-```
-
-5. Start training. Choose one of the scripts provided in the `scripts/` directory. Results are stored in the `/results` directory.
-These scripts are tuned for DGX1-32G. If you have a different system, use NGPU and BATCH_SIZE variables to adjust the parameters for your system.
-```bash
-bash scripts/run_electricity.sh
-bash scripts/run_traffic.sh
-```
-
-6. Start validation/evaluation. The metric we use for evaluation is q-risk. We can compare it per-quantile in the Pareto sense or jointly as one number indicating accuracy.
-```bash
-python inference.py \
---checkpoint <your_checkpoint> \
---data /data/processed/<dataset>/test.csv \
---cat_encodings /data/processed/<dataset>/cat_encodings.bin \
---tgt_scalers /data/processed/<dataset>/tgt_scalers.bin
-```
-
-7. Start inference/predictions. Visualize and save predictions by running the following command.
-```bash
-python inference.py \
---checkpoint <your_checkpoint> \
---data /data/processed/<dataset>/test.csv \
---cat_encodings /data/processed/<dataset>/cat_encodings.bin \
---tgt_scalers /data/processed/<dataset>/tgt_scalers.bin \
---visualize \
---save_predictions
-```
-
-
-
-Now that you have your model trained and evaluated, you can choose to compare your training results with our [Training accuracy results](#training-accuracy-results). You can also choose to benchmark your performance to [Training performance benchmark](#training-performance-results). Following the steps in these sections will ensure that you achieve the same accuracy and performance results as stated in the [Results](#results) section.
-## Advanced
-
-The following sections provide more  details about the dataset, running training and inference, and the training results.
-
-### Scripts and sample code
-
-In the root directory, the most important files are:
-
-`train.py`: Entry point for training
-`data_utils.py`: File containing the dataset implementation and preprocessing functions
-`modeling.py`: Definition of the model
-`configuration.py`: Contains configuration classes for various experiments
-`test.py`: Entry point testing trained model.
-`Dockerfile`: Container definition
-`log_helper.py`: Contains helper functions for setting up dllogger
-`criterions.py`: Definitions of loss functions
-
-The `scripts` directory contains scripts for default use cases:
-`run_electricity.sh`: train default model on the electricity dataset
-`run_traffic.sh`: train default model on the traffic dataset
-
-### Command-line options
-
-To view the full list of available options and their descriptions, use the `-h` or `--help` command-line option, for example:
-`python train.py --help`.
-
-The following example output is printed when running the model:
-```
-usage: train.py [-h] --data_path DATA_PATH --dataset {electricity,volatility,traffic,favorita} [--epochs EPOCHS] [--sample_data SAMPLE_DATA SAMPLE_DATA] [--batch_size BATCH_SIZE] [--lr LR] [--seed SEED] [--use_amp] [--clip_grad CLIP_GRAD]
-                [--early_stopping EARLY_STOPPING] [--results RESULTS] [--log_file LOG_FILE] [--distributed_world_size N] [--distributed_rank DISTRIBUTED_RANK] [--local_rank LOCAL_RANK] [--overwrite_config OVERWRITE_CONFIG]
-
-optional arguments:
-  -h, --help            show this help message and exit
-  --data_path DATA_PATH
-  --dataset {electricity,volatility,traffic,favorita}
-  --epochs EPOCHS
-  --sample_data SAMPLE_DATA SAMPLE_DATA
-  --batch_size BATCH_SIZE
-  --lr LR
-  --seed SEED
-  --use_amp             Enable automatic mixed precision
-  --clip_grad CLIP_GRAD
-  --early_stopping EARLY_STOPPING
-                        Stop training if validation loss does not improve for more than this number of epochs.
-  --results RESULTS
-  --log_file LOG_FILE
-  --distributed_world_size N
-                        total number of GPUs across all nodes (default: all visible GPUs)
-  --distributed_rank DISTRIBUTED_RANK
-                        rank of the current worker
-  --local_rank LOCAL_RANK
-                        rank of the current worker
-  --overwrite_config OVERWRITE_CONFIG
-                        JSON string used to overload config
-
-```
-
-### Getting the data
-    
-The TFT model was trained on the electricity and traffic benchmark datasets. This repository contains the `get_data.sh` download script, which for electricity and and traffic datasets will automatically download and preprocess the training, validation and test datasets, and produce files that contain scalers.
-#### Dataset guidelines
-
-The `data_utils.py` file contains all functions that are used to preprocess the data. Initially the data is loaded to a `pandas.DataFrame` and parsed to the common format which contains the features we will use for training. Then standardized data is cleaned, normalized, encoded and binarized.
-This step does the following:
-Drop all the columns that are not marked in the configuration file as used for training or preprocessing
-Flatten indices in case time series are indexed by more than one column
-Split the data into training, validation and test splits
-Filter out all the time series shorter than minimal example length
-Normalize columns marked as continuous in the configuration file
-Encode as integers columns marked as categorical
-Save the data in csv and binary formats
-
-#### Multi-dataset
-In order to use an alternate dataset, you have to write a function that parses your data to a common format. The format is as follows:
-There is at least one id column
-There is exactly one time column (that can also be used as a feature column)
-Each feature is in a separate column
-Each row represents a moment in time for only one time series
-Additionally, you must specify a configuration of the network, including a data description. Refer to the example in `configuration.py` file.
-### Training process
-
-The `train.py` script is an entry point for a training procedure. Refined recipes can be found in the `scripts` directory.
-The model trains for at most `--epochs` epochs. If option `--early_stopping N` is set, then training will end if for N subsequent epochs validation loss hadn’t improved.
-The details of the architecture and the dataset configuration are encapsulated by the `--dataset` option. This option chooses one of the configurations stored in the `configuration.py` file. You can enable mixed precision training by providing the `--use_amp` option. The training script supports multi-GPU training with the APEX package. To enable distributed training prepend training command with `python -m torch.distributed.launch --nproc_per_node=${NGPU}`.
-
-Example command:
-```
-python -m torch.distributed.launch --nproc_per_node=8 train.py \
-        --dataset electricity \
-        --data_path /data/processed/electricity_bin \
-        --batch_size=1024 \
-        --sample 450000 50000 \
-        --lr 1e-3 \
-        --epochs 25 \
-        --early_stopping 5 \
-        --seed 1 \
-        --use_amp \
-        --results /results/TFT_electricity_bs8x1024_lr1e-3/seed_1
-```
-
-The model is trained by optimizing quantile loss <img src="https://render.githubusercontent.com/render/math?math=\Large\sum_{i=1}^N\sum_{q\in\mathcal{Q}}\sum_{t=1}^{t_{max}}\frac{QL(y_{it},\hat{y}_i(q,t),q)}{Nt_{max}}">
-. After training, the checkpoint with the least validation loss is evaluated on a test split with q-risk metric <img src="https://render.githubusercontent.com/render/math?math=\Large\frac{2\sum_{y\in\Omega}\sum_{t=1}^{t_{max}}QL(y_t,\hat{y}(q,t),q)}{\sum_{y\in\Omega}\sum_{t=1}^{t_{max}}|y_t|}">.
-Results are by default stored in the `/results` directory. This can be changed by providing the `--results` option. At the end of the training,  the results directory will contain the trained checkpoint which had the lowest validation loss, dllogger logs (in dictionary per line format), and TensorBoard logs.
-
-### Inference process
-
-Inference can be run by launching the `inference.py` script. The script requires a trained checkpoint to run. It is crucial to prepare the data in the same way as training data prior to running the inference. Example command:
-```
-python inference.py \
---checkpoint /results/checkpoint.pt \
---data /data/processed/electricity_bin/test.csv \
---tgt_scalers /data/processed/electricity_bin/tgt_scalers.bin \
---cat_encodings /data/processed/electricity_bin/cat_encodings.bin \
---batch_size 2048 \
---visualize \
---save_predictions \
---joint_visualization \
---results /results \
---use_amp
-```
-
-In the default setting, it performs the evaluation of the model on a specified dataset and prints q-risk evaluated on this dataset. In order to save the predictions, use the `--save_predictions` option. Predictions will be stored in the directory specified by the `--results` option in the csv format. Option `--joint_visualization` allows us to plot graphs in TensorBoard format, allowing us to inspect the results and compare them to true values. Using `--visualize`, you can save plots for each example in a separate file.
-## Performance
-
-### Benchmarking
-
-The following section shows how to run benchmarks measuring the model performance in training and inference modes.
-
-#### Training performance benchmark
-
-In order to run training benchmarks, use the `scripts/benchmark.sh` script.
-
-#### Inference performance benchmark
-
-To benchmark the inference performance on a specific batch size and dataset, run the `inference.py` script.
-### Results
-
-The following sections provide details on how we achieved our performance and accuracy in training and inference.
-
-#### Training accuracy results
-
-We conducted an extensive hyperparameter search along with stability tests. The presented results are the averages from the hundreds of runs.
-
-##### Training accuracy: NVIDIA DGX A100 (A100 80GB)
-
-Our results were obtained by running the `train.sh` training script in the [PyTorch 21.06 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch) on NVIDIA A100 GPUs.
-
-| Dataset | GPUs | Batch size / GPU    | Accuracy - TF32  | Accuracy - mixed precision  |   Time to train - TF32  |  Time to train - mixed precision | Time to train speedup (TF32 to mixed precision)     
-|-------------|---|------|-----------------------|-----------------------|-------|-------|-------
-| Electricity | 1 | 1024 | 0.027 / 0.059 / 0.029 | 0.028 / 0.058 / 0.029 | 1427s | 1087s | 1.313x
-| Electricity | 8 | 1024 | 0.027 / 0.056 / 0.028 | 0.026 / 0.054 / 0.029 | 216s  | 176s  | 1.227x
-| Traffic     | 1 | 1024 | 0.040 / 0.103 / 0.075 | 0.040 / 0.103 / 0.075 | 957s  | 726s  | 1.318x
-| Traffic     | 8 | 1024 | 0.042 / 0.104 / 0.076 | 0.042 / 0.106 / 0.077 | 151s  | 126s  | 1.198x
-
-
-
-
-##### Training accuracy: NVIDIA DGX-1 (V100 16GB)
-
-Our results were obtained by running the `train.sh` training script in the [PyTorch 21.06 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch) on NVIDIA DGX-1 with V100 16GB GPUs.
-
-| Dataset | GPUs    | Batch size / GPU    | Accuracy - FP32  | Accuracy - mixed precision  |   Time to train - FP32  |  Time to train - mixed precision | Time to train speedup (FP32 to mixed precision)        
-|-------------|---|------|-----------------------|-----------------------|-------|-------|-----------
-| Electricity | 1 | 1024 | 0.027 / 0.056 / 0.028 | 0.027 / 0.058 / 0.029 | 2559s | 1598s | 1.601x 
-| Electricity | 8 | 1024 | 0.027 / 0.055 / 0.028 | 0.027 / 0.055 / 0.029 | 381s  | 261s  | 1.460x   
-| Traffic     | 1 | 1024 | 0.040 / 0.102 / 0.075 | 0.041 / 0.101 / 0.074 | 1718s | 1062s | 1.618x 
-| Traffic     | 8 | 1024 | 0.042 / 0.106 / 0.076 | 0.042 / 0.105 / 0.077 | 256s  | 176s  | 1.455x
-
-
-
-##### Training stability test
-
-In order to get a greater picture of the model’s accuracy, we performed a hyperparameter search along with stability tests on 100 random seeds for each configuration. Then, for each benchmark dataset, we have chosen the architecture with the least mean test q-risk. The table below summarizes the best configurations.
-
-| Dataset     | #GPU | Hidden size | #Heads | Local BS | LR   | Gradient clipping | Dropout | Mean q-risk | Std q-risk | Min q-risk | Max q-risk
-|-------------|------|-------------|--------|----------|------|-------------------|---------|-------------|------------| -----------|------ 
-| Electricity | 8    | 128         | 4      | 1024     | 1e-3 | 0.0               | 0.1     | 0.1131      | 0.0025     | 0.1080     | 0.1200
-| Traffic     | 8    | 128         | 4      | 1024     | 1e-3 | 0.0               | 0.3     | 0.2180      | 0.0049     | 0.2069     | 0.2336
-
-
-#### Training performance results
-
-##### Training performance: NVIDIA DGX A100 (A100 80GB)
-
-Our results were obtained by running the `train.sh` training script in the [PyTorch 21.06 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch) on NVIDIA A100 (A100 80GB) GPUs. Performance numbers (in items/images per second) were averaged over an entire training epoch.
-
-| Dataset | GPUs   | Batch size / GPU   | Throughput - TF32    | Throughput - mixed precision    | Throughput speedup (TF32 - mixed precision)   | Weak scaling - TF32    | Weak scaling - mixed precision        
-|-------------|---|------|--------|--------|-------|-------|-----
-| Electricity | 1 | 1024 | 10173  | 13703  | 1.35x | 1     | 1
-| Electricity | 8 | 1024 | 80596  | 107761 | 1.34x | 7.92x | 7.86x
-| Traffic     | 1 | 1024 | 10197  | 13779  | 1.35x | 1     | 1
-| Traffic     | 8 | 1024 | 80692  | 107979 | 1.34x | 7.91x | 7.84x
-
-
-To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
-
-The performance metrics used were items per second.
-
-
-##### Training performance: NVIDIA DGX-1 (V100 16GB)
-
-Our results were obtained by running the `train.sh` training script in the [PyTorch 21.06 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch) on NVIDIA DGX-1 with (V100 16GB) GPUs. Performance numbers (in items/images per second) were averaged over an entire training epoch.
-
-| Dataset | GPUs   | Batch size / GPU   | Throughput - FP32    | Throughput - mixed precision    | Throughput speedup (FP32 - mixed precision)   | Weak scaling - FP32    | Weak scaling - mixed precision        
-|-------------|---|------|-------|-------|-------|------|----
-| Electricity | 1 | 1024 | 5580  | 9148  | 1.64x | 1     | 1
-| Electricity | 8 | 1024 | 43351 | 69855 | 1.61x | 7.77x | 7.64x
-| Traffic     | 1 | 1024 | 5593  | 9194  | 1.64x | 1     | 1
-| Traffic     | 8 | 1024 | 43426 | 69983 | 1.61x | 7.76x | 7.61x
-
-
-
-To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
-
-The performance metrics used were items per second.
-
-## Release notes
-The performance measurements in this document were conducted at the time of publication and may not reflect the performance achieved from NVIDIA’s latest software release. For the most up-to-date performance measurements, go to https://developer.nvidia.com/deep-learning-performance-training-inference.
-
-### Changelog
-
-October 2021
-- Initial release
-
-### Known issues
-There are no known issues with this model.
-
diff --git a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/TFT_architecture.PNG b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/TFT_architecture.PNG
deleted file mode 100644
index c3431031..00000000
Binary files a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/TFT_architecture.PNG and /dev/null differ
diff --git a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/configuration.py b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/configuration.py
deleted file mode 100644
index bef26e66..00000000
--- a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/configuration.py
+++ /dev/null
@@ -1,128 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from data_utils import InputTypes, DataTypes, FeatureSpec
-import datetime
-
-class ElectricityConfig():
-    def __init__(self):
-
-        self.features = [
-                         FeatureSpec('id', InputTypes.ID, DataTypes.CATEGORICAL),
-                         FeatureSpec('hours_from_start', InputTypes.TIME, DataTypes.CONTINUOUS),
-                         FeatureSpec('power_usage', InputTypes.TARGET, DataTypes.CONTINUOUS),
-                         FeatureSpec('hour', InputTypes.KNOWN, DataTypes.CONTINUOUS),
-                         FeatureSpec('day_of_week', InputTypes.KNOWN, DataTypes.CONTINUOUS),
-                         FeatureSpec('hours_from_start', InputTypes.KNOWN, DataTypes.CONTINUOUS),
-                         FeatureSpec('categorical_id', InputTypes.STATIC, DataTypes.CATEGORICAL),
-                        ]
-        # Dataset split boundaries
-        self.time_ids = 'days_from_start' # This column contains time indices across which we split the data
-        self.train_range = (1096, 1315)
-        self.valid_range = (1308, 1339)
-        self.test_range = (1332, 1346)
-        self.dataset_stride = 1 #how many timesteps between examples
-        self.scale_per_id = True
-        self.missing_id_strategy = None
-        self.missing_cat_data_strategy='encode_all'
-
-        # Feature sizes
-        self.static_categorical_inp_lens = [369]
-        self.temporal_known_categorical_inp_lens = []
-        self.temporal_observed_categorical_inp_lens = []
-        self.quantiles = [0.1, 0.5, 0.9]
-
-        self.example_length = 8 * 24
-        self.encoder_length = 7 * 24
-
-        self.n_head = 4
-        self.hidden_size = 128
-        self.dropout = 0.1
-        self.attn_dropout = 0.0
-
-        #### Derived variables ####
-        self.temporal_known_continuous_inp_size = len([x for x in self.features 
-            if x.feature_type == InputTypes.KNOWN and x.feature_embed_type == DataTypes.CONTINUOUS])
-        self.temporal_observed_continuous_inp_size = len([x for x in self.features 
-            if x.feature_type == InputTypes.OBSERVED and x.feature_embed_type == DataTypes.CONTINUOUS])
-        self.temporal_target_size = len([x for x in self.features if x.feature_type == InputTypes.TARGET])
-        self.static_continuous_inp_size = len([x for x in self.features 
-            if x.feature_type == InputTypes.STATIC and x.feature_embed_type == DataTypes.CONTINUOUS])
-
-        self.num_static_vars = self.static_continuous_inp_size + len(self.static_categorical_inp_lens)
-        self.num_future_vars = self.temporal_known_continuous_inp_size + len(self.temporal_known_categorical_inp_lens)
-        self.num_historic_vars = sum([self.num_future_vars,
-                                      self.temporal_observed_continuous_inp_size,
-                                      self.temporal_target_size,
-                                      len(self.temporal_observed_categorical_inp_lens),
-                                      ])
-
-
-class TrafficConfig():
-    def __init__(self):
-
-        self.features = [
-                         FeatureSpec('id', InputTypes.ID, DataTypes.CATEGORICAL),
-                         FeatureSpec('hours_from_start', InputTypes.TIME, DataTypes.CONTINUOUS),
-                         FeatureSpec('values', InputTypes.TARGET, DataTypes.CONTINUOUS),
-                         FeatureSpec('time_on_day', InputTypes.KNOWN, DataTypes.CONTINUOUS),
-                         FeatureSpec('day_of_week', InputTypes.KNOWN, DataTypes.CONTINUOUS),
-                         FeatureSpec('hours_from_start', InputTypes.KNOWN, DataTypes.CONTINUOUS),
-                         FeatureSpec('categorical_id', InputTypes.STATIC, DataTypes.CATEGORICAL),
-                        ]
-        # Dataset split boundaries
-        self.time_ids = 'sensor_day' # This column contains time indices across which we split the data
-        self.train_range = (0, 151)
-        self.valid_range = (144, 166)
-        self.test_range = (159, float('inf'))
-        self.dataset_stride = 1 #how many timesteps between examples
-        self.scale_per_id = False
-        self.missing_id_strategy = None
-        self.missing_cat_data_strategy='encode_all'
-
-        # Feature sizes
-        self.static_categorical_inp_lens = [963]
-        self.temporal_known_categorical_inp_lens = []
-        self.temporal_observed_categorical_inp_lens = []
-        self.quantiles = [0.1, 0.5, 0.9]
-
-        self.example_length = 8 * 24
-        self.encoder_length = 7 * 24
-
-        self.n_head = 4
-        self.hidden_size = 128
-        self.dropout = 0.3
-        self.attn_dropout = 0.0
-
-        #### Derived variables ####
-        self.temporal_known_continuous_inp_size = len([x for x in self.features 
-            if x.feature_type == InputTypes.KNOWN and x.feature_embed_type == DataTypes.CONTINUOUS])
-        self.temporal_observed_continuous_inp_size = len([x for x in self.features 
-            if x.feature_type == InputTypes.OBSERVED and x.feature_embed_type == DataTypes.CONTINUOUS])
-        self.temporal_target_size = len([x for x in self.features if x.feature_type == InputTypes.TARGET])
-        self.static_continuous_inp_size = len([x for x in self.features 
-            if x.feature_type == InputTypes.STATIC and x.feature_embed_type == DataTypes.CONTINUOUS])
-
-        self.num_static_vars = self.static_continuous_inp_size + len(self.static_categorical_inp_lens)
-        self.num_future_vars = self.temporal_known_continuous_inp_size + len(self.temporal_known_categorical_inp_lens)
-        self.num_historic_vars = sum([self.num_future_vars,
-                                      self.temporal_observed_continuous_inp_size,
-                                      self.temporal_target_size,
-                                      len(self.temporal_observed_categorical_inp_lens),
-                                      ])
-
-
-CONFIGS = {'electricity':  ElectricityConfig,
-           'traffic':      TrafficConfig, 
-           }
diff --git a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/criterions.py b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/criterions.py
deleted file mode 100644
index 5c9df6ae..00000000
--- a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/criterions.py
+++ /dev/null
@@ -1,28 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-class QuantileLoss(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.register_buffer('q', torch.tensor(config.quantiles))
-
-    def forward(self, predictions, targets):
-        diff = predictions - targets
-        ql = (1-self.q)*F.relu(diff) + self.q*F.relu(-diff)
-        losses = ql.view(-1, ql.shape[-1]).mean(0)
-        return losses
diff --git a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/data_utils.py b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/data_utils.py
deleted file mode 100644
index f38f8bfb..00000000
--- a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/data_utils.py
+++ /dev/null
@@ -1,790 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-################################
-# Copyright 2021 The Google Research Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import math
-import pickle
-import enum
-import datetime
-
-from collections import namedtuple, OrderedDict
-
-import sklearn.preprocessing
-from sklearn.impute import SimpleImputer
-import pandas as pd
-import numpy as np
-from bisect import bisect
-
-import torch
-from torch.utils.data import Dataset,IterableDataset,DataLoader
-
-class DataTypes(enum.IntEnum):
-    """Defines numerical types of each column."""
-    CONTINUOUS = 0
-    CATEGORICAL = 1
-    DATE = 2
-    STR = 3
-
-class InputTypes(enum.IntEnum):
-    """Defines input types of each column."""
-    TARGET = 0
-    OBSERVED = 1
-    KNOWN = 2
-    STATIC = 3
-    ID = 4  # Single column used as an entity identifier
-    TIME = 5  # Single column exclusively used as a time index
-
-FeatureSpec = namedtuple('FeatureSpec', ['name', 'feature_type', 'feature_embed_type'])
-DTYPE_MAP = {
-        DataTypes.CONTINUOUS : np.float32,
-        DataTypes.CATEGORICAL : np.int64,
-        DataTypes.DATE:'datetime64[ns]',
-        DataTypes.STR: str
-        }
-
-FEAT_ORDER = [
-        (InputTypes.STATIC, DataTypes.CATEGORICAL),
-        (InputTypes.STATIC, DataTypes.CONTINUOUS),
-        (InputTypes.KNOWN, DataTypes.CATEGORICAL),
-        (InputTypes.KNOWN, DataTypes.CONTINUOUS),
-        (InputTypes.OBSERVED, DataTypes.CATEGORICAL),
-        (InputTypes.OBSERVED, DataTypes.CONTINUOUS),
-        (InputTypes.TARGET, DataTypes.CONTINUOUS),
-        (InputTypes.ID, DataTypes.CATEGORICAL)
-        ]
-
-FEAT_NAMES = ['s_cat' , 's_cont' , 'k_cat' , 'k_cont' , 'o_cat' , 'o_cont' , 'target', 'id']
-DEFAULT_ID_COL = 'id'
-
-class TFTBinaryDataset(Dataset):
-    def __init__(self, path, config):
-        super(TFTBinaryDataset).__init__()
-        self.features = [x for x in config.features if x.feature_embed_type != DataTypes.DATE]
-        self.example_length = config.example_length
-        self.stride = config.dataset_stride
-
-        self.grouped = pickle.load(open(path, 'rb'))
-        self.grouped = [x for x in self.grouped if x.shape[0] >= self.example_length]
-        self._cum_examples_in_group = np.cumsum([(g.shape[0] - self.example_length + 1)//self.stride for g in self.grouped])
-
-
-        self.feature_type_col_map = [[i for i,f in enumerate(self.features) if (f.feature_type, f.feature_embed_type) == x] for x in FEAT_ORDER]
-
-        # The list comprehension below is an elaborate way of rearranging data into correct order,
-        # simultaneously doing casting to proper types. Probably can be written neater
-        self.grouped = [
-                [
-                    arr[:, idxs].view(dtype=np.float32).astype(DTYPE_MAP[t[1]]) 
-                    for t, idxs in zip(FEAT_ORDER, self.feature_type_col_map)
-                ] 
-                for arr in self.grouped
-            ]
-
-    def __len__(self):
-        return self._cum_examples_in_group[-1] if len(self._cum_examples_in_group) else 0
-
-    def __getitem__(self, idx):
-        g_idx = bisect(self._cum_examples_in_group, idx)
-        e_idx = idx - self._cum_examples_in_group[g_idx-1] if g_idx else idx
-
-        group =  self.grouped[g_idx]
-
-        tensors = [
-                   torch.from_numpy(feat[e_idx * self.stride:e_idx*self.stride + self.example_length])
-                   if feat.size else torch.empty(0)
-                   for feat in group
-                  ]
-
-        return OrderedDict(zip(FEAT_NAMES, tensors))
-
-
-class TFTDataset(Dataset):
-    def __init__(self, path, config):
-        super(TFTDataset).__init__()
-        self.features = config.features
-        self.data = pd.read_csv(path, index_col=0)
-        self.example_length = config.example_length
-        self.stride = config.dataset_stride
-
-        # name field is a column name.
-        # there can be multiple entries with the same name because one column can be interpreted in many ways
-        time_col_name = next(x.name for x in self.features if x.feature_type==InputTypes.TIME)
-        id_col_name   = next(x.name for x in self.features if x.feature_type==InputTypes.ID)
-        if not id_col_name in self.data.columns:
-            id_col_name = DEFAULT_ID_COL
-            self.features = [x for x in self.features if x.feature_type!=InputTypes.ID]
-            self.features.append(FeatureSpec(DEFAULT_ID_COL, InputTypes.ID, DataTypes.CATEGORICAL))
-        col_dtypes    = {v.name:DTYPE_MAP[v.feature_embed_type] for v in self.features}
-
-
-        self.data.sort_values(time_col_name,inplace=True)
-        self.data = self.data[set(x.name for x in self.features)] #leave only relevant columns
-        self.data = self.data.astype(col_dtypes)
-        self.data = self.data.groupby(id_col_name).filter(lambda group: len(group) >= self.example_length)
-        self.grouped = list(self.data.groupby(id_col_name))
-
-        self._cum_examples_in_group = np.cumsum([(len(g[1]) - self.example_length + 1)//self.stride for g in self.grouped])
-
-    def __len__(self):
-        return self._cum_examples_in_group[-1]
-
-    def __getitem__(self, idx):
-        g_idx = len([x for x in self._cum_examples_in_group if x <= idx])
-        e_idx = idx - self._cum_examples_in_group[g_idx-1] if g_idx else idx
-
-        group =  self.grouped[g_idx][1]
-        sliced = group.iloc[e_idx * self.stride:e_idx*self.stride + self.example_length]
-
-        # We need to be sure that tensors are returned in the correct order
-        tensors = tuple([] for _ in range(8))
-        for v in self.features:
-            if v.feature_type == InputTypes.STATIC and v.feature_embed_type == DataTypes.CATEGORICAL:
-                tensors[0].append(torch.from_numpy(sliced[v.name].to_numpy()))
-            elif v.feature_type == InputTypes.STATIC and v.feature_embed_type == DataTypes.CONTINUOUS:
-                tensors[1].append(torch.from_numpy(sliced[v.name].to_numpy()))
-            elif v.feature_type == InputTypes.KNOWN and v.feature_embed_type == DataTypes.CATEGORICAL:
-                tensors[2].append(torch.from_numpy(sliced[v.name].to_numpy()))
-            elif v.feature_type == InputTypes.KNOWN and v.feature_embed_type == DataTypes.CONTINUOUS:
-                tensors[3].append(torch.from_numpy(sliced[v.name].to_numpy()))
-            elif v.feature_type == InputTypes.OBSERVED and v.feature_embed_type == DataTypes.CATEGORICAL:
-                tensors[4].append(torch.from_numpy(sliced[v.name].to_numpy()))
-            elif v.feature_type == InputTypes.OBSERVED and v.feature_embed_type == DataTypes.CONTINUOUS:
-                tensors[5].append(torch.from_numpy(sliced[v.name].to_numpy()))
-            elif v.feature_type == InputTypes.TARGET:
-                tensors[6].append(torch.from_numpy(sliced[v.name].to_numpy()))
-            elif v.feature_type == InputTypes.ID:
-                tensors[7].append(torch.from_numpy(sliced[v.name].to_numpy()))
-
-
-        tensors = [torch.stack(x, dim=-1) if x else torch.empty(0) for x in tensors]
-
-        return OrderedDict(zip(FEAT_NAMES, tensors))
-        
-def get_dataset_splits(df, config):
-
-    if hasattr(config, 'relative_split') and config.relative_split:
-        forecast_len = config.example_length - config.encoder_length
-        # The valid split is shifted from the train split by number of the forecast steps to the future.
-        # The test split is shifted by the number of the forecast steps from the valid split
-        train = []
-        valid = []
-        test = []
-
-        for _, group in df.groupby(DEFAULT_ID_COL):
-            index = group[config.time_ids]
-            _train = group.loc[index < config.valid_boundary]
-            _valid = group.iloc[(len(_train) - config.encoder_length):(len(_train) + forecast_len)]
-            _test = group.iloc[(len(_train) - config.encoder_length + forecast_len):(len(_train) + 2*forecast_len)]
-            train.append(_train)
-            valid.append(_valid)
-            test.append(_test)
-
-        train = pd.concat(train, axis=0)
-        valid = pd.concat(valid, axis=0)
-        test = pd.concat(test, axis=0)
-    else:
-        index = df[config.time_ids]
-        train = df.loc[(index >= config.train_range[0]) & (index < config.train_range[1])]
-        valid = df.loc[(index >= config.valid_range[0]) & (index < config.valid_range[1])]
-        test  = df.loc[(index >= config.test_range[0]) & (index < config.test_range[1])]
-
-    return train, valid, test
-
-def flatten_ids(df, config):
-
-    if config.missing_id_strategy == 'drop':
-        if hasattr(config, 'combine_ids') and config.combine_ids:
-            index = np.logical_or.reduce([df[c].isna() for c in config.combine_ids])
-        else:
-            id_col = next(x.name for x in config.features if x.feature_type == InputTypes.ID)
-            index = df[id_col].isna()
-        index = index[index == True].index # Extract indices of nans
-        df.drop(index, inplace=True)
-
-    if not (hasattr(config, 'combine_ids') and config.combine_ids):
-        id_col = next(x.name for x in config.features if x.feature_type == InputTypes.ID)
-        ids = df[id_col].apply(str)
-        df.drop(id_col, axis=1, inplace=True)
-        encoder = sklearn.preprocessing.LabelEncoder().fit(ids.values)
-        df[DEFAULT_ID_COL] = encoder.transform(ids)
-        encoders = OrderedDict({id_col: encoder})
-
-    else:
-        encoders = {c:sklearn.preprocessing.LabelEncoder().fit(df[c].values) for c in config.combine_ids}
-        encoders = OrderedDict(encoders)
-        lens = [len(v.classes_) for v in encoders.values()]
-        clens = np.roll(np.cumprod(lens), 1)
-        clens[0] = 1
-
-        # this takes a looooooot of time. Probably it would be better to create 2 dummy columns
-        df[DEFAULT_ID_COL] = df.apply(lambda row: sum([encoders[c].transform([row[c]])[0]*clens[i] for i,c in enumerate(encoders.keys())]), axis=1)
-        df.drop(config.combine_ids, axis=1, inplace=True)
-
-    return DEFAULT_ID_COL, encoders
-
-def impute(df, config):
-    #XXX This ensures that out scaling will have the same mean. We still need to check the variance
-    if not hasattr(config, 'missing_data_label'):
-        return df, None
-    else:
-        imp = SimpleImputer(missing_values=config.missing_data_label, strategy='mean')
-        mask = df.applymap(lambda x: True if x == config.missing_data_label else False)
-        data = df.values
-        col_mask = (data == config.missing_data_label).all(axis=0)
-        data[:,~col_mask] = imp.fit_transform(data)
-        return data, mask
-
-def normalize_reals(train, valid, test, config, id_col=DEFAULT_ID_COL):
-    tgt_cols = [x.name for x in config.features if x.feature_type == InputTypes.TARGET]
-    real_cols = list(set(v.name for v in config.features if v.feature_embed_type == DataTypes.CONTINUOUS).difference(set(tgt_cols)))
-    real_scalers = {}
-    tgt_scalers = {}
-
-    def apply_scalers(df, name=None):
-        if name is None:
-            name = df.name
-        mask = df.applymap(lambda x: True if x == config.missing_data_label else False) if hasattr(config, 'missing_data_label') else None
-        df[real_cols] = real_scalers[name].transform(df[real_cols])
-        if mask is not None and any(mask):
-            df[real_cols].mask(mask, 10**9)
-        df[tgt_cols] = tgt_scalers[name].transform(df[tgt_cols])
-        return df
-
-    if config.scale_per_id:
-        for identifier, sliced in train.groupby(id_col):
-            data = sliced[real_cols]
-            data, _ = impute(data, config)
-            real_scalers[identifier] = sklearn.preprocessing.StandardScaler().fit(data)
-            # XXX We should probably remove examples that contain NaN as a target
-            target = sliced[tgt_cols]
-            tgt_scalers[identifier] = sklearn.preprocessing.StandardScaler().fit(target)
-
-        train = train.groupby(id_col).apply(apply_scalers)
-        # For valid and testing leave only timeseries previously present in train subset
-        # XXX for proper data science we should consider encoding unseen timeseries as a special case, not throwing them away
-        valid = valid.loc[valid[id_col].isin(real_scalers.keys())]
-        valid = valid.groupby(id_col).apply(apply_scalers)
-        test = test.loc[test[id_col].isin(real_scalers.keys())]
-        test = test.groupby(id_col).apply(apply_scalers)
-
-    else:
-        data, _ = impute(train[real_cols], config)
-        real_scalers[''] = sklearn.preprocessing.StandardScaler().fit(data)
-        tgt_scalers[''] = sklearn.preprocessing.StandardScaler().fit(train[tgt_cols])
-
-        train = apply_scalers(train, name='')
-        valid = apply_scalers(valid, name='')
-        test = apply_scalers(test, name='')
-
-    return train, valid, test, real_scalers, tgt_scalers
-
-def encode_categoricals(train, valid, test, config):
-    cat_encodings = {}
-    cat_cols = list(set(v.name for v in config.features if v.feature_embed_type == DataTypes.CATEGORICAL and v.feature_type != InputTypes.ID))
-    num_classes = [] #XXX Maybe we should modify config based on this value? Or send a warninig?
-                     # For TC performance reasons we might want for num_classes[i] be divisible by 8
-
-    # Train categorical encoders
-    for c in cat_cols:
-        if config.missing_cat_data_strategy == 'special_token':
-            #XXX this will probably require some data augmentation
-            unique = train[c].unique()
-            valid[c].loc[valid[c].isin(unique)] = '<UNK>'
-            test[c].loc[test[c].isin(unique)] = '<UNK>'
-
-        if config.missing_cat_data_strategy == 'encode_all' or \
-                config.missing_cat_data_strategy == 'special_token':
-            srs = pd.concat([train[c], valid[c], test[c]]).apply(str)
-            cat_encodings[c] = sklearn.preprocessing.LabelEncoder().fit(srs.values)
-        elif config.missing_cat_data_strategy == 'drop':
-            # TODO: implement this. In addition to dropping rows this has to split specific time series in chunks
-            # to prevent data from having temporal gaps
-            pass
-        num_classes.append(srs.nunique())
-    print('Categorical variables encodings lens: ', num_classes)
-
-
-    for split in [train, valid, test]:
-        for c in cat_cols:
-            srs = split[c].apply(str)
-            split[c] = srs
-            split.loc[:,c] = cat_encodings[c].transform(srs)
-
-    return cat_encodings
-
-
-def preprocess(src_path, dst_path, config):
-    df = pd.read_csv(src_path, index_col=0)
-
-    for c in config.features:
-        if c.feature_embed_type == DataTypes.DATE:
-            df[c.name] = pd.to_datetime(df[c.name])
-
-    # Leave only columns relevant to preprocessing
-    relevant_columns = list(set([f.name for f in config.features] + [config.time_ids]))
-    df = df[relevant_columns]
-
-
-    id_col, id_encoders = flatten_ids(df, config)
-    df = df.reindex(sorted(df.columns), axis=1)
-    
-    train, valid, test = get_dataset_splits(df, config)
-   
-    # Length filter the data (all timeseries shorter than example len will be dropped)
-    #for df in [train, valid, test]:
-    #    df.groupby(id_col).filter(lambda x: len(x) >= config.example_length)
-    train = pd.concat([x[1] for x in train.groupby(id_col) if len(x[1]) >= config.example_length])
-    valid = pd.concat([x[1] for x in valid.groupby(id_col) if len(x[1]) >= config.example_length])
-    test  = pd.concat([x[1] for x in test.groupby(id_col)  if len(x[1]) >= config.example_length])
-
-    train, valid, test, real_scalers, tgt_scalers = normalize_reals(train, valid, test, config, id_col)
-
-    cat_encodings = encode_categoricals(train, valid, test, config)
-
-    os.makedirs(dst_path, exist_ok=True)
-    
-    train.to_csv(os.path.join(dst_path, 'train.csv'))
-    valid.to_csv(os.path.join(dst_path, 'valid.csv'))
-    test.to_csv(os.path.join(dst_path, 'test.csv'))
-
-    # Save relevant columns in binary form for faster dataloading
-    # IMORTANT: We always expect id to be a single column indicating the complete timeseries
-    # We also expect a copy of id in form of static categorical input!!!
-    col_names = [id_col] + [x.name for x in config.features if x.feature_embed_type != DataTypes.DATE and x.feature_type != InputTypes.ID]
-    grouped_train = [x[1][col_names].values.astype(np.float32).view(dtype=np.int32) for x in train.groupby(id_col)]
-    grouped_valid = [x[1][col_names].values.astype(np.float32).view(dtype=np.int32) for x in valid.groupby(id_col)]
-    grouped_test  = [x[1][col_names].values.astype(np.float32).view(dtype=np.int32) for x in test.groupby(id_col)]
-
-    pickle.dump(grouped_train, open(os.path.join(dst_path, 'train.bin'), 'wb'))
-    pickle.dump(grouped_valid, open(os.path.join(dst_path, 'valid.bin'), 'wb'))
-    pickle.dump(grouped_test,  open(os.path.join(dst_path, 'test.bin'), 'wb'))
-
-    
-    with open(os.path.join(dst_path, 'real_scalers.bin'), 'wb') as f:
-        pickle.dump(real_scalers, f)
-    with open(os.path.join(dst_path, 'tgt_scalers.bin'), 'wb') as f:
-        pickle.dump(tgt_scalers, f)
-    with open(os.path.join(dst_path, 'cat_encodings.bin'), 'wb') as f:
-        pickle.dump(cat_encodings, f)
-    with open(os.path.join(dst_path, 'id_encoders.bin'), 'wb') as f:
-        pickle.dump(id_encoders, f)
-    
-
-def sample_data(dataset, num_samples):
-    if num_samples < 0:
-        return dataset
-    else:
-        return torch.utils.data.Subset(dataset, np.random.choice(np.arange(len(dataset)), size=num_samples, replace=False))
-
-
-def standarize_electricity(path):
-    """Code taken from https://github.com/google-research/google-research/blob/master/tft/script_download_data.py"""
-    df = pd.read_csv(os.path.join(path, 'LD2011_2014.txt'), index_col=0, sep=';', decimal=',')
-    df.index = pd.to_datetime(df.index)
-    df.sort_index(inplace=True)
-  
-    # Used to determine the start and end dates of a series
-    output = df.resample('1h').mean().replace(0., np.nan)
-  
-    earliest_time = output.index.min()
-  
-    df_list = []
-    for label in output:
-        print('Processing {}'.format(label))
-        srs = output[label]
-  
-        start_date = min(srs.fillna(method='ffill').dropna().index)
-        end_date = max(srs.fillna(method='bfill').dropna().index)
-  
-        active_range = (srs.index >= start_date) & (srs.index <= end_date)
-        srs = srs[active_range].fillna(0.)
-  
-        tmp = pd.DataFrame({'power_usage': srs})
-        date = tmp.index
-        tmp['t'] = (date - earliest_time).seconds / 60 / 60 + (
-            date - earliest_time).days * 24
-        tmp['days_from_start'] = (date - earliest_time).days
-        tmp['categorical_id'] = label
-        tmp['date'] = date
-        tmp['id'] = label
-        tmp['hour'] = date.hour
-        tmp['day'] = date.day
-        tmp['day_of_week'] = date.dayofweek
-        tmp['month'] = date.month
-  
-        df_list.append(tmp)
-  
-    output = pd.concat(df_list, axis=0, join='outer').reset_index(drop=True)
-  
-    output['categorical_id'] = output['id'].copy()
-    output['hours_from_start'] = output['t']
-    output['categorical_day_of_week'] = output['day_of_week'].copy()
-    output['categorical_hour'] = output['hour'].copy()
-  
-    output.to_csv(os.path.join(path, 'standarized.csv'))
-
-def standarize_volatility(path):
-    df = pd.read_csv(os.path.join(path, 'oxfordmanrealizedvolatilityindices.csv'), index_col=0)  # no explicit index
-  
-    # Adds additional date/day fields
-    idx = [str(s).split('+')[0] for s in df.index
-          ]  # ignore timezones, we don't need them
-    dates = pd.to_datetime(idx)
-    df['date'] = dates
-    df['days_from_start'] = (dates - pd.datetime(2000, 1, 3)).days
-    df['day_of_week'] = dates.dayofweek
-    df['day_of_month'] = dates.day
-    df['week_of_year'] = dates.weekofyear
-    df['month'] = dates.month
-    df['year'] = dates.year
-    df['categorical_id'] = df['Symbol'].copy()
-  
-    # Processes log volatility
-    vol = df['rv5_ss'].copy()
-    vol.loc[vol == 0.] = np.nan
-    df['log_vol'] = np.log(vol)
-  
-    # Adds static information
-    symbol_region_mapping = {
-        '.AEX': 'EMEA',
-        '.AORD': 'APAC',
-        '.BFX': 'EMEA',
-        '.BSESN': 'APAC',
-        '.BVLG': 'EMEA',
-        '.BVSP': 'AMER',
-        '.DJI': 'AMER',
-        '.FCHI': 'EMEA',
-        '.FTMIB': 'EMEA',
-        '.FTSE': 'EMEA',
-        '.GDAXI': 'EMEA',
-        '.GSPTSE': 'AMER',
-        '.HSI': 'APAC',
-        '.IBEX': 'EMEA',
-        '.IXIC': 'AMER',
-        '.KS11': 'APAC',
-        '.KSE': 'APAC',
-        '.MXX': 'AMER',
-        '.N225': 'APAC ',
-        '.NSEI': 'APAC',
-        '.OMXC20': 'EMEA',
-        '.OMXHPI': 'EMEA',
-        '.OMXSPI': 'EMEA',
-        '.OSEAX': 'EMEA',
-        '.RUT': 'EMEA',
-        '.SMSI': 'EMEA',
-        '.SPX': 'AMER',
-        '.SSEC': 'APAC',
-        '.SSMI': 'EMEA',
-        '.STI': 'APAC',
-        '.STOXX50E': 'EMEA'
-    }
-  
-    df['Region'] = df['Symbol'].apply(lambda k: symbol_region_mapping[k])
-  
-    # Performs final processing
-    output_df_list = []
-    for grp in df.groupby('Symbol'):
-        sliced = grp[1].copy()
-        sliced.sort_values('days_from_start', inplace=True)
-        # Impute log volatility values
-        sliced['log_vol'].fillna(method='ffill', inplace=True)
-        sliced.dropna()
-        output_df_list.append(sliced)
-  
-    df = pd.concat(output_df_list, axis=0)
-  
-    df.to_csv(os.path.join(path, 'standarized.csv'))
-
-
-def standarize_traffic(path):
-    def process_list(s, variable_type=int, delimiter=None):
-        """Parses a line in the PEMS format to a list."""
-        if delimiter is None:
-            l = [
-                variable_type(i) for i in s.replace('[', '').replace(']', '').split()
-            ]
-        else:
-            l = [
-                variable_type(i)
-                for i in s.replace('[', '').replace(']', '').split(delimiter)
-            ]
-  
-        return l
-  
-    def read_single_list(filename):
-        """Returns single list from a file in the PEMS-custom format."""
-        with open(os.path.join(path, filename), 'r') as dat:
-            l = process_list(dat.readlines()[0])
-        return l
-  
-    def read_matrix(filename):
-        """Returns a matrix from a file in the PEMS-custom format."""
-        array_list = []
-        with open(os.path.join(path, filename), 'r') as dat:
-            lines = dat.readlines()
-            for i, line in enumerate(lines):
-                if (i + 1) % 50 == 0:
-                    print('Completed {} of {} rows for {}'.format(i + 1, len(lines),
-                                                                filename))
-                array = [
-                    process_list(row_split, variable_type=float, delimiter=None)
-                    for row_split in process_list(
-                        line, variable_type=str, delimiter=';')
-                ]
-                array_list.append(array)
-  
-        return array_list
-  
-    shuffle_order = np.array(read_single_list('randperm')) - 1  # index from 0
-    train_dayofweek = read_single_list('PEMS_trainlabels')
-    train_tensor = read_matrix('PEMS_train')
-    test_dayofweek = read_single_list('PEMS_testlabels')
-    test_tensor = read_matrix('PEMS_test')
-  
-    # Inverse permutate shuffle order
-    print('Shuffling')
-    inverse_mapping = {
-        new_location: previous_location
-        for previous_location, new_location in enumerate(shuffle_order)
-    }
-    reverse_shuffle_order = np.array([
-        inverse_mapping[new_location]
-        for new_location, _ in enumerate(shuffle_order)
-    ])
-  
-    # Group and reoder based on permuation matrix
-    print('Reodering')
-    day_of_week = np.array(train_dayofweek + test_dayofweek)
-    combined_tensor = np.array(train_tensor + test_tensor)
-  
-    day_of_week = day_of_week[reverse_shuffle_order]
-    combined_tensor = combined_tensor[reverse_shuffle_order]
-  
-    # Put everything back into a dataframe
-    print('Parsing as dataframe')
-    labels = ['traj_{}'.format(i) for i in read_single_list('stations_list')]
-  
-    hourly_list = []
-    for day, day_matrix in enumerate(combined_tensor):
-        # Hourly data
-        hourly = pd.DataFrame(day_matrix.T, columns=labels)
-        hourly['hour_on_day'] = [int(i / 6) for i in hourly.index
-                                ]  # sampled at 10 min intervals
-        if hourly['hour_on_day'].max() > 23 or hourly['hour_on_day'].min() < 0:
-            raise ValueError('Invalid hour! {}-{}'.format(
-                hourly['hour_on_day'].min(), hourly['hour_on_day'].max()))
-  
-        hourly = hourly.groupby('hour_on_day', as_index=True).mean()[labels]
-        hourly['sensor_day'] = day
-        hourly['time_on_day'] = hourly.index
-        hourly['day_of_week'] = day_of_week[day]
-  
-        hourly_list.append(hourly)
-  
-    hourly_frame = pd.concat(hourly_list, axis=0, ignore_index=True, sort=False)
-  
-    # Flatten such that each entitiy uses one row in dataframe
-    store_columns = [c for c in hourly_frame.columns if 'traj' in c]
-    other_columns = [c for c in hourly_frame.columns if 'traj' not in c]
-    flat_df = pd.DataFrame(columns=['values', 'prev_values', 'next_values'] +
-                           other_columns + ['id'])
-  
-    for store in store_columns:
-        print('Processing {}'.format(store))
-  
-        sliced = hourly_frame[[store] + other_columns].copy()
-        sliced.columns = ['values'] + other_columns
-        sliced['id'] = int(store.replace('traj_', ''))
-  
-        # Sort by Sensor-date-time
-        key = sliced['id'].apply(str) \
-                + sliced['sensor_day'].apply(lambda x: '_{:03d}'.format(x)) \
-                + sliced['time_on_day'].apply(lambda x: '_{:03d}'.format(x))
-        sliced = sliced.set_index(key).sort_index()
-  
-        sliced['values'] = sliced['values'].fillna(method='ffill')
-        sliced['prev_values'] = sliced['values'].shift(1)
-        sliced['next_values'] = sliced['values'].shift(-1)
-  
-        flat_df = flat_df.append(sliced.dropna(), ignore_index=True, sort=False)
-  
-    # Filter to match range used by other academic papers
-    index = flat_df['sensor_day']
-    flat_df = flat_df[index < 173].copy()
-  
-    # Creating columns fo categorical inputs
-    flat_df['categorical_id'] = flat_df['id'].copy()
-    flat_df['hours_from_start'] = flat_df['time_on_day'] \
-        + flat_df['sensor_day']*24.
-    flat_df['categorical_day_of_week'] = flat_df['day_of_week'].copy()
-    flat_df['categorical_time_on_day'] = flat_df['time_on_day'].copy()
-  
-    flat_df.to_csv(os.path.join(path, 'standarized.csv'))
-
-
-# XXX needs rework
-def standarize_favorita(data_folder):
-    import gc
-    # Extract only a subset of data to save/process for efficiency
-    start_date = pd.datetime(2015, 1, 1)
-    end_date = pd.datetime(2016, 6, 1)
-  
-    print('Regenerating data...')
-  
-    # load temporal data
-    temporal = pd.read_csv(os.path.join(data_folder, 'train.csv'), index_col=0)
-  
-    store_info = pd.read_csv(os.path.join(data_folder, 'stores.csv'), index_col=0)
-    oil = pd.read_csv(
-        os.path.join(data_folder, 'oil.csv'), index_col=0).iloc[:, 0]
-    holidays = pd.read_csv(os.path.join(data_folder, 'holidays_events.csv'))
-    items = pd.read_csv(os.path.join(data_folder, 'items.csv'), index_col=0)
-    transactions = pd.read_csv(os.path.join(data_folder, 'transactions.csv'))
-  
-    # Take first 6 months of data
-    temporal['date'] = pd.to_datetime(temporal['date'])
-  
-    # Filter dates to reduce storage space requirements
-    if start_date is not None:
-        temporal = temporal[(temporal['date'] >= start_date)]
-    if end_date is not None:
-        temporal = temporal[(temporal['date'] < end_date)]
-  
-    dates = temporal['date'].unique()
-  
-    # Add trajectory identifier
-    temporal['traj_id'] = temporal['store_nbr'].apply(
-        str) + '_' + temporal['item_nbr'].apply(str)
-    temporal['unique_id'] = temporal['traj_id'] + '_' + temporal['date'].apply(
-        str)
-  
-    # Remove all IDs with negative returns
-    print('Removing returns data')
-    min_returns = temporal['unit_sales'].groupby(temporal['traj_id']).min()
-    valid_ids = set(min_returns[min_returns >= 0].index)
-    selector = temporal['traj_id'].apply(lambda traj_id: traj_id in valid_ids)
-    new_temporal = temporal[selector].copy()
-    del temporal
-    gc.collect()
-    temporal = new_temporal
-    temporal['open'] = 1
-  
-    # Resampling
-    print('Resampling to regular grid')
-    resampled_dfs = []
-    for traj_id, raw_sub_df in temporal.groupby('traj_id'):
-        print('Resampling', traj_id)
-        sub_df = raw_sub_df.set_index('date', drop=True).copy()
-        sub_df = sub_df.resample('1d').last()
-        sub_df['date'] = sub_df.index
-        sub_df[['store_nbr', 'item_nbr', 'onpromotion']] \
-            = sub_df[['store_nbr', 'item_nbr', 'onpromotion']].fillna(method='ffill')
-        sub_df['open'] = sub_df['open'].fillna(
-            0)  # flag where sales data is unknown
-        sub_df['log_sales'] = np.log(sub_df['unit_sales'])
-    
-        resampled_dfs.append(sub_df.reset_index(drop=True))
-  
-    new_temporal = pd.concat(resampled_dfs, axis=0)
-    del temporal
-    gc.collect()
-    temporal = new_temporal
-  
-    print('Adding oil')
-    oil.name = 'oil'
-    oil.index = pd.to_datetime(oil.index)
-    #XXX the lines below match the value of the oil on given date with the rest of the timeseries
-    # missing values in oil series are copied from the index before. Then the oil series is joined with
-    # temporal. Then there are some dates present in temporal which arent present in oil, for which 
-    # oil values is substituted with -1. WHY?!
-    #TODO: check how many nans there are after first step. Previously oil series was extended by dates
-    # present in dates variable with nan value, which were forward filled. 
-    # This behavior is no longer supported by pandas, so we changed to DataFrame.isin method.
-    # This leaves us with more nans after first step than previously. To achieve previous behavior
-    # we have to join series before filling nans.
-    temporal = temporal.join(
-        #oil.loc[oil.index.isin(dates)].fillna(method='ffill'), on='date', how='left')
-        oil.loc[oil.index.isin(dates)], on='date', how='left')
-    temporal['oil'] = temporal['oil'].fillna(method='ffill')
-    temporal['oil'] = temporal['oil'].fillna(-1)
-  
-    print('Adding store info')
-    temporal = temporal.join(store_info, on='store_nbr', how='left')
-  
-    print('Adding item info')
-    temporal = temporal.join(items, on='item_nbr', how='left')
-  
-    transactions['date'] = pd.to_datetime(transactions['date'])
-    temporal = temporal.merge(
-        transactions,
-        left_on=['date', 'store_nbr'],
-        right_on=['date', 'store_nbr'],
-        how='left')
-    temporal['transactions'] = temporal['transactions'].fillna(-1)
-  
-    # Additional date info
-    temporal['day_of_week'] = pd.to_datetime(temporal['date'].values).dayofweek
-    temporal['day_of_month'] = pd.to_datetime(temporal['date'].values).day
-    temporal['month'] = pd.to_datetime(temporal['date'].values).month
-  
-    # Add holiday info
-    print('Adding holidays')
-    holiday_subset = holidays[holidays['transferred'].apply(
-        lambda x: not x)].copy()
-    holiday_subset.columns = [
-        s if s != 'type' else 'holiday_type' for s in holiday_subset.columns
-    ]
-    holiday_subset['date'] = pd.to_datetime(holiday_subset['date'])
-    local_holidays = holiday_subset[holiday_subset['locale'] == 'Local']
-    regional_holidays = holiday_subset[holiday_subset['locale'] == 'Regional']
-    national_holidays = holiday_subset[holiday_subset['locale'] == 'National']
-  
-    temporal['national_hol'] = temporal.merge(
-        national_holidays, left_on=['date'], right_on=['date'],
-        how='left')['description'].fillna('')
-    temporal['regional_hol'] = temporal.merge(
-        regional_holidays,
-        left_on=['state', 'date'],
-        right_on=['locale_name', 'date'],
-        how='left')['description'].fillna('')
-    temporal['local_hol'] = temporal.merge(
-        local_holidays,
-        left_on=['city', 'date'],
-        right_on=['locale_name', 'date'],
-        how='left')['description'].fillna('')
-  
-    temporal.sort_values('unique_id', inplace=True)
-
-    # Transform date to integer index
-    start_date = pd.to_datetime(min(temporal['date']))
-    dates = temporal['date'].apply(pd.to_datetime)
-    temporal['days_from_start'] = (dates - start_date).dt.days
-    temporal['categorical_id'] = temporal['traj_id'].copy()
-  
-    print('Saving processed file to {}'.format(os.path.join(data_folder, 'standarized.csv')))
-    temporal.to_csv(os.path.join(data_folder, 'standarized.csv'))
diff --git a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/ema.py b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/ema.py
deleted file mode 100644
index f8f5b331..00000000
--- a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/ema.py
+++ /dev/null
@@ -1,73 +0,0 @@
-# Copyright 2021 NVIDIA CORPORATION
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-
-#     http://www.apache.org/licenses/LICENSE-2.0
-
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Copyright 2019 Ross Wightman
-
-#    Licensed under the Apache License, Version 2.0 (the "License");
-#    you may not use this file except in compliance with the License.
-#    You may obtain a copy of the License at
-
-#        http://www.apache.org/licenses/LICENSE-2.0
-
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS,
-#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#    See the License for the specific language governing permissions and
-#    limitations under the License.
-
-""" 
-Exponential Moving Average (EMA) of model updates
-"""
-
-from collections import OrderedDict
-from copy import deepcopy
-
-import torch
-import torch.nn as nn
-
-class ModelEma(nn.Module):
-    """ Model Exponential Moving Average V2
-
-    Keep a moving average of everything in the model state_dict (parameters and buffers).
-    V2 of this module is simpler, it does not match params/buffers based on name but simply
-    iterates in order. It works with torchscript (JIT of full model).
-
-    """
-    def __init__(self, model, decay=0.999, device=None):
-        super().__init__()
-        # make a copy of the model for accumulating moving average of weights
-        self.module = deepcopy(model)
-        self.module.eval()
-        self.decay = decay
-        self.device = device  # perform ema on different device from model if set
-        if self.device is not None:
-            self.module.to(device=device)
-
-    def update(self, model):
-        update_fn=lambda ema_v, model_v: self.decay * ema_v + (1. - self.decay) * model_v
-        with torch.no_grad():
-            for ema_v, model_v in zip(self.module.state_dict().values(), model.state_dict().values()):
-                if self.device is not None:
-                    model_v = model_v.to(device=self.device)
-                ema_v.copy_(update_fn(ema_v, model_v))
-
-    def set(self, model):
-        with torch.no_grad():
-            for ema_v, model_v in zip(self.module.state_dict().values(), model.state_dict().values()):
-                if self.device is not None:
-                    model_v = model_v.to(device=self.device)
-                ema_v.copy_( model_v )
-
-    def forward(self, x):
-        return self.module(x)
diff --git a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/gpu_affinity.py b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/gpu_affinity.py
deleted file mode 100644
index 79fb1fc4..00000000
--- a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/gpu_affinity.py
+++ /dev/null
@@ -1,157 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import collections
-import math
-import os
-import pathlib
-import re
-
-import pynvml
-
-pynvml.nvmlInit()
-
-
-def systemGetDriverVersion():
-    return pynvml.nvmlSystemGetDriverVersion()
-
-
-def deviceGetCount():
-    return pynvml.nvmlDeviceGetCount()
-
-
-class device:
-    # assume nvml returns list of 64 bit ints
-    _nvml_affinity_elements = math.ceil(os.cpu_count() / 64)
-
-    def __init__(self, device_idx):
-        super().__init__()
-        self.handle = pynvml.nvmlDeviceGetHandleByIndex(device_idx)
-
-    def getName(self):
-        return pynvml.nvmlDeviceGetName(self.handle)
-
-    def getCpuAffinity(self):
-        affinity_string = ''
-        for j in pynvml.nvmlDeviceGetCpuAffinity(
-            self.handle, device._nvml_affinity_elements
-        ):
-            # assume nvml returns list of 64 bit ints
-            affinity_string = '{:064b}'.format(j) + affinity_string
-        affinity_list = [int(x) for x in affinity_string]
-        affinity_list.reverse()  # so core 0 is in 0th element of list
-
-        ret = [i for i, e in enumerate(affinity_list) if e != 0]
-        return ret
-
-
-def set_socket_affinity(gpu_id):
-    dev = device(gpu_id)
-    affinity = dev.getCpuAffinity()
-    os.sched_setaffinity(0, affinity)
-
-
-def set_single_affinity(gpu_id):
-    dev = device(gpu_id)
-    affinity = dev.getCpuAffinity()
-    os.sched_setaffinity(0, affinity[:1])
-
-
-def set_single_unique_affinity(gpu_id, nproc_per_node):
-    devices = [device(i) for i in range(nproc_per_node)]
-    socket_affinities = [dev.getCpuAffinity() for dev in devices]
-
-    siblings_list = get_thread_siblings_list()
-    siblings_dict = dict(siblings_list)
-
-    # remove siblings
-    for idx, socket_affinity in enumerate(socket_affinities):
-        socket_affinities[idx] = list(set(socket_affinity) - set(siblings_dict.values()))
-
-    affinities = []
-    assigned = []
-
-    for socket_affinity in socket_affinities:
-        for core in socket_affinity:
-            if core not in assigned:
-                affinities.append([core])
-                assigned.append(core)
-                break
-    os.sched_setaffinity(0, affinities[gpu_id])
-
-
-def set_socket_unique_affinity(gpu_id, nproc_per_node, mode):
-    device_ids = [device(i) for i in range(nproc_per_node)]
-    socket_affinities = [dev.getCpuAffinity() for dev in device_ids]
-
-    siblings_list = get_thread_siblings_list()
-    siblings_dict = dict(siblings_list)
-
-    # remove siblings
-    for idx, socket_affinity in enumerate(socket_affinities):
-        socket_affinities[idx] = list(set(socket_affinity) - set(siblings_dict.values()))
-
-    socket_affinities_to_device_ids = collections.defaultdict(list)
-
-    for idx, socket_affinity in enumerate(socket_affinities):
-        socket_affinities_to_device_ids[tuple(socket_affinity)].append(idx)
-
-    for socket_affinity, device_ids in socket_affinities_to_device_ids.items():
-        devices_per_group = len(device_ids)
-        cores_per_device = len(socket_affinity) // devices_per_group
-        for group_id, device_id in enumerate(device_ids):
-            if device_id == gpu_id:
-                if mode == 'interleaved':
-                    affinity = list(socket_affinity[group_id::devices_per_group])
-                elif mode == 'continuous':
-                    affinity = list(socket_affinity[group_id*cores_per_device:(group_id+1)*cores_per_device])
-                else:
-                    raise RuntimeError('Unknown set_socket_unique_affinity mode')
-
-                # reintroduce siblings
-                affinity += [siblings_dict[aff] for aff in affinity if aff in siblings_dict]
-                os.sched_setaffinity(0, affinity)
-
-
-def get_thread_siblings_list():
-    path = '/sys/devices/system/cpu/cpu*/topology/thread_siblings_list'
-    thread_siblings_list = []
-    pattern = re.compile(r'(\d+)\D(\d+)')
-    for fname in pathlib.Path(path[0]).glob(path[1:]):
-        with open(fname) as f:
-            content = f.read().strip()
-            res = pattern.findall(content)
-            if res:
-                pair = tuple(map(int, res[0]))
-                thread_siblings_list.append(pair)
-    return thread_siblings_list
-
-
-def set_affinity(gpu_id, nproc_per_node, mode='socket'):
-    if mode == 'socket':
-        set_socket_affinity(gpu_id)
-    elif mode == 'single':
-        set_single_affinity(gpu_id)
-    elif mode == 'single_unique':
-        set_single_unique_affinity(gpu_id, nproc_per_node)
-    elif mode == 'socket_unique_interleaved':
-        set_socket_unique_affinity(gpu_id, nproc_per_node, 'interleaved')
-    elif mode == 'socket_unique_continuous':
-        set_socket_unique_affinity(gpu_id, nproc_per_node, 'continuous')
-    else:
-        raise RuntimeError('Unknown affinity mode')
-
-    affinity = os.sched_getaffinity(0)
-    return affinity
-
diff --git a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/inference.py b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/inference.py
deleted file mode 100644
index 056429f1..00000000
--- a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/inference.py
+++ /dev/null
@@ -1,239 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import pandas as pd
-import numpy as np
-import pickle
-import argparse
-import torch
-from torch.utils.data import DataLoader
-from torch.cuda import amp
-from torch.utils.tensorboard import SummaryWriter
-from tqdm import tqdm
-from modeling import TemporalFusionTransformer
-from configuration import ElectricityConfig
-from data_utils import TFTDataset
-from utils import PerformanceMeter
-from criterions import QuantileLoss
-import dllogger
-from log_helper import setup_logger
-
-def _unscale_per_id(config, values, ids, scalers):
-    values = values.cpu().numpy()
-    num_horizons = config.example_length - config.encoder_length + 1
-    flat_values = pd.DataFrame(
-            values,
-            columns=[f't{j}' for j in range(num_horizons - values.shape[1], num_horizons)]
-            )
-    flat_values['id'] = ids
-    df_list = []
-    for idx, group in flat_values.groupby('id'):
-        scaler = scalers[idx]
-        group_copy = group.copy()
-        for col in group_copy.columns:
-            if not 'id' in col:
-                _col = np.expand_dims(group_copy[col].values, -1)
-                _t_col = scaler.inverse_transform(_col)[:,-1]
-                group_copy[col] = _t_col
-        df_list.append(group_copy)
-    flat_values = pd.concat(df_list, axis=0)
-
-    flat_values = flat_values[[col for col in flat_values if not 'id' in col]]
-    flat_tensor = torch.from_numpy(flat_values.values)
-    return flat_tensor
-
-def _unscale(config, values, scaler):
-    values = values.cpu().numpy()
-    num_horizons = config.example_length - config.encoder_length + 1
-    flat_values = pd.DataFrame(
-            values,
-            columns=[f't{j}' for j in range(num_horizons - values.shape[1], num_horizons)]
-            )
-    for col in flat_values.columns:
-        if not 'id' in col:
-            _col = np.expand_dims(flat_values[col].values, -1)
-            _t_col = scaler.inverse_transform(_col)[:,-1]
-            flat_values[col] = _t_col
-
-    flat_values = flat_values[[col for col in flat_values if not 'id' in col]]
-    flat_tensor = torch.from_numpy(flat_values.values)
-    return flat_tensor
-
-def predict(args, config, model, data_loader, scalers, cat_encodings, extend_targets=False):
-    model.eval()
-    predictions = []
-    targets = []
-    ids = []
-    perf_meter = PerformanceMeter()
-    n_workers = args.distributed_world_size if hasattr(args, 'distributed_world_size') else 1
-
-    for step, batch in enumerate(data_loader):
-        perf_meter.reset_current_lap()
-        with torch.no_grad():
-            batch = {key: tensor.cuda() if tensor.numel() else None for key, tensor in batch.items()}
-            ids.append(batch['id'][:,0,:])
-            targets.append(batch['target'])
-            predictions.append(model(batch).float())
-
-        perf_meter.update(args.batch_size * n_workers,
-            exclude_from_total=step in [0, len(data_loader)-1])
-
-    targets = torch.cat(targets, dim=0)
-    if not extend_targets:
-        targets = targets[:,config.encoder_length:,:] 
-    predictions = torch.cat(predictions, dim=0)
-    
-    if config.scale_per_id:
-        ids = torch.cat(ids, dim=0).cpu().numpy()
-
-        unscaled_predictions = torch.stack(
-                [_unscale_per_id(config, predictions[:,:,i], ids, scalers) for i in range(len(config.quantiles))], 
-                dim=-1)
-        unscaled_targets = _unscale_per_id(config, targets[:,:,0], ids, scalers).unsqueeze(-1)
-    else:
-        ids = None
-        unscaled_predictions = torch.stack(
-                [_unscale(config, predictions[:,:,i], scalers['']) for i in range(len(config.quantiles))], 
-                dim=-1)
-        unscaled_targets = _unscale(config, targets[:,:,0], scalers['']).unsqueeze(-1)
-
-    return unscaled_predictions, unscaled_targets, ids, perf_meter
-
-def visualize_v2(args, config, model, data_loader, scalers, cat_encodings):
-    unscaled_predictions, unscaled_targets, ids, _ = predict(args, config, model, data_loader, scalers, cat_encodings, extend_targets=True)
-
-    num_horizons = config.example_length - config.encoder_length + 1
-    pad = unscaled_predictions.new_full((unscaled_targets.shape[0], unscaled_targets.shape[1] - unscaled_predictions.shape[1], unscaled_predictions.shape[2]), fill_value=float('nan'))
-    pad[:,-1,:] = unscaled_targets[:,-num_horizons,:]
-    unscaled_predictions = torch.cat((pad, unscaled_predictions), dim=1)
-
-    ids = torch.from_numpy(ids.squeeze())
-    joint_graphs = torch.cat([unscaled_targets, unscaled_predictions], dim=2)
-    graphs = {i:joint_graphs[ids == i, :, :] for i in set(ids.tolist())}
-    for key, g in graphs.items():
-        for i, ex in enumerate(g):
-            df = pd.DataFrame(ex.numpy(), 
-                    index=range(num_horizons - ex.shape[0], num_horizons),
-                    columns=['target'] + [f'P{int(q*100)}' for q in config.quantiles])
-            fig = df.plot().get_figure()
-            ax = fig.get_axes()[0]
-            _values = df.values[config.encoder_length-1:,:]
-            ax.fill_between(range(num_horizons), _values[:,1], _values[:,-1], alpha=0.2, color='green')
-            os.makedirs(os.path.join(args.results, 'single_example_vis', str(key)), exist_ok=True)
-            fig.savefig(os.path.join(args.results, 'single_example_vis', str(key), f'{i}.pdf'))
-
-def inference(args, config, model, data_loader, scalers, cat_encodings):
-    unscaled_predictions, unscaled_targets, ids, perf_meter = predict(args, config, model, data_loader, scalers, cat_encodings)
-
-    if args.joint_visualization or args.save_predictions:
-        ids = torch.from_numpy(ids.squeeze())
-        #ids = torch.cat([x['id'][0] for x in data_loader.dataset])
-        joint_graphs = torch.cat([unscaled_targets, unscaled_predictions], dim=2)
-        graphs = {i:joint_graphs[ids == i, :, :] for i in set(ids.tolist())}
-        for key, g in graphs.items(): #timeseries id, joint targets and predictions
-            _g = {'targets': g[:,:,0]}
-            _g.update({f'P{int(q*100)}':g[:,:,i+1] for i, q in enumerate(config.quantiles)})
-            
-            if args.joint_visualization:
-                summary_writer = SummaryWriter(log_dir=os.path.join(args.results, 'predictions_vis', str(key)))
-                for q, t in _g.items(): # target and quantiles, timehorizon values
-                    if q == 'targets':
-                        targets = torch.cat([t[:,0], t[-1,1:]]) # WIP
-                        # We want to plot targets on the same graph as predictions. Probably could be written better.
-                        for i, val in enumerate(targets):
-                            summary_writer.add_scalars(str(key), {f'{q}':val}, i)
-                        continue
-
-                    # Tensor t contains different time horizons which are shifted in phase
-                    # Next lines realign them
-                    y = t.new_full((t.shape[0] + t.shape[1] -1, t.shape[1]), float('nan'))
-                    for i in range(y.shape[1]):
-                        y[i:i+t.shape[0], i] = t[:,i]
-
-                    for i, vals in enumerate(y): # timestep, timehorizon values value
-                        summary_writer.add_scalars(str(key), {f'{q}_t+{j+1}':v for j,v in enumerate(vals) if v == v}, i)
-                summary_writer.close()
-
-            if args.save_predictions:
-                for q, t in _g.items():
-                    df = pd.DataFrame(t.tolist())
-                    df.columns = [f't+{i+1}' for i in range(len(df.columns))]
-                    os.makedirs(os.path.join(args.results, 'predictions', str(key)), exist_ok=True)
-                    df.to_csv(os.path.join(args.results, 'predictions', str(key), q+'.csv'))
-
-    losses = QuantileLoss(config)(unscaled_predictions, unscaled_targets)
-    normalizer = unscaled_targets.abs().mean()
-    q_risk = 2 * losses / normalizer
-
-    perf_dict = {
-                'throughput': perf_meter.avg,
-                'latency_avg': perf_meter.total_time/len(perf_meter.intervals),
-                'latency_p90': perf_meter.p(90),
-                'latency_p95': perf_meter.p(95),
-                'latency_p99': perf_meter.p(99),
-                'total_infernece_time': perf_meter.total_time,
-                }
-
-    return q_risk, perf_dict
-
-
-def main(args):
-    
-    setup_logger(args)
-    # Set up model
-    state_dict = torch.load(args.checkpoint)
-    config = state_dict['config']
-    model = TemporalFusionTransformer(config).cuda()
-    model.load_state_dict(state_dict['model'])
-    model.eval()
-    model.cuda()
-
-    # Set up dataset
-    test_split = TFTDataset(args.data, config)
-    data_loader = DataLoader(test_split, batch_size=args.batch_size, num_workers=4)
-
-    scalers = pickle.load(open(args.tgt_scalers, 'rb'))
-    cat_encodings = pickle.load(open(args.cat_encodings, 'rb'))
-
-    if args.visualize:
-        # TODO: abstract away all forms of visualization.
-        visualize_v2(args, config, model, data_loader, scalers, cat_encodings)
-
-    quantiles, perf_dict = inference(args, config, model, data_loader, scalers, cat_encodings)
-    quantiles = {'test_p10': quantiles[0].item(), 'test_p50': quantiles[1].item(), 'test_p90': quantiles[2].item(), 'sum':sum(quantiles).item()}
-    finish_log = {**quantiles, **perf_dict}
-    dllogger.log(step=(), data=finish_log, verbosity=1)
-    print('Test q-risk: P10 {} | P50 {} | P90 {}'.format(*quantiles))
-    print('Latency:\n\tAverage {:.3f}s\n\tp90 {:.3f}s\n\tp95 {:.3f}s\n\tp99 {:.3f}s'.format(
-        perf_dict['latency_avg'], perf_dict['latency_p90'], perf_dict['latency_p95'], perf_dict['latency_p99']))
-
-if __name__=='__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--checkpoint', type=str,
-                        help='Path to the checkpoint')
-    parser.add_argument('--data', type=str,
-                        help='Path to the test split of the dataset')
-    parser.add_argument('--tgt_scalers', type=str,
-                        help='Path to the tgt_scalers.bin file produced by the preprocessing')
-    parser.add_argument('--cat_encodings', type=str,
-                        help='Path to the cat_encodings.bin file produced by the preprocessing')
-    parser.add_argument('--batch_size', type=int, default=64)
-    parser.add_argument('--visualize', action='store_true', help='Visualize predictions - each example on the separate plot')
-    parser.add_argument('--joint_visualization', action='store_true', help='Visualize predictions - each timeseries on separate plot. Projections will be concatenated.')
-    parser.add_argument('--save_predictions', action='store_true')
-    parser.add_argument('--results', type=str, default='/results')
-    parser.add_argument('--log_file', type=str, default='dllogger.json')
-    ARGS = parser.parse_args()
-    main(ARGS)
diff --git a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/log_helper.py b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/log_helper.py
deleted file mode 100644
index 83d2ac7f..00000000
--- a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/log_helper.py
+++ /dev/null
@@ -1,141 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import subprocess
-import sys
-import itertools
-import atexit
-
-import dllogger
-from dllogger import Backend, JSONStreamBackend, StdOutBackend
-
-import torch.distributed as dist
-from torch.utils.tensorboard import SummaryWriter
-
-class TensorBoardBackend(Backend):
-    def __init__(self, verbosity, log_dir):
-        super().__init__(verbosity=verbosity)
-        self.summary_writer = SummaryWriter(log_dir=os.path.join(log_dir, 'TB_summary'),
-                                            flush_secs=120,
-                                            max_queue=200
-                                            )
-        self.hp_cache = None
-        atexit.register(self.summary_writer.close)
-
-    @property
-    def log_level(self):
-        return self._log_level
-
-    def metadata(self, timestamp, elapsedtime, metric, metadata):
-        pass
-
-    def log(self, timestamp, elapsedtime, step, data):
-        if step == 'HPARAMS':
-            parameters = {k: v for k, v in data.items() if not isinstance(v, (list, tuple))}
-            #Unpack list and tuples
-            for d in [{k+f'_{i}':v for i,v in enumerate(l)} for k,l in data.items() if isinstance(l, (list, tuple))]:
-                parameters.update(d)
-            #Remove custom classes
-            parameters = {k: v for k, v in data.items() if isinstance(v, (int, float, str, bool))}
-            parameters.update({k:'None' for k, v in data.items() if v is None})
-            self.hp_cache = parameters
-        if step == ():
-            if self.hp_cache is None:
-                print('Warning: Cannot save HParameters. Please log HParameters with step=\'HPARAMS\'', file=sys.stderr)
-                return
-            self.summary_writer.add_hparams(self.hp_cache, data)
-        if not isinstance(step, int):
-            return
-        for k, v in data.items():
-            self.summary_writer.add_scalar(k, v, step)
-
-    def flush(self):
-        pass
-
-def setup_logger(args):
-    os.makedirs(args.results, exist_ok=True)
-    log_path = os.path.join(args.results, args.log_file)
-
-    if os.path.exists(log_path):
-        for i in itertools.count():
-            s_fname = args.log_file.split('.')
-            fname = '.'.join(s_fname[:-1]) + f'_{i}.' + s_fname[-1] if len(s_fname) > 1 else args.stat_file + f'.{i}'
-            log_path = os.path.join(args.results, fname)
-            if not os.path.exists(log_path):
-                break
-
-    def metric_format(metric, metadata, value):
-        return "{}: {}".format(metric, f'{value:.5f}' if isinstance(value, float) else value)
-    def step_format(step):
-        if step == ():
-            return "Finished |"
-        elif isinstance(step, int):
-            return "Step {0: <5} |".format(step)
-        return "Step {} |".format(step)
-
-
-    if not dist.is_initialized() or not args.distributed_world_size > 1 or args.distributed_rank == 0:
-        dllogger.init(backends=[JSONStreamBackend(verbosity=1, filename=log_path),
-                                TensorBoardBackend(verbosity=1, log_dir=args.results),
-                                StdOutBackend(verbosity=2, 
-                                              step_format=step_format,
-                                              prefix_format=lambda x: "")#,
-                                              #metric_format=metric_format)
-                                ])
-    else:
-        dllogger.init(backends=[])
-    dllogger.log(step='PARAMETER', data=vars(args), verbosity=0)
-
-    container_setup_info = {**get_framework_env_vars(), **get_system_info()}
-    dllogger.log(step='ENVIRONMENT', data=container_setup_info, verbosity=0)
-
-    dllogger.metadata('loss', {'GOAL': 'MINIMIZE', 'STAGE': 'TRAIN', 'format': ':5f'})
-    dllogger.metadata('P10', {'GOAL': 'MINIMIZE', 'STAGE': 'TRAIN', 'format': ':5f'})
-    dllogger.metadata('P50', {'GOAL': 'MINIMIZE', 'STAGE': 'TRAIN', 'format': ':5f'})
-    dllogger.metadata('P90', {'GOAL': 'MINIMIZE', 'STAGE': 'TRAIN', 'format': ':5f'})
-    dllogger.metadata('items/s', {'GOAL': 'MAXIMIZE', 'STAGE': 'TRAIN', 'format': ':1f'})
-    dllogger.metadata('val_loss', {'GOAL': 'MINIMIZE', 'STAGE': 'VAL', 'format':':5f'})
-    dllogger.metadata('val_P10', {'GOAL': 'MINIMIZE', 'STAGE': 'VAL', 'format': ':5f'})
-    dllogger.metadata('val_P50', {'GOAL': 'MINIMIZE', 'STAGE': 'VAL', 'format': ':5f'})
-    dllogger.metadata('val_P90', {'GOAL': 'MINIMIZE', 'STAGE': 'VAL', 'format': ':5f'})
-    dllogger.metadata('val_items/s', {'GOAL': 'MAXIMIZE', 'STAGE': 'VAL', 'format': ':1f'})
-    dllogger.metadata('test_P10', {'GOAL': 'MINIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
-    dllogger.metadata('test_P50', {'GOAL': 'MINIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
-    dllogger.metadata('test_P90', {'GOAL': 'MINIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
-    dllogger.metadata('throughput', {'GOAL': 'MAXIMIZE', 'STAGE': 'TEST', 'format': ':1f'})
-    dllogger.metadata('latency_p90', {'GOAL': 'MIMIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
-    dllogger.metadata('latency_p95', {'GOAL': 'MIMIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
-    dllogger.metadata('latency_p99', {'GOAL': 'MIMIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
-
-
-def get_framework_env_vars():
-    return {
-        'NVIDIA_PYTORCH_VERSION': os.environ.get('NVIDIA_PYTORCH_VERSION'),
-        'PYTORCH_VERSION': os.environ.get('PYTORCH_VERSION'),
-        'CUBLAS_VERSION': os.environ.get('CUBLAS_VERSION'),
-        'NCCL_VERSION': os.environ.get('NCCL_VERSION'),
-        'CUDA_DRIVER_VERSION': os.environ.get('CUDA_DRIVER_VERSION'),
-        'CUDNN_VERSION': os.environ.get('CUDNN_VERSION'),
-        'CUDA_VERSION': os.environ.get('CUDA_VERSION'),
-        'NVIDIA_PIPELINE_ID': os.environ.get('NVIDIA_PIPELINE_ID'),
-        'NVIDIA_BUILD_ID': os.environ.get('NVIDIA_BUILD_ID'),
-        'NVIDIA_TF32_OVERRIDE': os.environ.get('NVIDIA_TF32_OVERRIDE'),
-    }
-
-def get_system_info():
-    system_info = subprocess.run('nvidia-smi --query-gpu=gpu_name,memory.total,enforced.power.limit --format=csv'.split(), capture_output=True).stdout
-    system_info = [i.decode('utf-8') for i in system_info.split(b'\n')]
-    system_info = [x for x in system_info if x]
-    return {'system_info': system_info}
diff --git a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/modeling.py b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/modeling.py
deleted file mode 100644
index 65e64983..00000000
--- a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/modeling.py
+++ /dev/null
@@ -1,367 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-from torch import Tensor
-from typing import Dict, Tuple, Optional, List
-
-if os.environ.get("TFT_SCRIPTING", False):
-    from torch.nn import LayerNorm
-else:
-    from apex.normalization.fused_layer_norm import FusedLayerNorm as LayerNorm
-
-class MaybeLayerNorm(nn.Module):
-    def __init__(self, output_size, hidden_size, eps):
-        super().__init__()
-        if output_size and output_size == 1:
-            self.ln = nn.Identity()
-        else:
-            self.ln = LayerNorm(output_size if output_size else hidden_size, eps=eps)
-    
-    def forward(self, x):
-        return self.ln(x)
-
-
-class GLU(nn.Module):
-    def __init__(self, hidden_size, output_size):
-        super().__init__()
-        self.lin = nn.Linear(hidden_size, output_size * 2)
-
-    def forward(self, x: Tensor) -> Tensor:
-        x = self.lin(x)
-        x = F.glu(x)
-        return x
-
-
-class GRN(nn.Module):
-    def __init__(self,
-                 input_size,
-                 hidden_size, 
-                 output_size=None,
-                 context_hidden_size=None,
-                 dropout=0):
-        super().__init__()
-
-        
-        self.layer_norm = MaybeLayerNorm(output_size, hidden_size, eps=1e-3)
-        self.lin_a = nn.Linear(input_size, hidden_size)
-        if context_hidden_size is not None:
-            self.lin_c = nn.Linear(context_hidden_size, hidden_size, bias=False)
-        self.lin_i = nn.Linear(hidden_size, hidden_size)
-        self.glu = GLU(hidden_size, output_size if output_size else hidden_size)
-        self.dropout = nn.Dropout(dropout)
-        self.out_proj = nn.Linear(input_size, output_size) if output_size else None
-
-    def forward(self, a: Tensor, c: Optional[Tensor] = None):
-        x = self.lin_a(a)
-        if c is not None:
-            x = x + self.lin_c(c).unsqueeze(1)
-        x = F.elu(x)
-        x = self.lin_i(x)
-        x = self.dropout(x)
-        x = self.glu(x)
-        y = a if not self.out_proj else self.out_proj(a)
-        x = x + y
-        x = self.layer_norm(x)
-        return x 
-
-class TFTEmbedding(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.s_cat_inp_lens    = config.static_categorical_inp_lens
-        self.t_cat_k_inp_lens  = config.temporal_known_categorical_inp_lens
-        self.t_cat_o_inp_lens  = config.temporal_observed_categorical_inp_lens
-        self.s_cont_inp_size   = config.static_continuous_inp_size
-        self.t_cont_k_inp_size = config.temporal_known_continuous_inp_size
-        self.t_cont_o_inp_size = config.temporal_observed_continuous_inp_size
-        self.t_tgt_size        = config.temporal_target_size
-
-        self.hidden_size = config.hidden_size
-
-        # There are 7 types of input:
-        # 1. Static categorical
-        # 2. Static continuous
-        # 3. Temporal known a priori categorical
-        # 4. Temporal known a priori continuous
-        # 5. Temporal observed categorical
-        # 6. Temporal observed continuous
-        # 7. Temporal observed targets (time series obseved so far)
-
-        self.s_cat_embed = nn.ModuleList([
-            nn.Embedding(n, self.hidden_size) for n in self.s_cat_inp_lens]) if self.s_cat_inp_lens else None
-        self.t_cat_k_embed = nn.ModuleList([
-            nn.Embedding(n, self.hidden_size) for n in self.t_cat_k_inp_lens]) if self.t_cat_k_inp_lens else None
-        self.t_cat_o_embed = nn.ModuleList([
-            nn.Embedding(n, self.hidden_size) for n in self.t_cat_o_inp_lens]) if self.t_cat_o_inp_lens else None
-
-        self.s_cont_embedding_vectors = nn.Parameter(torch.Tensor(self.s_cont_inp_size, self.hidden_size)) if self.s_cont_inp_size else None
-        self.t_cont_k_embedding_vectors = nn.Parameter(torch.Tensor(self.t_cont_k_inp_size, self.hidden_size)) if self.t_cont_k_inp_size else None
-        self.t_cont_o_embedding_vectors = nn.Parameter(torch.Tensor(self.t_cont_o_inp_size, self.hidden_size)) if self.t_cont_o_inp_size else None
-        self.t_tgt_embedding_vectors = nn.Parameter(torch.Tensor(self.t_tgt_size, self.hidden_size))
-
-        self.s_cont_embedding_bias = nn.Parameter(torch.zeros(self.s_cont_inp_size, self.hidden_size)) if self.s_cont_inp_size else None
-        self.t_cont_k_embedding_bias = nn.Parameter(torch.zeros(self.t_cont_k_inp_size, self.hidden_size)) if self.t_cont_k_inp_size else None
-        self.t_cont_o_embedding_bias = nn.Parameter(torch.zeros(self.t_cont_o_inp_size, self.hidden_size)) if self.t_cont_o_inp_size else None
-        self.t_tgt_embedding_bias = nn.Parameter(torch.zeros(self.t_tgt_size, self.hidden_size))
-
-        if self.s_cont_embedding_vectors is not None:
-            torch.nn.init.xavier_normal_(self.s_cont_embedding_vectors)
-        if self.t_cont_k_embedding_vectors is not None:
-            torch.nn.init.xavier_normal_(self.t_cont_k_embedding_vectors)
-        if self.t_cont_o_embedding_vectors is not None:
-            torch.nn.init.xavier_normal_(self.t_cont_o_embedding_vectors)
-        torch.nn.init.xavier_normal_(self.t_tgt_embedding_vectors)
-
-    def _apply_embedding(self,
-            cat: Optional[Tensor],
-            cont: Optional[Tensor],
-            cat_emb: Optional[nn.ModuleList], 
-            cont_emb: Tensor,
-            cont_bias: Tensor,
-            ) -> Tuple[Optional[Tensor], Optional[Tensor]]:
-        e_cat = torch.stack([embed(cat[...,i]) for i, embed in enumerate(cat_emb)], dim=-2) if cat is not None else None
-        if cont is not None:
-            #the line below is equivalent to following einsums
-            #e_cont = torch.einsum('btf,fh->bthf', cont, cont_emb)
-            #e_cont = torch.einsum('bf,fh->bhf', cont, cont_emb)
-            e_cont = torch.mul(cont.unsqueeze(-1), cont_emb)
-            e_cont = e_cont + cont_bias
-        else:
-            e_cont = None
-
-        if e_cat is not None and e_cont is not None:
-            return torch.cat([e_cat, e_cont], dim=-2)
-        elif e_cat is not None:
-            return e_cat
-        elif e_cont is not None:
-            return e_cont
-        else:
-            return None
-
-    def forward(self, x: Dict[str, Tensor]):
-        # temporal/static categorical/continuous known/observed input 
-        s_cat_inp = x.get('s_cat', None)
-        s_cont_inp = x.get('s_cont', None)
-        t_cat_k_inp = x.get('k_cat', None)
-        t_cont_k_inp = x.get('k_cont', None)
-        t_cat_o_inp = x.get('o_cat', None)
-        t_cont_o_inp = x.get('o_cont', None)
-        t_tgt_obs = x['target'] # Has to be present
-
-        # Static inputs are expected to be equal for all timesteps
-        # For memory efficiency there is no assert statement
-        s_cat_inp = s_cat_inp[:,0,:] if s_cat_inp is not None else None
-        s_cont_inp = s_cont_inp[:,0,:] if s_cont_inp is not None else None
-
-        s_inp = self._apply_embedding(s_cat_inp,
-                                      s_cont_inp,
-                                      self.s_cat_embed,
-                                      self.s_cont_embedding_vectors,
-                                      self.s_cont_embedding_bias)
-        t_known_inp = self._apply_embedding(t_cat_k_inp,
-                                            t_cont_k_inp,
-                                            self.t_cat_k_embed,
-                                            self.t_cont_k_embedding_vectors,
-                                            self.t_cont_k_embedding_bias)
-        t_observed_inp = self._apply_embedding(t_cat_o_inp,
-                                               t_cont_o_inp,
-                                               self.t_cat_o_embed,
-                                               self.t_cont_o_embedding_vectors,
-                                               self.t_cont_o_embedding_bias)
-
-        # Temporal observed targets
-        # t_observed_tgt = torch.einsum('btf,fh->btfh', t_tgt_obs, self.t_tgt_embedding_vectors)
-        t_observed_tgt = torch.matmul(t_tgt_obs.unsqueeze(3).unsqueeze(4), self.t_tgt_embedding_vectors.unsqueeze(1)).squeeze(3)
-        t_observed_tgt = t_observed_tgt + self.t_tgt_embedding_bias
-
-        return s_inp, t_known_inp, t_observed_inp, t_observed_tgt
-
-class VariableSelectionNetwork(nn.Module):
-    def __init__(self, config, num_inputs):
-        super().__init__()
-        self.joint_grn = GRN(config.hidden_size*num_inputs, config.hidden_size, output_size=num_inputs, context_hidden_size=config.hidden_size)
-        self.var_grns = nn.ModuleList([GRN(config.hidden_size, config.hidden_size, dropout=config.dropout) for _ in range(num_inputs)])
-
-    def forward(self, x: Tensor, context: Optional[Tensor] = None):
-        Xi = x.reshape(*x.shape[:-2], -1)
-        grn_outputs = self.joint_grn(Xi, c=context)
-        sparse_weights = F.softmax(grn_outputs, dim=-1)
-        transformed_embed_list = [m(x[...,i,:]) for i, m in enumerate(self.var_grns)]
-        transformed_embed = torch.stack(transformed_embed_list, dim=-1)
-        #the line below performs batched matrix vector multiplication
-        #for temporal features it's bthf,btf->bth
-        #for static features it's bhf,bf->bh
-        variable_ctx = torch.matmul(transformed_embed, sparse_weights.unsqueeze(-1)).squeeze(-1)
-
-        return variable_ctx, sparse_weights
-
-class StaticCovariateEncoder(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.vsn = VariableSelectionNetwork(config, config.num_static_vars)
-        self.context_grns = nn.ModuleList([GRN(config.hidden_size, config.hidden_size, dropout=config.dropout) for _ in range(4)])
-
-    def forward(self, x: Tensor) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
-        variable_ctx, sparse_weights = self.vsn(x)
-
-        # Context vectors:
-        # variable selection context
-        # enrichment context
-        # state_c context
-        # state_h context
-        cs, ce, ch, cc = tuple(m(variable_ctx) for m in self.context_grns)
-
-        return cs, ce, ch, cc
-
-
-class InterpretableMultiHeadAttention(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.n_head = config.n_head
-        assert config.hidden_size % config.n_head == 0
-        self.d_head = config.hidden_size // config.n_head
-        self.qkv_linears = nn.Linear(config.hidden_size, (2 * self.n_head + 1) * self.d_head, bias=False)
-        self.out_proj = nn.Linear(self.d_head, config.hidden_size, bias=False)
-        self.attn_dropout = nn.Dropout(config.attn_dropout)
-        self.out_dropout = nn.Dropout(config.dropout)
-        self.scale = self.d_head**-0.5
-        self.register_buffer("_mask", torch.triu(torch.full((config.example_length, config.example_length), float('-inf')), 1).unsqueeze(0))
-
-    def forward(self, x: Tensor, mask_future_timesteps: bool = True) -> Tuple[Tensor, Tensor]:
-        bs, t, h_size = x.shape
-        qkv = self.qkv_linears(x)
-        q, k, v = qkv.split((self.n_head * self.d_head, self.n_head * self.d_head, self.d_head), dim=-1)
-        q = q.view(bs, t, self.n_head, self.d_head)
-        k = k.view(bs, t, self.n_head, self.d_head)
-        v = v.view(bs, t, self.d_head)
-
-        # attn_score = torch.einsum('bind,bjnd->bnij', q, k)
-        attn_score = torch.matmul(q.permute((0, 2, 1, 3)), k.permute((0, 2, 3, 1)))
-        attn_score.mul_(self.scale)
-
-        if mask_future_timesteps:
-            attn_score = attn_score + self._mask
-
-        attn_prob = F.softmax(attn_score, dim=3)
-        attn_prob = self.attn_dropout(attn_prob)
-
-        # attn_vec = torch.einsum('bnij,bjd->bnid', attn_prob, v)
-        attn_vec = torch.matmul(attn_prob, v.unsqueeze(1))
-        m_attn_vec = torch.mean(attn_vec, dim=1)
-        out = self.out_proj(m_attn_vec)
-        out = self.out_dropout(out)
-
-        return out, attn_vec
-
-
-
-class TemporalFusionTransformer(nn.Module):
-    """ 
-    Implementation of https://arxiv.org/abs/1912.09363 
-    """
-    def __init__(self, config):
-        super().__init__()
-
-        if hasattr(config, 'model'):
-            config = config.model
-
-        self.encoder_length = config.encoder_length #this determines from how distant past we want to use data from
-
-        self.embedding = TFTEmbedding(config)
-        self.static_encoder = StaticCovariateEncoder(config)
-
-        self.history_vsn = VariableSelectionNetwork(config, config.num_historic_vars) 
-        self.history_encoder = nn.LSTM(config.hidden_size, config.hidden_size, batch_first=True)
-        self.future_vsn = VariableSelectionNetwork(config, config.num_future_vars)
-        self.future_encoder = nn.LSTM(config.hidden_size, config.hidden_size, batch_first=True)
-
-
-        self.input_gate = GLU(config.hidden_size, config.hidden_size)
-        self.input_gate_ln = LayerNorm(config.hidden_size, eps=1e-3)
-
-        self.enrichment_grn = GRN(config.hidden_size,
-                                  config.hidden_size,
-                                  context_hidden_size=config.hidden_size, 
-                                  dropout=config.dropout)
-        self.attention = InterpretableMultiHeadAttention(config)
-        self.attention_gate = GLU(config.hidden_size, config.hidden_size)
-        self.attention_ln = LayerNorm(config.hidden_size, eps=1e-3)
-
-        self.positionwise_grn = GRN(config.hidden_size,
-                                    config.hidden_size,
-                                    dropout=config.dropout)
-
-        self.decoder_gate = GLU(config.hidden_size, config.hidden_size)
-        self.decoder_ln = LayerNorm(config.hidden_size, eps=1e-3)
-
-        self.quantile_proj = nn.Linear(config.hidden_size, len(config.quantiles))
-
-    def forward(self, x: Dict[str, Tensor]) -> Tensor:
-        s_inp, t_known_inp, t_observed_inp, t_observed_tgt = self.embedding(x)
-
-        # Static context
-        cs, ce, ch, cc = self.static_encoder(s_inp)
-        ch, cc = ch.unsqueeze(0), cc.unsqueeze(0) #lstm initial states
-
-        # Temporal input
-        _historical_inputs = [t_known_inp[:,:self.encoder_length,:], t_observed_tgt[:,:self.encoder_length,:]]
-        if t_observed_inp is not None:
-            _historical_inputs.insert(0,t_observed_inp[:,:self.encoder_length,:])
-
-        historical_inputs = torch.cat(_historical_inputs, dim=-2)
-        future_inputs = t_known_inp[:, self.encoder_length:]
-
-        # Encoders
-        historical_features, _ = self.history_vsn(historical_inputs, cs)
-        history, state = self.history_encoder(historical_features, (ch, cc))
-        future_features, _ = self.future_vsn(future_inputs, cs)
-        future, _ = self.future_encoder(future_features, state)
-        torch.cuda.synchronize() # this call gives perf boost for unknown reasons
-
-        # skip connection
-        input_embedding = torch.cat([historical_features, future_features], dim=1)
-        temporal_features = torch.cat([history, future], dim=1)
-        temporal_features = self.input_gate(temporal_features)
-        temporal_features = temporal_features + input_embedding
-        temporal_features = self.input_gate_ln(temporal_features)
-
-        # Static enrichment
-        enriched = self.enrichment_grn(temporal_features, c=ce)
-
-        # Temporal self attention
-        x, _ = self.attention(enriched, mask_future_timesteps=True)
-
-        # Don't compute hictorical quantiles
-        x = x[:, self.encoder_length:, :]
-        temporal_features = temporal_features[:, self.encoder_length:, :]
-        enriched = enriched[:, self.encoder_length:, :]
-
-        x = self.attention_gate(x)
-        x = x + enriched
-        x = self.attention_ln(x)
-
-        # Position-wise feed-forward
-        x = self.positionwise_grn(x)
-
-        # Final skip connection
-        x = self.decoder_gate(x)
-        x = x + temporal_features
-        x = self.decoder_ln(x)
-
-        out = self.quantile_proj(x)
-
-        return out
diff --git a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/requirements.txt b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/requirements.txt
deleted file mode 100644
index 8ba46efc..00000000
--- a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/requirements.txt
+++ /dev/null
@@ -1 +0,0 @@
-tensorboard
diff --git a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/scripts/benchmark.sh b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/scripts/benchmark.sh
deleted file mode 100644
index c8a04c36..00000000
--- a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/scripts/benchmark.sh
+++ /dev/null
@@ -1,54 +0,0 @@
-#! /bin/bash
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-NUM_GPUS=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
-[ $NUM_GPUS -eq 16 ] && WORKER_NUMS=(1 8 16) || WORKER_NUMS=(1 8)
-DATASETS=(electricity traffic)
-
-rm -r /tmp/benchmark_results
-
-for DATASET in ${DATASETS[@]}
-do
-    for NGPU in ${WORKER_NUMS[@]}
-    do
-        for BATCH_SIZE in 512 1024 1536 2048 2560
-        do
-            for USE_AMP in --use_amp ""
-            do
-                for AFFINITY in "--affinity disabled" "--affinity single" "--affinity socket_unique_interleaved"
-                do 
-                    EXP_NAME="TFT_benchmark_${DATASET}_BS_${BATCH_SIZE}_${NGPU}GPU${USE_AMP}_${AFFINITY}"
-                    python -m torch.distributed.launch --nproc_per_node=${NGPU} train.py \
-                            --dataset ${DATASET} \
-                            --data_path /data/processed/${DATASET}_bin \
-                            --batch_size=${BATCH_SIZE} \
-                            --lr 5e-4 \
-                            --epochs 1 \
-                            --sample 100000 5000 \
-                            --seed 1 \
-                            ${USE_AMP} \
-                            ${AFFINITY} \
-                            --clip_grad 0.1 \
-                            --results /tmp/benchmark_results/${EXP_NAME}
-                done
-            done
-        done
-    done
-done
-for P in `ls /tmp/benchmark_results/`;
-do
-    echo ${P}
-    tail -n 1 /tmp/benchmark_results/${P}/dllogger.json
-done
diff --git a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/scripts/get_data.sh b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/scripts/get_data.sh
deleted file mode 100644
index d4c7c7e1..00000000
--- a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/scripts/get_data.sh
+++ /dev/null
@@ -1,40 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-DATAPATH='/data'
-
-declare -A URLS=( ['electricity']='https://archive.ics.uci.edu/ml/machine-learning-databases/00321/LD2011_2014.txt.zip'
-                  ['traffic']='https://archive.ics.uci.edu/ml/machine-learning-databases/00204/PEMS-SF.zip'
-                )
-
-mkdir -p ${DATAPATH}/raw
-mkdir -p ${DATAPATH}/processed
-
-for DS in electricity traffic
-do
-	DS_PATH=${DATAPATH}/raw/${DS}
-	ZIP_FNAME=${DS_PATH}.zip
-    if [ ! -d ${DS_PATH} ]
-    then
-        wget "${URLS[${DS}]}" -O ${ZIP_FNAME}
-        unzip ${ZIP_FNAME} -d ${DS_PATH}
-    fi
-	python -c "from data_utils import standarize_${DS} as standarize; standarize(\"${DS_PATH}\")"
-	python -c "from data_utils import preprocess; \
-               from configuration import ${DS^}Config as Config; \
-               preprocess(\"${DS_PATH}/standarized.csv\", \"${DATAPATH}/processed/${DS}_bin\", Config())" 
-done
-
-
diff --git a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/scripts/run_electricity.sh b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/scripts/run_electricity.sh
deleted file mode 100644
index 86214a9a..00000000
--- a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/scripts/run_electricity.sh
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-: ${SEED:=1}
-: ${LR:=1e-3}
-: ${NGPU:=8}
-: ${BATCH_SIZE:=1024}
-: ${EPOCHS:=30}
-
-python -m torch.distributed.launch --nproc_per_node=${NGPU} train.py \
-        --dataset electricity \
-        --data_path /data/processed/electricity_bin \
-        --batch_size=${BATCH_SIZE} \
-        --sample 450000 50000 \
-        --lr ${LR} \
-        --epochs ${EPOCHS} \
-        --seed ${SEED} \
-        --use_amp \
-        --results /results/TFT_electricity_bs${NGPU}x${BATCH_SIZE}_lr${LR}/seed_${SEED}
diff --git a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/scripts/run_electricity_DGX1-16G.sh b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/scripts/run_electricity_DGX1-16G.sh
deleted file mode 100644
index 86214a9a..00000000
--- a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/scripts/run_electricity_DGX1-16G.sh
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-: ${SEED:=1}
-: ${LR:=1e-3}
-: ${NGPU:=8}
-: ${BATCH_SIZE:=1024}
-: ${EPOCHS:=30}
-
-python -m torch.distributed.launch --nproc_per_node=${NGPU} train.py \
-        --dataset electricity \
-        --data_path /data/processed/electricity_bin \
-        --batch_size=${BATCH_SIZE} \
-        --sample 450000 50000 \
-        --lr ${LR} \
-        --epochs ${EPOCHS} \
-        --seed ${SEED} \
-        --use_amp \
-        --results /results/TFT_electricity_bs${NGPU}x${BATCH_SIZE}_lr${LR}/seed_${SEED}
diff --git a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/scripts/run_traffic.sh b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/scripts/run_traffic.sh
deleted file mode 100644
index cab8e473..00000000
--- a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/scripts/run_traffic.sh
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-: ${SEED:=1}
-: ${LR:=1e-3}
-: ${NGPU:=8}
-: ${BATCH_SIZE:=1024}
-: ${EPOCHS:=20}
-
-python -m torch.distributed.launch --nproc_per_node=${NGPU} train.py \
-        --dataset traffic \
-        --data_path /data/processed/traffic_bin \
-        --batch_size=${BATCH_SIZE} \
-        --sample 450000 50000 \
-        --lr ${LR} \
-        --epochs ${EPOCHS} \
-        --seed ${SEED} \
-        --use_amp \
-        --results /results/TFT_traffic_bs${NGPU}x${BATCH_SIZE}_lr${LR}/seed_${SEED}
diff --git a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/scripts/run_traffic_DGX1-16G.sh b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/scripts/run_traffic_DGX1-16G.sh
deleted file mode 100644
index cab8e473..00000000
--- a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/scripts/run_traffic_DGX1-16G.sh
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-: ${SEED:=1}
-: ${LR:=1e-3}
-: ${NGPU:=8}
-: ${BATCH_SIZE:=1024}
-: ${EPOCHS:=20}
-
-python -m torch.distributed.launch --nproc_per_node=${NGPU} train.py \
-        --dataset traffic \
-        --data_path /data/processed/traffic_bin \
-        --batch_size=${BATCH_SIZE} \
-        --sample 450000 50000 \
-        --lr ${LR} \
-        --epochs ${EPOCHS} \
-        --seed ${SEED} \
-        --use_amp \
-        --results /results/TFT_traffic_bs${NGPU}x${BATCH_SIZE}_lr${LR}/seed_${SEED}
diff --git a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/train.py b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/train.py
deleted file mode 100644
index e5ceceeb..00000000
--- a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/train.py
+++ /dev/null
@@ -1,294 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import time
-import os
-import pickle
-import json
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import torch.distributed as dist
-from torch.utils.data import DataLoader, DistributedSampler, RandomSampler
-from apex import amp
-from apex.optimizers import FusedAdam
-#from torch.nn.parallel import DistributedDataParallel as DDP
-from apex.parallel import DistributedDataParallel as DDP
-
-import numpy as np
-
-import dllogger
-
-from modeling import TemporalFusionTransformer
-from configuration import CONFIGS
-from data_utils import TFTBinaryDataset, sample_data
-from log_helper import setup_logger
-from criterions import QuantileLoss
-from inference import predict
-from utils import PerformanceMeter
-import gpu_affinity
-from ema import ModelEma
-
-def load_dataset(args, config):
-    train_split = TFTBinaryDataset(os.path.join(args.data_path, 'train.bin'), config)
-    train_split = sample_data(train_split, args.sample_data[0])
-    if args.distributed_world_size > 1:
-        data_sampler = DistributedSampler(train_split, args.distributed_world_size, args.distributed_rank, seed=args.seed + args.distributed_rank, drop_last=True)
-    else:
-        data_sampler = RandomSampler(train_split)
-    train_loader = DataLoader(train_split, batch_size=args.batch_size, num_workers=4, sampler=data_sampler, pin_memory=True)
-
-    valid_split = TFTBinaryDataset(os.path.join(args.data_path, 'valid.bin'), config)
-    valid_split = sample_data(valid_split, args.sample_data[1])
-    if args.distributed_world_size > 1:
-        data_sampler = DistributedSampler(valid_split, args.distributed_world_size, args.distributed_rank, shuffle=False, drop_last=False)
-    else:
-        data_sampler = None
-    valid_loader = DataLoader(valid_split, batch_size=args.batch_size, sampler=data_sampler, num_workers=4, pin_memory=True)
-
-    test_split = TFTBinaryDataset(os.path.join(args.data_path, 'test.bin'), config)
-    if args.distributed_world_size > 1:
-        data_sampler = DistributedSampler(test_split, args.distributed_world_size, args.distributed_rank, shuffle=False, drop_last=False)
-    else:
-        data_sampler = None
-    test_loader = DataLoader(test_split, batch_size=args.batch_size, sampler=data_sampler, num_workers=4, pin_memory=True)
-
-    print_once(f'Train split length: {len(train_split)}')
-    print_once(f'Valid split length: {len(valid_split)}')
-    print_once(f'Test split length: {len(test_split)}')
-
-    return train_loader, valid_loader, test_loader
-
-def print_once(*args, **kwargs):
-    if not dist.is_initialized() or dist.get_rank() == 0:
-        print(*args, **kwargs)
-
-
-def main(args):
-    # Enable CuDNN autotuner
-    nproc_per_node = torch.cuda.device_count()
-    if args.affinity != 'disabled':
-        affinity = gpu_affinity.set_affinity(
-                args.local_rank,
-                nproc_per_node,
-                args.affinity
-            )
-        print(f'{args.local_rank}: thread affinity: {affinity}')
-
-
-    torch.backends.cudnn.benchmark = True
-
-    ### INIT DISTRIBUTED
-    if args.distributed_world_size > 1:
-        args.local_rank = int(os.environ.get('LOCAL_RANK', args.local_rank))
-        torch.cuda.set_device(args.local_rank)
-        dist.init_process_group(backend='nccl', init_method='env://')
-        args.distributed_world_size = int(os.environ['WORLD_SIZE'])
-        args.distributed_rank = dist.get_rank()
-        print_once(f'Distributed training with {args.distributed_world_size} GPUs')
-        torch.cuda.synchronize()
-
-    if args.seed:
-        np.random.seed(args.seed)
-        torch.manual_seed(args.seed)
-        torch.cuda.manual_seed(args.seed)
-
-    setup_logger(args)
-
-    config = CONFIGS[args.dataset]()
-    if args.overwrite_config:
-        config.__dict__.update(json.loads(args.overwrite_config))
-
-    dllogger.log(step='HPARAMS', data={**vars(args), **vars(config)}, verbosity=1)
-
-    model = TemporalFusionTransformer(config).cuda()
-    if args.ema_decay:
-        model_ema = ModelEma(model, decay=args.ema_decay)
-
-    print_once('Model params: {}'.format(sum(p.numel() for p in model.parameters())))
-    criterion = QuantileLoss(config).cuda()
-    optimizer = FusedAdam(model.parameters(), lr=args.lr)
-    if args.use_amp:
-        model, optimizer = amp.initialize(model, optimizer, opt_level="O2", loss_scale="dynamic")
-    if args.distributed_world_size > 1:
-        #model = DDP(model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True)
-        model = DDP(model)
-
-    train_loader, valid_loader, test_loader = load_dataset(args, config)
-
-    global_step = 0
-    perf_meter = PerformanceMeter()
-
-    for epoch in range(args.epochs):
-        start = time.time()
-        dllogger.log(step=global_step, data={'epoch': epoch}, verbosity=1)
-
-        model.train() 
-        for local_step, batch in enumerate(train_loader):
-            perf_meter.reset_current_lap()
-            batch = {key: tensor.cuda() if tensor.numel() else None for key, tensor in batch.items()}
-            predictions = model(batch)
-            targets = batch['target'][:,config.encoder_length:,:]
-            p_losses = criterion(predictions, targets)
-            loss = p_losses.sum()
-
-            if args.use_amp:
-                with amp.scale_loss(loss, optimizer) as scaled_loss:
-                    scaled_loss.backward()
-            else:
-                loss.backward()
-            if not args.grad_accumulation or (global_step+1) % args.grad_accumulation == 0:
-                if args.clip_grad:
-                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip_grad)
-                optimizer.step()
-                optimizer.zero_grad()
-                if args.ema_decay:
-                    model_ema.update(model)
-
-            if args.distributed_world_size > 1:
-                dist.all_reduce(p_losses)
-                p_losses /= args.distributed_world_size
-                loss = p_losses.sum()
-
-            torch.cuda.synchronize()
-            ips = perf_meter.update(args.batch_size * args.distributed_world_size,
-                    exclude_from_total=local_step in [0, len(train_loader)-1])
-
-            log_dict = {'P10':p_losses[0].item(), 'P50':p_losses[1].item(), 'P90':p_losses[2].item(), 'loss': loss.item(), 'items/s':ips}
-            dllogger.log(step=global_step, data=log_dict, verbosity=1)
-            global_step += 1
-
-        validate(args, config, model_ema if args.ema_decay else model, criterion, valid_loader, global_step)
-
-        if validate.early_stop_c >= args.early_stopping:
-            print_once('Early stopping')
-            break
-
-    ### TEST PHASE ###
-    state_dict = torch.load(os.path.join(args.results, 'checkpoint.pt'), map_location='cpu')
-    if isinstance(model, DDP):
-        model.module.load_state_dict(state_dict['model'])
-    else:
-        model.load_state_dict(state_dict['model'])
-    model.cuda().eval()
-
-    tgt_scalers = pickle.load(open(os.path.join(args.data_path, 'tgt_scalers.bin'), 'rb'))
-    cat_encodings = pickle.load(open(os.path.join(args.data_path,'cat_encodings.bin'), 'rb'))
-
-    unscaled_predictions, unscaled_targets, _, _ = predict(args, config, model, test_loader, tgt_scalers, cat_encodings)
-    losses = QuantileLoss(config)(unscaled_predictions, unscaled_targets)
-    normalizer = unscaled_targets.abs().mean()
-    quantiles = 2 * losses / normalizer
-
-    if args.distributed_world_size > 1:
-        quantiles = quantiles.cuda()
-        dist.all_reduce(quantiles)
-        quantiles /= args.distributed_world_size
-
-    quantiles = {'test_p10': quantiles[0].item(), 'test_p50': quantiles[1].item(), 'test_p90': quantiles[2].item(), 'sum':sum(quantiles).item()}
-    finish_log = {**quantiles, 'average_ips':perf_meter.avg, 'convergence_step':validate.conv_step}
-    dllogger.log(step=(), data=finish_log, verbosity=1)
-
-def validate(args, config, model, criterion, dataloader, global_step):
-    if not hasattr(validate, 'best_valid_loss'):
-        validate.best_valid_loss = float('inf')
-    if not hasattr(validate, 'early_stop_c'):
-        validate.early_stop_c = 0
-    model.eval()
-
-    losses = []
-    validation_start = time.time()
-    for batch in dataloader:
-        with torch.no_grad():
-            batch = {key: tensor.cuda() if tensor.numel() else None for key, tensor in batch.items()}
-            predictions = model(batch)
-            targets = batch['target'][:,config.encoder_length:,:]
-            p_losses = criterion(predictions, targets)
-            bs = next(t for t in batch.values() if t is not None).shape[0]
-            losses.append((p_losses, bs))
-
-    validation_end = time.time()
-
-    p_losses = sum([l[0]*l[1] for l in losses])/sum([l[1] for l in losses]) #takes into accunt that the last batch is not full
-    if args.distributed_world_size > 1:
-        dist.all_reduce(p_losses)
-        p_losses = p_losses/args.distributed_world_size
-
-    ips = len(dataloader.dataset) / (validation_end - validation_start)
-
-    log_dict = {'P10':p_losses[0].item(), 'P50':p_losses[1].item(), 'P90':p_losses[2].item(), 'loss': p_losses.sum().item(), 'items/s':ips}
-
-    if log_dict['loss'] < validate.best_valid_loss:
-        validate.best_valid_loss = log_dict['loss']
-        validate.early_stop_c = 0
-        validate.conv_step = global_step
-        if not dist.is_initialized() or dist.get_rank() == 0:
-            state_dict = model.module.state_dict() if isinstance(model, (DDP, ModelEma)) else model.state_dict()
-            ckpt = {'args':args, 'config':config, 'model':state_dict}
-            torch.save(ckpt, os.path.join(args.results, 'checkpoint.pt'))
-        if args.distributed_world_size > 1:
-            dist.barrier()
-    else:
-        validate.early_stop_c += 1
-        
-    log_dict = {'val_'+k:v for k,v in log_dict.items()}
-    dllogger.log(step=global_step, data=log_dict, verbosity=1)
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--data_path', type=str, required=True,
-                        help='Path to the dataset')
-    parser.add_argument('--dataset', type=str, required=True, choices=CONFIGS.keys(),
-                        help='Dataset name')
-    parser.add_argument('--epochs', type=int, default=25,
-                        help='Default number of training epochs')
-    parser.add_argument('--sample_data', type=lambda x: int(float(x)), nargs=2, default=[-1, -1],
-                        help="""Subsample the dataset. Specify number of training and valid examples.
-                        Values can be provided in scientific notation. Floats will be truncated.""")
-    parser.add_argument('--batch_size', type=int, default=64)
-    parser.add_argument('--lr', type=float, default=1e-3)
-    parser.add_argument('--seed', type=int, default=1)
-    parser.add_argument('--use_amp', action='store_true', help='Enable automatic mixed precision')
-    parser.add_argument('--clip_grad', type=float, default=0.0)
-    parser.add_argument('--grad_accumulation', type=int, default=0)
-    parser.add_argument('--early_stopping', type=int, default=1000,
-                        help='Stop training if validation loss does not improve for more than this number of epochs.')
-    parser.add_argument('--results', type=str, default='/results',
-                        help='Directory in which results are stored')
-    parser.add_argument('--log_file', type=str, default='dllogger.json',
-                        help='Name of dllogger output file')
-    parser.add_argument('--distributed_world_size', type=int, metavar='N',
-                       default=torch.cuda.device_count(),
-                       help='total number of GPUs across all nodes (default: all visible GPUs)')
-    parser.add_argument('--distributed_rank', default=os.getenv('LOCAL_RANK', 0), type=int,
-                       help='rank of the current worker')
-    parser.add_argument('--local_rank', default=0, type=int,
-                       help='rank of the current worker')
-    parser.add_argument('--overwrite_config', type=str, default='',
-                       help='JSON string used to overload config')
-    parser.add_argument('--affinity', type=str,
-                         default='socket_unique_interleaved',
-                         choices=['socket', 'single', 'single_unique',
-                                  'socket_unique_interleaved',
-                                  'socket_unique_continuous',
-                                  'disabled'],
-                         help='type of CPU affinity')
-    parser.add_argument("--ema_decay", type=float, default=0.0, help='Use exponential moving average')
-
-
-    ARGS = parser.parse_args()
-    main(ARGS)
diff --git a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/utils.py b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/utils.py
deleted file mode 100644
index bf88be40..00000000
--- a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/tft_pyt/utils.py
+++ /dev/null
@@ -1,46 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import time
-
-class PerformanceMeter():
-    def __init__(self):
-        self.reset()
-
-    def reset(self):
-        self.avg = 0
-        self.count = 0
-        self.total_time = 0
-        self.last_update_time = time.time()
-        self.intervals = []
-
-    def update(self, n, exclude_from_total=False):
-        delta = time.time() - self.last_update_time
-        self.intervals.append(delta)
-        if not exclude_from_total:
-            self.total_time += delta
-            self.count += n
-            self.avg = self.count / self.total_time
-        self.last_update_time = time.time()
-
-        return n/delta
-
-    def reset_current_lap(self):
-        self.last_update_time = time.time()
-
-    def p(self, i):
-        assert i <= 100
-        idx = int(len(self.intervals) * i / 100)
-        return sorted(self.intervals)[idx]
-
diff --git a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/train.py b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/train.py
deleted file mode 100644
index e5ceceeb..00000000
--- a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/train.py
+++ /dev/null
@@ -1,294 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import time
-import os
-import pickle
-import json
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import torch.distributed as dist
-from torch.utils.data import DataLoader, DistributedSampler, RandomSampler
-from apex import amp
-from apex.optimizers import FusedAdam
-#from torch.nn.parallel import DistributedDataParallel as DDP
-from apex.parallel import DistributedDataParallel as DDP
-
-import numpy as np
-
-import dllogger
-
-from modeling import TemporalFusionTransformer
-from configuration import CONFIGS
-from data_utils import TFTBinaryDataset, sample_data
-from log_helper import setup_logger
-from criterions import QuantileLoss
-from inference import predict
-from utils import PerformanceMeter
-import gpu_affinity
-from ema import ModelEma
-
-def load_dataset(args, config):
-    train_split = TFTBinaryDataset(os.path.join(args.data_path, 'train.bin'), config)
-    train_split = sample_data(train_split, args.sample_data[0])
-    if args.distributed_world_size > 1:
-        data_sampler = DistributedSampler(train_split, args.distributed_world_size, args.distributed_rank, seed=args.seed + args.distributed_rank, drop_last=True)
-    else:
-        data_sampler = RandomSampler(train_split)
-    train_loader = DataLoader(train_split, batch_size=args.batch_size, num_workers=4, sampler=data_sampler, pin_memory=True)
-
-    valid_split = TFTBinaryDataset(os.path.join(args.data_path, 'valid.bin'), config)
-    valid_split = sample_data(valid_split, args.sample_data[1])
-    if args.distributed_world_size > 1:
-        data_sampler = DistributedSampler(valid_split, args.distributed_world_size, args.distributed_rank, shuffle=False, drop_last=False)
-    else:
-        data_sampler = None
-    valid_loader = DataLoader(valid_split, batch_size=args.batch_size, sampler=data_sampler, num_workers=4, pin_memory=True)
-
-    test_split = TFTBinaryDataset(os.path.join(args.data_path, 'test.bin'), config)
-    if args.distributed_world_size > 1:
-        data_sampler = DistributedSampler(test_split, args.distributed_world_size, args.distributed_rank, shuffle=False, drop_last=False)
-    else:
-        data_sampler = None
-    test_loader = DataLoader(test_split, batch_size=args.batch_size, sampler=data_sampler, num_workers=4, pin_memory=True)
-
-    print_once(f'Train split length: {len(train_split)}')
-    print_once(f'Valid split length: {len(valid_split)}')
-    print_once(f'Test split length: {len(test_split)}')
-
-    return train_loader, valid_loader, test_loader
-
-def print_once(*args, **kwargs):
-    if not dist.is_initialized() or dist.get_rank() == 0:
-        print(*args, **kwargs)
-
-
-def main(args):
-    # Enable CuDNN autotuner
-    nproc_per_node = torch.cuda.device_count()
-    if args.affinity != 'disabled':
-        affinity = gpu_affinity.set_affinity(
-                args.local_rank,
-                nproc_per_node,
-                args.affinity
-            )
-        print(f'{args.local_rank}: thread affinity: {affinity}')
-
-
-    torch.backends.cudnn.benchmark = True
-
-    ### INIT DISTRIBUTED
-    if args.distributed_world_size > 1:
-        args.local_rank = int(os.environ.get('LOCAL_RANK', args.local_rank))
-        torch.cuda.set_device(args.local_rank)
-        dist.init_process_group(backend='nccl', init_method='env://')
-        args.distributed_world_size = int(os.environ['WORLD_SIZE'])
-        args.distributed_rank = dist.get_rank()
-        print_once(f'Distributed training with {args.distributed_world_size} GPUs')
-        torch.cuda.synchronize()
-
-    if args.seed:
-        np.random.seed(args.seed)
-        torch.manual_seed(args.seed)
-        torch.cuda.manual_seed(args.seed)
-
-    setup_logger(args)
-
-    config = CONFIGS[args.dataset]()
-    if args.overwrite_config:
-        config.__dict__.update(json.loads(args.overwrite_config))
-
-    dllogger.log(step='HPARAMS', data={**vars(args), **vars(config)}, verbosity=1)
-
-    model = TemporalFusionTransformer(config).cuda()
-    if args.ema_decay:
-        model_ema = ModelEma(model, decay=args.ema_decay)
-
-    print_once('Model params: {}'.format(sum(p.numel() for p in model.parameters())))
-    criterion = QuantileLoss(config).cuda()
-    optimizer = FusedAdam(model.parameters(), lr=args.lr)
-    if args.use_amp:
-        model, optimizer = amp.initialize(model, optimizer, opt_level="O2", loss_scale="dynamic")
-    if args.distributed_world_size > 1:
-        #model = DDP(model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True)
-        model = DDP(model)
-
-    train_loader, valid_loader, test_loader = load_dataset(args, config)
-
-    global_step = 0
-    perf_meter = PerformanceMeter()
-
-    for epoch in range(args.epochs):
-        start = time.time()
-        dllogger.log(step=global_step, data={'epoch': epoch}, verbosity=1)
-
-        model.train() 
-        for local_step, batch in enumerate(train_loader):
-            perf_meter.reset_current_lap()
-            batch = {key: tensor.cuda() if tensor.numel() else None for key, tensor in batch.items()}
-            predictions = model(batch)
-            targets = batch['target'][:,config.encoder_length:,:]
-            p_losses = criterion(predictions, targets)
-            loss = p_losses.sum()
-
-            if args.use_amp:
-                with amp.scale_loss(loss, optimizer) as scaled_loss:
-                    scaled_loss.backward()
-            else:
-                loss.backward()
-            if not args.grad_accumulation or (global_step+1) % args.grad_accumulation == 0:
-                if args.clip_grad:
-                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip_grad)
-                optimizer.step()
-                optimizer.zero_grad()
-                if args.ema_decay:
-                    model_ema.update(model)
-
-            if args.distributed_world_size > 1:
-                dist.all_reduce(p_losses)
-                p_losses /= args.distributed_world_size
-                loss = p_losses.sum()
-
-            torch.cuda.synchronize()
-            ips = perf_meter.update(args.batch_size * args.distributed_world_size,
-                    exclude_from_total=local_step in [0, len(train_loader)-1])
-
-            log_dict = {'P10':p_losses[0].item(), 'P50':p_losses[1].item(), 'P90':p_losses[2].item(), 'loss': loss.item(), 'items/s':ips}
-            dllogger.log(step=global_step, data=log_dict, verbosity=1)
-            global_step += 1
-
-        validate(args, config, model_ema if args.ema_decay else model, criterion, valid_loader, global_step)
-
-        if validate.early_stop_c >= args.early_stopping:
-            print_once('Early stopping')
-            break
-
-    ### TEST PHASE ###
-    state_dict = torch.load(os.path.join(args.results, 'checkpoint.pt'), map_location='cpu')
-    if isinstance(model, DDP):
-        model.module.load_state_dict(state_dict['model'])
-    else:
-        model.load_state_dict(state_dict['model'])
-    model.cuda().eval()
-
-    tgt_scalers = pickle.load(open(os.path.join(args.data_path, 'tgt_scalers.bin'), 'rb'))
-    cat_encodings = pickle.load(open(os.path.join(args.data_path,'cat_encodings.bin'), 'rb'))
-
-    unscaled_predictions, unscaled_targets, _, _ = predict(args, config, model, test_loader, tgt_scalers, cat_encodings)
-    losses = QuantileLoss(config)(unscaled_predictions, unscaled_targets)
-    normalizer = unscaled_targets.abs().mean()
-    quantiles = 2 * losses / normalizer
-
-    if args.distributed_world_size > 1:
-        quantiles = quantiles.cuda()
-        dist.all_reduce(quantiles)
-        quantiles /= args.distributed_world_size
-
-    quantiles = {'test_p10': quantiles[0].item(), 'test_p50': quantiles[1].item(), 'test_p90': quantiles[2].item(), 'sum':sum(quantiles).item()}
-    finish_log = {**quantiles, 'average_ips':perf_meter.avg, 'convergence_step':validate.conv_step}
-    dllogger.log(step=(), data=finish_log, verbosity=1)
-
-def validate(args, config, model, criterion, dataloader, global_step):
-    if not hasattr(validate, 'best_valid_loss'):
-        validate.best_valid_loss = float('inf')
-    if not hasattr(validate, 'early_stop_c'):
-        validate.early_stop_c = 0
-    model.eval()
-
-    losses = []
-    validation_start = time.time()
-    for batch in dataloader:
-        with torch.no_grad():
-            batch = {key: tensor.cuda() if tensor.numel() else None for key, tensor in batch.items()}
-            predictions = model(batch)
-            targets = batch['target'][:,config.encoder_length:,:]
-            p_losses = criterion(predictions, targets)
-            bs = next(t for t in batch.values() if t is not None).shape[0]
-            losses.append((p_losses, bs))
-
-    validation_end = time.time()
-
-    p_losses = sum([l[0]*l[1] for l in losses])/sum([l[1] for l in losses]) #takes into accunt that the last batch is not full
-    if args.distributed_world_size > 1:
-        dist.all_reduce(p_losses)
-        p_losses = p_losses/args.distributed_world_size
-
-    ips = len(dataloader.dataset) / (validation_end - validation_start)
-
-    log_dict = {'P10':p_losses[0].item(), 'P50':p_losses[1].item(), 'P90':p_losses[2].item(), 'loss': p_losses.sum().item(), 'items/s':ips}
-
-    if log_dict['loss'] < validate.best_valid_loss:
-        validate.best_valid_loss = log_dict['loss']
-        validate.early_stop_c = 0
-        validate.conv_step = global_step
-        if not dist.is_initialized() or dist.get_rank() == 0:
-            state_dict = model.module.state_dict() if isinstance(model, (DDP, ModelEma)) else model.state_dict()
-            ckpt = {'args':args, 'config':config, 'model':state_dict}
-            torch.save(ckpt, os.path.join(args.results, 'checkpoint.pt'))
-        if args.distributed_world_size > 1:
-            dist.barrier()
-    else:
-        validate.early_stop_c += 1
-        
-    log_dict = {'val_'+k:v for k,v in log_dict.items()}
-    dllogger.log(step=global_step, data=log_dict, verbosity=1)
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--data_path', type=str, required=True,
-                        help='Path to the dataset')
-    parser.add_argument('--dataset', type=str, required=True, choices=CONFIGS.keys(),
-                        help='Dataset name')
-    parser.add_argument('--epochs', type=int, default=25,
-                        help='Default number of training epochs')
-    parser.add_argument('--sample_data', type=lambda x: int(float(x)), nargs=2, default=[-1, -1],
-                        help="""Subsample the dataset. Specify number of training and valid examples.
-                        Values can be provided in scientific notation. Floats will be truncated.""")
-    parser.add_argument('--batch_size', type=int, default=64)
-    parser.add_argument('--lr', type=float, default=1e-3)
-    parser.add_argument('--seed', type=int, default=1)
-    parser.add_argument('--use_amp', action='store_true', help='Enable automatic mixed precision')
-    parser.add_argument('--clip_grad', type=float, default=0.0)
-    parser.add_argument('--grad_accumulation', type=int, default=0)
-    parser.add_argument('--early_stopping', type=int, default=1000,
-                        help='Stop training if validation loss does not improve for more than this number of epochs.')
-    parser.add_argument('--results', type=str, default='/results',
-                        help='Directory in which results are stored')
-    parser.add_argument('--log_file', type=str, default='dllogger.json',
-                        help='Name of dllogger output file')
-    parser.add_argument('--distributed_world_size', type=int, metavar='N',
-                       default=torch.cuda.device_count(),
-                       help='total number of GPUs across all nodes (default: all visible GPUs)')
-    parser.add_argument('--distributed_rank', default=os.getenv('LOCAL_RANK', 0), type=int,
-                       help='rank of the current worker')
-    parser.add_argument('--local_rank', default=0, type=int,
-                       help='rank of the current worker')
-    parser.add_argument('--overwrite_config', type=str, default='',
-                       help='JSON string used to overload config')
-    parser.add_argument('--affinity', type=str,
-                         default='socket_unique_interleaved',
-                         choices=['socket', 'single', 'single_unique',
-                                  'socket_unique_interleaved',
-                                  'socket_unique_continuous',
-                                  'disabled'],
-                         help='type of CPU affinity')
-    parser.add_argument("--ema_decay", type=float, default=0.0, help='Use exponential moving average')
-
-
-    ARGS = parser.parse_args()
-    main(ARGS)
diff --git a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/utils.py b/PyTorch/Forecasting/TFT/TemporalFusionTransformers/utils.py
deleted file mode 100644
index bf88be40..00000000
--- a/PyTorch/Forecasting/TFT/TemporalFusionTransformers/utils.py
+++ /dev/null
@@ -1,46 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import time
-
-class PerformanceMeter():
-    def __init__(self):
-        self.reset()
-
-    def reset(self):
-        self.avg = 0
-        self.count = 0
-        self.total_time = 0
-        self.last_update_time = time.time()
-        self.intervals = []
-
-    def update(self, n, exclude_from_total=False):
-        delta = time.time() - self.last_update_time
-        self.intervals.append(delta)
-        if not exclude_from_total:
-            self.total_time += delta
-            self.count += n
-            self.avg = self.count / self.total_time
-        self.last_update_time = time.time()
-
-        return n/delta
-
-    def reset_current_lap(self):
-        self.last_update_time = time.time()
-
-    def p(self, i):
-        assert i <= 100
-        idx = int(len(self.intervals) * i / 100)
-        return sorted(self.intervals)[idx]
-
diff --git a/PyTorch/Forecasting/TFT/tft_pyt/Dockerfile b/PyTorch/Forecasting/TFT/tft_pyt/Dockerfile
deleted file mode 100644
index 70552ea1..00000000
--- a/PyTorch/Forecasting/TFT/tft_pyt/Dockerfile
+++ /dev/null
@@ -1,36 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:21.06-py3
-
-FROM ${FROM_IMAGE_NAME}
-
-RUN apt-get update && apt-get install -y libb64-dev libb64-0d
-WORKDIR /workspace
-#ENV PYTHONPATH /workspace
-RUN pip uninstall -y typing
-
-RUN apt update && apt install -y p7zip-full
-COPY requirements.txt .
-RUN pip install --upgrade pip
-RUN pip install --no-cache-dir --ignore-installed -r requirements.txt
-RUN pip install --no-cache-dir -e git://github.com/NVIDIA/dllogger#egg=dllogger
-
-COPY . .
-ENV PYTHONPATH="${PYTHONPATH}:/workspace"
-
-# AMP monkey-patch
-RUN sed -i 's/  def forward(ctx,/  @amp.custom_fwd\(cast_inputs=torch.float32\)\n  def forward(ctx,/g' /opt/conda/lib/python3.8/site-packages/apex/normalization/fused_layer_norm.py
-RUN sed -i 's/  def backward(ctx,/  @amp.custom_bwd\n  def backward(ctx,/g' /opt/conda/lib/python3.8/site-packages/apex/normalization/fused_layer_norm.py
-RUN sed -i 's/^import torch$/import torch\nfrom torch.cuda import amp/' /opt/conda/lib/python3.8/site-packages/apex/normalization/fused_layer_norm.py
diff --git a/PyTorch/Forecasting/TFT/tft_pyt/LICENCE b/PyTorch/Forecasting/TFT/tft_pyt/LICENCE
deleted file mode 100644
index 261eeb9e..00000000
--- a/PyTorch/Forecasting/TFT/tft_pyt/LICENCE
+++ /dev/null
@@ -1,201 +0,0 @@
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-   1. Definitions.
-
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-
-   END OF TERMS AND CONDITIONS
-
-   APPENDIX: How to apply the Apache License to your work.
-
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-
-   Copyright [yyyy] [name of copyright owner]
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
diff --git a/PyTorch/Forecasting/TFT/tft_pyt/LICENSE AGREEMENT b/PyTorch/Forecasting/TFT/tft_pyt/LICENSE AGREEMENT
deleted file mode 100644
index 5d1d88cf..00000000
--- a/PyTorch/Forecasting/TFT/tft_pyt/LICENSE AGREEMENT	
+++ /dev/null
@@ -1,25 +0,0 @@
-Individual Contributor License Agreement (CLA)
-Thank you for submitting your contributions to this project.
-
-By signing this CLA, you agree that the following terms apply to all of your past, present and future contributions to the project.
-
-License.
-You hereby represent that all present, past and future contributions are governed by the Apache 2.0 License copyright statement.
-
-This entails that to the extent possible under law, you transfer all copyright and related or neighboring rights of the code or documents you contribute to the project itself or its maintainers. Furthermore you also represent that you have the authority to perform the above waiver with respect to the entirety of you contributions.
-
-Moral Rights.
-To the fullest extent permitted under applicable law, you hereby waive, and agree not to assert, all of your “moral rights” in or relating to your contributions for the benefit of the project.
-
-Third Party Content.
-If your Contribution includes or is based on any source code, object code, bug fixes, configuration changes, tools, specifications, documentation, data, materials, feedback, information or other works of authorship that were not authored by you (“Third Party Content”) or if you are aware of any third party intellectual property or proprietary rights associated with your Contribution (“Third Party Rights”), then you agree to include with the submission of your Contribution full details respecting such Third Party Content and Third Party Rights, including, without limitation, identification of which aspects of your Contribution contain Third Party Content or are associated with Third Party Rights, the owner/author of the Third Party Content and Third Party Rights, where you obtained the Third Party Content, and any applicable third party license terms or restrictions respecting the Third Party Content and Third Party Rights. For greater certainty, the foregoing obligations respecting the identification of Third Party Content and Third Party Rights do not apply to any portion of a Project that is incorporated into your Contribution to that same Project.
-
-Representations.
-You represent that, other than the Third Party Content and Third Party Rights identified by you in accordance with this Agreement, you are the sole author of your Contributions and are legally entitled to grant the foregoing licenses and waivers in respect of your Contributions. If your Contributions were created in the course of your employment with your past or present employer(s), you represent that such employer(s) has authorized you to make your Contributions on behalf of such employer(s) or such employer (s) has waived all of their right, title or interest in or to your Contributions.
-
-Disclaimer.
-To the fullest extent permitted under applicable law, your Contributions are provided on an "as is" basis, without any warranties or conditions, express or implied, including, without limitation, any implied warranties or conditions of non-infringement, merchantability or fitness for a particular purpose. You are not required to provide support for your Contributions, except to the extent you desire to provide support.
-
-No Obligation.
-You acknowledge that the maintainers of this project are under no obligation to use or incorporate your contributions into the project. The decision to use or incorporate your contributions into the project will be made at the sole discretion of the maintainers or their authorized delegates.
-
diff --git a/PyTorch/Forecasting/TFT/tft_pyt/NOTICE b/PyTorch/Forecasting/TFT/tft_pyt/NOTICE
deleted file mode 100644
index ae19bb47..00000000
--- a/PyTorch/Forecasting/TFT/tft_pyt/NOTICE
+++ /dev/null
@@ -1,3 +0,0 @@
-TFT for PyTorch
-
-This repository includes software from https://github.com/google-research/google-research/tree/master/tft licensed under the Apache License, Version 2.0
diff --git a/PyTorch/Forecasting/TFT/tft_pyt/README.md b/PyTorch/Forecasting/TFT/tft_pyt/README.md
deleted file mode 100644
index 69b39d12..00000000
--- a/PyTorch/Forecasting/TFT/tft_pyt/README.md
+++ /dev/null
@@ -1,465 +0,0 @@
-# Temporal Fusion Transformer For PyTorch
-
-This repository provides a script and recipe to train the Temporal Fusion Transformer model to achieve state-of-the-art accuracy. The content of this repository is tested and maintained by NVIDIA.
-
-## Table Of Contents
-
-- [Model overview](#model-overview)
-    * [Model architecture](#model-architecture)
-    * [Default configuration](#default-configuration)
-    * [Feature support matrix](#feature-support-matrix)
-	    * [Features](#features)
-    * [Mixed precision training](#mixed-precision-training)
-	    * [Enabling mixed precision](#enabling-mixed-precision)
-          * [Enabling TF32](#enabling-tf32)
-    * [Glossary](#glossary)
-- [Setup](#setup)
-    * [Requirements](#requirements)
-- [Quick Start Guide](#quick-start-guide)
-- [Advanced](#advanced)
-    * [Scripts and sample code](#scripts-and-sample-code)
-    * [Command-line options](#command-line-options)
-    * [Getting the data](#getting-the-data)
-        * [Dataset guidelines](#dataset-guidelines)
-        * [Multi-dataset](#multi-dataset)
-    * [Training process](#training-process)
-    * [Inference process](#inference-process)
-- [Performance](#performance)
-    * [Benchmarking](#benchmarking)
-        * [Training performance benchmark](#training-performance-benchmark)
-        * [Inference performance benchmark](#inference-performance-benchmark)
-    * [Results](#results)
-        * [Training accuracy results](#training-accuracy-results)                         
-            * [Training accuracy: NVIDIA DGX A100 (8x A100 80GB)](#training-accuracy-nvidia-dgx-a100-8x-a100-80gb)
-            * [Training accuracy: NVIDIA DGX-1 (8x V100 16GB)](#training-accuracy-nvidia-dgx-1-8x-v100-16gb)
-            * [Training stability test](#training-stability-test)
-        * [Training performance results](#training-performance-results)
-            * [Training performance: NVIDIA DGX A100 (8x A100 80GB)](#training-performance-nvidia-dgx-a100-8x-a100-80gb)
-            * [Training performance: NVIDIA DGX-1 (8x V100 16GB)](#training-performance-nvidia-dgx-1-8x-v100-16gb)
-- [Release notes](#release-notes)
-    * [Changelog](#changelog)
-    * [Known issues](#known-issues)
-
-
-
-## Model overview
-
-The Temporal Fusion Transformer [TFT](https://arxiv.org/abs/1912.09363) model is a state-of-the-art architecture for interpretable, multi-horizon time-series prediction. The model was first developed and [implemented by Google](https://github.com/google-research/google-research/tree/master/tft) with the collaboration with the University of Oxford.
-This implementation differs from the reference implementation by addressing the issue of missing data, which is common in production datasets, by either masking their values in attention matrices or embedding them as a special value in the latent space.
-This model enables the prediction of confidence intervals for future values of time series for multiple future timesteps.
-
-This model is trained with mixed precision using Tensor Cores on Volta, Turing, and the NVIDIA Ampere GPU architectures. Therefore, researchers can get results 1.45x faster than training without Tensor Cores while experiencing the benefits of mixed precision training. This model is tested against each NGC monthly container release to ensure consistent accuracy and performance over time.
-
-### Model architecture
-
-The TFT model is a hybrid architecture joining LSTM encoding of time series and interpretability of transformer attention layers. Prediction is based on three  types of variables: static (constant for a given time series), known (known in advance for whole history and future), observed (known only for historical data). All these variables come in two flavors: categorical, and continuous. In addition to historical data, we feed the model with historical values of time series. All variables are embedded in high-dimensional space by learning an embedding vector. Categorical variables embeddings are learned in the classical sense of embedding discrete values. The model learns a single vector for each continuous variable, which is then scaled by this variable’s value for further processing. The next step is to filter variables through the Variable Selection Network (VSN), which assigns weights to the inputs in accordance with their relevance to the prediction. Static variables are used as a context for variable selection of other variables and as an initial state of LSTM encoders.
-After encoding, variables are passed to multi-head attention layers (decoder), which produce the final prediction. Whole architecture is interwoven with residual connections with gating mechanisms that allow  the architecture to adapt to various problems by skipping some parts of it.
-For the sake of explainability, heads of self-attention layers share value matrices. This allows interpreting  self-attention as an ensemble of models predicting different temporal patterns over the same feature set. The other feature that helps us understand the model is VSN activations, which tells us how relevant the given feature is to the prediction.
-![](TFT_architecture.PNG)
-*image source: https://arxiv.org/abs/1912.09363*
-
-### Default configuration
-
-The specific configuration of the TFT model depends on the dataset used. Not only is the volume of the model subject to change but so are the data sampling and preprocessing strategies. During preprocessing, data is normalized per feature. For a part of the datasets, we apply scaling per-time-series, which takes into account shifts in distribution between entities (i.e., a factory consumes more electricity than an average house). The model is trained with the quantile loss: <img src="https://render.githubusercontent.com/render/math?math=\Large\sum_{i=1}^N\sum_{q\in\mathcal{Q}}\sum_{t=1}^{t_{max}}\frac{QL(y_it,\hat{y}_i(q,t),q)}{Nt_{max}}">
-For quantiles in [0.1, 0.5, 0.9]. The default configurations are tuned for distributed training on DGX-1-32G with mixed precision. We use dynamic loss scaling. Specific values are provided in the table below.
-
-| Dataset | Training samples | Validation samples | Test samples | History length | Forecast horizon | Dropout | Hidden size | #Heads | BS | LR | Gradient clipping |
-| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
-| Electricity | 450k | 50k | 53.5k | 168 | 24 | 0.1 | 128 | 4 | 8x1024 | 1e-3 | 0.0 |
-| Traffic | 450k | 50k | 139.6k | 168 | 24 | 0.3 | 128 | 4 | 8x1024 | 1e-3 | 0.0
-
-### Feature support matrix
-
-The following features are supported by this model: 
-
-| Feature                    | Yes column                
-|----------------------------|--------------------------
-|Distributed data parallel   |         Yes
-|PyTorch AMP                 |         Yes 
-    
-         
-#### Features
-
-[Automatic Mixed Precision](https://pytorch.org/docs/stable/amp.html)
-provides an easy way to leverage Tensor Cores’ performance. It allows the execution of parts of a network in lower precision. Refer to [Mixed precision training](#mixed-precision-training) for more information.
-
-[PyTorch
-DistributedDataParallel](https://pytorch.org/docs/stable/nn.html#torch.nn.parallel.DistributedDataParallel) - a module
-wrapper that enables easy multiprocess distributed data-parallel
-training.
-
-### Mixed precision training
-
-Mixed precision is the combined use of different numerical precisions in a
-computational method.
-[Mixed precision](https://arxiv.org/abs/1710.03740) training offers significant
-computational speedup by performing operations in half-precision format while
-storing minimal information in single-precision to retain as much information
-as possible in critical parts of the network. Since the introduction of [Tensor Cores](https://developer.nvidia.com/tensor-cores) in Volta, and following with 
-both the Turing and Ampere architectures, significant training speedups are 
-experienced by switching to
-mixed precision -- up to 3x overall speedup on the most arithmetically intense
-model architectures. Using mixed precision training previously required two
-steps:
-
-1. Porting the model to use the FP16 data type where appropriate.
-2. Manually adding loss scaling to preserve small gradient values.
-
-The ability to train deep learning networks with lower precision was introduced
-in the Pascal architecture and first supported in [CUDA
-8](https://devblogs.nvidia.com/parallelforall/tag/fp16/) in the NVIDIA Deep
-Learning SDK.
-
-For information about:
-* How to train using mixed precision, refer to the [Mixed Precision
-  Training](https://arxiv.org/abs/1710.03740) paper and [Training With Mixed
-  Precision](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html)
-  documentation.
-* Techniques used for mixed precision training, refer to the [Mixed-Precision
-  Training of Deep Neural
-  Networks](https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/)
-  blog.
-* APEX tools for mixed precision training, refer to the [NVIDIA Apex: Tools for Easy Mixed-Precision Training in
-  PyTorch](https://devblogs.nvidia.com/apex-pytorch-easy-mixed-precision-training/)
-  .
-
-
-#### Enabling mixed precision
-
-
-Mixed precision is enabled in PyTorch by using the Automatic Mixed Precision torch.cuda.amp module, which casts variables to half-precision upon retrieval while storing variables in single-precision format. Furthermore, to preserve small gradient magnitudes in backpropagation, a [loss scaling](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html#lossscaling) step must be included when applying gradients. In PyTorch, loss scaling can be applied automatically by the GradScaler class. All the necessary steps to implement AMP are verbosely described [here](https://pytorch.org/docs/stable/notes/amp_examples.html#amp-examples).
-
-To enable mixed precision for TFT, simply add the `--use_amp` option to the training script.
-#### Enabling TF32
-
-TensorFloat-32 (TF32) is the new math mode in [NVIDIA A100](https://www.nvidia.com/en-us/data-center/a100/) GPUs for handling the matrix math, also called tensor operations. TF32 running on Tensor Cores in A100 GPUs can provide up to 10x speedups compared to single-precision floating-point math (FP32) on Volta GPUs. 
-
-TF32 Tensor Cores can speed up networks using FP32, typically with no loss of accuracy. It is more robust than FP16 for models which require high dynamic range for weights or activations.
-
-For more information, refer to the [TensorFloat-32 in the A100 GPU Accelerates AI Training, HPC up to 20x](https://blogs.nvidia.com/blog/2020/05/14/tensorfloat-32-precision-format/) blog post.
-
-TF32 is supported in the NVIDIA Ampere GPU architecture and is enabled by default.
-
-
-
-### Glossary
-
-**Multi horizon prediction**  
-Process of estimating values of a time series for multiple future time steps.
-
-**Quantiles**  
-Cut points dividing the range of a probability distribution intervals with equal probabilities.
-
-**Time series**  
-Series of data points indexed and equally spaced in time.
-
-**Transformer**  
-The paper [Attention Is All You Need](https://arxiv.org/abs/1706.03762) introduces a novel architecture called Transformer that uses an attention mechanism and transforms one sequence into another.
- 
-
-## Setup
-
-The following section lists the requirements that you need to meet in order to start training the TFT model.
-
-### Requirements
-
-This repository contains Dockerfile, which extends the PyTorch NGC container and encapsulates some dependencies. Aside from these dependencies, ensure you have the following components:
--   [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker)
--   [PyTorch 21.06 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch)
--   Supported GPUs:
-- [NVIDIA Volta architecture](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/)
-- [NVIDIA Turing architecture](https://www.nvidia.com/en-us/design-visualization/technologies/turing-architecture/)
-- [NVIDIA Ampere architecture](https://www.nvidia.com/en-us/data-center/nvidia-ampere-gpu-architecture/)
-
-For more information about how to get started with NGC containers, refer to the following sections from the NVIDIA GPU Cloud Documentation and the Deep Learning Documentation:
--   [Getting Started Using NVIDIA GPU Cloud](https://docs.nvidia.com/ngc/ngc-getting-started-guide/index.html)
--   [Accessing And Pulling From The NGC Container Registry](https://docs.nvidia.com/deeplearning/frameworks/user-guide/index.html#accessing_registry)
--   Running [PyTorch](https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/running.html#running)
-
-  
-For those unable to use the PyTorch NGC container to set up the required environment or create your own container, refer to the versioned [NVIDIA Container Support Matrix](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html).
-
-## Quick Start Guide
-
-To train your model using mixed or TF32 precision with Tensor Cores, perform the following steps using the default parameters of the TFT model on any of the benchmark datasets. For the specifics concerning training and inference, refer to the [Advanced](#advanced) section.
-
-1. Clone the repository.
-```bash
-git clone https://github.com/NVIDIA/DeepLearningExamples
-cd DeepLearningExamples/PyTorch/Forecasting/TFT
-```
-
-2.  Build the TFT PyTorch NGC container.
-```bash
-docker build --network=host -t tft .
-```
-
-3.  Start an interactive session in the NGC container to run training/inference.
-```bash
-docker run -it --rm --ipc=host --network=host --gpus all -v /path/to/your/data:/data/ tft
-```
-
-Note: Ensure to mount your dataset using the -v flag to make it available for training inside the NVIDIA Docker container.
-
-4.  Download and preprocess datasets.
-```bash
-bash scripts/get_data.sh
-```
-
-5. Start training. Choose one of the scripts provided in the `scripts/` directory. Results are stored in the `/results` directory.
-These scripts are tuned for DGX1-32G. If you have a different system, use NGPU and BATCH_SIZE variables to adjust the parameters for your system.
-```bash
-bash scripts/run_electricity.sh
-bash scripts/run_traffic.sh
-```
-
-6. Start validation/evaluation. The metric we use for evaluation is q-risk. We can compare it per-quantile in the Pareto sense or jointly as one number indicating accuracy.
-```bash
-python inference.py \
---checkpoint <your_checkpoint> \
---data /data/processed/<dataset>/test.csv \
---cat_encodings /data/processed/<dataset>/cat_encodings.bin \
---tgt_scalers /data/processed/<dataset>/tgt_scalers.bin
-```
-
-7. Start inference/predictions. Visualize and save predictions by running the following command.
-```bash
-python inference.py \
---checkpoint <your_checkpoint> \
---data /data/processed/<dataset>/test.csv \
---cat_encodings /data/processed/<dataset>/cat_encodings.bin \
---tgt_scalers /data/processed/<dataset>/tgt_scalers.bin \
---visualize \
---save_predictions
-```
-
-
-
-Now that you have your model trained and evaluated, you can choose to compare your training results with our [Training accuracy results](#training-accuracy-results). You can also choose to benchmark your performance to [Training performance benchmark](#training-performance-results). Following the steps in these sections will ensure that you achieve the same accuracy and performance results as stated in the [Results](#results) section.
-## Advanced
-
-The following sections provide more  details about the dataset, running training and inference, and the training results.
-
-### Scripts and sample code
-
-In the root directory, the most important files are:
-
-`train.py`: Entry point for training
-`data_utils.py`: File containing the dataset implementation and preprocessing functions
-`modeling.py`: Definition of the model
-`configuration.py`: Contains configuration classes for various experiments
-`test.py`: Entry point testing trained model.
-`Dockerfile`: Container definition
-`log_helper.py`: Contains helper functions for setting up dllogger
-`criterions.py`: Definitions of loss functions
-
-The `scripts` directory contains scripts for default use cases:
-`run_electricity.sh`: train default model on the electricity dataset
-`run_traffic.sh`: train default model on the traffic dataset
-
-### Command-line options
-
-To view the full list of available options and their descriptions, use the `-h` or `--help` command-line option, for example:
-`python train.py --help`.
-
-The following example output is printed when running the model:
-```
-usage: train.py [-h] --data_path DATA_PATH --dataset {electricity,volatility,traffic,favorita} [--epochs EPOCHS] [--sample_data SAMPLE_DATA SAMPLE_DATA] [--batch_size BATCH_SIZE] [--lr LR] [--seed SEED] [--use_amp] [--clip_grad CLIP_GRAD]
-                [--early_stopping EARLY_STOPPING] [--results RESULTS] [--log_file LOG_FILE] [--distributed_world_size N] [--distributed_rank DISTRIBUTED_RANK] [--local_rank LOCAL_RANK] [--overwrite_config OVERWRITE_CONFIG]
-
-optional arguments:
-  -h, --help            show this help message and exit
-  --data_path DATA_PATH
-  --dataset {electricity,volatility,traffic,favorita}
-  --epochs EPOCHS
-  --sample_data SAMPLE_DATA SAMPLE_DATA
-  --batch_size BATCH_SIZE
-  --lr LR
-  --seed SEED
-  --use_amp             Enable automatic mixed precision
-  --clip_grad CLIP_GRAD
-  --early_stopping EARLY_STOPPING
-                        Stop training if validation loss does not improve for more than this number of epochs.
-  --results RESULTS
-  --log_file LOG_FILE
-  --distributed_world_size N
-                        total number of GPUs across all nodes (default: all visible GPUs)
-  --distributed_rank DISTRIBUTED_RANK
-                        rank of the current worker
-  --local_rank LOCAL_RANK
-                        rank of the current worker
-  --overwrite_config OVERWRITE_CONFIG
-                        JSON string used to overload config
-
-```
-
-### Getting the data
-    
-The TFT model was trained on the electricity and traffic benchmark datasets. This repository contains the `get_data.sh` download script, which for electricity and and traffic datasets will automatically download and preprocess the training, validation and test datasets, and produce files that contain scalers.
-#### Dataset guidelines
-
-The `data_utils.py` file contains all functions that are used to preprocess the data. Initially the data is loaded to a `pandas.DataFrame` and parsed to the common format which contains the features we will use for training. Then standardized data is cleaned, normalized, encoded and binarized.
-This step does the following:
-Drop all the columns that are not marked in the configuration file as used for training or preprocessing
-Flatten indices in case time series are indexed by more than one column
-Split the data into training, validation and test splits
-Filter out all the time series shorter than minimal example length
-Normalize columns marked as continuous in the configuration file
-Encode as integers columns marked as categorical
-Save the data in csv and binary formats
-
-#### Multi-dataset
-In order to use an alternate dataset, you have to write a function that parses your data to a common format. The format is as follows:
-There is at least one id column
-There is exactly one time column (that can also be used as a feature column)
-Each feature is in a separate column
-Each row represents a moment in time for only one time series
-Additionally, you must specify a configuration of the network, including a data description. Refer to the example in `configuration.py` file.
-### Training process
-
-The `train.py` script is an entry point for a training procedure. Refined recipes can be found in the `scripts` directory.
-The model trains for at most `--epochs` epochs. If option `--early_stopping N` is set, then training will end if for N subsequent epochs validation loss hadn’t improved.
-The details of the architecture and the dataset configuration are encapsulated by the `--dataset` option. This option chooses one of the configurations stored in the `configuration.py` file. You can enable mixed precision training by providing the `--use_amp` option. The training script supports multi-GPU training with the APEX package. To enable distributed training prepend training command with `python -m torch.distributed.launch --nproc_per_node=${NGPU}`.
-
-Example command:
-```
-python -m torch.distributed.launch --nproc_per_node=8 train.py \
-        --dataset electricity \
-        --data_path /data/processed/electricity_bin \
-        --batch_size=1024 \
-        --sample 450000 50000 \
-        --lr 1e-3 \
-        --epochs 25 \
-        --early_stopping 5 \
-        --seed 1 \
-        --use_amp \
-        --results /results/TFT_electricity_bs8x1024_lr1e-3/seed_1
-```
-
-The model is trained by optimizing quantile loss <img src="https://render.githubusercontent.com/render/math?math=\Large\sum_{i=1}^N\sum_{q\in\mathcal{Q}}\sum_{t=1}^{t_{max}}\frac{QL(y_{it},\hat{y}_i(q,t),q)}{Nt_{max}}">
-. After training, the checkpoint with the least validation loss is evaluated on a test split with q-risk metric <img src="https://render.githubusercontent.com/render/math?math=\Large\frac{2\sum_{y\in\Omega}\sum_{t=1}^{t_{max}}QL(y_t,\hat{y}(q,t),q)}{\sum_{y\in\Omega}\sum_{t=1}^{t_{max}}|y_t|}">.
-Results are by default stored in the `/results` directory. This can be changed by providing the `--results` option. At the end of the training,  the results directory will contain the trained checkpoint which had the lowest validation loss, dllogger logs (in dictionary per line format), and TensorBoard logs.
-
-### Inference process
-
-Inference can be run by launching the `inference.py` script. The script requires a trained checkpoint to run. It is crucial to prepare the data in the same way as training data prior to running the inference. Example command:
-```
-python inference.py \
---checkpoint /results/checkpoint.pt \
---data /data/processed/electricity_bin/test.csv \
---tgt_scalers /data/processed/electricity_bin/tgt_scalers.bin \
---cat_encodings /data/processed/electricity_bin/cat_encodings.bin \
---batch_size 2048 \
---visualize \
---save_predictions \
---joint_visualization \
---results /results \
---use_amp
-```
-
-In the default setting, it performs the evaluation of the model on a specified dataset and prints q-risk evaluated on this dataset. In order to save the predictions, use the `--save_predictions` option. Predictions will be stored in the directory specified by the `--results` option in the csv format. Option `--joint_visualization` allows us to plot graphs in TensorBoard format, allowing us to inspect the results and compare them to true values. Using `--visualize`, you can save plots for each example in a separate file.
-## Performance
-
-### Benchmarking
-
-The following section shows how to run benchmarks measuring the model performance in training and inference modes.
-
-#### Training performance benchmark
-
-In order to run training benchmarks, use the `scripts/benchmark.sh` script.
-
-#### Inference performance benchmark
-
-To benchmark the inference performance on a specific batch size and dataset, run the `inference.py` script.
-### Results
-
-The following sections provide details on how we achieved our performance and accuracy in training and inference.
-
-#### Training accuracy results
-
-We conducted an extensive hyperparameter search along with stability tests. The presented results are the averages from the hundreds of runs.
-
-##### Training accuracy: NVIDIA DGX A100 (A100 80GB)
-
-Our results were obtained by running the `train.sh` training script in the [PyTorch 21.06 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch) on NVIDIA A100 GPUs.
-
-| Dataset | GPUs | Batch size / GPU    | Accuracy - TF32  | Accuracy - mixed precision  |   Time to train - TF32  |  Time to train - mixed precision | Time to train speedup (TF32 to mixed precision)     
-|-------------|---|------|-----------------------|-----------------------|-------|-------|-------
-| Electricity | 1 | 1024 | 0.027 / 0.059 / 0.029 | 0.028 / 0.058 / 0.029 | 1427s | 1087s | 1.313x
-| Electricity | 8 | 1024 | 0.027 / 0.056 / 0.028 | 0.026 / 0.054 / 0.029 | 216s  | 176s  | 1.227x
-| Traffic     | 1 | 1024 | 0.040 / 0.103 / 0.075 | 0.040 / 0.103 / 0.075 | 957s  | 726s  | 1.318x
-| Traffic     | 8 | 1024 | 0.042 / 0.104 / 0.076 | 0.042 / 0.106 / 0.077 | 151s  | 126s  | 1.198x
-
-
-
-
-##### Training accuracy: NVIDIA DGX-1 (V100 16GB)
-
-Our results were obtained by running the `train.sh` training script in the [PyTorch 21.06 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch) on NVIDIA DGX-1 with V100 16GB GPUs.
-
-| Dataset | GPUs    | Batch size / GPU    | Accuracy - FP32  | Accuracy - mixed precision  |   Time to train - FP32  |  Time to train - mixed precision | Time to train speedup (FP32 to mixed precision)        
-|-------------|---|------|-----------------------|-----------------------|-------|-------|-----------
-| Electricity | 1 | 1024 | 0.027 / 0.056 / 0.028 | 0.027 / 0.058 / 0.029 | 2559s | 1598s | 1.601x 
-| Electricity | 8 | 1024 | 0.027 / 0.055 / 0.028 | 0.027 / 0.055 / 0.029 | 381s  | 261s  | 1.460x   
-| Traffic     | 1 | 1024 | 0.040 / 0.102 / 0.075 | 0.041 / 0.101 / 0.074 | 1718s | 1062s | 1.618x 
-| Traffic     | 8 | 1024 | 0.042 / 0.106 / 0.076 | 0.042 / 0.105 / 0.077 | 256s  | 176s  | 1.455x
-
-
-
-##### Training stability test
-
-In order to get a greater picture of the model’s accuracy, we performed a hyperparameter search along with stability tests on 100 random seeds for each configuration. Then, for each benchmark dataset, we have chosen the architecture with the least mean test q-risk. The table below summarizes the best configurations.
-
-| Dataset     | #GPU | Hidden size | #Heads | Local BS | LR   | Gradient clipping | Dropout | Mean q-risk | Std q-risk | Min q-risk | Max q-risk
-|-------------|------|-------------|--------|----------|------|-------------------|---------|-------------|------------| -----------|------ 
-| Electricity | 8    | 128         | 4      | 1024     | 1e-3 | 0.0               | 0.1     | 0.1131      | 0.0025     | 0.1080     | 0.1200
-| Traffic     | 8    | 128         | 4      | 1024     | 1e-3 | 0.0               | 0.3     | 0.2180      | 0.0049     | 0.2069     | 0.2336
-
-
-#### Training performance results
-
-##### Training performance: NVIDIA DGX A100 (A100 80GB)
-
-Our results were obtained by running the `train.sh` training script in the [PyTorch 21.06 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch) on NVIDIA A100 (A100 80GB) GPUs. Performance numbers (in items/images per second) were averaged over an entire training epoch.
-
-| Dataset | GPUs   | Batch size / GPU   | Throughput - TF32    | Throughput - mixed precision    | Throughput speedup (TF32 - mixed precision)   | Weak scaling - TF32    | Weak scaling - mixed precision        
-|-------------|---|------|--------|--------|-------|-------|-----
-| Electricity | 1 | 1024 | 10173  | 13703  | 1.35x | 1     | 1
-| Electricity | 8 | 1024 | 80596  | 107761 | 1.34x | 7.92x | 7.86x
-| Traffic     | 1 | 1024 | 10197  | 13779  | 1.35x | 1     | 1
-| Traffic     | 8 | 1024 | 80692  | 107979 | 1.34x | 7.91x | 7.84x
-
-
-To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
-
-The performance metrics used were items per second.
-
-
-##### Training performance: NVIDIA DGX-1 (V100 16GB)
-
-Our results were obtained by running the `train.sh` training script in the [PyTorch 21.06 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch) on NVIDIA DGX-1 with (V100 16GB) GPUs. Performance numbers (in items/images per second) were averaged over an entire training epoch.
-
-| Dataset | GPUs   | Batch size / GPU   | Throughput - FP32    | Throughput - mixed precision    | Throughput speedup (FP32 - mixed precision)   | Weak scaling - FP32    | Weak scaling - mixed precision        
-|-------------|---|------|-------|-------|-------|------|----
-| Electricity | 1 | 1024 | 5580  | 9148  | 1.64x | 1     | 1
-| Electricity | 8 | 1024 | 43351 | 69855 | 1.61x | 7.77x | 7.64x
-| Traffic     | 1 | 1024 | 5593  | 9194  | 1.64x | 1     | 1
-| Traffic     | 8 | 1024 | 43426 | 69983 | 1.61x | 7.76x | 7.61x
-
-
-
-To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
-
-The performance metrics used were items per second.
-
-## Release notes
-The performance measurements in this document were conducted at the time of publication and may not reflect the performance achieved from NVIDIA’s latest software release. For the most up-to-date performance measurements, go to https://developer.nvidia.com/deep-learning-performance-training-inference.
-
-### Changelog
-
-October 2021
-- Initial release
-
-### Known issues
-There are no known issues with this model.
-
diff --git a/PyTorch/Forecasting/TFT/tft_pyt/TFT_architecture.PNG b/PyTorch/Forecasting/TFT/tft_pyt/TFT_architecture.PNG
deleted file mode 100644
index c3431031..00000000
Binary files a/PyTorch/Forecasting/TFT/tft_pyt/TFT_architecture.PNG and /dev/null differ
diff --git a/PyTorch/Forecasting/TFT/tft_pyt/configuration.py b/PyTorch/Forecasting/TFT/tft_pyt/configuration.py
deleted file mode 100644
index bef26e66..00000000
--- a/PyTorch/Forecasting/TFT/tft_pyt/configuration.py
+++ /dev/null
@@ -1,128 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from data_utils import InputTypes, DataTypes, FeatureSpec
-import datetime
-
-class ElectricityConfig():
-    def __init__(self):
-
-        self.features = [
-                         FeatureSpec('id', InputTypes.ID, DataTypes.CATEGORICAL),
-                         FeatureSpec('hours_from_start', InputTypes.TIME, DataTypes.CONTINUOUS),
-                         FeatureSpec('power_usage', InputTypes.TARGET, DataTypes.CONTINUOUS),
-                         FeatureSpec('hour', InputTypes.KNOWN, DataTypes.CONTINUOUS),
-                         FeatureSpec('day_of_week', InputTypes.KNOWN, DataTypes.CONTINUOUS),
-                         FeatureSpec('hours_from_start', InputTypes.KNOWN, DataTypes.CONTINUOUS),
-                         FeatureSpec('categorical_id', InputTypes.STATIC, DataTypes.CATEGORICAL),
-                        ]
-        # Dataset split boundaries
-        self.time_ids = 'days_from_start' # This column contains time indices across which we split the data
-        self.train_range = (1096, 1315)
-        self.valid_range = (1308, 1339)
-        self.test_range = (1332, 1346)
-        self.dataset_stride = 1 #how many timesteps between examples
-        self.scale_per_id = True
-        self.missing_id_strategy = None
-        self.missing_cat_data_strategy='encode_all'
-
-        # Feature sizes
-        self.static_categorical_inp_lens = [369]
-        self.temporal_known_categorical_inp_lens = []
-        self.temporal_observed_categorical_inp_lens = []
-        self.quantiles = [0.1, 0.5, 0.9]
-
-        self.example_length = 8 * 24
-        self.encoder_length = 7 * 24
-
-        self.n_head = 4
-        self.hidden_size = 128
-        self.dropout = 0.1
-        self.attn_dropout = 0.0
-
-        #### Derived variables ####
-        self.temporal_known_continuous_inp_size = len([x for x in self.features 
-            if x.feature_type == InputTypes.KNOWN and x.feature_embed_type == DataTypes.CONTINUOUS])
-        self.temporal_observed_continuous_inp_size = len([x for x in self.features 
-            if x.feature_type == InputTypes.OBSERVED and x.feature_embed_type == DataTypes.CONTINUOUS])
-        self.temporal_target_size = len([x for x in self.features if x.feature_type == InputTypes.TARGET])
-        self.static_continuous_inp_size = len([x for x in self.features 
-            if x.feature_type == InputTypes.STATIC and x.feature_embed_type == DataTypes.CONTINUOUS])
-
-        self.num_static_vars = self.static_continuous_inp_size + len(self.static_categorical_inp_lens)
-        self.num_future_vars = self.temporal_known_continuous_inp_size + len(self.temporal_known_categorical_inp_lens)
-        self.num_historic_vars = sum([self.num_future_vars,
-                                      self.temporal_observed_continuous_inp_size,
-                                      self.temporal_target_size,
-                                      len(self.temporal_observed_categorical_inp_lens),
-                                      ])
-
-
-class TrafficConfig():
-    def __init__(self):
-
-        self.features = [
-                         FeatureSpec('id', InputTypes.ID, DataTypes.CATEGORICAL),
-                         FeatureSpec('hours_from_start', InputTypes.TIME, DataTypes.CONTINUOUS),
-                         FeatureSpec('values', InputTypes.TARGET, DataTypes.CONTINUOUS),
-                         FeatureSpec('time_on_day', InputTypes.KNOWN, DataTypes.CONTINUOUS),
-                         FeatureSpec('day_of_week', InputTypes.KNOWN, DataTypes.CONTINUOUS),
-                         FeatureSpec('hours_from_start', InputTypes.KNOWN, DataTypes.CONTINUOUS),
-                         FeatureSpec('categorical_id', InputTypes.STATIC, DataTypes.CATEGORICAL),
-                        ]
-        # Dataset split boundaries
-        self.time_ids = 'sensor_day' # This column contains time indices across which we split the data
-        self.train_range = (0, 151)
-        self.valid_range = (144, 166)
-        self.test_range = (159, float('inf'))
-        self.dataset_stride = 1 #how many timesteps between examples
-        self.scale_per_id = False
-        self.missing_id_strategy = None
-        self.missing_cat_data_strategy='encode_all'
-
-        # Feature sizes
-        self.static_categorical_inp_lens = [963]
-        self.temporal_known_categorical_inp_lens = []
-        self.temporal_observed_categorical_inp_lens = []
-        self.quantiles = [0.1, 0.5, 0.9]
-
-        self.example_length = 8 * 24
-        self.encoder_length = 7 * 24
-
-        self.n_head = 4
-        self.hidden_size = 128
-        self.dropout = 0.3
-        self.attn_dropout = 0.0
-
-        #### Derived variables ####
-        self.temporal_known_continuous_inp_size = len([x for x in self.features 
-            if x.feature_type == InputTypes.KNOWN and x.feature_embed_type == DataTypes.CONTINUOUS])
-        self.temporal_observed_continuous_inp_size = len([x for x in self.features 
-            if x.feature_type == InputTypes.OBSERVED and x.feature_embed_type == DataTypes.CONTINUOUS])
-        self.temporal_target_size = len([x for x in self.features if x.feature_type == InputTypes.TARGET])
-        self.static_continuous_inp_size = len([x for x in self.features 
-            if x.feature_type == InputTypes.STATIC and x.feature_embed_type == DataTypes.CONTINUOUS])
-
-        self.num_static_vars = self.static_continuous_inp_size + len(self.static_categorical_inp_lens)
-        self.num_future_vars = self.temporal_known_continuous_inp_size + len(self.temporal_known_categorical_inp_lens)
-        self.num_historic_vars = sum([self.num_future_vars,
-                                      self.temporal_observed_continuous_inp_size,
-                                      self.temporal_target_size,
-                                      len(self.temporal_observed_categorical_inp_lens),
-                                      ])
-
-
-CONFIGS = {'electricity':  ElectricityConfig,
-           'traffic':      TrafficConfig, 
-           }
diff --git a/PyTorch/Forecasting/TFT/tft_pyt/criterions.py b/PyTorch/Forecasting/TFT/tft_pyt/criterions.py
deleted file mode 100644
index 5c9df6ae..00000000
--- a/PyTorch/Forecasting/TFT/tft_pyt/criterions.py
+++ /dev/null
@@ -1,28 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-class QuantileLoss(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.register_buffer('q', torch.tensor(config.quantiles))
-
-    def forward(self, predictions, targets):
-        diff = predictions - targets
-        ql = (1-self.q)*F.relu(diff) + self.q*F.relu(-diff)
-        losses = ql.view(-1, ql.shape[-1]).mean(0)
-        return losses
diff --git a/PyTorch/Forecasting/TFT/tft_pyt/data_utils.py b/PyTorch/Forecasting/TFT/tft_pyt/data_utils.py
deleted file mode 100644
index f38f8bfb..00000000
--- a/PyTorch/Forecasting/TFT/tft_pyt/data_utils.py
+++ /dev/null
@@ -1,790 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-################################
-# Copyright 2021 The Google Research Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import math
-import pickle
-import enum
-import datetime
-
-from collections import namedtuple, OrderedDict
-
-import sklearn.preprocessing
-from sklearn.impute import SimpleImputer
-import pandas as pd
-import numpy as np
-from bisect import bisect
-
-import torch
-from torch.utils.data import Dataset,IterableDataset,DataLoader
-
-class DataTypes(enum.IntEnum):
-    """Defines numerical types of each column."""
-    CONTINUOUS = 0
-    CATEGORICAL = 1
-    DATE = 2
-    STR = 3
-
-class InputTypes(enum.IntEnum):
-    """Defines input types of each column."""
-    TARGET = 0
-    OBSERVED = 1
-    KNOWN = 2
-    STATIC = 3
-    ID = 4  # Single column used as an entity identifier
-    TIME = 5  # Single column exclusively used as a time index
-
-FeatureSpec = namedtuple('FeatureSpec', ['name', 'feature_type', 'feature_embed_type'])
-DTYPE_MAP = {
-        DataTypes.CONTINUOUS : np.float32,
-        DataTypes.CATEGORICAL : np.int64,
-        DataTypes.DATE:'datetime64[ns]',
-        DataTypes.STR: str
-        }
-
-FEAT_ORDER = [
-        (InputTypes.STATIC, DataTypes.CATEGORICAL),
-        (InputTypes.STATIC, DataTypes.CONTINUOUS),
-        (InputTypes.KNOWN, DataTypes.CATEGORICAL),
-        (InputTypes.KNOWN, DataTypes.CONTINUOUS),
-        (InputTypes.OBSERVED, DataTypes.CATEGORICAL),
-        (InputTypes.OBSERVED, DataTypes.CONTINUOUS),
-        (InputTypes.TARGET, DataTypes.CONTINUOUS),
-        (InputTypes.ID, DataTypes.CATEGORICAL)
-        ]
-
-FEAT_NAMES = ['s_cat' , 's_cont' , 'k_cat' , 'k_cont' , 'o_cat' , 'o_cont' , 'target', 'id']
-DEFAULT_ID_COL = 'id'
-
-class TFTBinaryDataset(Dataset):
-    def __init__(self, path, config):
-        super(TFTBinaryDataset).__init__()
-        self.features = [x for x in config.features if x.feature_embed_type != DataTypes.DATE]
-        self.example_length = config.example_length
-        self.stride = config.dataset_stride
-
-        self.grouped = pickle.load(open(path, 'rb'))
-        self.grouped = [x for x in self.grouped if x.shape[0] >= self.example_length]
-        self._cum_examples_in_group = np.cumsum([(g.shape[0] - self.example_length + 1)//self.stride for g in self.grouped])
-
-
-        self.feature_type_col_map = [[i for i,f in enumerate(self.features) if (f.feature_type, f.feature_embed_type) == x] for x in FEAT_ORDER]
-
-        # The list comprehension below is an elaborate way of rearranging data into correct order,
-        # simultaneously doing casting to proper types. Probably can be written neater
-        self.grouped = [
-                [
-                    arr[:, idxs].view(dtype=np.float32).astype(DTYPE_MAP[t[1]]) 
-                    for t, idxs in zip(FEAT_ORDER, self.feature_type_col_map)
-                ] 
-                for arr in self.grouped
-            ]
-
-    def __len__(self):
-        return self._cum_examples_in_group[-1] if len(self._cum_examples_in_group) else 0
-
-    def __getitem__(self, idx):
-        g_idx = bisect(self._cum_examples_in_group, idx)
-        e_idx = idx - self._cum_examples_in_group[g_idx-1] if g_idx else idx
-
-        group =  self.grouped[g_idx]
-
-        tensors = [
-                   torch.from_numpy(feat[e_idx * self.stride:e_idx*self.stride + self.example_length])
-                   if feat.size else torch.empty(0)
-                   for feat in group
-                  ]
-
-        return OrderedDict(zip(FEAT_NAMES, tensors))
-
-
-class TFTDataset(Dataset):
-    def __init__(self, path, config):
-        super(TFTDataset).__init__()
-        self.features = config.features
-        self.data = pd.read_csv(path, index_col=0)
-        self.example_length = config.example_length
-        self.stride = config.dataset_stride
-
-        # name field is a column name.
-        # there can be multiple entries with the same name because one column can be interpreted in many ways
-        time_col_name = next(x.name for x in self.features if x.feature_type==InputTypes.TIME)
-        id_col_name   = next(x.name for x in self.features if x.feature_type==InputTypes.ID)
-        if not id_col_name in self.data.columns:
-            id_col_name = DEFAULT_ID_COL
-            self.features = [x for x in self.features if x.feature_type!=InputTypes.ID]
-            self.features.append(FeatureSpec(DEFAULT_ID_COL, InputTypes.ID, DataTypes.CATEGORICAL))
-        col_dtypes    = {v.name:DTYPE_MAP[v.feature_embed_type] for v in self.features}
-
-
-        self.data.sort_values(time_col_name,inplace=True)
-        self.data = self.data[set(x.name for x in self.features)] #leave only relevant columns
-        self.data = self.data.astype(col_dtypes)
-        self.data = self.data.groupby(id_col_name).filter(lambda group: len(group) >= self.example_length)
-        self.grouped = list(self.data.groupby(id_col_name))
-
-        self._cum_examples_in_group = np.cumsum([(len(g[1]) - self.example_length + 1)//self.stride for g in self.grouped])
-
-    def __len__(self):
-        return self._cum_examples_in_group[-1]
-
-    def __getitem__(self, idx):
-        g_idx = len([x for x in self._cum_examples_in_group if x <= idx])
-        e_idx = idx - self._cum_examples_in_group[g_idx-1] if g_idx else idx
-
-        group =  self.grouped[g_idx][1]
-        sliced = group.iloc[e_idx * self.stride:e_idx*self.stride + self.example_length]
-
-        # We need to be sure that tensors are returned in the correct order
-        tensors = tuple([] for _ in range(8))
-        for v in self.features:
-            if v.feature_type == InputTypes.STATIC and v.feature_embed_type == DataTypes.CATEGORICAL:
-                tensors[0].append(torch.from_numpy(sliced[v.name].to_numpy()))
-            elif v.feature_type == InputTypes.STATIC and v.feature_embed_type == DataTypes.CONTINUOUS:
-                tensors[1].append(torch.from_numpy(sliced[v.name].to_numpy()))
-            elif v.feature_type == InputTypes.KNOWN and v.feature_embed_type == DataTypes.CATEGORICAL:
-                tensors[2].append(torch.from_numpy(sliced[v.name].to_numpy()))
-            elif v.feature_type == InputTypes.KNOWN and v.feature_embed_type == DataTypes.CONTINUOUS:
-                tensors[3].append(torch.from_numpy(sliced[v.name].to_numpy()))
-            elif v.feature_type == InputTypes.OBSERVED and v.feature_embed_type == DataTypes.CATEGORICAL:
-                tensors[4].append(torch.from_numpy(sliced[v.name].to_numpy()))
-            elif v.feature_type == InputTypes.OBSERVED and v.feature_embed_type == DataTypes.CONTINUOUS:
-                tensors[5].append(torch.from_numpy(sliced[v.name].to_numpy()))
-            elif v.feature_type == InputTypes.TARGET:
-                tensors[6].append(torch.from_numpy(sliced[v.name].to_numpy()))
-            elif v.feature_type == InputTypes.ID:
-                tensors[7].append(torch.from_numpy(sliced[v.name].to_numpy()))
-
-
-        tensors = [torch.stack(x, dim=-1) if x else torch.empty(0) for x in tensors]
-
-        return OrderedDict(zip(FEAT_NAMES, tensors))
-        
-def get_dataset_splits(df, config):
-
-    if hasattr(config, 'relative_split') and config.relative_split:
-        forecast_len = config.example_length - config.encoder_length
-        # The valid split is shifted from the train split by number of the forecast steps to the future.
-        # The test split is shifted by the number of the forecast steps from the valid split
-        train = []
-        valid = []
-        test = []
-
-        for _, group in df.groupby(DEFAULT_ID_COL):
-            index = group[config.time_ids]
-            _train = group.loc[index < config.valid_boundary]
-            _valid = group.iloc[(len(_train) - config.encoder_length):(len(_train) + forecast_len)]
-            _test = group.iloc[(len(_train) - config.encoder_length + forecast_len):(len(_train) + 2*forecast_len)]
-            train.append(_train)
-            valid.append(_valid)
-            test.append(_test)
-
-        train = pd.concat(train, axis=0)
-        valid = pd.concat(valid, axis=0)
-        test = pd.concat(test, axis=0)
-    else:
-        index = df[config.time_ids]
-        train = df.loc[(index >= config.train_range[0]) & (index < config.train_range[1])]
-        valid = df.loc[(index >= config.valid_range[0]) & (index < config.valid_range[1])]
-        test  = df.loc[(index >= config.test_range[0]) & (index < config.test_range[1])]
-
-    return train, valid, test
-
-def flatten_ids(df, config):
-
-    if config.missing_id_strategy == 'drop':
-        if hasattr(config, 'combine_ids') and config.combine_ids:
-            index = np.logical_or.reduce([df[c].isna() for c in config.combine_ids])
-        else:
-            id_col = next(x.name for x in config.features if x.feature_type == InputTypes.ID)
-            index = df[id_col].isna()
-        index = index[index == True].index # Extract indices of nans
-        df.drop(index, inplace=True)
-
-    if not (hasattr(config, 'combine_ids') and config.combine_ids):
-        id_col = next(x.name for x in config.features if x.feature_type == InputTypes.ID)
-        ids = df[id_col].apply(str)
-        df.drop(id_col, axis=1, inplace=True)
-        encoder = sklearn.preprocessing.LabelEncoder().fit(ids.values)
-        df[DEFAULT_ID_COL] = encoder.transform(ids)
-        encoders = OrderedDict({id_col: encoder})
-
-    else:
-        encoders = {c:sklearn.preprocessing.LabelEncoder().fit(df[c].values) for c in config.combine_ids}
-        encoders = OrderedDict(encoders)
-        lens = [len(v.classes_) for v in encoders.values()]
-        clens = np.roll(np.cumprod(lens), 1)
-        clens[0] = 1
-
-        # this takes a looooooot of time. Probably it would be better to create 2 dummy columns
-        df[DEFAULT_ID_COL] = df.apply(lambda row: sum([encoders[c].transform([row[c]])[0]*clens[i] for i,c in enumerate(encoders.keys())]), axis=1)
-        df.drop(config.combine_ids, axis=1, inplace=True)
-
-    return DEFAULT_ID_COL, encoders
-
-def impute(df, config):
-    #XXX This ensures that out scaling will have the same mean. We still need to check the variance
-    if not hasattr(config, 'missing_data_label'):
-        return df, None
-    else:
-        imp = SimpleImputer(missing_values=config.missing_data_label, strategy='mean')
-        mask = df.applymap(lambda x: True if x == config.missing_data_label else False)
-        data = df.values
-        col_mask = (data == config.missing_data_label).all(axis=0)
-        data[:,~col_mask] = imp.fit_transform(data)
-        return data, mask
-
-def normalize_reals(train, valid, test, config, id_col=DEFAULT_ID_COL):
-    tgt_cols = [x.name for x in config.features if x.feature_type == InputTypes.TARGET]
-    real_cols = list(set(v.name for v in config.features if v.feature_embed_type == DataTypes.CONTINUOUS).difference(set(tgt_cols)))
-    real_scalers = {}
-    tgt_scalers = {}
-
-    def apply_scalers(df, name=None):
-        if name is None:
-            name = df.name
-        mask = df.applymap(lambda x: True if x == config.missing_data_label else False) if hasattr(config, 'missing_data_label') else None
-        df[real_cols] = real_scalers[name].transform(df[real_cols])
-        if mask is not None and any(mask):
-            df[real_cols].mask(mask, 10**9)
-        df[tgt_cols] = tgt_scalers[name].transform(df[tgt_cols])
-        return df
-
-    if config.scale_per_id:
-        for identifier, sliced in train.groupby(id_col):
-            data = sliced[real_cols]
-            data, _ = impute(data, config)
-            real_scalers[identifier] = sklearn.preprocessing.StandardScaler().fit(data)
-            # XXX We should probably remove examples that contain NaN as a target
-            target = sliced[tgt_cols]
-            tgt_scalers[identifier] = sklearn.preprocessing.StandardScaler().fit(target)
-
-        train = train.groupby(id_col).apply(apply_scalers)
-        # For valid and testing leave only timeseries previously present in train subset
-        # XXX for proper data science we should consider encoding unseen timeseries as a special case, not throwing them away
-        valid = valid.loc[valid[id_col].isin(real_scalers.keys())]
-        valid = valid.groupby(id_col).apply(apply_scalers)
-        test = test.loc[test[id_col].isin(real_scalers.keys())]
-        test = test.groupby(id_col).apply(apply_scalers)
-
-    else:
-        data, _ = impute(train[real_cols], config)
-        real_scalers[''] = sklearn.preprocessing.StandardScaler().fit(data)
-        tgt_scalers[''] = sklearn.preprocessing.StandardScaler().fit(train[tgt_cols])
-
-        train = apply_scalers(train, name='')
-        valid = apply_scalers(valid, name='')
-        test = apply_scalers(test, name='')
-
-    return train, valid, test, real_scalers, tgt_scalers
-
-def encode_categoricals(train, valid, test, config):
-    cat_encodings = {}
-    cat_cols = list(set(v.name for v in config.features if v.feature_embed_type == DataTypes.CATEGORICAL and v.feature_type != InputTypes.ID))
-    num_classes = [] #XXX Maybe we should modify config based on this value? Or send a warninig?
-                     # For TC performance reasons we might want for num_classes[i] be divisible by 8
-
-    # Train categorical encoders
-    for c in cat_cols:
-        if config.missing_cat_data_strategy == 'special_token':
-            #XXX this will probably require some data augmentation
-            unique = train[c].unique()
-            valid[c].loc[valid[c].isin(unique)] = '<UNK>'
-            test[c].loc[test[c].isin(unique)] = '<UNK>'
-
-        if config.missing_cat_data_strategy == 'encode_all' or \
-                config.missing_cat_data_strategy == 'special_token':
-            srs = pd.concat([train[c], valid[c], test[c]]).apply(str)
-            cat_encodings[c] = sklearn.preprocessing.LabelEncoder().fit(srs.values)
-        elif config.missing_cat_data_strategy == 'drop':
-            # TODO: implement this. In addition to dropping rows this has to split specific time series in chunks
-            # to prevent data from having temporal gaps
-            pass
-        num_classes.append(srs.nunique())
-    print('Categorical variables encodings lens: ', num_classes)
-
-
-    for split in [train, valid, test]:
-        for c in cat_cols:
-            srs = split[c].apply(str)
-            split[c] = srs
-            split.loc[:,c] = cat_encodings[c].transform(srs)
-
-    return cat_encodings
-
-
-def preprocess(src_path, dst_path, config):
-    df = pd.read_csv(src_path, index_col=0)
-
-    for c in config.features:
-        if c.feature_embed_type == DataTypes.DATE:
-            df[c.name] = pd.to_datetime(df[c.name])
-
-    # Leave only columns relevant to preprocessing
-    relevant_columns = list(set([f.name for f in config.features] + [config.time_ids]))
-    df = df[relevant_columns]
-
-
-    id_col, id_encoders = flatten_ids(df, config)
-    df = df.reindex(sorted(df.columns), axis=1)
-    
-    train, valid, test = get_dataset_splits(df, config)
-   
-    # Length filter the data (all timeseries shorter than example len will be dropped)
-    #for df in [train, valid, test]:
-    #    df.groupby(id_col).filter(lambda x: len(x) >= config.example_length)
-    train = pd.concat([x[1] for x in train.groupby(id_col) if len(x[1]) >= config.example_length])
-    valid = pd.concat([x[1] for x in valid.groupby(id_col) if len(x[1]) >= config.example_length])
-    test  = pd.concat([x[1] for x in test.groupby(id_col)  if len(x[1]) >= config.example_length])
-
-    train, valid, test, real_scalers, tgt_scalers = normalize_reals(train, valid, test, config, id_col)
-
-    cat_encodings = encode_categoricals(train, valid, test, config)
-
-    os.makedirs(dst_path, exist_ok=True)
-    
-    train.to_csv(os.path.join(dst_path, 'train.csv'))
-    valid.to_csv(os.path.join(dst_path, 'valid.csv'))
-    test.to_csv(os.path.join(dst_path, 'test.csv'))
-
-    # Save relevant columns in binary form for faster dataloading
-    # IMORTANT: We always expect id to be a single column indicating the complete timeseries
-    # We also expect a copy of id in form of static categorical input!!!
-    col_names = [id_col] + [x.name for x in config.features if x.feature_embed_type != DataTypes.DATE and x.feature_type != InputTypes.ID]
-    grouped_train = [x[1][col_names].values.astype(np.float32).view(dtype=np.int32) for x in train.groupby(id_col)]
-    grouped_valid = [x[1][col_names].values.astype(np.float32).view(dtype=np.int32) for x in valid.groupby(id_col)]
-    grouped_test  = [x[1][col_names].values.astype(np.float32).view(dtype=np.int32) for x in test.groupby(id_col)]
-
-    pickle.dump(grouped_train, open(os.path.join(dst_path, 'train.bin'), 'wb'))
-    pickle.dump(grouped_valid, open(os.path.join(dst_path, 'valid.bin'), 'wb'))
-    pickle.dump(grouped_test,  open(os.path.join(dst_path, 'test.bin'), 'wb'))
-
-    
-    with open(os.path.join(dst_path, 'real_scalers.bin'), 'wb') as f:
-        pickle.dump(real_scalers, f)
-    with open(os.path.join(dst_path, 'tgt_scalers.bin'), 'wb') as f:
-        pickle.dump(tgt_scalers, f)
-    with open(os.path.join(dst_path, 'cat_encodings.bin'), 'wb') as f:
-        pickle.dump(cat_encodings, f)
-    with open(os.path.join(dst_path, 'id_encoders.bin'), 'wb') as f:
-        pickle.dump(id_encoders, f)
-    
-
-def sample_data(dataset, num_samples):
-    if num_samples < 0:
-        return dataset
-    else:
-        return torch.utils.data.Subset(dataset, np.random.choice(np.arange(len(dataset)), size=num_samples, replace=False))
-
-
-def standarize_electricity(path):
-    """Code taken from https://github.com/google-research/google-research/blob/master/tft/script_download_data.py"""
-    df = pd.read_csv(os.path.join(path, 'LD2011_2014.txt'), index_col=0, sep=';', decimal=',')
-    df.index = pd.to_datetime(df.index)
-    df.sort_index(inplace=True)
-  
-    # Used to determine the start and end dates of a series
-    output = df.resample('1h').mean().replace(0., np.nan)
-  
-    earliest_time = output.index.min()
-  
-    df_list = []
-    for label in output:
-        print('Processing {}'.format(label))
-        srs = output[label]
-  
-        start_date = min(srs.fillna(method='ffill').dropna().index)
-        end_date = max(srs.fillna(method='bfill').dropna().index)
-  
-        active_range = (srs.index >= start_date) & (srs.index <= end_date)
-        srs = srs[active_range].fillna(0.)
-  
-        tmp = pd.DataFrame({'power_usage': srs})
-        date = tmp.index
-        tmp['t'] = (date - earliest_time).seconds / 60 / 60 + (
-            date - earliest_time).days * 24
-        tmp['days_from_start'] = (date - earliest_time).days
-        tmp['categorical_id'] = label
-        tmp['date'] = date
-        tmp['id'] = label
-        tmp['hour'] = date.hour
-        tmp['day'] = date.day
-        tmp['day_of_week'] = date.dayofweek
-        tmp['month'] = date.month
-  
-        df_list.append(tmp)
-  
-    output = pd.concat(df_list, axis=0, join='outer').reset_index(drop=True)
-  
-    output['categorical_id'] = output['id'].copy()
-    output['hours_from_start'] = output['t']
-    output['categorical_day_of_week'] = output['day_of_week'].copy()
-    output['categorical_hour'] = output['hour'].copy()
-  
-    output.to_csv(os.path.join(path, 'standarized.csv'))
-
-def standarize_volatility(path):
-    df = pd.read_csv(os.path.join(path, 'oxfordmanrealizedvolatilityindices.csv'), index_col=0)  # no explicit index
-  
-    # Adds additional date/day fields
-    idx = [str(s).split('+')[0] for s in df.index
-          ]  # ignore timezones, we don't need them
-    dates = pd.to_datetime(idx)
-    df['date'] = dates
-    df['days_from_start'] = (dates - pd.datetime(2000, 1, 3)).days
-    df['day_of_week'] = dates.dayofweek
-    df['day_of_month'] = dates.day
-    df['week_of_year'] = dates.weekofyear
-    df['month'] = dates.month
-    df['year'] = dates.year
-    df['categorical_id'] = df['Symbol'].copy()
-  
-    # Processes log volatility
-    vol = df['rv5_ss'].copy()
-    vol.loc[vol == 0.] = np.nan
-    df['log_vol'] = np.log(vol)
-  
-    # Adds static information
-    symbol_region_mapping = {
-        '.AEX': 'EMEA',
-        '.AORD': 'APAC',
-        '.BFX': 'EMEA',
-        '.BSESN': 'APAC',
-        '.BVLG': 'EMEA',
-        '.BVSP': 'AMER',
-        '.DJI': 'AMER',
-        '.FCHI': 'EMEA',
-        '.FTMIB': 'EMEA',
-        '.FTSE': 'EMEA',
-        '.GDAXI': 'EMEA',
-        '.GSPTSE': 'AMER',
-        '.HSI': 'APAC',
-        '.IBEX': 'EMEA',
-        '.IXIC': 'AMER',
-        '.KS11': 'APAC',
-        '.KSE': 'APAC',
-        '.MXX': 'AMER',
-        '.N225': 'APAC ',
-        '.NSEI': 'APAC',
-        '.OMXC20': 'EMEA',
-        '.OMXHPI': 'EMEA',
-        '.OMXSPI': 'EMEA',
-        '.OSEAX': 'EMEA',
-        '.RUT': 'EMEA',
-        '.SMSI': 'EMEA',
-        '.SPX': 'AMER',
-        '.SSEC': 'APAC',
-        '.SSMI': 'EMEA',
-        '.STI': 'APAC',
-        '.STOXX50E': 'EMEA'
-    }
-  
-    df['Region'] = df['Symbol'].apply(lambda k: symbol_region_mapping[k])
-  
-    # Performs final processing
-    output_df_list = []
-    for grp in df.groupby('Symbol'):
-        sliced = grp[1].copy()
-        sliced.sort_values('days_from_start', inplace=True)
-        # Impute log volatility values
-        sliced['log_vol'].fillna(method='ffill', inplace=True)
-        sliced.dropna()
-        output_df_list.append(sliced)
-  
-    df = pd.concat(output_df_list, axis=0)
-  
-    df.to_csv(os.path.join(path, 'standarized.csv'))
-
-
-def standarize_traffic(path):
-    def process_list(s, variable_type=int, delimiter=None):
-        """Parses a line in the PEMS format to a list."""
-        if delimiter is None:
-            l = [
-                variable_type(i) for i in s.replace('[', '').replace(']', '').split()
-            ]
-        else:
-            l = [
-                variable_type(i)
-                for i in s.replace('[', '').replace(']', '').split(delimiter)
-            ]
-  
-        return l
-  
-    def read_single_list(filename):
-        """Returns single list from a file in the PEMS-custom format."""
-        with open(os.path.join(path, filename), 'r') as dat:
-            l = process_list(dat.readlines()[0])
-        return l
-  
-    def read_matrix(filename):
-        """Returns a matrix from a file in the PEMS-custom format."""
-        array_list = []
-        with open(os.path.join(path, filename), 'r') as dat:
-            lines = dat.readlines()
-            for i, line in enumerate(lines):
-                if (i + 1) % 50 == 0:
-                    print('Completed {} of {} rows for {}'.format(i + 1, len(lines),
-                                                                filename))
-                array = [
-                    process_list(row_split, variable_type=float, delimiter=None)
-                    for row_split in process_list(
-                        line, variable_type=str, delimiter=';')
-                ]
-                array_list.append(array)
-  
-        return array_list
-  
-    shuffle_order = np.array(read_single_list('randperm')) - 1  # index from 0
-    train_dayofweek = read_single_list('PEMS_trainlabels')
-    train_tensor = read_matrix('PEMS_train')
-    test_dayofweek = read_single_list('PEMS_testlabels')
-    test_tensor = read_matrix('PEMS_test')
-  
-    # Inverse permutate shuffle order
-    print('Shuffling')
-    inverse_mapping = {
-        new_location: previous_location
-        for previous_location, new_location in enumerate(shuffle_order)
-    }
-    reverse_shuffle_order = np.array([
-        inverse_mapping[new_location]
-        for new_location, _ in enumerate(shuffle_order)
-    ])
-  
-    # Group and reoder based on permuation matrix
-    print('Reodering')
-    day_of_week = np.array(train_dayofweek + test_dayofweek)
-    combined_tensor = np.array(train_tensor + test_tensor)
-  
-    day_of_week = day_of_week[reverse_shuffle_order]
-    combined_tensor = combined_tensor[reverse_shuffle_order]
-  
-    # Put everything back into a dataframe
-    print('Parsing as dataframe')
-    labels = ['traj_{}'.format(i) for i in read_single_list('stations_list')]
-  
-    hourly_list = []
-    for day, day_matrix in enumerate(combined_tensor):
-        # Hourly data
-        hourly = pd.DataFrame(day_matrix.T, columns=labels)
-        hourly['hour_on_day'] = [int(i / 6) for i in hourly.index
-                                ]  # sampled at 10 min intervals
-        if hourly['hour_on_day'].max() > 23 or hourly['hour_on_day'].min() < 0:
-            raise ValueError('Invalid hour! {}-{}'.format(
-                hourly['hour_on_day'].min(), hourly['hour_on_day'].max()))
-  
-        hourly = hourly.groupby('hour_on_day', as_index=True).mean()[labels]
-        hourly['sensor_day'] = day
-        hourly['time_on_day'] = hourly.index
-        hourly['day_of_week'] = day_of_week[day]
-  
-        hourly_list.append(hourly)
-  
-    hourly_frame = pd.concat(hourly_list, axis=0, ignore_index=True, sort=False)
-  
-    # Flatten such that each entitiy uses one row in dataframe
-    store_columns = [c for c in hourly_frame.columns if 'traj' in c]
-    other_columns = [c for c in hourly_frame.columns if 'traj' not in c]
-    flat_df = pd.DataFrame(columns=['values', 'prev_values', 'next_values'] +
-                           other_columns + ['id'])
-  
-    for store in store_columns:
-        print('Processing {}'.format(store))
-  
-        sliced = hourly_frame[[store] + other_columns].copy()
-        sliced.columns = ['values'] + other_columns
-        sliced['id'] = int(store.replace('traj_', ''))
-  
-        # Sort by Sensor-date-time
-        key = sliced['id'].apply(str) \
-                + sliced['sensor_day'].apply(lambda x: '_{:03d}'.format(x)) \
-                + sliced['time_on_day'].apply(lambda x: '_{:03d}'.format(x))
-        sliced = sliced.set_index(key).sort_index()
-  
-        sliced['values'] = sliced['values'].fillna(method='ffill')
-        sliced['prev_values'] = sliced['values'].shift(1)
-        sliced['next_values'] = sliced['values'].shift(-1)
-  
-        flat_df = flat_df.append(sliced.dropna(), ignore_index=True, sort=False)
-  
-    # Filter to match range used by other academic papers
-    index = flat_df['sensor_day']
-    flat_df = flat_df[index < 173].copy()
-  
-    # Creating columns fo categorical inputs
-    flat_df['categorical_id'] = flat_df['id'].copy()
-    flat_df['hours_from_start'] = flat_df['time_on_day'] \
-        + flat_df['sensor_day']*24.
-    flat_df['categorical_day_of_week'] = flat_df['day_of_week'].copy()
-    flat_df['categorical_time_on_day'] = flat_df['time_on_day'].copy()
-  
-    flat_df.to_csv(os.path.join(path, 'standarized.csv'))
-
-
-# XXX needs rework
-def standarize_favorita(data_folder):
-    import gc
-    # Extract only a subset of data to save/process for efficiency
-    start_date = pd.datetime(2015, 1, 1)
-    end_date = pd.datetime(2016, 6, 1)
-  
-    print('Regenerating data...')
-  
-    # load temporal data
-    temporal = pd.read_csv(os.path.join(data_folder, 'train.csv'), index_col=0)
-  
-    store_info = pd.read_csv(os.path.join(data_folder, 'stores.csv'), index_col=0)
-    oil = pd.read_csv(
-        os.path.join(data_folder, 'oil.csv'), index_col=0).iloc[:, 0]
-    holidays = pd.read_csv(os.path.join(data_folder, 'holidays_events.csv'))
-    items = pd.read_csv(os.path.join(data_folder, 'items.csv'), index_col=0)
-    transactions = pd.read_csv(os.path.join(data_folder, 'transactions.csv'))
-  
-    # Take first 6 months of data
-    temporal['date'] = pd.to_datetime(temporal['date'])
-  
-    # Filter dates to reduce storage space requirements
-    if start_date is not None:
-        temporal = temporal[(temporal['date'] >= start_date)]
-    if end_date is not None:
-        temporal = temporal[(temporal['date'] < end_date)]
-  
-    dates = temporal['date'].unique()
-  
-    # Add trajectory identifier
-    temporal['traj_id'] = temporal['store_nbr'].apply(
-        str) + '_' + temporal['item_nbr'].apply(str)
-    temporal['unique_id'] = temporal['traj_id'] + '_' + temporal['date'].apply(
-        str)
-  
-    # Remove all IDs with negative returns
-    print('Removing returns data')
-    min_returns = temporal['unit_sales'].groupby(temporal['traj_id']).min()
-    valid_ids = set(min_returns[min_returns >= 0].index)
-    selector = temporal['traj_id'].apply(lambda traj_id: traj_id in valid_ids)
-    new_temporal = temporal[selector].copy()
-    del temporal
-    gc.collect()
-    temporal = new_temporal
-    temporal['open'] = 1
-  
-    # Resampling
-    print('Resampling to regular grid')
-    resampled_dfs = []
-    for traj_id, raw_sub_df in temporal.groupby('traj_id'):
-        print('Resampling', traj_id)
-        sub_df = raw_sub_df.set_index('date', drop=True).copy()
-        sub_df = sub_df.resample('1d').last()
-        sub_df['date'] = sub_df.index
-        sub_df[['store_nbr', 'item_nbr', 'onpromotion']] \
-            = sub_df[['store_nbr', 'item_nbr', 'onpromotion']].fillna(method='ffill')
-        sub_df['open'] = sub_df['open'].fillna(
-            0)  # flag where sales data is unknown
-        sub_df['log_sales'] = np.log(sub_df['unit_sales'])
-    
-        resampled_dfs.append(sub_df.reset_index(drop=True))
-  
-    new_temporal = pd.concat(resampled_dfs, axis=0)
-    del temporal
-    gc.collect()
-    temporal = new_temporal
-  
-    print('Adding oil')
-    oil.name = 'oil'
-    oil.index = pd.to_datetime(oil.index)
-    #XXX the lines below match the value of the oil on given date with the rest of the timeseries
-    # missing values in oil series are copied from the index before. Then the oil series is joined with
-    # temporal. Then there are some dates present in temporal which arent present in oil, for which 
-    # oil values is substituted with -1. WHY?!
-    #TODO: check how many nans there are after first step. Previously oil series was extended by dates
-    # present in dates variable with nan value, which were forward filled. 
-    # This behavior is no longer supported by pandas, so we changed to DataFrame.isin method.
-    # This leaves us with more nans after first step than previously. To achieve previous behavior
-    # we have to join series before filling nans.
-    temporal = temporal.join(
-        #oil.loc[oil.index.isin(dates)].fillna(method='ffill'), on='date', how='left')
-        oil.loc[oil.index.isin(dates)], on='date', how='left')
-    temporal['oil'] = temporal['oil'].fillna(method='ffill')
-    temporal['oil'] = temporal['oil'].fillna(-1)
-  
-    print('Adding store info')
-    temporal = temporal.join(store_info, on='store_nbr', how='left')
-  
-    print('Adding item info')
-    temporal = temporal.join(items, on='item_nbr', how='left')
-  
-    transactions['date'] = pd.to_datetime(transactions['date'])
-    temporal = temporal.merge(
-        transactions,
-        left_on=['date', 'store_nbr'],
-        right_on=['date', 'store_nbr'],
-        how='left')
-    temporal['transactions'] = temporal['transactions'].fillna(-1)
-  
-    # Additional date info
-    temporal['day_of_week'] = pd.to_datetime(temporal['date'].values).dayofweek
-    temporal['day_of_month'] = pd.to_datetime(temporal['date'].values).day
-    temporal['month'] = pd.to_datetime(temporal['date'].values).month
-  
-    # Add holiday info
-    print('Adding holidays')
-    holiday_subset = holidays[holidays['transferred'].apply(
-        lambda x: not x)].copy()
-    holiday_subset.columns = [
-        s if s != 'type' else 'holiday_type' for s in holiday_subset.columns
-    ]
-    holiday_subset['date'] = pd.to_datetime(holiday_subset['date'])
-    local_holidays = holiday_subset[holiday_subset['locale'] == 'Local']
-    regional_holidays = holiday_subset[holiday_subset['locale'] == 'Regional']
-    national_holidays = holiday_subset[holiday_subset['locale'] == 'National']
-  
-    temporal['national_hol'] = temporal.merge(
-        national_holidays, left_on=['date'], right_on=['date'],
-        how='left')['description'].fillna('')
-    temporal['regional_hol'] = temporal.merge(
-        regional_holidays,
-        left_on=['state', 'date'],
-        right_on=['locale_name', 'date'],
-        how='left')['description'].fillna('')
-    temporal['local_hol'] = temporal.merge(
-        local_holidays,
-        left_on=['city', 'date'],
-        right_on=['locale_name', 'date'],
-        how='left')['description'].fillna('')
-  
-    temporal.sort_values('unique_id', inplace=True)
-
-    # Transform date to integer index
-    start_date = pd.to_datetime(min(temporal['date']))
-    dates = temporal['date'].apply(pd.to_datetime)
-    temporal['days_from_start'] = (dates - start_date).dt.days
-    temporal['categorical_id'] = temporal['traj_id'].copy()
-  
-    print('Saving processed file to {}'.format(os.path.join(data_folder, 'standarized.csv')))
-    temporal.to_csv(os.path.join(data_folder, 'standarized.csv'))
diff --git a/PyTorch/Forecasting/TFT/tft_pyt/ema.py b/PyTorch/Forecasting/TFT/tft_pyt/ema.py
deleted file mode 100644
index f8f5b331..00000000
--- a/PyTorch/Forecasting/TFT/tft_pyt/ema.py
+++ /dev/null
@@ -1,73 +0,0 @@
-# Copyright 2021 NVIDIA CORPORATION
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-
-#     http://www.apache.org/licenses/LICENSE-2.0
-
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Copyright 2019 Ross Wightman
-
-#    Licensed under the Apache License, Version 2.0 (the "License");
-#    you may not use this file except in compliance with the License.
-#    You may obtain a copy of the License at
-
-#        http://www.apache.org/licenses/LICENSE-2.0
-
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS,
-#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#    See the License for the specific language governing permissions and
-#    limitations under the License.
-
-""" 
-Exponential Moving Average (EMA) of model updates
-"""
-
-from collections import OrderedDict
-from copy import deepcopy
-
-import torch
-import torch.nn as nn
-
-class ModelEma(nn.Module):
-    """ Model Exponential Moving Average V2
-
-    Keep a moving average of everything in the model state_dict (parameters and buffers).
-    V2 of this module is simpler, it does not match params/buffers based on name but simply
-    iterates in order. It works with torchscript (JIT of full model).
-
-    """
-    def __init__(self, model, decay=0.999, device=None):
-        super().__init__()
-        # make a copy of the model for accumulating moving average of weights
-        self.module = deepcopy(model)
-        self.module.eval()
-        self.decay = decay
-        self.device = device  # perform ema on different device from model if set
-        if self.device is not None:
-            self.module.to(device=device)
-
-    def update(self, model):
-        update_fn=lambda ema_v, model_v: self.decay * ema_v + (1. - self.decay) * model_v
-        with torch.no_grad():
-            for ema_v, model_v in zip(self.module.state_dict().values(), model.state_dict().values()):
-                if self.device is not None:
-                    model_v = model_v.to(device=self.device)
-                ema_v.copy_(update_fn(ema_v, model_v))
-
-    def set(self, model):
-        with torch.no_grad():
-            for ema_v, model_v in zip(self.module.state_dict().values(), model.state_dict().values()):
-                if self.device is not None:
-                    model_v = model_v.to(device=self.device)
-                ema_v.copy_( model_v )
-
-    def forward(self, x):
-        return self.module(x)
diff --git a/PyTorch/Forecasting/TFT/tft_pyt/gpu_affinity.py b/PyTorch/Forecasting/TFT/tft_pyt/gpu_affinity.py
deleted file mode 100644
index 79fb1fc4..00000000
--- a/PyTorch/Forecasting/TFT/tft_pyt/gpu_affinity.py
+++ /dev/null
@@ -1,157 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import collections
-import math
-import os
-import pathlib
-import re
-
-import pynvml
-
-pynvml.nvmlInit()
-
-
-def systemGetDriverVersion():
-    return pynvml.nvmlSystemGetDriverVersion()
-
-
-def deviceGetCount():
-    return pynvml.nvmlDeviceGetCount()
-
-
-class device:
-    # assume nvml returns list of 64 bit ints
-    _nvml_affinity_elements = math.ceil(os.cpu_count() / 64)
-
-    def __init__(self, device_idx):
-        super().__init__()
-        self.handle = pynvml.nvmlDeviceGetHandleByIndex(device_idx)
-
-    def getName(self):
-        return pynvml.nvmlDeviceGetName(self.handle)
-
-    def getCpuAffinity(self):
-        affinity_string = ''
-        for j in pynvml.nvmlDeviceGetCpuAffinity(
-            self.handle, device._nvml_affinity_elements
-        ):
-            # assume nvml returns list of 64 bit ints
-            affinity_string = '{:064b}'.format(j) + affinity_string
-        affinity_list = [int(x) for x in affinity_string]
-        affinity_list.reverse()  # so core 0 is in 0th element of list
-
-        ret = [i for i, e in enumerate(affinity_list) if e != 0]
-        return ret
-
-
-def set_socket_affinity(gpu_id):
-    dev = device(gpu_id)
-    affinity = dev.getCpuAffinity()
-    os.sched_setaffinity(0, affinity)
-
-
-def set_single_affinity(gpu_id):
-    dev = device(gpu_id)
-    affinity = dev.getCpuAffinity()
-    os.sched_setaffinity(0, affinity[:1])
-
-
-def set_single_unique_affinity(gpu_id, nproc_per_node):
-    devices = [device(i) for i in range(nproc_per_node)]
-    socket_affinities = [dev.getCpuAffinity() for dev in devices]
-
-    siblings_list = get_thread_siblings_list()
-    siblings_dict = dict(siblings_list)
-
-    # remove siblings
-    for idx, socket_affinity in enumerate(socket_affinities):
-        socket_affinities[idx] = list(set(socket_affinity) - set(siblings_dict.values()))
-
-    affinities = []
-    assigned = []
-
-    for socket_affinity in socket_affinities:
-        for core in socket_affinity:
-            if core not in assigned:
-                affinities.append([core])
-                assigned.append(core)
-                break
-    os.sched_setaffinity(0, affinities[gpu_id])
-
-
-def set_socket_unique_affinity(gpu_id, nproc_per_node, mode):
-    device_ids = [device(i) for i in range(nproc_per_node)]
-    socket_affinities = [dev.getCpuAffinity() for dev in device_ids]
-
-    siblings_list = get_thread_siblings_list()
-    siblings_dict = dict(siblings_list)
-
-    # remove siblings
-    for idx, socket_affinity in enumerate(socket_affinities):
-        socket_affinities[idx] = list(set(socket_affinity) - set(siblings_dict.values()))
-
-    socket_affinities_to_device_ids = collections.defaultdict(list)
-
-    for idx, socket_affinity in enumerate(socket_affinities):
-        socket_affinities_to_device_ids[tuple(socket_affinity)].append(idx)
-
-    for socket_affinity, device_ids in socket_affinities_to_device_ids.items():
-        devices_per_group = len(device_ids)
-        cores_per_device = len(socket_affinity) // devices_per_group
-        for group_id, device_id in enumerate(device_ids):
-            if device_id == gpu_id:
-                if mode == 'interleaved':
-                    affinity = list(socket_affinity[group_id::devices_per_group])
-                elif mode == 'continuous':
-                    affinity = list(socket_affinity[group_id*cores_per_device:(group_id+1)*cores_per_device])
-                else:
-                    raise RuntimeError('Unknown set_socket_unique_affinity mode')
-
-                # reintroduce siblings
-                affinity += [siblings_dict[aff] for aff in affinity if aff in siblings_dict]
-                os.sched_setaffinity(0, affinity)
-
-
-def get_thread_siblings_list():
-    path = '/sys/devices/system/cpu/cpu*/topology/thread_siblings_list'
-    thread_siblings_list = []
-    pattern = re.compile(r'(\d+)\D(\d+)')
-    for fname in pathlib.Path(path[0]).glob(path[1:]):
-        with open(fname) as f:
-            content = f.read().strip()
-            res = pattern.findall(content)
-            if res:
-                pair = tuple(map(int, res[0]))
-                thread_siblings_list.append(pair)
-    return thread_siblings_list
-
-
-def set_affinity(gpu_id, nproc_per_node, mode='socket'):
-    if mode == 'socket':
-        set_socket_affinity(gpu_id)
-    elif mode == 'single':
-        set_single_affinity(gpu_id)
-    elif mode == 'single_unique':
-        set_single_unique_affinity(gpu_id, nproc_per_node)
-    elif mode == 'socket_unique_interleaved':
-        set_socket_unique_affinity(gpu_id, nproc_per_node, 'interleaved')
-    elif mode == 'socket_unique_continuous':
-        set_socket_unique_affinity(gpu_id, nproc_per_node, 'continuous')
-    else:
-        raise RuntimeError('Unknown affinity mode')
-
-    affinity = os.sched_getaffinity(0)
-    return affinity
-
diff --git a/PyTorch/Forecasting/TFT/tft_pyt/inference.py b/PyTorch/Forecasting/TFT/tft_pyt/inference.py
deleted file mode 100644
index 056429f1..00000000
--- a/PyTorch/Forecasting/TFT/tft_pyt/inference.py
+++ /dev/null
@@ -1,239 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import pandas as pd
-import numpy as np
-import pickle
-import argparse
-import torch
-from torch.utils.data import DataLoader
-from torch.cuda import amp
-from torch.utils.tensorboard import SummaryWriter
-from tqdm import tqdm
-from modeling import TemporalFusionTransformer
-from configuration import ElectricityConfig
-from data_utils import TFTDataset
-from utils import PerformanceMeter
-from criterions import QuantileLoss
-import dllogger
-from log_helper import setup_logger
-
-def _unscale_per_id(config, values, ids, scalers):
-    values = values.cpu().numpy()
-    num_horizons = config.example_length - config.encoder_length + 1
-    flat_values = pd.DataFrame(
-            values,
-            columns=[f't{j}' for j in range(num_horizons - values.shape[1], num_horizons)]
-            )
-    flat_values['id'] = ids
-    df_list = []
-    for idx, group in flat_values.groupby('id'):
-        scaler = scalers[idx]
-        group_copy = group.copy()
-        for col in group_copy.columns:
-            if not 'id' in col:
-                _col = np.expand_dims(group_copy[col].values, -1)
-                _t_col = scaler.inverse_transform(_col)[:,-1]
-                group_copy[col] = _t_col
-        df_list.append(group_copy)
-    flat_values = pd.concat(df_list, axis=0)
-
-    flat_values = flat_values[[col for col in flat_values if not 'id' in col]]
-    flat_tensor = torch.from_numpy(flat_values.values)
-    return flat_tensor
-
-def _unscale(config, values, scaler):
-    values = values.cpu().numpy()
-    num_horizons = config.example_length - config.encoder_length + 1
-    flat_values = pd.DataFrame(
-            values,
-            columns=[f't{j}' for j in range(num_horizons - values.shape[1], num_horizons)]
-            )
-    for col in flat_values.columns:
-        if not 'id' in col:
-            _col = np.expand_dims(flat_values[col].values, -1)
-            _t_col = scaler.inverse_transform(_col)[:,-1]
-            flat_values[col] = _t_col
-
-    flat_values = flat_values[[col for col in flat_values if not 'id' in col]]
-    flat_tensor = torch.from_numpy(flat_values.values)
-    return flat_tensor
-
-def predict(args, config, model, data_loader, scalers, cat_encodings, extend_targets=False):
-    model.eval()
-    predictions = []
-    targets = []
-    ids = []
-    perf_meter = PerformanceMeter()
-    n_workers = args.distributed_world_size if hasattr(args, 'distributed_world_size') else 1
-
-    for step, batch in enumerate(data_loader):
-        perf_meter.reset_current_lap()
-        with torch.no_grad():
-            batch = {key: tensor.cuda() if tensor.numel() else None for key, tensor in batch.items()}
-            ids.append(batch['id'][:,0,:])
-            targets.append(batch['target'])
-            predictions.append(model(batch).float())
-
-        perf_meter.update(args.batch_size * n_workers,
-            exclude_from_total=step in [0, len(data_loader)-1])
-
-    targets = torch.cat(targets, dim=0)
-    if not extend_targets:
-        targets = targets[:,config.encoder_length:,:] 
-    predictions = torch.cat(predictions, dim=0)
-    
-    if config.scale_per_id:
-        ids = torch.cat(ids, dim=0).cpu().numpy()
-
-        unscaled_predictions = torch.stack(
-                [_unscale_per_id(config, predictions[:,:,i], ids, scalers) for i in range(len(config.quantiles))], 
-                dim=-1)
-        unscaled_targets = _unscale_per_id(config, targets[:,:,0], ids, scalers).unsqueeze(-1)
-    else:
-        ids = None
-        unscaled_predictions = torch.stack(
-                [_unscale(config, predictions[:,:,i], scalers['']) for i in range(len(config.quantiles))], 
-                dim=-1)
-        unscaled_targets = _unscale(config, targets[:,:,0], scalers['']).unsqueeze(-1)
-
-    return unscaled_predictions, unscaled_targets, ids, perf_meter
-
-def visualize_v2(args, config, model, data_loader, scalers, cat_encodings):
-    unscaled_predictions, unscaled_targets, ids, _ = predict(args, config, model, data_loader, scalers, cat_encodings, extend_targets=True)
-
-    num_horizons = config.example_length - config.encoder_length + 1
-    pad = unscaled_predictions.new_full((unscaled_targets.shape[0], unscaled_targets.shape[1] - unscaled_predictions.shape[1], unscaled_predictions.shape[2]), fill_value=float('nan'))
-    pad[:,-1,:] = unscaled_targets[:,-num_horizons,:]
-    unscaled_predictions = torch.cat((pad, unscaled_predictions), dim=1)
-
-    ids = torch.from_numpy(ids.squeeze())
-    joint_graphs = torch.cat([unscaled_targets, unscaled_predictions], dim=2)
-    graphs = {i:joint_graphs[ids == i, :, :] for i in set(ids.tolist())}
-    for key, g in graphs.items():
-        for i, ex in enumerate(g):
-            df = pd.DataFrame(ex.numpy(), 
-                    index=range(num_horizons - ex.shape[0], num_horizons),
-                    columns=['target'] + [f'P{int(q*100)}' for q in config.quantiles])
-            fig = df.plot().get_figure()
-            ax = fig.get_axes()[0]
-            _values = df.values[config.encoder_length-1:,:]
-            ax.fill_between(range(num_horizons), _values[:,1], _values[:,-1], alpha=0.2, color='green')
-            os.makedirs(os.path.join(args.results, 'single_example_vis', str(key)), exist_ok=True)
-            fig.savefig(os.path.join(args.results, 'single_example_vis', str(key), f'{i}.pdf'))
-
-def inference(args, config, model, data_loader, scalers, cat_encodings):
-    unscaled_predictions, unscaled_targets, ids, perf_meter = predict(args, config, model, data_loader, scalers, cat_encodings)
-
-    if args.joint_visualization or args.save_predictions:
-        ids = torch.from_numpy(ids.squeeze())
-        #ids = torch.cat([x['id'][0] for x in data_loader.dataset])
-        joint_graphs = torch.cat([unscaled_targets, unscaled_predictions], dim=2)
-        graphs = {i:joint_graphs[ids == i, :, :] for i in set(ids.tolist())}
-        for key, g in graphs.items(): #timeseries id, joint targets and predictions
-            _g = {'targets': g[:,:,0]}
-            _g.update({f'P{int(q*100)}':g[:,:,i+1] for i, q in enumerate(config.quantiles)})
-            
-            if args.joint_visualization:
-                summary_writer = SummaryWriter(log_dir=os.path.join(args.results, 'predictions_vis', str(key)))
-                for q, t in _g.items(): # target and quantiles, timehorizon values
-                    if q == 'targets':
-                        targets = torch.cat([t[:,0], t[-1,1:]]) # WIP
-                        # We want to plot targets on the same graph as predictions. Probably could be written better.
-                        for i, val in enumerate(targets):
-                            summary_writer.add_scalars(str(key), {f'{q}':val}, i)
-                        continue
-
-                    # Tensor t contains different time horizons which are shifted in phase
-                    # Next lines realign them
-                    y = t.new_full((t.shape[0] + t.shape[1] -1, t.shape[1]), float('nan'))
-                    for i in range(y.shape[1]):
-                        y[i:i+t.shape[0], i] = t[:,i]
-
-                    for i, vals in enumerate(y): # timestep, timehorizon values value
-                        summary_writer.add_scalars(str(key), {f'{q}_t+{j+1}':v for j,v in enumerate(vals) if v == v}, i)
-                summary_writer.close()
-
-            if args.save_predictions:
-                for q, t in _g.items():
-                    df = pd.DataFrame(t.tolist())
-                    df.columns = [f't+{i+1}' for i in range(len(df.columns))]
-                    os.makedirs(os.path.join(args.results, 'predictions', str(key)), exist_ok=True)
-                    df.to_csv(os.path.join(args.results, 'predictions', str(key), q+'.csv'))
-
-    losses = QuantileLoss(config)(unscaled_predictions, unscaled_targets)
-    normalizer = unscaled_targets.abs().mean()
-    q_risk = 2 * losses / normalizer
-
-    perf_dict = {
-                'throughput': perf_meter.avg,
-                'latency_avg': perf_meter.total_time/len(perf_meter.intervals),
-                'latency_p90': perf_meter.p(90),
-                'latency_p95': perf_meter.p(95),
-                'latency_p99': perf_meter.p(99),
-                'total_infernece_time': perf_meter.total_time,
-                }
-
-    return q_risk, perf_dict
-
-
-def main(args):
-    
-    setup_logger(args)
-    # Set up model
-    state_dict = torch.load(args.checkpoint)
-    config = state_dict['config']
-    model = TemporalFusionTransformer(config).cuda()
-    model.load_state_dict(state_dict['model'])
-    model.eval()
-    model.cuda()
-
-    # Set up dataset
-    test_split = TFTDataset(args.data, config)
-    data_loader = DataLoader(test_split, batch_size=args.batch_size, num_workers=4)
-
-    scalers = pickle.load(open(args.tgt_scalers, 'rb'))
-    cat_encodings = pickle.load(open(args.cat_encodings, 'rb'))
-
-    if args.visualize:
-        # TODO: abstract away all forms of visualization.
-        visualize_v2(args, config, model, data_loader, scalers, cat_encodings)
-
-    quantiles, perf_dict = inference(args, config, model, data_loader, scalers, cat_encodings)
-    quantiles = {'test_p10': quantiles[0].item(), 'test_p50': quantiles[1].item(), 'test_p90': quantiles[2].item(), 'sum':sum(quantiles).item()}
-    finish_log = {**quantiles, **perf_dict}
-    dllogger.log(step=(), data=finish_log, verbosity=1)
-    print('Test q-risk: P10 {} | P50 {} | P90 {}'.format(*quantiles))
-    print('Latency:\n\tAverage {:.3f}s\n\tp90 {:.3f}s\n\tp95 {:.3f}s\n\tp99 {:.3f}s'.format(
-        perf_dict['latency_avg'], perf_dict['latency_p90'], perf_dict['latency_p95'], perf_dict['latency_p99']))
-
-if __name__=='__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--checkpoint', type=str,
-                        help='Path to the checkpoint')
-    parser.add_argument('--data', type=str,
-                        help='Path to the test split of the dataset')
-    parser.add_argument('--tgt_scalers', type=str,
-                        help='Path to the tgt_scalers.bin file produced by the preprocessing')
-    parser.add_argument('--cat_encodings', type=str,
-                        help='Path to the cat_encodings.bin file produced by the preprocessing')
-    parser.add_argument('--batch_size', type=int, default=64)
-    parser.add_argument('--visualize', action='store_true', help='Visualize predictions - each example on the separate plot')
-    parser.add_argument('--joint_visualization', action='store_true', help='Visualize predictions - each timeseries on separate plot. Projections will be concatenated.')
-    parser.add_argument('--save_predictions', action='store_true')
-    parser.add_argument('--results', type=str, default='/results')
-    parser.add_argument('--log_file', type=str, default='dllogger.json')
-    ARGS = parser.parse_args()
-    main(ARGS)
diff --git a/PyTorch/Forecasting/TFT/tft_pyt/log_helper.py b/PyTorch/Forecasting/TFT/tft_pyt/log_helper.py
deleted file mode 100644
index 83d2ac7f..00000000
--- a/PyTorch/Forecasting/TFT/tft_pyt/log_helper.py
+++ /dev/null
@@ -1,141 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import subprocess
-import sys
-import itertools
-import atexit
-
-import dllogger
-from dllogger import Backend, JSONStreamBackend, StdOutBackend
-
-import torch.distributed as dist
-from torch.utils.tensorboard import SummaryWriter
-
-class TensorBoardBackend(Backend):
-    def __init__(self, verbosity, log_dir):
-        super().__init__(verbosity=verbosity)
-        self.summary_writer = SummaryWriter(log_dir=os.path.join(log_dir, 'TB_summary'),
-                                            flush_secs=120,
-                                            max_queue=200
-                                            )
-        self.hp_cache = None
-        atexit.register(self.summary_writer.close)
-
-    @property
-    def log_level(self):
-        return self._log_level
-
-    def metadata(self, timestamp, elapsedtime, metric, metadata):
-        pass
-
-    def log(self, timestamp, elapsedtime, step, data):
-        if step == 'HPARAMS':
-            parameters = {k: v for k, v in data.items() if not isinstance(v, (list, tuple))}
-            #Unpack list and tuples
-            for d in [{k+f'_{i}':v for i,v in enumerate(l)} for k,l in data.items() if isinstance(l, (list, tuple))]:
-                parameters.update(d)
-            #Remove custom classes
-            parameters = {k: v for k, v in data.items() if isinstance(v, (int, float, str, bool))}
-            parameters.update({k:'None' for k, v in data.items() if v is None})
-            self.hp_cache = parameters
-        if step == ():
-            if self.hp_cache is None:
-                print('Warning: Cannot save HParameters. Please log HParameters with step=\'HPARAMS\'', file=sys.stderr)
-                return
-            self.summary_writer.add_hparams(self.hp_cache, data)
-        if not isinstance(step, int):
-            return
-        for k, v in data.items():
-            self.summary_writer.add_scalar(k, v, step)
-
-    def flush(self):
-        pass
-
-def setup_logger(args):
-    os.makedirs(args.results, exist_ok=True)
-    log_path = os.path.join(args.results, args.log_file)
-
-    if os.path.exists(log_path):
-        for i in itertools.count():
-            s_fname = args.log_file.split('.')
-            fname = '.'.join(s_fname[:-1]) + f'_{i}.' + s_fname[-1] if len(s_fname) > 1 else args.stat_file + f'.{i}'
-            log_path = os.path.join(args.results, fname)
-            if not os.path.exists(log_path):
-                break
-
-    def metric_format(metric, metadata, value):
-        return "{}: {}".format(metric, f'{value:.5f}' if isinstance(value, float) else value)
-    def step_format(step):
-        if step == ():
-            return "Finished |"
-        elif isinstance(step, int):
-            return "Step {0: <5} |".format(step)
-        return "Step {} |".format(step)
-
-
-    if not dist.is_initialized() or not args.distributed_world_size > 1 or args.distributed_rank == 0:
-        dllogger.init(backends=[JSONStreamBackend(verbosity=1, filename=log_path),
-                                TensorBoardBackend(verbosity=1, log_dir=args.results),
-                                StdOutBackend(verbosity=2, 
-                                              step_format=step_format,
-                                              prefix_format=lambda x: "")#,
-                                              #metric_format=metric_format)
-                                ])
-    else:
-        dllogger.init(backends=[])
-    dllogger.log(step='PARAMETER', data=vars(args), verbosity=0)
-
-    container_setup_info = {**get_framework_env_vars(), **get_system_info()}
-    dllogger.log(step='ENVIRONMENT', data=container_setup_info, verbosity=0)
-
-    dllogger.metadata('loss', {'GOAL': 'MINIMIZE', 'STAGE': 'TRAIN', 'format': ':5f'})
-    dllogger.metadata('P10', {'GOAL': 'MINIMIZE', 'STAGE': 'TRAIN', 'format': ':5f'})
-    dllogger.metadata('P50', {'GOAL': 'MINIMIZE', 'STAGE': 'TRAIN', 'format': ':5f'})
-    dllogger.metadata('P90', {'GOAL': 'MINIMIZE', 'STAGE': 'TRAIN', 'format': ':5f'})
-    dllogger.metadata('items/s', {'GOAL': 'MAXIMIZE', 'STAGE': 'TRAIN', 'format': ':1f'})
-    dllogger.metadata('val_loss', {'GOAL': 'MINIMIZE', 'STAGE': 'VAL', 'format':':5f'})
-    dllogger.metadata('val_P10', {'GOAL': 'MINIMIZE', 'STAGE': 'VAL', 'format': ':5f'})
-    dllogger.metadata('val_P50', {'GOAL': 'MINIMIZE', 'STAGE': 'VAL', 'format': ':5f'})
-    dllogger.metadata('val_P90', {'GOAL': 'MINIMIZE', 'STAGE': 'VAL', 'format': ':5f'})
-    dllogger.metadata('val_items/s', {'GOAL': 'MAXIMIZE', 'STAGE': 'VAL', 'format': ':1f'})
-    dllogger.metadata('test_P10', {'GOAL': 'MINIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
-    dllogger.metadata('test_P50', {'GOAL': 'MINIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
-    dllogger.metadata('test_P90', {'GOAL': 'MINIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
-    dllogger.metadata('throughput', {'GOAL': 'MAXIMIZE', 'STAGE': 'TEST', 'format': ':1f'})
-    dllogger.metadata('latency_p90', {'GOAL': 'MIMIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
-    dllogger.metadata('latency_p95', {'GOAL': 'MIMIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
-    dllogger.metadata('latency_p99', {'GOAL': 'MIMIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
-
-
-def get_framework_env_vars():
-    return {
-        'NVIDIA_PYTORCH_VERSION': os.environ.get('NVIDIA_PYTORCH_VERSION'),
-        'PYTORCH_VERSION': os.environ.get('PYTORCH_VERSION'),
-        'CUBLAS_VERSION': os.environ.get('CUBLAS_VERSION'),
-        'NCCL_VERSION': os.environ.get('NCCL_VERSION'),
-        'CUDA_DRIVER_VERSION': os.environ.get('CUDA_DRIVER_VERSION'),
-        'CUDNN_VERSION': os.environ.get('CUDNN_VERSION'),
-        'CUDA_VERSION': os.environ.get('CUDA_VERSION'),
-        'NVIDIA_PIPELINE_ID': os.environ.get('NVIDIA_PIPELINE_ID'),
-        'NVIDIA_BUILD_ID': os.environ.get('NVIDIA_BUILD_ID'),
-        'NVIDIA_TF32_OVERRIDE': os.environ.get('NVIDIA_TF32_OVERRIDE'),
-    }
-
-def get_system_info():
-    system_info = subprocess.run('nvidia-smi --query-gpu=gpu_name,memory.total,enforced.power.limit --format=csv'.split(), capture_output=True).stdout
-    system_info = [i.decode('utf-8') for i in system_info.split(b'\n')]
-    system_info = [x for x in system_info if x]
-    return {'system_info': system_info}
diff --git a/PyTorch/Forecasting/TFT/tft_pyt/modeling.py b/PyTorch/Forecasting/TFT/tft_pyt/modeling.py
deleted file mode 100644
index 65e64983..00000000
--- a/PyTorch/Forecasting/TFT/tft_pyt/modeling.py
+++ /dev/null
@@ -1,367 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-from torch import Tensor
-from typing import Dict, Tuple, Optional, List
-
-if os.environ.get("TFT_SCRIPTING", False):
-    from torch.nn import LayerNorm
-else:
-    from apex.normalization.fused_layer_norm import FusedLayerNorm as LayerNorm
-
-class MaybeLayerNorm(nn.Module):
-    def __init__(self, output_size, hidden_size, eps):
-        super().__init__()
-        if output_size and output_size == 1:
-            self.ln = nn.Identity()
-        else:
-            self.ln = LayerNorm(output_size if output_size else hidden_size, eps=eps)
-    
-    def forward(self, x):
-        return self.ln(x)
-
-
-class GLU(nn.Module):
-    def __init__(self, hidden_size, output_size):
-        super().__init__()
-        self.lin = nn.Linear(hidden_size, output_size * 2)
-
-    def forward(self, x: Tensor) -> Tensor:
-        x = self.lin(x)
-        x = F.glu(x)
-        return x
-
-
-class GRN(nn.Module):
-    def __init__(self,
-                 input_size,
-                 hidden_size, 
-                 output_size=None,
-                 context_hidden_size=None,
-                 dropout=0):
-        super().__init__()
-
-        
-        self.layer_norm = MaybeLayerNorm(output_size, hidden_size, eps=1e-3)
-        self.lin_a = nn.Linear(input_size, hidden_size)
-        if context_hidden_size is not None:
-            self.lin_c = nn.Linear(context_hidden_size, hidden_size, bias=False)
-        self.lin_i = nn.Linear(hidden_size, hidden_size)
-        self.glu = GLU(hidden_size, output_size if output_size else hidden_size)
-        self.dropout = nn.Dropout(dropout)
-        self.out_proj = nn.Linear(input_size, output_size) if output_size else None
-
-    def forward(self, a: Tensor, c: Optional[Tensor] = None):
-        x = self.lin_a(a)
-        if c is not None:
-            x = x + self.lin_c(c).unsqueeze(1)
-        x = F.elu(x)
-        x = self.lin_i(x)
-        x = self.dropout(x)
-        x = self.glu(x)
-        y = a if not self.out_proj else self.out_proj(a)
-        x = x + y
-        x = self.layer_norm(x)
-        return x 
-
-class TFTEmbedding(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.s_cat_inp_lens    = config.static_categorical_inp_lens
-        self.t_cat_k_inp_lens  = config.temporal_known_categorical_inp_lens
-        self.t_cat_o_inp_lens  = config.temporal_observed_categorical_inp_lens
-        self.s_cont_inp_size   = config.static_continuous_inp_size
-        self.t_cont_k_inp_size = config.temporal_known_continuous_inp_size
-        self.t_cont_o_inp_size = config.temporal_observed_continuous_inp_size
-        self.t_tgt_size        = config.temporal_target_size
-
-        self.hidden_size = config.hidden_size
-
-        # There are 7 types of input:
-        # 1. Static categorical
-        # 2. Static continuous
-        # 3. Temporal known a priori categorical
-        # 4. Temporal known a priori continuous
-        # 5. Temporal observed categorical
-        # 6. Temporal observed continuous
-        # 7. Temporal observed targets (time series obseved so far)
-
-        self.s_cat_embed = nn.ModuleList([
-            nn.Embedding(n, self.hidden_size) for n in self.s_cat_inp_lens]) if self.s_cat_inp_lens else None
-        self.t_cat_k_embed = nn.ModuleList([
-            nn.Embedding(n, self.hidden_size) for n in self.t_cat_k_inp_lens]) if self.t_cat_k_inp_lens else None
-        self.t_cat_o_embed = nn.ModuleList([
-            nn.Embedding(n, self.hidden_size) for n in self.t_cat_o_inp_lens]) if self.t_cat_o_inp_lens else None
-
-        self.s_cont_embedding_vectors = nn.Parameter(torch.Tensor(self.s_cont_inp_size, self.hidden_size)) if self.s_cont_inp_size else None
-        self.t_cont_k_embedding_vectors = nn.Parameter(torch.Tensor(self.t_cont_k_inp_size, self.hidden_size)) if self.t_cont_k_inp_size else None
-        self.t_cont_o_embedding_vectors = nn.Parameter(torch.Tensor(self.t_cont_o_inp_size, self.hidden_size)) if self.t_cont_o_inp_size else None
-        self.t_tgt_embedding_vectors = nn.Parameter(torch.Tensor(self.t_tgt_size, self.hidden_size))
-
-        self.s_cont_embedding_bias = nn.Parameter(torch.zeros(self.s_cont_inp_size, self.hidden_size)) if self.s_cont_inp_size else None
-        self.t_cont_k_embedding_bias = nn.Parameter(torch.zeros(self.t_cont_k_inp_size, self.hidden_size)) if self.t_cont_k_inp_size else None
-        self.t_cont_o_embedding_bias = nn.Parameter(torch.zeros(self.t_cont_o_inp_size, self.hidden_size)) if self.t_cont_o_inp_size else None
-        self.t_tgt_embedding_bias = nn.Parameter(torch.zeros(self.t_tgt_size, self.hidden_size))
-
-        if self.s_cont_embedding_vectors is not None:
-            torch.nn.init.xavier_normal_(self.s_cont_embedding_vectors)
-        if self.t_cont_k_embedding_vectors is not None:
-            torch.nn.init.xavier_normal_(self.t_cont_k_embedding_vectors)
-        if self.t_cont_o_embedding_vectors is not None:
-            torch.nn.init.xavier_normal_(self.t_cont_o_embedding_vectors)
-        torch.nn.init.xavier_normal_(self.t_tgt_embedding_vectors)
-
-    def _apply_embedding(self,
-            cat: Optional[Tensor],
-            cont: Optional[Tensor],
-            cat_emb: Optional[nn.ModuleList], 
-            cont_emb: Tensor,
-            cont_bias: Tensor,
-            ) -> Tuple[Optional[Tensor], Optional[Tensor]]:
-        e_cat = torch.stack([embed(cat[...,i]) for i, embed in enumerate(cat_emb)], dim=-2) if cat is not None else None
-        if cont is not None:
-            #the line below is equivalent to following einsums
-            #e_cont = torch.einsum('btf,fh->bthf', cont, cont_emb)
-            #e_cont = torch.einsum('bf,fh->bhf', cont, cont_emb)
-            e_cont = torch.mul(cont.unsqueeze(-1), cont_emb)
-            e_cont = e_cont + cont_bias
-        else:
-            e_cont = None
-
-        if e_cat is not None and e_cont is not None:
-            return torch.cat([e_cat, e_cont], dim=-2)
-        elif e_cat is not None:
-            return e_cat
-        elif e_cont is not None:
-            return e_cont
-        else:
-            return None
-
-    def forward(self, x: Dict[str, Tensor]):
-        # temporal/static categorical/continuous known/observed input 
-        s_cat_inp = x.get('s_cat', None)
-        s_cont_inp = x.get('s_cont', None)
-        t_cat_k_inp = x.get('k_cat', None)
-        t_cont_k_inp = x.get('k_cont', None)
-        t_cat_o_inp = x.get('o_cat', None)
-        t_cont_o_inp = x.get('o_cont', None)
-        t_tgt_obs = x['target'] # Has to be present
-
-        # Static inputs are expected to be equal for all timesteps
-        # For memory efficiency there is no assert statement
-        s_cat_inp = s_cat_inp[:,0,:] if s_cat_inp is not None else None
-        s_cont_inp = s_cont_inp[:,0,:] if s_cont_inp is not None else None
-
-        s_inp = self._apply_embedding(s_cat_inp,
-                                      s_cont_inp,
-                                      self.s_cat_embed,
-                                      self.s_cont_embedding_vectors,
-                                      self.s_cont_embedding_bias)
-        t_known_inp = self._apply_embedding(t_cat_k_inp,
-                                            t_cont_k_inp,
-                                            self.t_cat_k_embed,
-                                            self.t_cont_k_embedding_vectors,
-                                            self.t_cont_k_embedding_bias)
-        t_observed_inp = self._apply_embedding(t_cat_o_inp,
-                                               t_cont_o_inp,
-                                               self.t_cat_o_embed,
-                                               self.t_cont_o_embedding_vectors,
-                                               self.t_cont_o_embedding_bias)
-
-        # Temporal observed targets
-        # t_observed_tgt = torch.einsum('btf,fh->btfh', t_tgt_obs, self.t_tgt_embedding_vectors)
-        t_observed_tgt = torch.matmul(t_tgt_obs.unsqueeze(3).unsqueeze(4), self.t_tgt_embedding_vectors.unsqueeze(1)).squeeze(3)
-        t_observed_tgt = t_observed_tgt + self.t_tgt_embedding_bias
-
-        return s_inp, t_known_inp, t_observed_inp, t_observed_tgt
-
-class VariableSelectionNetwork(nn.Module):
-    def __init__(self, config, num_inputs):
-        super().__init__()
-        self.joint_grn = GRN(config.hidden_size*num_inputs, config.hidden_size, output_size=num_inputs, context_hidden_size=config.hidden_size)
-        self.var_grns = nn.ModuleList([GRN(config.hidden_size, config.hidden_size, dropout=config.dropout) for _ in range(num_inputs)])
-
-    def forward(self, x: Tensor, context: Optional[Tensor] = None):
-        Xi = x.reshape(*x.shape[:-2], -1)
-        grn_outputs = self.joint_grn(Xi, c=context)
-        sparse_weights = F.softmax(grn_outputs, dim=-1)
-        transformed_embed_list = [m(x[...,i,:]) for i, m in enumerate(self.var_grns)]
-        transformed_embed = torch.stack(transformed_embed_list, dim=-1)
-        #the line below performs batched matrix vector multiplication
-        #for temporal features it's bthf,btf->bth
-        #for static features it's bhf,bf->bh
-        variable_ctx = torch.matmul(transformed_embed, sparse_weights.unsqueeze(-1)).squeeze(-1)
-
-        return variable_ctx, sparse_weights
-
-class StaticCovariateEncoder(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.vsn = VariableSelectionNetwork(config, config.num_static_vars)
-        self.context_grns = nn.ModuleList([GRN(config.hidden_size, config.hidden_size, dropout=config.dropout) for _ in range(4)])
-
-    def forward(self, x: Tensor) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
-        variable_ctx, sparse_weights = self.vsn(x)
-
-        # Context vectors:
-        # variable selection context
-        # enrichment context
-        # state_c context
-        # state_h context
-        cs, ce, ch, cc = tuple(m(variable_ctx) for m in self.context_grns)
-
-        return cs, ce, ch, cc
-
-
-class InterpretableMultiHeadAttention(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.n_head = config.n_head
-        assert config.hidden_size % config.n_head == 0
-        self.d_head = config.hidden_size // config.n_head
-        self.qkv_linears = nn.Linear(config.hidden_size, (2 * self.n_head + 1) * self.d_head, bias=False)
-        self.out_proj = nn.Linear(self.d_head, config.hidden_size, bias=False)
-        self.attn_dropout = nn.Dropout(config.attn_dropout)
-        self.out_dropout = nn.Dropout(config.dropout)
-        self.scale = self.d_head**-0.5
-        self.register_buffer("_mask", torch.triu(torch.full((config.example_length, config.example_length), float('-inf')), 1).unsqueeze(0))
-
-    def forward(self, x: Tensor, mask_future_timesteps: bool = True) -> Tuple[Tensor, Tensor]:
-        bs, t, h_size = x.shape
-        qkv = self.qkv_linears(x)
-        q, k, v = qkv.split((self.n_head * self.d_head, self.n_head * self.d_head, self.d_head), dim=-1)
-        q = q.view(bs, t, self.n_head, self.d_head)
-        k = k.view(bs, t, self.n_head, self.d_head)
-        v = v.view(bs, t, self.d_head)
-
-        # attn_score = torch.einsum('bind,bjnd->bnij', q, k)
-        attn_score = torch.matmul(q.permute((0, 2, 1, 3)), k.permute((0, 2, 3, 1)))
-        attn_score.mul_(self.scale)
-
-        if mask_future_timesteps:
-            attn_score = attn_score + self._mask
-
-        attn_prob = F.softmax(attn_score, dim=3)
-        attn_prob = self.attn_dropout(attn_prob)
-
-        # attn_vec = torch.einsum('bnij,bjd->bnid', attn_prob, v)
-        attn_vec = torch.matmul(attn_prob, v.unsqueeze(1))
-        m_attn_vec = torch.mean(attn_vec, dim=1)
-        out = self.out_proj(m_attn_vec)
-        out = self.out_dropout(out)
-
-        return out, attn_vec
-
-
-
-class TemporalFusionTransformer(nn.Module):
-    """ 
-    Implementation of https://arxiv.org/abs/1912.09363 
-    """
-    def __init__(self, config):
-        super().__init__()
-
-        if hasattr(config, 'model'):
-            config = config.model
-
-        self.encoder_length = config.encoder_length #this determines from how distant past we want to use data from
-
-        self.embedding = TFTEmbedding(config)
-        self.static_encoder = StaticCovariateEncoder(config)
-
-        self.history_vsn = VariableSelectionNetwork(config, config.num_historic_vars) 
-        self.history_encoder = nn.LSTM(config.hidden_size, config.hidden_size, batch_first=True)
-        self.future_vsn = VariableSelectionNetwork(config, config.num_future_vars)
-        self.future_encoder = nn.LSTM(config.hidden_size, config.hidden_size, batch_first=True)
-
-
-        self.input_gate = GLU(config.hidden_size, config.hidden_size)
-        self.input_gate_ln = LayerNorm(config.hidden_size, eps=1e-3)
-
-        self.enrichment_grn = GRN(config.hidden_size,
-                                  config.hidden_size,
-                                  context_hidden_size=config.hidden_size, 
-                                  dropout=config.dropout)
-        self.attention = InterpretableMultiHeadAttention(config)
-        self.attention_gate = GLU(config.hidden_size, config.hidden_size)
-        self.attention_ln = LayerNorm(config.hidden_size, eps=1e-3)
-
-        self.positionwise_grn = GRN(config.hidden_size,
-                                    config.hidden_size,
-                                    dropout=config.dropout)
-
-        self.decoder_gate = GLU(config.hidden_size, config.hidden_size)
-        self.decoder_ln = LayerNorm(config.hidden_size, eps=1e-3)
-
-        self.quantile_proj = nn.Linear(config.hidden_size, len(config.quantiles))
-
-    def forward(self, x: Dict[str, Tensor]) -> Tensor:
-        s_inp, t_known_inp, t_observed_inp, t_observed_tgt = self.embedding(x)
-
-        # Static context
-        cs, ce, ch, cc = self.static_encoder(s_inp)
-        ch, cc = ch.unsqueeze(0), cc.unsqueeze(0) #lstm initial states
-
-        # Temporal input
-        _historical_inputs = [t_known_inp[:,:self.encoder_length,:], t_observed_tgt[:,:self.encoder_length,:]]
-        if t_observed_inp is not None:
-            _historical_inputs.insert(0,t_observed_inp[:,:self.encoder_length,:])
-
-        historical_inputs = torch.cat(_historical_inputs, dim=-2)
-        future_inputs = t_known_inp[:, self.encoder_length:]
-
-        # Encoders
-        historical_features, _ = self.history_vsn(historical_inputs, cs)
-        history, state = self.history_encoder(historical_features, (ch, cc))
-        future_features, _ = self.future_vsn(future_inputs, cs)
-        future, _ = self.future_encoder(future_features, state)
-        torch.cuda.synchronize() # this call gives perf boost for unknown reasons
-
-        # skip connection
-        input_embedding = torch.cat([historical_features, future_features], dim=1)
-        temporal_features = torch.cat([history, future], dim=1)
-        temporal_features = self.input_gate(temporal_features)
-        temporal_features = temporal_features + input_embedding
-        temporal_features = self.input_gate_ln(temporal_features)
-
-        # Static enrichment
-        enriched = self.enrichment_grn(temporal_features, c=ce)
-
-        # Temporal self attention
-        x, _ = self.attention(enriched, mask_future_timesteps=True)
-
-        # Don't compute hictorical quantiles
-        x = x[:, self.encoder_length:, :]
-        temporal_features = temporal_features[:, self.encoder_length:, :]
-        enriched = enriched[:, self.encoder_length:, :]
-
-        x = self.attention_gate(x)
-        x = x + enriched
-        x = self.attention_ln(x)
-
-        # Position-wise feed-forward
-        x = self.positionwise_grn(x)
-
-        # Final skip connection
-        x = self.decoder_gate(x)
-        x = x + temporal_features
-        x = self.decoder_ln(x)
-
-        out = self.quantile_proj(x)
-
-        return out
diff --git a/PyTorch/Forecasting/TFT/tft_pyt/requirements.txt b/PyTorch/Forecasting/TFT/tft_pyt/requirements.txt
deleted file mode 100644
index 8ba46efc..00000000
--- a/PyTorch/Forecasting/TFT/tft_pyt/requirements.txt
+++ /dev/null
@@ -1 +0,0 @@
-tensorboard
diff --git a/PyTorch/Forecasting/TFT/tft_pyt/scripts/benchmark.sh b/PyTorch/Forecasting/TFT/tft_pyt/scripts/benchmark.sh
deleted file mode 100644
index c8a04c36..00000000
--- a/PyTorch/Forecasting/TFT/tft_pyt/scripts/benchmark.sh
+++ /dev/null
@@ -1,54 +0,0 @@
-#! /bin/bash
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-NUM_GPUS=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
-[ $NUM_GPUS -eq 16 ] && WORKER_NUMS=(1 8 16) || WORKER_NUMS=(1 8)
-DATASETS=(electricity traffic)
-
-rm -r /tmp/benchmark_results
-
-for DATASET in ${DATASETS[@]}
-do
-    for NGPU in ${WORKER_NUMS[@]}
-    do
-        for BATCH_SIZE in 512 1024 1536 2048 2560
-        do
-            for USE_AMP in --use_amp ""
-            do
-                for AFFINITY in "--affinity disabled" "--affinity single" "--affinity socket_unique_interleaved"
-                do 
-                    EXP_NAME="TFT_benchmark_${DATASET}_BS_${BATCH_SIZE}_${NGPU}GPU${USE_AMP}_${AFFINITY}"
-                    python -m torch.distributed.launch --nproc_per_node=${NGPU} train.py \
-                            --dataset ${DATASET} \
-                            --data_path /data/processed/${DATASET}_bin \
-                            --batch_size=${BATCH_SIZE} \
-                            --lr 5e-4 \
-                            --epochs 1 \
-                            --sample 100000 5000 \
-                            --seed 1 \
-                            ${USE_AMP} \
-                            ${AFFINITY} \
-                            --clip_grad 0.1 \
-                            --results /tmp/benchmark_results/${EXP_NAME}
-                done
-            done
-        done
-    done
-done
-for P in `ls /tmp/benchmark_results/`;
-do
-    echo ${P}
-    tail -n 1 /tmp/benchmark_results/${P}/dllogger.json
-done
diff --git a/PyTorch/Forecasting/TFT/tft_pyt/scripts/get_data.sh b/PyTorch/Forecasting/TFT/tft_pyt/scripts/get_data.sh
deleted file mode 100644
index d4c7c7e1..00000000
--- a/PyTorch/Forecasting/TFT/tft_pyt/scripts/get_data.sh
+++ /dev/null
@@ -1,40 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-DATAPATH='/data'
-
-declare -A URLS=( ['electricity']='https://archive.ics.uci.edu/ml/machine-learning-databases/00321/LD2011_2014.txt.zip'
-                  ['traffic']='https://archive.ics.uci.edu/ml/machine-learning-databases/00204/PEMS-SF.zip'
-                )
-
-mkdir -p ${DATAPATH}/raw
-mkdir -p ${DATAPATH}/processed
-
-for DS in electricity traffic
-do
-	DS_PATH=${DATAPATH}/raw/${DS}
-	ZIP_FNAME=${DS_PATH}.zip
-    if [ ! -d ${DS_PATH} ]
-    then
-        wget "${URLS[${DS}]}" -O ${ZIP_FNAME}
-        unzip ${ZIP_FNAME} -d ${DS_PATH}
-    fi
-	python -c "from data_utils import standarize_${DS} as standarize; standarize(\"${DS_PATH}\")"
-	python -c "from data_utils import preprocess; \
-               from configuration import ${DS^}Config as Config; \
-               preprocess(\"${DS_PATH}/standarized.csv\", \"${DATAPATH}/processed/${DS}_bin\", Config())" 
-done
-
-
diff --git a/PyTorch/Forecasting/TFT/tft_pyt/scripts/run_electricity.sh b/PyTorch/Forecasting/TFT/tft_pyt/scripts/run_electricity.sh
deleted file mode 100644
index 86214a9a..00000000
--- a/PyTorch/Forecasting/TFT/tft_pyt/scripts/run_electricity.sh
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-: ${SEED:=1}
-: ${LR:=1e-3}
-: ${NGPU:=8}
-: ${BATCH_SIZE:=1024}
-: ${EPOCHS:=30}
-
-python -m torch.distributed.launch --nproc_per_node=${NGPU} train.py \
-        --dataset electricity \
-        --data_path /data/processed/electricity_bin \
-        --batch_size=${BATCH_SIZE} \
-        --sample 450000 50000 \
-        --lr ${LR} \
-        --epochs ${EPOCHS} \
-        --seed ${SEED} \
-        --use_amp \
-        --results /results/TFT_electricity_bs${NGPU}x${BATCH_SIZE}_lr${LR}/seed_${SEED}
diff --git a/PyTorch/Forecasting/TFT/tft_pyt/scripts/run_electricity_DGX1-16G.sh b/PyTorch/Forecasting/TFT/tft_pyt/scripts/run_electricity_DGX1-16G.sh
deleted file mode 100644
index 86214a9a..00000000
--- a/PyTorch/Forecasting/TFT/tft_pyt/scripts/run_electricity_DGX1-16G.sh
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-: ${SEED:=1}
-: ${LR:=1e-3}
-: ${NGPU:=8}
-: ${BATCH_SIZE:=1024}
-: ${EPOCHS:=30}
-
-python -m torch.distributed.launch --nproc_per_node=${NGPU} train.py \
-        --dataset electricity \
-        --data_path /data/processed/electricity_bin \
-        --batch_size=${BATCH_SIZE} \
-        --sample 450000 50000 \
-        --lr ${LR} \
-        --epochs ${EPOCHS} \
-        --seed ${SEED} \
-        --use_amp \
-        --results /results/TFT_electricity_bs${NGPU}x${BATCH_SIZE}_lr${LR}/seed_${SEED}
diff --git a/PyTorch/Forecasting/TFT/tft_pyt/scripts/run_traffic.sh b/PyTorch/Forecasting/TFT/tft_pyt/scripts/run_traffic.sh
deleted file mode 100644
index cab8e473..00000000
--- a/PyTorch/Forecasting/TFT/tft_pyt/scripts/run_traffic.sh
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-: ${SEED:=1}
-: ${LR:=1e-3}
-: ${NGPU:=8}
-: ${BATCH_SIZE:=1024}
-: ${EPOCHS:=20}
-
-python -m torch.distributed.launch --nproc_per_node=${NGPU} train.py \
-        --dataset traffic \
-        --data_path /data/processed/traffic_bin \
-        --batch_size=${BATCH_SIZE} \
-        --sample 450000 50000 \
-        --lr ${LR} \
-        --epochs ${EPOCHS} \
-        --seed ${SEED} \
-        --use_amp \
-        --results /results/TFT_traffic_bs${NGPU}x${BATCH_SIZE}_lr${LR}/seed_${SEED}
diff --git a/PyTorch/Forecasting/TFT/tft_pyt/scripts/run_traffic_DGX1-16G.sh b/PyTorch/Forecasting/TFT/tft_pyt/scripts/run_traffic_DGX1-16G.sh
deleted file mode 100644
index cab8e473..00000000
--- a/PyTorch/Forecasting/TFT/tft_pyt/scripts/run_traffic_DGX1-16G.sh
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-: ${SEED:=1}
-: ${LR:=1e-3}
-: ${NGPU:=8}
-: ${BATCH_SIZE:=1024}
-: ${EPOCHS:=20}
-
-python -m torch.distributed.launch --nproc_per_node=${NGPU} train.py \
-        --dataset traffic \
-        --data_path /data/processed/traffic_bin \
-        --batch_size=${BATCH_SIZE} \
-        --sample 450000 50000 \
-        --lr ${LR} \
-        --epochs ${EPOCHS} \
-        --seed ${SEED} \
-        --use_amp \
-        --results /results/TFT_traffic_bs${NGPU}x${BATCH_SIZE}_lr${LR}/seed_${SEED}
diff --git a/PyTorch/Forecasting/TFT/tft_pyt/train.py b/PyTorch/Forecasting/TFT/tft_pyt/train.py
deleted file mode 100644
index e5ceceeb..00000000
--- a/PyTorch/Forecasting/TFT/tft_pyt/train.py
+++ /dev/null
@@ -1,294 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import time
-import os
-import pickle
-import json
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import torch.distributed as dist
-from torch.utils.data import DataLoader, DistributedSampler, RandomSampler
-from apex import amp
-from apex.optimizers import FusedAdam
-#from torch.nn.parallel import DistributedDataParallel as DDP
-from apex.parallel import DistributedDataParallel as DDP
-
-import numpy as np
-
-import dllogger
-
-from modeling import TemporalFusionTransformer
-from configuration import CONFIGS
-from data_utils import TFTBinaryDataset, sample_data
-from log_helper import setup_logger
-from criterions import QuantileLoss
-from inference import predict
-from utils import PerformanceMeter
-import gpu_affinity
-from ema import ModelEma
-
-def load_dataset(args, config):
-    train_split = TFTBinaryDataset(os.path.join(args.data_path, 'train.bin'), config)
-    train_split = sample_data(train_split, args.sample_data[0])
-    if args.distributed_world_size > 1:
-        data_sampler = DistributedSampler(train_split, args.distributed_world_size, args.distributed_rank, seed=args.seed + args.distributed_rank, drop_last=True)
-    else:
-        data_sampler = RandomSampler(train_split)
-    train_loader = DataLoader(train_split, batch_size=args.batch_size, num_workers=4, sampler=data_sampler, pin_memory=True)
-
-    valid_split = TFTBinaryDataset(os.path.join(args.data_path, 'valid.bin'), config)
-    valid_split = sample_data(valid_split, args.sample_data[1])
-    if args.distributed_world_size > 1:
-        data_sampler = DistributedSampler(valid_split, args.distributed_world_size, args.distributed_rank, shuffle=False, drop_last=False)
-    else:
-        data_sampler = None
-    valid_loader = DataLoader(valid_split, batch_size=args.batch_size, sampler=data_sampler, num_workers=4, pin_memory=True)
-
-    test_split = TFTBinaryDataset(os.path.join(args.data_path, 'test.bin'), config)
-    if args.distributed_world_size > 1:
-        data_sampler = DistributedSampler(test_split, args.distributed_world_size, args.distributed_rank, shuffle=False, drop_last=False)
-    else:
-        data_sampler = None
-    test_loader = DataLoader(test_split, batch_size=args.batch_size, sampler=data_sampler, num_workers=4, pin_memory=True)
-
-    print_once(f'Train split length: {len(train_split)}')
-    print_once(f'Valid split length: {len(valid_split)}')
-    print_once(f'Test split length: {len(test_split)}')
-
-    return train_loader, valid_loader, test_loader
-
-def print_once(*args, **kwargs):
-    if not dist.is_initialized() or dist.get_rank() == 0:
-        print(*args, **kwargs)
-
-
-def main(args):
-    # Enable CuDNN autotuner
-    nproc_per_node = torch.cuda.device_count()
-    if args.affinity != 'disabled':
-        affinity = gpu_affinity.set_affinity(
-                args.local_rank,
-                nproc_per_node,
-                args.affinity
-            )
-        print(f'{args.local_rank}: thread affinity: {affinity}')
-
-
-    torch.backends.cudnn.benchmark = True
-
-    ### INIT DISTRIBUTED
-    if args.distributed_world_size > 1:
-        args.local_rank = int(os.environ.get('LOCAL_RANK', args.local_rank))
-        torch.cuda.set_device(args.local_rank)
-        dist.init_process_group(backend='nccl', init_method='env://')
-        args.distributed_world_size = int(os.environ['WORLD_SIZE'])
-        args.distributed_rank = dist.get_rank()
-        print_once(f'Distributed training with {args.distributed_world_size} GPUs')
-        torch.cuda.synchronize()
-
-    if args.seed:
-        np.random.seed(args.seed)
-        torch.manual_seed(args.seed)
-        torch.cuda.manual_seed(args.seed)
-
-    setup_logger(args)
-
-    config = CONFIGS[args.dataset]()
-    if args.overwrite_config:
-        config.__dict__.update(json.loads(args.overwrite_config))
-
-    dllogger.log(step='HPARAMS', data={**vars(args), **vars(config)}, verbosity=1)
-
-    model = TemporalFusionTransformer(config).cuda()
-    if args.ema_decay:
-        model_ema = ModelEma(model, decay=args.ema_decay)
-
-    print_once('Model params: {}'.format(sum(p.numel() for p in model.parameters())))
-    criterion = QuantileLoss(config).cuda()
-    optimizer = FusedAdam(model.parameters(), lr=args.lr)
-    if args.use_amp:
-        model, optimizer = amp.initialize(model, optimizer, opt_level="O2", loss_scale="dynamic")
-    if args.distributed_world_size > 1:
-        #model = DDP(model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True)
-        model = DDP(model)
-
-    train_loader, valid_loader, test_loader = load_dataset(args, config)
-
-    global_step = 0
-    perf_meter = PerformanceMeter()
-
-    for epoch in range(args.epochs):
-        start = time.time()
-        dllogger.log(step=global_step, data={'epoch': epoch}, verbosity=1)
-
-        model.train() 
-        for local_step, batch in enumerate(train_loader):
-            perf_meter.reset_current_lap()
-            batch = {key: tensor.cuda() if tensor.numel() else None for key, tensor in batch.items()}
-            predictions = model(batch)
-            targets = batch['target'][:,config.encoder_length:,:]
-            p_losses = criterion(predictions, targets)
-            loss = p_losses.sum()
-
-            if args.use_amp:
-                with amp.scale_loss(loss, optimizer) as scaled_loss:
-                    scaled_loss.backward()
-            else:
-                loss.backward()
-            if not args.grad_accumulation or (global_step+1) % args.grad_accumulation == 0:
-                if args.clip_grad:
-                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip_grad)
-                optimizer.step()
-                optimizer.zero_grad()
-                if args.ema_decay:
-                    model_ema.update(model)
-
-            if args.distributed_world_size > 1:
-                dist.all_reduce(p_losses)
-                p_losses /= args.distributed_world_size
-                loss = p_losses.sum()
-
-            torch.cuda.synchronize()
-            ips = perf_meter.update(args.batch_size * args.distributed_world_size,
-                    exclude_from_total=local_step in [0, len(train_loader)-1])
-
-            log_dict = {'P10':p_losses[0].item(), 'P50':p_losses[1].item(), 'P90':p_losses[2].item(), 'loss': loss.item(), 'items/s':ips}
-            dllogger.log(step=global_step, data=log_dict, verbosity=1)
-            global_step += 1
-
-        validate(args, config, model_ema if args.ema_decay else model, criterion, valid_loader, global_step)
-
-        if validate.early_stop_c >= args.early_stopping:
-            print_once('Early stopping')
-            break
-
-    ### TEST PHASE ###
-    state_dict = torch.load(os.path.join(args.results, 'checkpoint.pt'), map_location='cpu')
-    if isinstance(model, DDP):
-        model.module.load_state_dict(state_dict['model'])
-    else:
-        model.load_state_dict(state_dict['model'])
-    model.cuda().eval()
-
-    tgt_scalers = pickle.load(open(os.path.join(args.data_path, 'tgt_scalers.bin'), 'rb'))
-    cat_encodings = pickle.load(open(os.path.join(args.data_path,'cat_encodings.bin'), 'rb'))
-
-    unscaled_predictions, unscaled_targets, _, _ = predict(args, config, model, test_loader, tgt_scalers, cat_encodings)
-    losses = QuantileLoss(config)(unscaled_predictions, unscaled_targets)
-    normalizer = unscaled_targets.abs().mean()
-    quantiles = 2 * losses / normalizer
-
-    if args.distributed_world_size > 1:
-        quantiles = quantiles.cuda()
-        dist.all_reduce(quantiles)
-        quantiles /= args.distributed_world_size
-
-    quantiles = {'test_p10': quantiles[0].item(), 'test_p50': quantiles[1].item(), 'test_p90': quantiles[2].item(), 'sum':sum(quantiles).item()}
-    finish_log = {**quantiles, 'average_ips':perf_meter.avg, 'convergence_step':validate.conv_step}
-    dllogger.log(step=(), data=finish_log, verbosity=1)
-
-def validate(args, config, model, criterion, dataloader, global_step):
-    if not hasattr(validate, 'best_valid_loss'):
-        validate.best_valid_loss = float('inf')
-    if not hasattr(validate, 'early_stop_c'):
-        validate.early_stop_c = 0
-    model.eval()
-
-    losses = []
-    validation_start = time.time()
-    for batch in dataloader:
-        with torch.no_grad():
-            batch = {key: tensor.cuda() if tensor.numel() else None for key, tensor in batch.items()}
-            predictions = model(batch)
-            targets = batch['target'][:,config.encoder_length:,:]
-            p_losses = criterion(predictions, targets)
-            bs = next(t for t in batch.values() if t is not None).shape[0]
-            losses.append((p_losses, bs))
-
-    validation_end = time.time()
-
-    p_losses = sum([l[0]*l[1] for l in losses])/sum([l[1] for l in losses]) #takes into accunt that the last batch is not full
-    if args.distributed_world_size > 1:
-        dist.all_reduce(p_losses)
-        p_losses = p_losses/args.distributed_world_size
-
-    ips = len(dataloader.dataset) / (validation_end - validation_start)
-
-    log_dict = {'P10':p_losses[0].item(), 'P50':p_losses[1].item(), 'P90':p_losses[2].item(), 'loss': p_losses.sum().item(), 'items/s':ips}
-
-    if log_dict['loss'] < validate.best_valid_loss:
-        validate.best_valid_loss = log_dict['loss']
-        validate.early_stop_c = 0
-        validate.conv_step = global_step
-        if not dist.is_initialized() or dist.get_rank() == 0:
-            state_dict = model.module.state_dict() if isinstance(model, (DDP, ModelEma)) else model.state_dict()
-            ckpt = {'args':args, 'config':config, 'model':state_dict}
-            torch.save(ckpt, os.path.join(args.results, 'checkpoint.pt'))
-        if args.distributed_world_size > 1:
-            dist.barrier()
-    else:
-        validate.early_stop_c += 1
-        
-    log_dict = {'val_'+k:v for k,v in log_dict.items()}
-    dllogger.log(step=global_step, data=log_dict, verbosity=1)
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--data_path', type=str, required=True,
-                        help='Path to the dataset')
-    parser.add_argument('--dataset', type=str, required=True, choices=CONFIGS.keys(),
-                        help='Dataset name')
-    parser.add_argument('--epochs', type=int, default=25,
-                        help='Default number of training epochs')
-    parser.add_argument('--sample_data', type=lambda x: int(float(x)), nargs=2, default=[-1, -1],
-                        help="""Subsample the dataset. Specify number of training and valid examples.
-                        Values can be provided in scientific notation. Floats will be truncated.""")
-    parser.add_argument('--batch_size', type=int, default=64)
-    parser.add_argument('--lr', type=float, default=1e-3)
-    parser.add_argument('--seed', type=int, default=1)
-    parser.add_argument('--use_amp', action='store_true', help='Enable automatic mixed precision')
-    parser.add_argument('--clip_grad', type=float, default=0.0)
-    parser.add_argument('--grad_accumulation', type=int, default=0)
-    parser.add_argument('--early_stopping', type=int, default=1000,
-                        help='Stop training if validation loss does not improve for more than this number of epochs.')
-    parser.add_argument('--results', type=str, default='/results',
-                        help='Directory in which results are stored')
-    parser.add_argument('--log_file', type=str, default='dllogger.json',
-                        help='Name of dllogger output file')
-    parser.add_argument('--distributed_world_size', type=int, metavar='N',
-                       default=torch.cuda.device_count(),
-                       help='total number of GPUs across all nodes (default: all visible GPUs)')
-    parser.add_argument('--distributed_rank', default=os.getenv('LOCAL_RANK', 0), type=int,
-                       help='rank of the current worker')
-    parser.add_argument('--local_rank', default=0, type=int,
-                       help='rank of the current worker')
-    parser.add_argument('--overwrite_config', type=str, default='',
-                       help='JSON string used to overload config')
-    parser.add_argument('--affinity', type=str,
-                         default='socket_unique_interleaved',
-                         choices=['socket', 'single', 'single_unique',
-                                  'socket_unique_interleaved',
-                                  'socket_unique_continuous',
-                                  'disabled'],
-                         help='type of CPU affinity')
-    parser.add_argument("--ema_decay", type=float, default=0.0, help='Use exponential moving average')
-
-
-    ARGS = parser.parse_args()
-    main(ARGS)
diff --git a/PyTorch/Forecasting/TFT/tft_pyt/utils.py b/PyTorch/Forecasting/TFT/tft_pyt/utils.py
deleted file mode 100644
index bf88be40..00000000
--- a/PyTorch/Forecasting/TFT/tft_pyt/utils.py
+++ /dev/null
@@ -1,46 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import time
-
-class PerformanceMeter():
-    def __init__(self):
-        self.reset()
-
-    def reset(self):
-        self.avg = 0
-        self.count = 0
-        self.total_time = 0
-        self.last_update_time = time.time()
-        self.intervals = []
-
-    def update(self, n, exclude_from_total=False):
-        delta = time.time() - self.last_update_time
-        self.intervals.append(delta)
-        if not exclude_from_total:
-            self.total_time += delta
-            self.count += n
-            self.avg = self.count / self.total_time
-        self.last_update_time = time.time()
-
-        return n/delta
-
-    def reset_current_lap(self):
-        self.last_update_time = time.time()
-
-    def p(self, i):
-        assert i <= 100
-        idx = int(len(self.intervals) * i / 100)
-        return sorted(self.intervals)[idx]
-
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/LICENSE b/Tools/PyTorch/TimeSeriesPredictionPlatform/LICENSE
index c1a81fee..050dd10c 100755
--- a/Tools/PyTorch/TimeSeriesPredictionPlatform/LICENSE
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/LICENSE
@@ -186,7 +186,7 @@
       same "printed page" as the copyright notice for easier
       identification within third-party archives.
 
-   Copyright [yyyy] [name of copyright owner]
+   Copyright 2021 NVIDIA Corporation
 
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.
@@ -198,4 +198,4 @@
    distributed under the License is distributed on an "AS IS" BASIS,
    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    See the License for the specific language governing permissions and
-   limitations under the License.
\ No newline at end of file
+   limitations under the License.
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/README.md b/Tools/PyTorch/TimeSeriesPredictionPlatform/README.md
index 1845851d..be69560d 100755
--- a/Tools/PyTorch/TimeSeriesPredictionPlatform/README.md
+++ b/Tools/PyTorch/TimeSeriesPredictionPlatform/README.md
@@ -33,8 +33,8 @@ The platform is designed to support multiple data types for input features, incl
 The TSPP utilizes the default configurations provided by each model for each accompanying dataset. More information on individual model configurations can be found within the respective model repositories. By default, Temporal Fusion Transformer (TFT) is included within the TSPP.
 
 ### Models
-    - Temporal Fusion Transformers XXX INSERT LINK HERE
-    - AutoARIMA
+-  [Temporal Fusion Transformer](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Forecasting/TFT)
+-  AutoARIMA
 
 ### Feature support matrix
 This tool supports the following features: 
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/Dockerfile b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/Dockerfile
deleted file mode 100644
index 70552ea1..00000000
--- a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/Dockerfile
+++ /dev/null
@@ -1,36 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:21.06-py3
-
-FROM ${FROM_IMAGE_NAME}
-
-RUN apt-get update && apt-get install -y libb64-dev libb64-0d
-WORKDIR /workspace
-#ENV PYTHONPATH /workspace
-RUN pip uninstall -y typing
-
-RUN apt update && apt install -y p7zip-full
-COPY requirements.txt .
-RUN pip install --upgrade pip
-RUN pip install --no-cache-dir --ignore-installed -r requirements.txt
-RUN pip install --no-cache-dir -e git://github.com/NVIDIA/dllogger#egg=dllogger
-
-COPY . .
-ENV PYTHONPATH="${PYTHONPATH}:/workspace"
-
-# AMP monkey-patch
-RUN sed -i 's/  def forward(ctx,/  @amp.custom_fwd\(cast_inputs=torch.float32\)\n  def forward(ctx,/g' /opt/conda/lib/python3.8/site-packages/apex/normalization/fused_layer_norm.py
-RUN sed -i 's/  def backward(ctx,/  @amp.custom_bwd\n  def backward(ctx,/g' /opt/conda/lib/python3.8/site-packages/apex/normalization/fused_layer_norm.py
-RUN sed -i 's/^import torch$/import torch\nfrom torch.cuda import amp/' /opt/conda/lib/python3.8/site-packages/apex/normalization/fused_layer_norm.py
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/LICENCE b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/LICENCE
deleted file mode 100644
index 261eeb9e..00000000
--- a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/LICENCE
+++ /dev/null
@@ -1,201 +0,0 @@
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-   1. Definitions.
-
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-
-   END OF TERMS AND CONDITIONS
-
-   APPENDIX: How to apply the Apache License to your work.
-
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-
-   Copyright [yyyy] [name of copyright owner]
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/LICENSE AGREEMENT b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/LICENSE AGREEMENT
deleted file mode 100644
index 5d1d88cf..00000000
--- a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/LICENSE AGREEMENT	
+++ /dev/null
@@ -1,25 +0,0 @@
-Individual Contributor License Agreement (CLA)
-Thank you for submitting your contributions to this project.
-
-By signing this CLA, you agree that the following terms apply to all of your past, present and future contributions to the project.
-
-License.
-You hereby represent that all present, past and future contributions are governed by the Apache 2.0 License copyright statement.
-
-This entails that to the extent possible under law, you transfer all copyright and related or neighboring rights of the code or documents you contribute to the project itself or its maintainers. Furthermore you also represent that you have the authority to perform the above waiver with respect to the entirety of you contributions.
-
-Moral Rights.
-To the fullest extent permitted under applicable law, you hereby waive, and agree not to assert, all of your “moral rights” in or relating to your contributions for the benefit of the project.
-
-Third Party Content.
-If your Contribution includes or is based on any source code, object code, bug fixes, configuration changes, tools, specifications, documentation, data, materials, feedback, information or other works of authorship that were not authored by you (“Third Party Content”) or if you are aware of any third party intellectual property or proprietary rights associated with your Contribution (“Third Party Rights”), then you agree to include with the submission of your Contribution full details respecting such Third Party Content and Third Party Rights, including, without limitation, identification of which aspects of your Contribution contain Third Party Content or are associated with Third Party Rights, the owner/author of the Third Party Content and Third Party Rights, where you obtained the Third Party Content, and any applicable third party license terms or restrictions respecting the Third Party Content and Third Party Rights. For greater certainty, the foregoing obligations respecting the identification of Third Party Content and Third Party Rights do not apply to any portion of a Project that is incorporated into your Contribution to that same Project.
-
-Representations.
-You represent that, other than the Third Party Content and Third Party Rights identified by you in accordance with this Agreement, you are the sole author of your Contributions and are legally entitled to grant the foregoing licenses and waivers in respect of your Contributions. If your Contributions were created in the course of your employment with your past or present employer(s), you represent that such employer(s) has authorized you to make your Contributions on behalf of such employer(s) or such employer (s) has waived all of their right, title or interest in or to your Contributions.
-
-Disclaimer.
-To the fullest extent permitted under applicable law, your Contributions are provided on an "as is" basis, without any warranties or conditions, express or implied, including, without limitation, any implied warranties or conditions of non-infringement, merchantability or fitness for a particular purpose. You are not required to provide support for your Contributions, except to the extent you desire to provide support.
-
-No Obligation.
-You acknowledge that the maintainers of this project are under no obligation to use or incorporate your contributions into the project. The decision to use or incorporate your contributions into the project will be made at the sole discretion of the maintainers or their authorized delegates.
-
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/NOTICE b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/NOTICE
deleted file mode 100644
index ae19bb47..00000000
--- a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/NOTICE
+++ /dev/null
@@ -1,3 +0,0 @@
-TFT for PyTorch
-
-This repository includes software from https://github.com/google-research/google-research/tree/master/tft licensed under the Apache License, Version 2.0
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/README.md b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/README.md
deleted file mode 100644
index 69b39d12..00000000
--- a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/README.md
+++ /dev/null
@@ -1,465 +0,0 @@
-# Temporal Fusion Transformer For PyTorch
-
-This repository provides a script and recipe to train the Temporal Fusion Transformer model to achieve state-of-the-art accuracy. The content of this repository is tested and maintained by NVIDIA.
-
-## Table Of Contents
-
-- [Model overview](#model-overview)
-    * [Model architecture](#model-architecture)
-    * [Default configuration](#default-configuration)
-    * [Feature support matrix](#feature-support-matrix)
-	    * [Features](#features)
-    * [Mixed precision training](#mixed-precision-training)
-	    * [Enabling mixed precision](#enabling-mixed-precision)
-          * [Enabling TF32](#enabling-tf32)
-    * [Glossary](#glossary)
-- [Setup](#setup)
-    * [Requirements](#requirements)
-- [Quick Start Guide](#quick-start-guide)
-- [Advanced](#advanced)
-    * [Scripts and sample code](#scripts-and-sample-code)
-    * [Command-line options](#command-line-options)
-    * [Getting the data](#getting-the-data)
-        * [Dataset guidelines](#dataset-guidelines)
-        * [Multi-dataset](#multi-dataset)
-    * [Training process](#training-process)
-    * [Inference process](#inference-process)
-- [Performance](#performance)
-    * [Benchmarking](#benchmarking)
-        * [Training performance benchmark](#training-performance-benchmark)
-        * [Inference performance benchmark](#inference-performance-benchmark)
-    * [Results](#results)
-        * [Training accuracy results](#training-accuracy-results)                         
-            * [Training accuracy: NVIDIA DGX A100 (8x A100 80GB)](#training-accuracy-nvidia-dgx-a100-8x-a100-80gb)
-            * [Training accuracy: NVIDIA DGX-1 (8x V100 16GB)](#training-accuracy-nvidia-dgx-1-8x-v100-16gb)
-            * [Training stability test](#training-stability-test)
-        * [Training performance results](#training-performance-results)
-            * [Training performance: NVIDIA DGX A100 (8x A100 80GB)](#training-performance-nvidia-dgx-a100-8x-a100-80gb)
-            * [Training performance: NVIDIA DGX-1 (8x V100 16GB)](#training-performance-nvidia-dgx-1-8x-v100-16gb)
-- [Release notes](#release-notes)
-    * [Changelog](#changelog)
-    * [Known issues](#known-issues)
-
-
-
-## Model overview
-
-The Temporal Fusion Transformer [TFT](https://arxiv.org/abs/1912.09363) model is a state-of-the-art architecture for interpretable, multi-horizon time-series prediction. The model was first developed and [implemented by Google](https://github.com/google-research/google-research/tree/master/tft) with the collaboration with the University of Oxford.
-This implementation differs from the reference implementation by addressing the issue of missing data, which is common in production datasets, by either masking their values in attention matrices or embedding them as a special value in the latent space.
-This model enables the prediction of confidence intervals for future values of time series for multiple future timesteps.
-
-This model is trained with mixed precision using Tensor Cores on Volta, Turing, and the NVIDIA Ampere GPU architectures. Therefore, researchers can get results 1.45x faster than training without Tensor Cores while experiencing the benefits of mixed precision training. This model is tested against each NGC monthly container release to ensure consistent accuracy and performance over time.
-
-### Model architecture
-
-The TFT model is a hybrid architecture joining LSTM encoding of time series and interpretability of transformer attention layers. Prediction is based on three  types of variables: static (constant for a given time series), known (known in advance for whole history and future), observed (known only for historical data). All these variables come in two flavors: categorical, and continuous. In addition to historical data, we feed the model with historical values of time series. All variables are embedded in high-dimensional space by learning an embedding vector. Categorical variables embeddings are learned in the classical sense of embedding discrete values. The model learns a single vector for each continuous variable, which is then scaled by this variable’s value for further processing. The next step is to filter variables through the Variable Selection Network (VSN), which assigns weights to the inputs in accordance with their relevance to the prediction. Static variables are used as a context for variable selection of other variables and as an initial state of LSTM encoders.
-After encoding, variables are passed to multi-head attention layers (decoder), which produce the final prediction. Whole architecture is interwoven with residual connections with gating mechanisms that allow  the architecture to adapt to various problems by skipping some parts of it.
-For the sake of explainability, heads of self-attention layers share value matrices. This allows interpreting  self-attention as an ensemble of models predicting different temporal patterns over the same feature set. The other feature that helps us understand the model is VSN activations, which tells us how relevant the given feature is to the prediction.
-![](TFT_architecture.PNG)
-*image source: https://arxiv.org/abs/1912.09363*
-
-### Default configuration
-
-The specific configuration of the TFT model depends on the dataset used. Not only is the volume of the model subject to change but so are the data sampling and preprocessing strategies. During preprocessing, data is normalized per feature. For a part of the datasets, we apply scaling per-time-series, which takes into account shifts in distribution between entities (i.e., a factory consumes more electricity than an average house). The model is trained with the quantile loss: <img src="https://render.githubusercontent.com/render/math?math=\Large\sum_{i=1}^N\sum_{q\in\mathcal{Q}}\sum_{t=1}^{t_{max}}\frac{QL(y_it,\hat{y}_i(q,t),q)}{Nt_{max}}">
-For quantiles in [0.1, 0.5, 0.9]. The default configurations are tuned for distributed training on DGX-1-32G with mixed precision. We use dynamic loss scaling. Specific values are provided in the table below.
-
-| Dataset | Training samples | Validation samples | Test samples | History length | Forecast horizon | Dropout | Hidden size | #Heads | BS | LR | Gradient clipping |
-| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
-| Electricity | 450k | 50k | 53.5k | 168 | 24 | 0.1 | 128 | 4 | 8x1024 | 1e-3 | 0.0 |
-| Traffic | 450k | 50k | 139.6k | 168 | 24 | 0.3 | 128 | 4 | 8x1024 | 1e-3 | 0.0
-
-### Feature support matrix
-
-The following features are supported by this model: 
-
-| Feature                    | Yes column                
-|----------------------------|--------------------------
-|Distributed data parallel   |         Yes
-|PyTorch AMP                 |         Yes 
-    
-         
-#### Features
-
-[Automatic Mixed Precision](https://pytorch.org/docs/stable/amp.html)
-provides an easy way to leverage Tensor Cores’ performance. It allows the execution of parts of a network in lower precision. Refer to [Mixed precision training](#mixed-precision-training) for more information.
-
-[PyTorch
-DistributedDataParallel](https://pytorch.org/docs/stable/nn.html#torch.nn.parallel.DistributedDataParallel) - a module
-wrapper that enables easy multiprocess distributed data-parallel
-training.
-
-### Mixed precision training
-
-Mixed precision is the combined use of different numerical precisions in a
-computational method.
-[Mixed precision](https://arxiv.org/abs/1710.03740) training offers significant
-computational speedup by performing operations in half-precision format while
-storing minimal information in single-precision to retain as much information
-as possible in critical parts of the network. Since the introduction of [Tensor Cores](https://developer.nvidia.com/tensor-cores) in Volta, and following with 
-both the Turing and Ampere architectures, significant training speedups are 
-experienced by switching to
-mixed precision -- up to 3x overall speedup on the most arithmetically intense
-model architectures. Using mixed precision training previously required two
-steps:
-
-1. Porting the model to use the FP16 data type where appropriate.
-2. Manually adding loss scaling to preserve small gradient values.
-
-The ability to train deep learning networks with lower precision was introduced
-in the Pascal architecture and first supported in [CUDA
-8](https://devblogs.nvidia.com/parallelforall/tag/fp16/) in the NVIDIA Deep
-Learning SDK.
-
-For information about:
-* How to train using mixed precision, refer to the [Mixed Precision
-  Training](https://arxiv.org/abs/1710.03740) paper and [Training With Mixed
-  Precision](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html)
-  documentation.
-* Techniques used for mixed precision training, refer to the [Mixed-Precision
-  Training of Deep Neural
-  Networks](https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/)
-  blog.
-* APEX tools for mixed precision training, refer to the [NVIDIA Apex: Tools for Easy Mixed-Precision Training in
-  PyTorch](https://devblogs.nvidia.com/apex-pytorch-easy-mixed-precision-training/)
-  .
-
-
-#### Enabling mixed precision
-
-
-Mixed precision is enabled in PyTorch by using the Automatic Mixed Precision torch.cuda.amp module, which casts variables to half-precision upon retrieval while storing variables in single-precision format. Furthermore, to preserve small gradient magnitudes in backpropagation, a [loss scaling](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html#lossscaling) step must be included when applying gradients. In PyTorch, loss scaling can be applied automatically by the GradScaler class. All the necessary steps to implement AMP are verbosely described [here](https://pytorch.org/docs/stable/notes/amp_examples.html#amp-examples).
-
-To enable mixed precision for TFT, simply add the `--use_amp` option to the training script.
-#### Enabling TF32
-
-TensorFloat-32 (TF32) is the new math mode in [NVIDIA A100](https://www.nvidia.com/en-us/data-center/a100/) GPUs for handling the matrix math, also called tensor operations. TF32 running on Tensor Cores in A100 GPUs can provide up to 10x speedups compared to single-precision floating-point math (FP32) on Volta GPUs. 
-
-TF32 Tensor Cores can speed up networks using FP32, typically with no loss of accuracy. It is more robust than FP16 for models which require high dynamic range for weights or activations.
-
-For more information, refer to the [TensorFloat-32 in the A100 GPU Accelerates AI Training, HPC up to 20x](https://blogs.nvidia.com/blog/2020/05/14/tensorfloat-32-precision-format/) blog post.
-
-TF32 is supported in the NVIDIA Ampere GPU architecture and is enabled by default.
-
-
-
-### Glossary
-
-**Multi horizon prediction**  
-Process of estimating values of a time series for multiple future time steps.
-
-**Quantiles**  
-Cut points dividing the range of a probability distribution intervals with equal probabilities.
-
-**Time series**  
-Series of data points indexed and equally spaced in time.
-
-**Transformer**  
-The paper [Attention Is All You Need](https://arxiv.org/abs/1706.03762) introduces a novel architecture called Transformer that uses an attention mechanism and transforms one sequence into another.
- 
-
-## Setup
-
-The following section lists the requirements that you need to meet in order to start training the TFT model.
-
-### Requirements
-
-This repository contains Dockerfile, which extends the PyTorch NGC container and encapsulates some dependencies. Aside from these dependencies, ensure you have the following components:
--   [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker)
--   [PyTorch 21.06 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch)
--   Supported GPUs:
-- [NVIDIA Volta architecture](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/)
-- [NVIDIA Turing architecture](https://www.nvidia.com/en-us/design-visualization/technologies/turing-architecture/)
-- [NVIDIA Ampere architecture](https://www.nvidia.com/en-us/data-center/nvidia-ampere-gpu-architecture/)
-
-For more information about how to get started with NGC containers, refer to the following sections from the NVIDIA GPU Cloud Documentation and the Deep Learning Documentation:
--   [Getting Started Using NVIDIA GPU Cloud](https://docs.nvidia.com/ngc/ngc-getting-started-guide/index.html)
--   [Accessing And Pulling From The NGC Container Registry](https://docs.nvidia.com/deeplearning/frameworks/user-guide/index.html#accessing_registry)
--   Running [PyTorch](https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/running.html#running)
-
-  
-For those unable to use the PyTorch NGC container to set up the required environment or create your own container, refer to the versioned [NVIDIA Container Support Matrix](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html).
-
-## Quick Start Guide
-
-To train your model using mixed or TF32 precision with Tensor Cores, perform the following steps using the default parameters of the TFT model on any of the benchmark datasets. For the specifics concerning training and inference, refer to the [Advanced](#advanced) section.
-
-1. Clone the repository.
-```bash
-git clone https://github.com/NVIDIA/DeepLearningExamples
-cd DeepLearningExamples/PyTorch/Forecasting/TFT
-```
-
-2.  Build the TFT PyTorch NGC container.
-```bash
-docker build --network=host -t tft .
-```
-
-3.  Start an interactive session in the NGC container to run training/inference.
-```bash
-docker run -it --rm --ipc=host --network=host --gpus all -v /path/to/your/data:/data/ tft
-```
-
-Note: Ensure to mount your dataset using the -v flag to make it available for training inside the NVIDIA Docker container.
-
-4.  Download and preprocess datasets.
-```bash
-bash scripts/get_data.sh
-```
-
-5. Start training. Choose one of the scripts provided in the `scripts/` directory. Results are stored in the `/results` directory.
-These scripts are tuned for DGX1-32G. If you have a different system, use NGPU and BATCH_SIZE variables to adjust the parameters for your system.
-```bash
-bash scripts/run_electricity.sh
-bash scripts/run_traffic.sh
-```
-
-6. Start validation/evaluation. The metric we use for evaluation is q-risk. We can compare it per-quantile in the Pareto sense or jointly as one number indicating accuracy.
-```bash
-python inference.py \
---checkpoint <your_checkpoint> \
---data /data/processed/<dataset>/test.csv \
---cat_encodings /data/processed/<dataset>/cat_encodings.bin \
---tgt_scalers /data/processed/<dataset>/tgt_scalers.bin
-```
-
-7. Start inference/predictions. Visualize and save predictions by running the following command.
-```bash
-python inference.py \
---checkpoint <your_checkpoint> \
---data /data/processed/<dataset>/test.csv \
---cat_encodings /data/processed/<dataset>/cat_encodings.bin \
---tgt_scalers /data/processed/<dataset>/tgt_scalers.bin \
---visualize \
---save_predictions
-```
-
-
-
-Now that you have your model trained and evaluated, you can choose to compare your training results with our [Training accuracy results](#training-accuracy-results). You can also choose to benchmark your performance to [Training performance benchmark](#training-performance-results). Following the steps in these sections will ensure that you achieve the same accuracy and performance results as stated in the [Results](#results) section.
-## Advanced
-
-The following sections provide more  details about the dataset, running training and inference, and the training results.
-
-### Scripts and sample code
-
-In the root directory, the most important files are:
-
-`train.py`: Entry point for training
-`data_utils.py`: File containing the dataset implementation and preprocessing functions
-`modeling.py`: Definition of the model
-`configuration.py`: Contains configuration classes for various experiments
-`test.py`: Entry point testing trained model.
-`Dockerfile`: Container definition
-`log_helper.py`: Contains helper functions for setting up dllogger
-`criterions.py`: Definitions of loss functions
-
-The `scripts` directory contains scripts for default use cases:
-`run_electricity.sh`: train default model on the electricity dataset
-`run_traffic.sh`: train default model on the traffic dataset
-
-### Command-line options
-
-To view the full list of available options and their descriptions, use the `-h` or `--help` command-line option, for example:
-`python train.py --help`.
-
-The following example output is printed when running the model:
-```
-usage: train.py [-h] --data_path DATA_PATH --dataset {electricity,volatility,traffic,favorita} [--epochs EPOCHS] [--sample_data SAMPLE_DATA SAMPLE_DATA] [--batch_size BATCH_SIZE] [--lr LR] [--seed SEED] [--use_amp] [--clip_grad CLIP_GRAD]
-                [--early_stopping EARLY_STOPPING] [--results RESULTS] [--log_file LOG_FILE] [--distributed_world_size N] [--distributed_rank DISTRIBUTED_RANK] [--local_rank LOCAL_RANK] [--overwrite_config OVERWRITE_CONFIG]
-
-optional arguments:
-  -h, --help            show this help message and exit
-  --data_path DATA_PATH
-  --dataset {electricity,volatility,traffic,favorita}
-  --epochs EPOCHS
-  --sample_data SAMPLE_DATA SAMPLE_DATA
-  --batch_size BATCH_SIZE
-  --lr LR
-  --seed SEED
-  --use_amp             Enable automatic mixed precision
-  --clip_grad CLIP_GRAD
-  --early_stopping EARLY_STOPPING
-                        Stop training if validation loss does not improve for more than this number of epochs.
-  --results RESULTS
-  --log_file LOG_FILE
-  --distributed_world_size N
-                        total number of GPUs across all nodes (default: all visible GPUs)
-  --distributed_rank DISTRIBUTED_RANK
-                        rank of the current worker
-  --local_rank LOCAL_RANK
-                        rank of the current worker
-  --overwrite_config OVERWRITE_CONFIG
-                        JSON string used to overload config
-
-```
-
-### Getting the data
-    
-The TFT model was trained on the electricity and traffic benchmark datasets. This repository contains the `get_data.sh` download script, which for electricity and and traffic datasets will automatically download and preprocess the training, validation and test datasets, and produce files that contain scalers.
-#### Dataset guidelines
-
-The `data_utils.py` file contains all functions that are used to preprocess the data. Initially the data is loaded to a `pandas.DataFrame` and parsed to the common format which contains the features we will use for training. Then standardized data is cleaned, normalized, encoded and binarized.
-This step does the following:
-Drop all the columns that are not marked in the configuration file as used for training or preprocessing
-Flatten indices in case time series are indexed by more than one column
-Split the data into training, validation and test splits
-Filter out all the time series shorter than minimal example length
-Normalize columns marked as continuous in the configuration file
-Encode as integers columns marked as categorical
-Save the data in csv and binary formats
-
-#### Multi-dataset
-In order to use an alternate dataset, you have to write a function that parses your data to a common format. The format is as follows:
-There is at least one id column
-There is exactly one time column (that can also be used as a feature column)
-Each feature is in a separate column
-Each row represents a moment in time for only one time series
-Additionally, you must specify a configuration of the network, including a data description. Refer to the example in `configuration.py` file.
-### Training process
-
-The `train.py` script is an entry point for a training procedure. Refined recipes can be found in the `scripts` directory.
-The model trains for at most `--epochs` epochs. If option `--early_stopping N` is set, then training will end if for N subsequent epochs validation loss hadn’t improved.
-The details of the architecture and the dataset configuration are encapsulated by the `--dataset` option. This option chooses one of the configurations stored in the `configuration.py` file. You can enable mixed precision training by providing the `--use_amp` option. The training script supports multi-GPU training with the APEX package. To enable distributed training prepend training command with `python -m torch.distributed.launch --nproc_per_node=${NGPU}`.
-
-Example command:
-```
-python -m torch.distributed.launch --nproc_per_node=8 train.py \
-        --dataset electricity \
-        --data_path /data/processed/electricity_bin \
-        --batch_size=1024 \
-        --sample 450000 50000 \
-        --lr 1e-3 \
-        --epochs 25 \
-        --early_stopping 5 \
-        --seed 1 \
-        --use_amp \
-        --results /results/TFT_electricity_bs8x1024_lr1e-3/seed_1
-```
-
-The model is trained by optimizing quantile loss <img src="https://render.githubusercontent.com/render/math?math=\Large\sum_{i=1}^N\sum_{q\in\mathcal{Q}}\sum_{t=1}^{t_{max}}\frac{QL(y_{it},\hat{y}_i(q,t),q)}{Nt_{max}}">
-. After training, the checkpoint with the least validation loss is evaluated on a test split with q-risk metric <img src="https://render.githubusercontent.com/render/math?math=\Large\frac{2\sum_{y\in\Omega}\sum_{t=1}^{t_{max}}QL(y_t,\hat{y}(q,t),q)}{\sum_{y\in\Omega}\sum_{t=1}^{t_{max}}|y_t|}">.
-Results are by default stored in the `/results` directory. This can be changed by providing the `--results` option. At the end of the training,  the results directory will contain the trained checkpoint which had the lowest validation loss, dllogger logs (in dictionary per line format), and TensorBoard logs.
-
-### Inference process
-
-Inference can be run by launching the `inference.py` script. The script requires a trained checkpoint to run. It is crucial to prepare the data in the same way as training data prior to running the inference. Example command:
-```
-python inference.py \
---checkpoint /results/checkpoint.pt \
---data /data/processed/electricity_bin/test.csv \
---tgt_scalers /data/processed/electricity_bin/tgt_scalers.bin \
---cat_encodings /data/processed/electricity_bin/cat_encodings.bin \
---batch_size 2048 \
---visualize \
---save_predictions \
---joint_visualization \
---results /results \
---use_amp
-```
-
-In the default setting, it performs the evaluation of the model on a specified dataset and prints q-risk evaluated on this dataset. In order to save the predictions, use the `--save_predictions` option. Predictions will be stored in the directory specified by the `--results` option in the csv format. Option `--joint_visualization` allows us to plot graphs in TensorBoard format, allowing us to inspect the results and compare them to true values. Using `--visualize`, you can save plots for each example in a separate file.
-## Performance
-
-### Benchmarking
-
-The following section shows how to run benchmarks measuring the model performance in training and inference modes.
-
-#### Training performance benchmark
-
-In order to run training benchmarks, use the `scripts/benchmark.sh` script.
-
-#### Inference performance benchmark
-
-To benchmark the inference performance on a specific batch size and dataset, run the `inference.py` script.
-### Results
-
-The following sections provide details on how we achieved our performance and accuracy in training and inference.
-
-#### Training accuracy results
-
-We conducted an extensive hyperparameter search along with stability tests. The presented results are the averages from the hundreds of runs.
-
-##### Training accuracy: NVIDIA DGX A100 (A100 80GB)
-
-Our results were obtained by running the `train.sh` training script in the [PyTorch 21.06 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch) on NVIDIA A100 GPUs.
-
-| Dataset | GPUs | Batch size / GPU    | Accuracy - TF32  | Accuracy - mixed precision  |   Time to train - TF32  |  Time to train - mixed precision | Time to train speedup (TF32 to mixed precision)     
-|-------------|---|------|-----------------------|-----------------------|-------|-------|-------
-| Electricity | 1 | 1024 | 0.027 / 0.059 / 0.029 | 0.028 / 0.058 / 0.029 | 1427s | 1087s | 1.313x
-| Electricity | 8 | 1024 | 0.027 / 0.056 / 0.028 | 0.026 / 0.054 / 0.029 | 216s  | 176s  | 1.227x
-| Traffic     | 1 | 1024 | 0.040 / 0.103 / 0.075 | 0.040 / 0.103 / 0.075 | 957s  | 726s  | 1.318x
-| Traffic     | 8 | 1024 | 0.042 / 0.104 / 0.076 | 0.042 / 0.106 / 0.077 | 151s  | 126s  | 1.198x
-
-
-
-
-##### Training accuracy: NVIDIA DGX-1 (V100 16GB)
-
-Our results were obtained by running the `train.sh` training script in the [PyTorch 21.06 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch) on NVIDIA DGX-1 with V100 16GB GPUs.
-
-| Dataset | GPUs    | Batch size / GPU    | Accuracy - FP32  | Accuracy - mixed precision  |   Time to train - FP32  |  Time to train - mixed precision | Time to train speedup (FP32 to mixed precision)        
-|-------------|---|------|-----------------------|-----------------------|-------|-------|-----------
-| Electricity | 1 | 1024 | 0.027 / 0.056 / 0.028 | 0.027 / 0.058 / 0.029 | 2559s | 1598s | 1.601x 
-| Electricity | 8 | 1024 | 0.027 / 0.055 / 0.028 | 0.027 / 0.055 / 0.029 | 381s  | 261s  | 1.460x   
-| Traffic     | 1 | 1024 | 0.040 / 0.102 / 0.075 | 0.041 / 0.101 / 0.074 | 1718s | 1062s | 1.618x 
-| Traffic     | 8 | 1024 | 0.042 / 0.106 / 0.076 | 0.042 / 0.105 / 0.077 | 256s  | 176s  | 1.455x
-
-
-
-##### Training stability test
-
-In order to get a greater picture of the model’s accuracy, we performed a hyperparameter search along with stability tests on 100 random seeds for each configuration. Then, for each benchmark dataset, we have chosen the architecture with the least mean test q-risk. The table below summarizes the best configurations.
-
-| Dataset     | #GPU | Hidden size | #Heads | Local BS | LR   | Gradient clipping | Dropout | Mean q-risk | Std q-risk | Min q-risk | Max q-risk
-|-------------|------|-------------|--------|----------|------|-------------------|---------|-------------|------------| -----------|------ 
-| Electricity | 8    | 128         | 4      | 1024     | 1e-3 | 0.0               | 0.1     | 0.1131      | 0.0025     | 0.1080     | 0.1200
-| Traffic     | 8    | 128         | 4      | 1024     | 1e-3 | 0.0               | 0.3     | 0.2180      | 0.0049     | 0.2069     | 0.2336
-
-
-#### Training performance results
-
-##### Training performance: NVIDIA DGX A100 (A100 80GB)
-
-Our results were obtained by running the `train.sh` training script in the [PyTorch 21.06 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch) on NVIDIA A100 (A100 80GB) GPUs. Performance numbers (in items/images per second) were averaged over an entire training epoch.
-
-| Dataset | GPUs   | Batch size / GPU   | Throughput - TF32    | Throughput - mixed precision    | Throughput speedup (TF32 - mixed precision)   | Weak scaling - TF32    | Weak scaling - mixed precision        
-|-------------|---|------|--------|--------|-------|-------|-----
-| Electricity | 1 | 1024 | 10173  | 13703  | 1.35x | 1     | 1
-| Electricity | 8 | 1024 | 80596  | 107761 | 1.34x | 7.92x | 7.86x
-| Traffic     | 1 | 1024 | 10197  | 13779  | 1.35x | 1     | 1
-| Traffic     | 8 | 1024 | 80692  | 107979 | 1.34x | 7.91x | 7.84x
-
-
-To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
-
-The performance metrics used were items per second.
-
-
-##### Training performance: NVIDIA DGX-1 (V100 16GB)
-
-Our results were obtained by running the `train.sh` training script in the [PyTorch 21.06 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch) on NVIDIA DGX-1 with (V100 16GB) GPUs. Performance numbers (in items/images per second) were averaged over an entire training epoch.
-
-| Dataset | GPUs   | Batch size / GPU   | Throughput - FP32    | Throughput - mixed precision    | Throughput speedup (FP32 - mixed precision)   | Weak scaling - FP32    | Weak scaling - mixed precision        
-|-------------|---|------|-------|-------|-------|------|----
-| Electricity | 1 | 1024 | 5580  | 9148  | 1.64x | 1     | 1
-| Electricity | 8 | 1024 | 43351 | 69855 | 1.61x | 7.77x | 7.64x
-| Traffic     | 1 | 1024 | 5593  | 9194  | 1.64x | 1     | 1
-| Traffic     | 8 | 1024 | 43426 | 69983 | 1.61x | 7.76x | 7.61x
-
-
-
-To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
-
-The performance metrics used were items per second.
-
-## Release notes
-The performance measurements in this document were conducted at the time of publication and may not reflect the performance achieved from NVIDIA’s latest software release. For the most up-to-date performance measurements, go to https://developer.nvidia.com/deep-learning-performance-training-inference.
-
-### Changelog
-
-October 2021
-- Initial release
-
-### Known issues
-There are no known issues with this model.
-
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TFT_architecture.PNG b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TFT_architecture.PNG
deleted file mode 100644
index c3431031..00000000
Binary files a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TFT_architecture.PNG and /dev/null differ
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/Dockerfile b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/Dockerfile
deleted file mode 100644
index 70552ea1..00000000
--- a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/Dockerfile
+++ /dev/null
@@ -1,36 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:21.06-py3
-
-FROM ${FROM_IMAGE_NAME}
-
-RUN apt-get update && apt-get install -y libb64-dev libb64-0d
-WORKDIR /workspace
-#ENV PYTHONPATH /workspace
-RUN pip uninstall -y typing
-
-RUN apt update && apt install -y p7zip-full
-COPY requirements.txt .
-RUN pip install --upgrade pip
-RUN pip install --no-cache-dir --ignore-installed -r requirements.txt
-RUN pip install --no-cache-dir -e git://github.com/NVIDIA/dllogger#egg=dllogger
-
-COPY . .
-ENV PYTHONPATH="${PYTHONPATH}:/workspace"
-
-# AMP monkey-patch
-RUN sed -i 's/  def forward(ctx,/  @amp.custom_fwd\(cast_inputs=torch.float32\)\n  def forward(ctx,/g' /opt/conda/lib/python3.8/site-packages/apex/normalization/fused_layer_norm.py
-RUN sed -i 's/  def backward(ctx,/  @amp.custom_bwd\n  def backward(ctx,/g' /opt/conda/lib/python3.8/site-packages/apex/normalization/fused_layer_norm.py
-RUN sed -i 's/^import torch$/import torch\nfrom torch.cuda import amp/' /opt/conda/lib/python3.8/site-packages/apex/normalization/fused_layer_norm.py
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/LICENCE b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/LICENCE
deleted file mode 100644
index 261eeb9e..00000000
--- a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/LICENCE
+++ /dev/null
@@ -1,201 +0,0 @@
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-   1. Definitions.
-
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-
-   END OF TERMS AND CONDITIONS
-
-   APPENDIX: How to apply the Apache License to your work.
-
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-
-   Copyright [yyyy] [name of copyright owner]
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/LICENSE AGREEMENT b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/LICENSE AGREEMENT
deleted file mode 100644
index 5d1d88cf..00000000
--- a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/LICENSE AGREEMENT	
+++ /dev/null
@@ -1,25 +0,0 @@
-Individual Contributor License Agreement (CLA)
-Thank you for submitting your contributions to this project.
-
-By signing this CLA, you agree that the following terms apply to all of your past, present and future contributions to the project.
-
-License.
-You hereby represent that all present, past and future contributions are governed by the Apache 2.0 License copyright statement.
-
-This entails that to the extent possible under law, you transfer all copyright and related or neighboring rights of the code or documents you contribute to the project itself or its maintainers. Furthermore you also represent that you have the authority to perform the above waiver with respect to the entirety of you contributions.
-
-Moral Rights.
-To the fullest extent permitted under applicable law, you hereby waive, and agree not to assert, all of your “moral rights” in or relating to your contributions for the benefit of the project.
-
-Third Party Content.
-If your Contribution includes or is based on any source code, object code, bug fixes, configuration changes, tools, specifications, documentation, data, materials, feedback, information or other works of authorship that were not authored by you (“Third Party Content”) or if you are aware of any third party intellectual property or proprietary rights associated with your Contribution (“Third Party Rights”), then you agree to include with the submission of your Contribution full details respecting such Third Party Content and Third Party Rights, including, without limitation, identification of which aspects of your Contribution contain Third Party Content or are associated with Third Party Rights, the owner/author of the Third Party Content and Third Party Rights, where you obtained the Third Party Content, and any applicable third party license terms or restrictions respecting the Third Party Content and Third Party Rights. For greater certainty, the foregoing obligations respecting the identification of Third Party Content and Third Party Rights do not apply to any portion of a Project that is incorporated into your Contribution to that same Project.
-
-Representations.
-You represent that, other than the Third Party Content and Third Party Rights identified by you in accordance with this Agreement, you are the sole author of your Contributions and are legally entitled to grant the foregoing licenses and waivers in respect of your Contributions. If your Contributions were created in the course of your employment with your past or present employer(s), you represent that such employer(s) has authorized you to make your Contributions on behalf of such employer(s) or such employer (s) has waived all of their right, title or interest in or to your Contributions.
-
-Disclaimer.
-To the fullest extent permitted under applicable law, your Contributions are provided on an "as is" basis, without any warranties or conditions, express or implied, including, without limitation, any implied warranties or conditions of non-infringement, merchantability or fitness for a particular purpose. You are not required to provide support for your Contributions, except to the extent you desire to provide support.
-
-No Obligation.
-You acknowledge that the maintainers of this project are under no obligation to use or incorporate your contributions into the project. The decision to use or incorporate your contributions into the project will be made at the sole discretion of the maintainers or their authorized delegates.
-
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/NOTICE b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/NOTICE
deleted file mode 100644
index ae19bb47..00000000
--- a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/NOTICE
+++ /dev/null
@@ -1,3 +0,0 @@
-TFT for PyTorch
-
-This repository includes software from https://github.com/google-research/google-research/tree/master/tft licensed under the Apache License, Version 2.0
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/README.md b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/README.md
deleted file mode 100644
index 69b39d12..00000000
--- a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/README.md
+++ /dev/null
@@ -1,465 +0,0 @@
-# Temporal Fusion Transformer For PyTorch
-
-This repository provides a script and recipe to train the Temporal Fusion Transformer model to achieve state-of-the-art accuracy. The content of this repository is tested and maintained by NVIDIA.
-
-## Table Of Contents
-
-- [Model overview](#model-overview)
-    * [Model architecture](#model-architecture)
-    * [Default configuration](#default-configuration)
-    * [Feature support matrix](#feature-support-matrix)
-	    * [Features](#features)
-    * [Mixed precision training](#mixed-precision-training)
-	    * [Enabling mixed precision](#enabling-mixed-precision)
-          * [Enabling TF32](#enabling-tf32)
-    * [Glossary](#glossary)
-- [Setup](#setup)
-    * [Requirements](#requirements)
-- [Quick Start Guide](#quick-start-guide)
-- [Advanced](#advanced)
-    * [Scripts and sample code](#scripts-and-sample-code)
-    * [Command-line options](#command-line-options)
-    * [Getting the data](#getting-the-data)
-        * [Dataset guidelines](#dataset-guidelines)
-        * [Multi-dataset](#multi-dataset)
-    * [Training process](#training-process)
-    * [Inference process](#inference-process)
-- [Performance](#performance)
-    * [Benchmarking](#benchmarking)
-        * [Training performance benchmark](#training-performance-benchmark)
-        * [Inference performance benchmark](#inference-performance-benchmark)
-    * [Results](#results)
-        * [Training accuracy results](#training-accuracy-results)                         
-            * [Training accuracy: NVIDIA DGX A100 (8x A100 80GB)](#training-accuracy-nvidia-dgx-a100-8x-a100-80gb)
-            * [Training accuracy: NVIDIA DGX-1 (8x V100 16GB)](#training-accuracy-nvidia-dgx-1-8x-v100-16gb)
-            * [Training stability test](#training-stability-test)
-        * [Training performance results](#training-performance-results)
-            * [Training performance: NVIDIA DGX A100 (8x A100 80GB)](#training-performance-nvidia-dgx-a100-8x-a100-80gb)
-            * [Training performance: NVIDIA DGX-1 (8x V100 16GB)](#training-performance-nvidia-dgx-1-8x-v100-16gb)
-- [Release notes](#release-notes)
-    * [Changelog](#changelog)
-    * [Known issues](#known-issues)
-
-
-
-## Model overview
-
-The Temporal Fusion Transformer [TFT](https://arxiv.org/abs/1912.09363) model is a state-of-the-art architecture for interpretable, multi-horizon time-series prediction. The model was first developed and [implemented by Google](https://github.com/google-research/google-research/tree/master/tft) with the collaboration with the University of Oxford.
-This implementation differs from the reference implementation by addressing the issue of missing data, which is common in production datasets, by either masking their values in attention matrices or embedding them as a special value in the latent space.
-This model enables the prediction of confidence intervals for future values of time series for multiple future timesteps.
-
-This model is trained with mixed precision using Tensor Cores on Volta, Turing, and the NVIDIA Ampere GPU architectures. Therefore, researchers can get results 1.45x faster than training without Tensor Cores while experiencing the benefits of mixed precision training. This model is tested against each NGC monthly container release to ensure consistent accuracy and performance over time.
-
-### Model architecture
-
-The TFT model is a hybrid architecture joining LSTM encoding of time series and interpretability of transformer attention layers. Prediction is based on three  types of variables: static (constant for a given time series), known (known in advance for whole history and future), observed (known only for historical data). All these variables come in two flavors: categorical, and continuous. In addition to historical data, we feed the model with historical values of time series. All variables are embedded in high-dimensional space by learning an embedding vector. Categorical variables embeddings are learned in the classical sense of embedding discrete values. The model learns a single vector for each continuous variable, which is then scaled by this variable’s value for further processing. The next step is to filter variables through the Variable Selection Network (VSN), which assigns weights to the inputs in accordance with their relevance to the prediction. Static variables are used as a context for variable selection of other variables and as an initial state of LSTM encoders.
-After encoding, variables are passed to multi-head attention layers (decoder), which produce the final prediction. Whole architecture is interwoven with residual connections with gating mechanisms that allow  the architecture to adapt to various problems by skipping some parts of it.
-For the sake of explainability, heads of self-attention layers share value matrices. This allows interpreting  self-attention as an ensemble of models predicting different temporal patterns over the same feature set. The other feature that helps us understand the model is VSN activations, which tells us how relevant the given feature is to the prediction.
-![](TFT_architecture.PNG)
-*image source: https://arxiv.org/abs/1912.09363*
-
-### Default configuration
-
-The specific configuration of the TFT model depends on the dataset used. Not only is the volume of the model subject to change but so are the data sampling and preprocessing strategies. During preprocessing, data is normalized per feature. For a part of the datasets, we apply scaling per-time-series, which takes into account shifts in distribution between entities (i.e., a factory consumes more electricity than an average house). The model is trained with the quantile loss: <img src="https://render.githubusercontent.com/render/math?math=\Large\sum_{i=1}^N\sum_{q\in\mathcal{Q}}\sum_{t=1}^{t_{max}}\frac{QL(y_it,\hat{y}_i(q,t),q)}{Nt_{max}}">
-For quantiles in [0.1, 0.5, 0.9]. The default configurations are tuned for distributed training on DGX-1-32G with mixed precision. We use dynamic loss scaling. Specific values are provided in the table below.
-
-| Dataset | Training samples | Validation samples | Test samples | History length | Forecast horizon | Dropout | Hidden size | #Heads | BS | LR | Gradient clipping |
-| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
-| Electricity | 450k | 50k | 53.5k | 168 | 24 | 0.1 | 128 | 4 | 8x1024 | 1e-3 | 0.0 |
-| Traffic | 450k | 50k | 139.6k | 168 | 24 | 0.3 | 128 | 4 | 8x1024 | 1e-3 | 0.0
-
-### Feature support matrix
-
-The following features are supported by this model: 
-
-| Feature                    | Yes column                
-|----------------------------|--------------------------
-|Distributed data parallel   |         Yes
-|PyTorch AMP                 |         Yes 
-    
-         
-#### Features
-
-[Automatic Mixed Precision](https://pytorch.org/docs/stable/amp.html)
-provides an easy way to leverage Tensor Cores’ performance. It allows the execution of parts of a network in lower precision. Refer to [Mixed precision training](#mixed-precision-training) for more information.
-
-[PyTorch
-DistributedDataParallel](https://pytorch.org/docs/stable/nn.html#torch.nn.parallel.DistributedDataParallel) - a module
-wrapper that enables easy multiprocess distributed data-parallel
-training.
-
-### Mixed precision training
-
-Mixed precision is the combined use of different numerical precisions in a
-computational method.
-[Mixed precision](https://arxiv.org/abs/1710.03740) training offers significant
-computational speedup by performing operations in half-precision format while
-storing minimal information in single-precision to retain as much information
-as possible in critical parts of the network. Since the introduction of [Tensor Cores](https://developer.nvidia.com/tensor-cores) in Volta, and following with 
-both the Turing and Ampere architectures, significant training speedups are 
-experienced by switching to
-mixed precision -- up to 3x overall speedup on the most arithmetically intense
-model architectures. Using mixed precision training previously required two
-steps:
-
-1. Porting the model to use the FP16 data type where appropriate.
-2. Manually adding loss scaling to preserve small gradient values.
-
-The ability to train deep learning networks with lower precision was introduced
-in the Pascal architecture and first supported in [CUDA
-8](https://devblogs.nvidia.com/parallelforall/tag/fp16/) in the NVIDIA Deep
-Learning SDK.
-
-For information about:
-* How to train using mixed precision, refer to the [Mixed Precision
-  Training](https://arxiv.org/abs/1710.03740) paper and [Training With Mixed
-  Precision](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html)
-  documentation.
-* Techniques used for mixed precision training, refer to the [Mixed-Precision
-  Training of Deep Neural
-  Networks](https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/)
-  blog.
-* APEX tools for mixed precision training, refer to the [NVIDIA Apex: Tools for Easy Mixed-Precision Training in
-  PyTorch](https://devblogs.nvidia.com/apex-pytorch-easy-mixed-precision-training/)
-  .
-
-
-#### Enabling mixed precision
-
-
-Mixed precision is enabled in PyTorch by using the Automatic Mixed Precision torch.cuda.amp module, which casts variables to half-precision upon retrieval while storing variables in single-precision format. Furthermore, to preserve small gradient magnitudes in backpropagation, a [loss scaling](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html#lossscaling) step must be included when applying gradients. In PyTorch, loss scaling can be applied automatically by the GradScaler class. All the necessary steps to implement AMP are verbosely described [here](https://pytorch.org/docs/stable/notes/amp_examples.html#amp-examples).
-
-To enable mixed precision for TFT, simply add the `--use_amp` option to the training script.
-#### Enabling TF32
-
-TensorFloat-32 (TF32) is the new math mode in [NVIDIA A100](https://www.nvidia.com/en-us/data-center/a100/) GPUs for handling the matrix math, also called tensor operations. TF32 running on Tensor Cores in A100 GPUs can provide up to 10x speedups compared to single-precision floating-point math (FP32) on Volta GPUs. 
-
-TF32 Tensor Cores can speed up networks using FP32, typically with no loss of accuracy. It is more robust than FP16 for models which require high dynamic range for weights or activations.
-
-For more information, refer to the [TensorFloat-32 in the A100 GPU Accelerates AI Training, HPC up to 20x](https://blogs.nvidia.com/blog/2020/05/14/tensorfloat-32-precision-format/) blog post.
-
-TF32 is supported in the NVIDIA Ampere GPU architecture and is enabled by default.
-
-
-
-### Glossary
-
-**Multi horizon prediction**  
-Process of estimating values of a time series for multiple future time steps.
-
-**Quantiles**  
-Cut points dividing the range of a probability distribution intervals with equal probabilities.
-
-**Time series**  
-Series of data points indexed and equally spaced in time.
-
-**Transformer**  
-The paper [Attention Is All You Need](https://arxiv.org/abs/1706.03762) introduces a novel architecture called Transformer that uses an attention mechanism and transforms one sequence into another.
- 
-
-## Setup
-
-The following section lists the requirements that you need to meet in order to start training the TFT model.
-
-### Requirements
-
-This repository contains Dockerfile, which extends the PyTorch NGC container and encapsulates some dependencies. Aside from these dependencies, ensure you have the following components:
--   [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker)
--   [PyTorch 21.06 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch)
--   Supported GPUs:
-- [NVIDIA Volta architecture](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/)
-- [NVIDIA Turing architecture](https://www.nvidia.com/en-us/design-visualization/technologies/turing-architecture/)
-- [NVIDIA Ampere architecture](https://www.nvidia.com/en-us/data-center/nvidia-ampere-gpu-architecture/)
-
-For more information about how to get started with NGC containers, refer to the following sections from the NVIDIA GPU Cloud Documentation and the Deep Learning Documentation:
--   [Getting Started Using NVIDIA GPU Cloud](https://docs.nvidia.com/ngc/ngc-getting-started-guide/index.html)
--   [Accessing And Pulling From The NGC Container Registry](https://docs.nvidia.com/deeplearning/frameworks/user-guide/index.html#accessing_registry)
--   Running [PyTorch](https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/running.html#running)
-
-  
-For those unable to use the PyTorch NGC container to set up the required environment or create your own container, refer to the versioned [NVIDIA Container Support Matrix](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html).
-
-## Quick Start Guide
-
-To train your model using mixed or TF32 precision with Tensor Cores, perform the following steps using the default parameters of the TFT model on any of the benchmark datasets. For the specifics concerning training and inference, refer to the [Advanced](#advanced) section.
-
-1. Clone the repository.
-```bash
-git clone https://github.com/NVIDIA/DeepLearningExamples
-cd DeepLearningExamples/PyTorch/Forecasting/TFT
-```
-
-2.  Build the TFT PyTorch NGC container.
-```bash
-docker build --network=host -t tft .
-```
-
-3.  Start an interactive session in the NGC container to run training/inference.
-```bash
-docker run -it --rm --ipc=host --network=host --gpus all -v /path/to/your/data:/data/ tft
-```
-
-Note: Ensure to mount your dataset using the -v flag to make it available for training inside the NVIDIA Docker container.
-
-4.  Download and preprocess datasets.
-```bash
-bash scripts/get_data.sh
-```
-
-5. Start training. Choose one of the scripts provided in the `scripts/` directory. Results are stored in the `/results` directory.
-These scripts are tuned for DGX1-32G. If you have a different system, use NGPU and BATCH_SIZE variables to adjust the parameters for your system.
-```bash
-bash scripts/run_electricity.sh
-bash scripts/run_traffic.sh
-```
-
-6. Start validation/evaluation. The metric we use for evaluation is q-risk. We can compare it per-quantile in the Pareto sense or jointly as one number indicating accuracy.
-```bash
-python inference.py \
---checkpoint <your_checkpoint> \
---data /data/processed/<dataset>/test.csv \
---cat_encodings /data/processed/<dataset>/cat_encodings.bin \
---tgt_scalers /data/processed/<dataset>/tgt_scalers.bin
-```
-
-7. Start inference/predictions. Visualize and save predictions by running the following command.
-```bash
-python inference.py \
---checkpoint <your_checkpoint> \
---data /data/processed/<dataset>/test.csv \
---cat_encodings /data/processed/<dataset>/cat_encodings.bin \
---tgt_scalers /data/processed/<dataset>/tgt_scalers.bin \
---visualize \
---save_predictions
-```
-
-
-
-Now that you have your model trained and evaluated, you can choose to compare your training results with our [Training accuracy results](#training-accuracy-results). You can also choose to benchmark your performance to [Training performance benchmark](#training-performance-results). Following the steps in these sections will ensure that you achieve the same accuracy and performance results as stated in the [Results](#results) section.
-## Advanced
-
-The following sections provide more  details about the dataset, running training and inference, and the training results.
-
-### Scripts and sample code
-
-In the root directory, the most important files are:
-
-`train.py`: Entry point for training
-`data_utils.py`: File containing the dataset implementation and preprocessing functions
-`modeling.py`: Definition of the model
-`configuration.py`: Contains configuration classes for various experiments
-`test.py`: Entry point testing trained model.
-`Dockerfile`: Container definition
-`log_helper.py`: Contains helper functions for setting up dllogger
-`criterions.py`: Definitions of loss functions
-
-The `scripts` directory contains scripts for default use cases:
-`run_electricity.sh`: train default model on the electricity dataset
-`run_traffic.sh`: train default model on the traffic dataset
-
-### Command-line options
-
-To view the full list of available options and their descriptions, use the `-h` or `--help` command-line option, for example:
-`python train.py --help`.
-
-The following example output is printed when running the model:
-```
-usage: train.py [-h] --data_path DATA_PATH --dataset {electricity,volatility,traffic,favorita} [--epochs EPOCHS] [--sample_data SAMPLE_DATA SAMPLE_DATA] [--batch_size BATCH_SIZE] [--lr LR] [--seed SEED] [--use_amp] [--clip_grad CLIP_GRAD]
-                [--early_stopping EARLY_STOPPING] [--results RESULTS] [--log_file LOG_FILE] [--distributed_world_size N] [--distributed_rank DISTRIBUTED_RANK] [--local_rank LOCAL_RANK] [--overwrite_config OVERWRITE_CONFIG]
-
-optional arguments:
-  -h, --help            show this help message and exit
-  --data_path DATA_PATH
-  --dataset {electricity,volatility,traffic,favorita}
-  --epochs EPOCHS
-  --sample_data SAMPLE_DATA SAMPLE_DATA
-  --batch_size BATCH_SIZE
-  --lr LR
-  --seed SEED
-  --use_amp             Enable automatic mixed precision
-  --clip_grad CLIP_GRAD
-  --early_stopping EARLY_STOPPING
-                        Stop training if validation loss does not improve for more than this number of epochs.
-  --results RESULTS
-  --log_file LOG_FILE
-  --distributed_world_size N
-                        total number of GPUs across all nodes (default: all visible GPUs)
-  --distributed_rank DISTRIBUTED_RANK
-                        rank of the current worker
-  --local_rank LOCAL_RANK
-                        rank of the current worker
-  --overwrite_config OVERWRITE_CONFIG
-                        JSON string used to overload config
-
-```
-
-### Getting the data
-    
-The TFT model was trained on the electricity and traffic benchmark datasets. This repository contains the `get_data.sh` download script, which for electricity and and traffic datasets will automatically download and preprocess the training, validation and test datasets, and produce files that contain scalers.
-#### Dataset guidelines
-
-The `data_utils.py` file contains all functions that are used to preprocess the data. Initially the data is loaded to a `pandas.DataFrame` and parsed to the common format which contains the features we will use for training. Then standardized data is cleaned, normalized, encoded and binarized.
-This step does the following:
-Drop all the columns that are not marked in the configuration file as used for training or preprocessing
-Flatten indices in case time series are indexed by more than one column
-Split the data into training, validation and test splits
-Filter out all the time series shorter than minimal example length
-Normalize columns marked as continuous in the configuration file
-Encode as integers columns marked as categorical
-Save the data in csv and binary formats
-
-#### Multi-dataset
-In order to use an alternate dataset, you have to write a function that parses your data to a common format. The format is as follows:
-There is at least one id column
-There is exactly one time column (that can also be used as a feature column)
-Each feature is in a separate column
-Each row represents a moment in time for only one time series
-Additionally, you must specify a configuration of the network, including a data description. Refer to the example in `configuration.py` file.
-### Training process
-
-The `train.py` script is an entry point for a training procedure. Refined recipes can be found in the `scripts` directory.
-The model trains for at most `--epochs` epochs. If option `--early_stopping N` is set, then training will end if for N subsequent epochs validation loss hadn’t improved.
-The details of the architecture and the dataset configuration are encapsulated by the `--dataset` option. This option chooses one of the configurations stored in the `configuration.py` file. You can enable mixed precision training by providing the `--use_amp` option. The training script supports multi-GPU training with the APEX package. To enable distributed training prepend training command with `python -m torch.distributed.launch --nproc_per_node=${NGPU}`.
-
-Example command:
-```
-python -m torch.distributed.launch --nproc_per_node=8 train.py \
-        --dataset electricity \
-        --data_path /data/processed/electricity_bin \
-        --batch_size=1024 \
-        --sample 450000 50000 \
-        --lr 1e-3 \
-        --epochs 25 \
-        --early_stopping 5 \
-        --seed 1 \
-        --use_amp \
-        --results /results/TFT_electricity_bs8x1024_lr1e-3/seed_1
-```
-
-The model is trained by optimizing quantile loss <img src="https://render.githubusercontent.com/render/math?math=\Large\sum_{i=1}^N\sum_{q\in\mathcal{Q}}\sum_{t=1}^{t_{max}}\frac{QL(y_{it},\hat{y}_i(q,t),q)}{Nt_{max}}">
-. After training, the checkpoint with the least validation loss is evaluated on a test split with q-risk metric <img src="https://render.githubusercontent.com/render/math?math=\Large\frac{2\sum_{y\in\Omega}\sum_{t=1}^{t_{max}}QL(y_t,\hat{y}(q,t),q)}{\sum_{y\in\Omega}\sum_{t=1}^{t_{max}}|y_t|}">.
-Results are by default stored in the `/results` directory. This can be changed by providing the `--results` option. At the end of the training,  the results directory will contain the trained checkpoint which had the lowest validation loss, dllogger logs (in dictionary per line format), and TensorBoard logs.
-
-### Inference process
-
-Inference can be run by launching the `inference.py` script. The script requires a trained checkpoint to run. It is crucial to prepare the data in the same way as training data prior to running the inference. Example command:
-```
-python inference.py \
---checkpoint /results/checkpoint.pt \
---data /data/processed/electricity_bin/test.csv \
---tgt_scalers /data/processed/electricity_bin/tgt_scalers.bin \
---cat_encodings /data/processed/electricity_bin/cat_encodings.bin \
---batch_size 2048 \
---visualize \
---save_predictions \
---joint_visualization \
---results /results \
---use_amp
-```
-
-In the default setting, it performs the evaluation of the model on a specified dataset and prints q-risk evaluated on this dataset. In order to save the predictions, use the `--save_predictions` option. Predictions will be stored in the directory specified by the `--results` option in the csv format. Option `--joint_visualization` allows us to plot graphs in TensorBoard format, allowing us to inspect the results and compare them to true values. Using `--visualize`, you can save plots for each example in a separate file.
-## Performance
-
-### Benchmarking
-
-The following section shows how to run benchmarks measuring the model performance in training and inference modes.
-
-#### Training performance benchmark
-
-In order to run training benchmarks, use the `scripts/benchmark.sh` script.
-
-#### Inference performance benchmark
-
-To benchmark the inference performance on a specific batch size and dataset, run the `inference.py` script.
-### Results
-
-The following sections provide details on how we achieved our performance and accuracy in training and inference.
-
-#### Training accuracy results
-
-We conducted an extensive hyperparameter search along with stability tests. The presented results are the averages from the hundreds of runs.
-
-##### Training accuracy: NVIDIA DGX A100 (A100 80GB)
-
-Our results were obtained by running the `train.sh` training script in the [PyTorch 21.06 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch) on NVIDIA A100 GPUs.
-
-| Dataset | GPUs | Batch size / GPU    | Accuracy - TF32  | Accuracy - mixed precision  |   Time to train - TF32  |  Time to train - mixed precision | Time to train speedup (TF32 to mixed precision)     
-|-------------|---|------|-----------------------|-----------------------|-------|-------|-------
-| Electricity | 1 | 1024 | 0.027 / 0.059 / 0.029 | 0.028 / 0.058 / 0.029 | 1427s | 1087s | 1.313x
-| Electricity | 8 | 1024 | 0.027 / 0.056 / 0.028 | 0.026 / 0.054 / 0.029 | 216s  | 176s  | 1.227x
-| Traffic     | 1 | 1024 | 0.040 / 0.103 / 0.075 | 0.040 / 0.103 / 0.075 | 957s  | 726s  | 1.318x
-| Traffic     | 8 | 1024 | 0.042 / 0.104 / 0.076 | 0.042 / 0.106 / 0.077 | 151s  | 126s  | 1.198x
-
-
-
-
-##### Training accuracy: NVIDIA DGX-1 (V100 16GB)
-
-Our results were obtained by running the `train.sh` training script in the [PyTorch 21.06 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch) on NVIDIA DGX-1 with V100 16GB GPUs.
-
-| Dataset | GPUs    | Batch size / GPU    | Accuracy - FP32  | Accuracy - mixed precision  |   Time to train - FP32  |  Time to train - mixed precision | Time to train speedup (FP32 to mixed precision)        
-|-------------|---|------|-----------------------|-----------------------|-------|-------|-----------
-| Electricity | 1 | 1024 | 0.027 / 0.056 / 0.028 | 0.027 / 0.058 / 0.029 | 2559s | 1598s | 1.601x 
-| Electricity | 8 | 1024 | 0.027 / 0.055 / 0.028 | 0.027 / 0.055 / 0.029 | 381s  | 261s  | 1.460x   
-| Traffic     | 1 | 1024 | 0.040 / 0.102 / 0.075 | 0.041 / 0.101 / 0.074 | 1718s | 1062s | 1.618x 
-| Traffic     | 8 | 1024 | 0.042 / 0.106 / 0.076 | 0.042 / 0.105 / 0.077 | 256s  | 176s  | 1.455x
-
-
-
-##### Training stability test
-
-In order to get a greater picture of the model’s accuracy, we performed a hyperparameter search along with stability tests on 100 random seeds for each configuration. Then, for each benchmark dataset, we have chosen the architecture with the least mean test q-risk. The table below summarizes the best configurations.
-
-| Dataset     | #GPU | Hidden size | #Heads | Local BS | LR   | Gradient clipping | Dropout | Mean q-risk | Std q-risk | Min q-risk | Max q-risk
-|-------------|------|-------------|--------|----------|------|-------------------|---------|-------------|------------| -----------|------ 
-| Electricity | 8    | 128         | 4      | 1024     | 1e-3 | 0.0               | 0.1     | 0.1131      | 0.0025     | 0.1080     | 0.1200
-| Traffic     | 8    | 128         | 4      | 1024     | 1e-3 | 0.0               | 0.3     | 0.2180      | 0.0049     | 0.2069     | 0.2336
-
-
-#### Training performance results
-
-##### Training performance: NVIDIA DGX A100 (A100 80GB)
-
-Our results were obtained by running the `train.sh` training script in the [PyTorch 21.06 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch) on NVIDIA A100 (A100 80GB) GPUs. Performance numbers (in items/images per second) were averaged over an entire training epoch.
-
-| Dataset | GPUs   | Batch size / GPU   | Throughput - TF32    | Throughput - mixed precision    | Throughput speedup (TF32 - mixed precision)   | Weak scaling - TF32    | Weak scaling - mixed precision        
-|-------------|---|------|--------|--------|-------|-------|-----
-| Electricity | 1 | 1024 | 10173  | 13703  | 1.35x | 1     | 1
-| Electricity | 8 | 1024 | 80596  | 107761 | 1.34x | 7.92x | 7.86x
-| Traffic     | 1 | 1024 | 10197  | 13779  | 1.35x | 1     | 1
-| Traffic     | 8 | 1024 | 80692  | 107979 | 1.34x | 7.91x | 7.84x
-
-
-To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
-
-The performance metrics used were items per second.
-
-
-##### Training performance: NVIDIA DGX-1 (V100 16GB)
-
-Our results were obtained by running the `train.sh` training script in the [PyTorch 21.06 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch) on NVIDIA DGX-1 with (V100 16GB) GPUs. Performance numbers (in items/images per second) were averaged over an entire training epoch.
-
-| Dataset | GPUs   | Batch size / GPU   | Throughput - FP32    | Throughput - mixed precision    | Throughput speedup (FP32 - mixed precision)   | Weak scaling - FP32    | Weak scaling - mixed precision        
-|-------------|---|------|-------|-------|-------|------|----
-| Electricity | 1 | 1024 | 5580  | 9148  | 1.64x | 1     | 1
-| Electricity | 8 | 1024 | 43351 | 69855 | 1.61x | 7.77x | 7.64x
-| Traffic     | 1 | 1024 | 5593  | 9194  | 1.64x | 1     | 1
-| Traffic     | 8 | 1024 | 43426 | 69983 | 1.61x | 7.76x | 7.61x
-
-
-
-To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
-
-The performance metrics used were items per second.
-
-## Release notes
-The performance measurements in this document were conducted at the time of publication and may not reflect the performance achieved from NVIDIA’s latest software release. For the most up-to-date performance measurements, go to https://developer.nvidia.com/deep-learning-performance-training-inference.
-
-### Changelog
-
-October 2021
-- Initial release
-
-### Known issues
-There are no known issues with this model.
-
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/TFT_architecture.PNG b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/TFT_architecture.PNG
deleted file mode 100644
index c3431031..00000000
Binary files a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/TFT_architecture.PNG and /dev/null differ
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/configuration.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/configuration.py
deleted file mode 100644
index bef26e66..00000000
--- a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/configuration.py
+++ /dev/null
@@ -1,128 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from data_utils import InputTypes, DataTypes, FeatureSpec
-import datetime
-
-class ElectricityConfig():
-    def __init__(self):
-
-        self.features = [
-                         FeatureSpec('id', InputTypes.ID, DataTypes.CATEGORICAL),
-                         FeatureSpec('hours_from_start', InputTypes.TIME, DataTypes.CONTINUOUS),
-                         FeatureSpec('power_usage', InputTypes.TARGET, DataTypes.CONTINUOUS),
-                         FeatureSpec('hour', InputTypes.KNOWN, DataTypes.CONTINUOUS),
-                         FeatureSpec('day_of_week', InputTypes.KNOWN, DataTypes.CONTINUOUS),
-                         FeatureSpec('hours_from_start', InputTypes.KNOWN, DataTypes.CONTINUOUS),
-                         FeatureSpec('categorical_id', InputTypes.STATIC, DataTypes.CATEGORICAL),
-                        ]
-        # Dataset split boundaries
-        self.time_ids = 'days_from_start' # This column contains time indices across which we split the data
-        self.train_range = (1096, 1315)
-        self.valid_range = (1308, 1339)
-        self.test_range = (1332, 1346)
-        self.dataset_stride = 1 #how many timesteps between examples
-        self.scale_per_id = True
-        self.missing_id_strategy = None
-        self.missing_cat_data_strategy='encode_all'
-
-        # Feature sizes
-        self.static_categorical_inp_lens = [369]
-        self.temporal_known_categorical_inp_lens = []
-        self.temporal_observed_categorical_inp_lens = []
-        self.quantiles = [0.1, 0.5, 0.9]
-
-        self.example_length = 8 * 24
-        self.encoder_length = 7 * 24
-
-        self.n_head = 4
-        self.hidden_size = 128
-        self.dropout = 0.1
-        self.attn_dropout = 0.0
-
-        #### Derived variables ####
-        self.temporal_known_continuous_inp_size = len([x for x in self.features 
-            if x.feature_type == InputTypes.KNOWN and x.feature_embed_type == DataTypes.CONTINUOUS])
-        self.temporal_observed_continuous_inp_size = len([x for x in self.features 
-            if x.feature_type == InputTypes.OBSERVED and x.feature_embed_type == DataTypes.CONTINUOUS])
-        self.temporal_target_size = len([x for x in self.features if x.feature_type == InputTypes.TARGET])
-        self.static_continuous_inp_size = len([x for x in self.features 
-            if x.feature_type == InputTypes.STATIC and x.feature_embed_type == DataTypes.CONTINUOUS])
-
-        self.num_static_vars = self.static_continuous_inp_size + len(self.static_categorical_inp_lens)
-        self.num_future_vars = self.temporal_known_continuous_inp_size + len(self.temporal_known_categorical_inp_lens)
-        self.num_historic_vars = sum([self.num_future_vars,
-                                      self.temporal_observed_continuous_inp_size,
-                                      self.temporal_target_size,
-                                      len(self.temporal_observed_categorical_inp_lens),
-                                      ])
-
-
-class TrafficConfig():
-    def __init__(self):
-
-        self.features = [
-                         FeatureSpec('id', InputTypes.ID, DataTypes.CATEGORICAL),
-                         FeatureSpec('hours_from_start', InputTypes.TIME, DataTypes.CONTINUOUS),
-                         FeatureSpec('values', InputTypes.TARGET, DataTypes.CONTINUOUS),
-                         FeatureSpec('time_on_day', InputTypes.KNOWN, DataTypes.CONTINUOUS),
-                         FeatureSpec('day_of_week', InputTypes.KNOWN, DataTypes.CONTINUOUS),
-                         FeatureSpec('hours_from_start', InputTypes.KNOWN, DataTypes.CONTINUOUS),
-                         FeatureSpec('categorical_id', InputTypes.STATIC, DataTypes.CATEGORICAL),
-                        ]
-        # Dataset split boundaries
-        self.time_ids = 'sensor_day' # This column contains time indices across which we split the data
-        self.train_range = (0, 151)
-        self.valid_range = (144, 166)
-        self.test_range = (159, float('inf'))
-        self.dataset_stride = 1 #how many timesteps between examples
-        self.scale_per_id = False
-        self.missing_id_strategy = None
-        self.missing_cat_data_strategy='encode_all'
-
-        # Feature sizes
-        self.static_categorical_inp_lens = [963]
-        self.temporal_known_categorical_inp_lens = []
-        self.temporal_observed_categorical_inp_lens = []
-        self.quantiles = [0.1, 0.5, 0.9]
-
-        self.example_length = 8 * 24
-        self.encoder_length = 7 * 24
-
-        self.n_head = 4
-        self.hidden_size = 128
-        self.dropout = 0.3
-        self.attn_dropout = 0.0
-
-        #### Derived variables ####
-        self.temporal_known_continuous_inp_size = len([x for x in self.features 
-            if x.feature_type == InputTypes.KNOWN and x.feature_embed_type == DataTypes.CONTINUOUS])
-        self.temporal_observed_continuous_inp_size = len([x for x in self.features 
-            if x.feature_type == InputTypes.OBSERVED and x.feature_embed_type == DataTypes.CONTINUOUS])
-        self.temporal_target_size = len([x for x in self.features if x.feature_type == InputTypes.TARGET])
-        self.static_continuous_inp_size = len([x for x in self.features 
-            if x.feature_type == InputTypes.STATIC and x.feature_embed_type == DataTypes.CONTINUOUS])
-
-        self.num_static_vars = self.static_continuous_inp_size + len(self.static_categorical_inp_lens)
-        self.num_future_vars = self.temporal_known_continuous_inp_size + len(self.temporal_known_categorical_inp_lens)
-        self.num_historic_vars = sum([self.num_future_vars,
-                                      self.temporal_observed_continuous_inp_size,
-                                      self.temporal_target_size,
-                                      len(self.temporal_observed_categorical_inp_lens),
-                                      ])
-
-
-CONFIGS = {'electricity':  ElectricityConfig,
-           'traffic':      TrafficConfig, 
-           }
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/criterions.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/criterions.py
deleted file mode 100644
index 5c9df6ae..00000000
--- a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/criterions.py
+++ /dev/null
@@ -1,28 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-class QuantileLoss(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.register_buffer('q', torch.tensor(config.quantiles))
-
-    def forward(self, predictions, targets):
-        diff = predictions - targets
-        ql = (1-self.q)*F.relu(diff) + self.q*F.relu(-diff)
-        losses = ql.view(-1, ql.shape[-1]).mean(0)
-        return losses
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/data_utils.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/data_utils.py
deleted file mode 100644
index f38f8bfb..00000000
--- a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/data_utils.py
+++ /dev/null
@@ -1,790 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-################################
-# Copyright 2021 The Google Research Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import math
-import pickle
-import enum
-import datetime
-
-from collections import namedtuple, OrderedDict
-
-import sklearn.preprocessing
-from sklearn.impute import SimpleImputer
-import pandas as pd
-import numpy as np
-from bisect import bisect
-
-import torch
-from torch.utils.data import Dataset,IterableDataset,DataLoader
-
-class DataTypes(enum.IntEnum):
-    """Defines numerical types of each column."""
-    CONTINUOUS = 0
-    CATEGORICAL = 1
-    DATE = 2
-    STR = 3
-
-class InputTypes(enum.IntEnum):
-    """Defines input types of each column."""
-    TARGET = 0
-    OBSERVED = 1
-    KNOWN = 2
-    STATIC = 3
-    ID = 4  # Single column used as an entity identifier
-    TIME = 5  # Single column exclusively used as a time index
-
-FeatureSpec = namedtuple('FeatureSpec', ['name', 'feature_type', 'feature_embed_type'])
-DTYPE_MAP = {
-        DataTypes.CONTINUOUS : np.float32,
-        DataTypes.CATEGORICAL : np.int64,
-        DataTypes.DATE:'datetime64[ns]',
-        DataTypes.STR: str
-        }
-
-FEAT_ORDER = [
-        (InputTypes.STATIC, DataTypes.CATEGORICAL),
-        (InputTypes.STATIC, DataTypes.CONTINUOUS),
-        (InputTypes.KNOWN, DataTypes.CATEGORICAL),
-        (InputTypes.KNOWN, DataTypes.CONTINUOUS),
-        (InputTypes.OBSERVED, DataTypes.CATEGORICAL),
-        (InputTypes.OBSERVED, DataTypes.CONTINUOUS),
-        (InputTypes.TARGET, DataTypes.CONTINUOUS),
-        (InputTypes.ID, DataTypes.CATEGORICAL)
-        ]
-
-FEAT_NAMES = ['s_cat' , 's_cont' , 'k_cat' , 'k_cont' , 'o_cat' , 'o_cont' , 'target', 'id']
-DEFAULT_ID_COL = 'id'
-
-class TFTBinaryDataset(Dataset):
-    def __init__(self, path, config):
-        super(TFTBinaryDataset).__init__()
-        self.features = [x for x in config.features if x.feature_embed_type != DataTypes.DATE]
-        self.example_length = config.example_length
-        self.stride = config.dataset_stride
-
-        self.grouped = pickle.load(open(path, 'rb'))
-        self.grouped = [x for x in self.grouped if x.shape[0] >= self.example_length]
-        self._cum_examples_in_group = np.cumsum([(g.shape[0] - self.example_length + 1)//self.stride for g in self.grouped])
-
-
-        self.feature_type_col_map = [[i for i,f in enumerate(self.features) if (f.feature_type, f.feature_embed_type) == x] for x in FEAT_ORDER]
-
-        # The list comprehension below is an elaborate way of rearranging data into correct order,
-        # simultaneously doing casting to proper types. Probably can be written neater
-        self.grouped = [
-                [
-                    arr[:, idxs].view(dtype=np.float32).astype(DTYPE_MAP[t[1]]) 
-                    for t, idxs in zip(FEAT_ORDER, self.feature_type_col_map)
-                ] 
-                for arr in self.grouped
-            ]
-
-    def __len__(self):
-        return self._cum_examples_in_group[-1] if len(self._cum_examples_in_group) else 0
-
-    def __getitem__(self, idx):
-        g_idx = bisect(self._cum_examples_in_group, idx)
-        e_idx = idx - self._cum_examples_in_group[g_idx-1] if g_idx else idx
-
-        group =  self.grouped[g_idx]
-
-        tensors = [
-                   torch.from_numpy(feat[e_idx * self.stride:e_idx*self.stride + self.example_length])
-                   if feat.size else torch.empty(0)
-                   for feat in group
-                  ]
-
-        return OrderedDict(zip(FEAT_NAMES, tensors))
-
-
-class TFTDataset(Dataset):
-    def __init__(self, path, config):
-        super(TFTDataset).__init__()
-        self.features = config.features
-        self.data = pd.read_csv(path, index_col=0)
-        self.example_length = config.example_length
-        self.stride = config.dataset_stride
-
-        # name field is a column name.
-        # there can be multiple entries with the same name because one column can be interpreted in many ways
-        time_col_name = next(x.name for x in self.features if x.feature_type==InputTypes.TIME)
-        id_col_name   = next(x.name for x in self.features if x.feature_type==InputTypes.ID)
-        if not id_col_name in self.data.columns:
-            id_col_name = DEFAULT_ID_COL
-            self.features = [x for x in self.features if x.feature_type!=InputTypes.ID]
-            self.features.append(FeatureSpec(DEFAULT_ID_COL, InputTypes.ID, DataTypes.CATEGORICAL))
-        col_dtypes    = {v.name:DTYPE_MAP[v.feature_embed_type] for v in self.features}
-
-
-        self.data.sort_values(time_col_name,inplace=True)
-        self.data = self.data[set(x.name for x in self.features)] #leave only relevant columns
-        self.data = self.data.astype(col_dtypes)
-        self.data = self.data.groupby(id_col_name).filter(lambda group: len(group) >= self.example_length)
-        self.grouped = list(self.data.groupby(id_col_name))
-
-        self._cum_examples_in_group = np.cumsum([(len(g[1]) - self.example_length + 1)//self.stride for g in self.grouped])
-
-    def __len__(self):
-        return self._cum_examples_in_group[-1]
-
-    def __getitem__(self, idx):
-        g_idx = len([x for x in self._cum_examples_in_group if x <= idx])
-        e_idx = idx - self._cum_examples_in_group[g_idx-1] if g_idx else idx
-
-        group =  self.grouped[g_idx][1]
-        sliced = group.iloc[e_idx * self.stride:e_idx*self.stride + self.example_length]
-
-        # We need to be sure that tensors are returned in the correct order
-        tensors = tuple([] for _ in range(8))
-        for v in self.features:
-            if v.feature_type == InputTypes.STATIC and v.feature_embed_type == DataTypes.CATEGORICAL:
-                tensors[0].append(torch.from_numpy(sliced[v.name].to_numpy()))
-            elif v.feature_type == InputTypes.STATIC and v.feature_embed_type == DataTypes.CONTINUOUS:
-                tensors[1].append(torch.from_numpy(sliced[v.name].to_numpy()))
-            elif v.feature_type == InputTypes.KNOWN and v.feature_embed_type == DataTypes.CATEGORICAL:
-                tensors[2].append(torch.from_numpy(sliced[v.name].to_numpy()))
-            elif v.feature_type == InputTypes.KNOWN and v.feature_embed_type == DataTypes.CONTINUOUS:
-                tensors[3].append(torch.from_numpy(sliced[v.name].to_numpy()))
-            elif v.feature_type == InputTypes.OBSERVED and v.feature_embed_type == DataTypes.CATEGORICAL:
-                tensors[4].append(torch.from_numpy(sliced[v.name].to_numpy()))
-            elif v.feature_type == InputTypes.OBSERVED and v.feature_embed_type == DataTypes.CONTINUOUS:
-                tensors[5].append(torch.from_numpy(sliced[v.name].to_numpy()))
-            elif v.feature_type == InputTypes.TARGET:
-                tensors[6].append(torch.from_numpy(sliced[v.name].to_numpy()))
-            elif v.feature_type == InputTypes.ID:
-                tensors[7].append(torch.from_numpy(sliced[v.name].to_numpy()))
-
-
-        tensors = [torch.stack(x, dim=-1) if x else torch.empty(0) for x in tensors]
-
-        return OrderedDict(zip(FEAT_NAMES, tensors))
-        
-def get_dataset_splits(df, config):
-
-    if hasattr(config, 'relative_split') and config.relative_split:
-        forecast_len = config.example_length - config.encoder_length
-        # The valid split is shifted from the train split by number of the forecast steps to the future.
-        # The test split is shifted by the number of the forecast steps from the valid split
-        train = []
-        valid = []
-        test = []
-
-        for _, group in df.groupby(DEFAULT_ID_COL):
-            index = group[config.time_ids]
-            _train = group.loc[index < config.valid_boundary]
-            _valid = group.iloc[(len(_train) - config.encoder_length):(len(_train) + forecast_len)]
-            _test = group.iloc[(len(_train) - config.encoder_length + forecast_len):(len(_train) + 2*forecast_len)]
-            train.append(_train)
-            valid.append(_valid)
-            test.append(_test)
-
-        train = pd.concat(train, axis=0)
-        valid = pd.concat(valid, axis=0)
-        test = pd.concat(test, axis=0)
-    else:
-        index = df[config.time_ids]
-        train = df.loc[(index >= config.train_range[0]) & (index < config.train_range[1])]
-        valid = df.loc[(index >= config.valid_range[0]) & (index < config.valid_range[1])]
-        test  = df.loc[(index >= config.test_range[0]) & (index < config.test_range[1])]
-
-    return train, valid, test
-
-def flatten_ids(df, config):
-
-    if config.missing_id_strategy == 'drop':
-        if hasattr(config, 'combine_ids') and config.combine_ids:
-            index = np.logical_or.reduce([df[c].isna() for c in config.combine_ids])
-        else:
-            id_col = next(x.name for x in config.features if x.feature_type == InputTypes.ID)
-            index = df[id_col].isna()
-        index = index[index == True].index # Extract indices of nans
-        df.drop(index, inplace=True)
-
-    if not (hasattr(config, 'combine_ids') and config.combine_ids):
-        id_col = next(x.name for x in config.features if x.feature_type == InputTypes.ID)
-        ids = df[id_col].apply(str)
-        df.drop(id_col, axis=1, inplace=True)
-        encoder = sklearn.preprocessing.LabelEncoder().fit(ids.values)
-        df[DEFAULT_ID_COL] = encoder.transform(ids)
-        encoders = OrderedDict({id_col: encoder})
-
-    else:
-        encoders = {c:sklearn.preprocessing.LabelEncoder().fit(df[c].values) for c in config.combine_ids}
-        encoders = OrderedDict(encoders)
-        lens = [len(v.classes_) for v in encoders.values()]
-        clens = np.roll(np.cumprod(lens), 1)
-        clens[0] = 1
-
-        # this takes a looooooot of time. Probably it would be better to create 2 dummy columns
-        df[DEFAULT_ID_COL] = df.apply(lambda row: sum([encoders[c].transform([row[c]])[0]*clens[i] for i,c in enumerate(encoders.keys())]), axis=1)
-        df.drop(config.combine_ids, axis=1, inplace=True)
-
-    return DEFAULT_ID_COL, encoders
-
-def impute(df, config):
-    #XXX This ensures that out scaling will have the same mean. We still need to check the variance
-    if not hasattr(config, 'missing_data_label'):
-        return df, None
-    else:
-        imp = SimpleImputer(missing_values=config.missing_data_label, strategy='mean')
-        mask = df.applymap(lambda x: True if x == config.missing_data_label else False)
-        data = df.values
-        col_mask = (data == config.missing_data_label).all(axis=0)
-        data[:,~col_mask] = imp.fit_transform(data)
-        return data, mask
-
-def normalize_reals(train, valid, test, config, id_col=DEFAULT_ID_COL):
-    tgt_cols = [x.name for x in config.features if x.feature_type == InputTypes.TARGET]
-    real_cols = list(set(v.name for v in config.features if v.feature_embed_type == DataTypes.CONTINUOUS).difference(set(tgt_cols)))
-    real_scalers = {}
-    tgt_scalers = {}
-
-    def apply_scalers(df, name=None):
-        if name is None:
-            name = df.name
-        mask = df.applymap(lambda x: True if x == config.missing_data_label else False) if hasattr(config, 'missing_data_label') else None
-        df[real_cols] = real_scalers[name].transform(df[real_cols])
-        if mask is not None and any(mask):
-            df[real_cols].mask(mask, 10**9)
-        df[tgt_cols] = tgt_scalers[name].transform(df[tgt_cols])
-        return df
-
-    if config.scale_per_id:
-        for identifier, sliced in train.groupby(id_col):
-            data = sliced[real_cols]
-            data, _ = impute(data, config)
-            real_scalers[identifier] = sklearn.preprocessing.StandardScaler().fit(data)
-            # XXX We should probably remove examples that contain NaN as a target
-            target = sliced[tgt_cols]
-            tgt_scalers[identifier] = sklearn.preprocessing.StandardScaler().fit(target)
-
-        train = train.groupby(id_col).apply(apply_scalers)
-        # For valid and testing leave only timeseries previously present in train subset
-        # XXX for proper data science we should consider encoding unseen timeseries as a special case, not throwing them away
-        valid = valid.loc[valid[id_col].isin(real_scalers.keys())]
-        valid = valid.groupby(id_col).apply(apply_scalers)
-        test = test.loc[test[id_col].isin(real_scalers.keys())]
-        test = test.groupby(id_col).apply(apply_scalers)
-
-    else:
-        data, _ = impute(train[real_cols], config)
-        real_scalers[''] = sklearn.preprocessing.StandardScaler().fit(data)
-        tgt_scalers[''] = sklearn.preprocessing.StandardScaler().fit(train[tgt_cols])
-
-        train = apply_scalers(train, name='')
-        valid = apply_scalers(valid, name='')
-        test = apply_scalers(test, name='')
-
-    return train, valid, test, real_scalers, tgt_scalers
-
-def encode_categoricals(train, valid, test, config):
-    cat_encodings = {}
-    cat_cols = list(set(v.name for v in config.features if v.feature_embed_type == DataTypes.CATEGORICAL and v.feature_type != InputTypes.ID))
-    num_classes = [] #XXX Maybe we should modify config based on this value? Or send a warninig?
-                     # For TC performance reasons we might want for num_classes[i] be divisible by 8
-
-    # Train categorical encoders
-    for c in cat_cols:
-        if config.missing_cat_data_strategy == 'special_token':
-            #XXX this will probably require some data augmentation
-            unique = train[c].unique()
-            valid[c].loc[valid[c].isin(unique)] = '<UNK>'
-            test[c].loc[test[c].isin(unique)] = '<UNK>'
-
-        if config.missing_cat_data_strategy == 'encode_all' or \
-                config.missing_cat_data_strategy == 'special_token':
-            srs = pd.concat([train[c], valid[c], test[c]]).apply(str)
-            cat_encodings[c] = sklearn.preprocessing.LabelEncoder().fit(srs.values)
-        elif config.missing_cat_data_strategy == 'drop':
-            # TODO: implement this. In addition to dropping rows this has to split specific time series in chunks
-            # to prevent data from having temporal gaps
-            pass
-        num_classes.append(srs.nunique())
-    print('Categorical variables encodings lens: ', num_classes)
-
-
-    for split in [train, valid, test]:
-        for c in cat_cols:
-            srs = split[c].apply(str)
-            split[c] = srs
-            split.loc[:,c] = cat_encodings[c].transform(srs)
-
-    return cat_encodings
-
-
-def preprocess(src_path, dst_path, config):
-    df = pd.read_csv(src_path, index_col=0)
-
-    for c in config.features:
-        if c.feature_embed_type == DataTypes.DATE:
-            df[c.name] = pd.to_datetime(df[c.name])
-
-    # Leave only columns relevant to preprocessing
-    relevant_columns = list(set([f.name for f in config.features] + [config.time_ids]))
-    df = df[relevant_columns]
-
-
-    id_col, id_encoders = flatten_ids(df, config)
-    df = df.reindex(sorted(df.columns), axis=1)
-    
-    train, valid, test = get_dataset_splits(df, config)
-   
-    # Length filter the data (all timeseries shorter than example len will be dropped)
-    #for df in [train, valid, test]:
-    #    df.groupby(id_col).filter(lambda x: len(x) >= config.example_length)
-    train = pd.concat([x[1] for x in train.groupby(id_col) if len(x[1]) >= config.example_length])
-    valid = pd.concat([x[1] for x in valid.groupby(id_col) if len(x[1]) >= config.example_length])
-    test  = pd.concat([x[1] for x in test.groupby(id_col)  if len(x[1]) >= config.example_length])
-
-    train, valid, test, real_scalers, tgt_scalers = normalize_reals(train, valid, test, config, id_col)
-
-    cat_encodings = encode_categoricals(train, valid, test, config)
-
-    os.makedirs(dst_path, exist_ok=True)
-    
-    train.to_csv(os.path.join(dst_path, 'train.csv'))
-    valid.to_csv(os.path.join(dst_path, 'valid.csv'))
-    test.to_csv(os.path.join(dst_path, 'test.csv'))
-
-    # Save relevant columns in binary form for faster dataloading
-    # IMORTANT: We always expect id to be a single column indicating the complete timeseries
-    # We also expect a copy of id in form of static categorical input!!!
-    col_names = [id_col] + [x.name for x in config.features if x.feature_embed_type != DataTypes.DATE and x.feature_type != InputTypes.ID]
-    grouped_train = [x[1][col_names].values.astype(np.float32).view(dtype=np.int32) for x in train.groupby(id_col)]
-    grouped_valid = [x[1][col_names].values.astype(np.float32).view(dtype=np.int32) for x in valid.groupby(id_col)]
-    grouped_test  = [x[1][col_names].values.astype(np.float32).view(dtype=np.int32) for x in test.groupby(id_col)]
-
-    pickle.dump(grouped_train, open(os.path.join(dst_path, 'train.bin'), 'wb'))
-    pickle.dump(grouped_valid, open(os.path.join(dst_path, 'valid.bin'), 'wb'))
-    pickle.dump(grouped_test,  open(os.path.join(dst_path, 'test.bin'), 'wb'))
-
-    
-    with open(os.path.join(dst_path, 'real_scalers.bin'), 'wb') as f:
-        pickle.dump(real_scalers, f)
-    with open(os.path.join(dst_path, 'tgt_scalers.bin'), 'wb') as f:
-        pickle.dump(tgt_scalers, f)
-    with open(os.path.join(dst_path, 'cat_encodings.bin'), 'wb') as f:
-        pickle.dump(cat_encodings, f)
-    with open(os.path.join(dst_path, 'id_encoders.bin'), 'wb') as f:
-        pickle.dump(id_encoders, f)
-    
-
-def sample_data(dataset, num_samples):
-    if num_samples < 0:
-        return dataset
-    else:
-        return torch.utils.data.Subset(dataset, np.random.choice(np.arange(len(dataset)), size=num_samples, replace=False))
-
-
-def standarize_electricity(path):
-    """Code taken from https://github.com/google-research/google-research/blob/master/tft/script_download_data.py"""
-    df = pd.read_csv(os.path.join(path, 'LD2011_2014.txt'), index_col=0, sep=';', decimal=',')
-    df.index = pd.to_datetime(df.index)
-    df.sort_index(inplace=True)
-  
-    # Used to determine the start and end dates of a series
-    output = df.resample('1h').mean().replace(0., np.nan)
-  
-    earliest_time = output.index.min()
-  
-    df_list = []
-    for label in output:
-        print('Processing {}'.format(label))
-        srs = output[label]
-  
-        start_date = min(srs.fillna(method='ffill').dropna().index)
-        end_date = max(srs.fillna(method='bfill').dropna().index)
-  
-        active_range = (srs.index >= start_date) & (srs.index <= end_date)
-        srs = srs[active_range].fillna(0.)
-  
-        tmp = pd.DataFrame({'power_usage': srs})
-        date = tmp.index
-        tmp['t'] = (date - earliest_time).seconds / 60 / 60 + (
-            date - earliest_time).days * 24
-        tmp['days_from_start'] = (date - earliest_time).days
-        tmp['categorical_id'] = label
-        tmp['date'] = date
-        tmp['id'] = label
-        tmp['hour'] = date.hour
-        tmp['day'] = date.day
-        tmp['day_of_week'] = date.dayofweek
-        tmp['month'] = date.month
-  
-        df_list.append(tmp)
-  
-    output = pd.concat(df_list, axis=0, join='outer').reset_index(drop=True)
-  
-    output['categorical_id'] = output['id'].copy()
-    output['hours_from_start'] = output['t']
-    output['categorical_day_of_week'] = output['day_of_week'].copy()
-    output['categorical_hour'] = output['hour'].copy()
-  
-    output.to_csv(os.path.join(path, 'standarized.csv'))
-
-def standarize_volatility(path):
-    df = pd.read_csv(os.path.join(path, 'oxfordmanrealizedvolatilityindices.csv'), index_col=0)  # no explicit index
-  
-    # Adds additional date/day fields
-    idx = [str(s).split('+')[0] for s in df.index
-          ]  # ignore timezones, we don't need them
-    dates = pd.to_datetime(idx)
-    df['date'] = dates
-    df['days_from_start'] = (dates - pd.datetime(2000, 1, 3)).days
-    df['day_of_week'] = dates.dayofweek
-    df['day_of_month'] = dates.day
-    df['week_of_year'] = dates.weekofyear
-    df['month'] = dates.month
-    df['year'] = dates.year
-    df['categorical_id'] = df['Symbol'].copy()
-  
-    # Processes log volatility
-    vol = df['rv5_ss'].copy()
-    vol.loc[vol == 0.] = np.nan
-    df['log_vol'] = np.log(vol)
-  
-    # Adds static information
-    symbol_region_mapping = {
-        '.AEX': 'EMEA',
-        '.AORD': 'APAC',
-        '.BFX': 'EMEA',
-        '.BSESN': 'APAC',
-        '.BVLG': 'EMEA',
-        '.BVSP': 'AMER',
-        '.DJI': 'AMER',
-        '.FCHI': 'EMEA',
-        '.FTMIB': 'EMEA',
-        '.FTSE': 'EMEA',
-        '.GDAXI': 'EMEA',
-        '.GSPTSE': 'AMER',
-        '.HSI': 'APAC',
-        '.IBEX': 'EMEA',
-        '.IXIC': 'AMER',
-        '.KS11': 'APAC',
-        '.KSE': 'APAC',
-        '.MXX': 'AMER',
-        '.N225': 'APAC ',
-        '.NSEI': 'APAC',
-        '.OMXC20': 'EMEA',
-        '.OMXHPI': 'EMEA',
-        '.OMXSPI': 'EMEA',
-        '.OSEAX': 'EMEA',
-        '.RUT': 'EMEA',
-        '.SMSI': 'EMEA',
-        '.SPX': 'AMER',
-        '.SSEC': 'APAC',
-        '.SSMI': 'EMEA',
-        '.STI': 'APAC',
-        '.STOXX50E': 'EMEA'
-    }
-  
-    df['Region'] = df['Symbol'].apply(lambda k: symbol_region_mapping[k])
-  
-    # Performs final processing
-    output_df_list = []
-    for grp in df.groupby('Symbol'):
-        sliced = grp[1].copy()
-        sliced.sort_values('days_from_start', inplace=True)
-        # Impute log volatility values
-        sliced['log_vol'].fillna(method='ffill', inplace=True)
-        sliced.dropna()
-        output_df_list.append(sliced)
-  
-    df = pd.concat(output_df_list, axis=0)
-  
-    df.to_csv(os.path.join(path, 'standarized.csv'))
-
-
-def standarize_traffic(path):
-    def process_list(s, variable_type=int, delimiter=None):
-        """Parses a line in the PEMS format to a list."""
-        if delimiter is None:
-            l = [
-                variable_type(i) for i in s.replace('[', '').replace(']', '').split()
-            ]
-        else:
-            l = [
-                variable_type(i)
-                for i in s.replace('[', '').replace(']', '').split(delimiter)
-            ]
-  
-        return l
-  
-    def read_single_list(filename):
-        """Returns single list from a file in the PEMS-custom format."""
-        with open(os.path.join(path, filename), 'r') as dat:
-            l = process_list(dat.readlines()[0])
-        return l
-  
-    def read_matrix(filename):
-        """Returns a matrix from a file in the PEMS-custom format."""
-        array_list = []
-        with open(os.path.join(path, filename), 'r') as dat:
-            lines = dat.readlines()
-            for i, line in enumerate(lines):
-                if (i + 1) % 50 == 0:
-                    print('Completed {} of {} rows for {}'.format(i + 1, len(lines),
-                                                                filename))
-                array = [
-                    process_list(row_split, variable_type=float, delimiter=None)
-                    for row_split in process_list(
-                        line, variable_type=str, delimiter=';')
-                ]
-                array_list.append(array)
-  
-        return array_list
-  
-    shuffle_order = np.array(read_single_list('randperm')) - 1  # index from 0
-    train_dayofweek = read_single_list('PEMS_trainlabels')
-    train_tensor = read_matrix('PEMS_train')
-    test_dayofweek = read_single_list('PEMS_testlabels')
-    test_tensor = read_matrix('PEMS_test')
-  
-    # Inverse permutate shuffle order
-    print('Shuffling')
-    inverse_mapping = {
-        new_location: previous_location
-        for previous_location, new_location in enumerate(shuffle_order)
-    }
-    reverse_shuffle_order = np.array([
-        inverse_mapping[new_location]
-        for new_location, _ in enumerate(shuffle_order)
-    ])
-  
-    # Group and reoder based on permuation matrix
-    print('Reodering')
-    day_of_week = np.array(train_dayofweek + test_dayofweek)
-    combined_tensor = np.array(train_tensor + test_tensor)
-  
-    day_of_week = day_of_week[reverse_shuffle_order]
-    combined_tensor = combined_tensor[reverse_shuffle_order]
-  
-    # Put everything back into a dataframe
-    print('Parsing as dataframe')
-    labels = ['traj_{}'.format(i) for i in read_single_list('stations_list')]
-  
-    hourly_list = []
-    for day, day_matrix in enumerate(combined_tensor):
-        # Hourly data
-        hourly = pd.DataFrame(day_matrix.T, columns=labels)
-        hourly['hour_on_day'] = [int(i / 6) for i in hourly.index
-                                ]  # sampled at 10 min intervals
-        if hourly['hour_on_day'].max() > 23 or hourly['hour_on_day'].min() < 0:
-            raise ValueError('Invalid hour! {}-{}'.format(
-                hourly['hour_on_day'].min(), hourly['hour_on_day'].max()))
-  
-        hourly = hourly.groupby('hour_on_day', as_index=True).mean()[labels]
-        hourly['sensor_day'] = day
-        hourly['time_on_day'] = hourly.index
-        hourly['day_of_week'] = day_of_week[day]
-  
-        hourly_list.append(hourly)
-  
-    hourly_frame = pd.concat(hourly_list, axis=0, ignore_index=True, sort=False)
-  
-    # Flatten such that each entitiy uses one row in dataframe
-    store_columns = [c for c in hourly_frame.columns if 'traj' in c]
-    other_columns = [c for c in hourly_frame.columns if 'traj' not in c]
-    flat_df = pd.DataFrame(columns=['values', 'prev_values', 'next_values'] +
-                           other_columns + ['id'])
-  
-    for store in store_columns:
-        print('Processing {}'.format(store))
-  
-        sliced = hourly_frame[[store] + other_columns].copy()
-        sliced.columns = ['values'] + other_columns
-        sliced['id'] = int(store.replace('traj_', ''))
-  
-        # Sort by Sensor-date-time
-        key = sliced['id'].apply(str) \
-                + sliced['sensor_day'].apply(lambda x: '_{:03d}'.format(x)) \
-                + sliced['time_on_day'].apply(lambda x: '_{:03d}'.format(x))
-        sliced = sliced.set_index(key).sort_index()
-  
-        sliced['values'] = sliced['values'].fillna(method='ffill')
-        sliced['prev_values'] = sliced['values'].shift(1)
-        sliced['next_values'] = sliced['values'].shift(-1)
-  
-        flat_df = flat_df.append(sliced.dropna(), ignore_index=True, sort=False)
-  
-    # Filter to match range used by other academic papers
-    index = flat_df['sensor_day']
-    flat_df = flat_df[index < 173].copy()
-  
-    # Creating columns fo categorical inputs
-    flat_df['categorical_id'] = flat_df['id'].copy()
-    flat_df['hours_from_start'] = flat_df['time_on_day'] \
-        + flat_df['sensor_day']*24.
-    flat_df['categorical_day_of_week'] = flat_df['day_of_week'].copy()
-    flat_df['categorical_time_on_day'] = flat_df['time_on_day'].copy()
-  
-    flat_df.to_csv(os.path.join(path, 'standarized.csv'))
-
-
-# XXX needs rework
-def standarize_favorita(data_folder):
-    import gc
-    # Extract only a subset of data to save/process for efficiency
-    start_date = pd.datetime(2015, 1, 1)
-    end_date = pd.datetime(2016, 6, 1)
-  
-    print('Regenerating data...')
-  
-    # load temporal data
-    temporal = pd.read_csv(os.path.join(data_folder, 'train.csv'), index_col=0)
-  
-    store_info = pd.read_csv(os.path.join(data_folder, 'stores.csv'), index_col=0)
-    oil = pd.read_csv(
-        os.path.join(data_folder, 'oil.csv'), index_col=0).iloc[:, 0]
-    holidays = pd.read_csv(os.path.join(data_folder, 'holidays_events.csv'))
-    items = pd.read_csv(os.path.join(data_folder, 'items.csv'), index_col=0)
-    transactions = pd.read_csv(os.path.join(data_folder, 'transactions.csv'))
-  
-    # Take first 6 months of data
-    temporal['date'] = pd.to_datetime(temporal['date'])
-  
-    # Filter dates to reduce storage space requirements
-    if start_date is not None:
-        temporal = temporal[(temporal['date'] >= start_date)]
-    if end_date is not None:
-        temporal = temporal[(temporal['date'] < end_date)]
-  
-    dates = temporal['date'].unique()
-  
-    # Add trajectory identifier
-    temporal['traj_id'] = temporal['store_nbr'].apply(
-        str) + '_' + temporal['item_nbr'].apply(str)
-    temporal['unique_id'] = temporal['traj_id'] + '_' + temporal['date'].apply(
-        str)
-  
-    # Remove all IDs with negative returns
-    print('Removing returns data')
-    min_returns = temporal['unit_sales'].groupby(temporal['traj_id']).min()
-    valid_ids = set(min_returns[min_returns >= 0].index)
-    selector = temporal['traj_id'].apply(lambda traj_id: traj_id in valid_ids)
-    new_temporal = temporal[selector].copy()
-    del temporal
-    gc.collect()
-    temporal = new_temporal
-    temporal['open'] = 1
-  
-    # Resampling
-    print('Resampling to regular grid')
-    resampled_dfs = []
-    for traj_id, raw_sub_df in temporal.groupby('traj_id'):
-        print('Resampling', traj_id)
-        sub_df = raw_sub_df.set_index('date', drop=True).copy()
-        sub_df = sub_df.resample('1d').last()
-        sub_df['date'] = sub_df.index
-        sub_df[['store_nbr', 'item_nbr', 'onpromotion']] \
-            = sub_df[['store_nbr', 'item_nbr', 'onpromotion']].fillna(method='ffill')
-        sub_df['open'] = sub_df['open'].fillna(
-            0)  # flag where sales data is unknown
-        sub_df['log_sales'] = np.log(sub_df['unit_sales'])
-    
-        resampled_dfs.append(sub_df.reset_index(drop=True))
-  
-    new_temporal = pd.concat(resampled_dfs, axis=0)
-    del temporal
-    gc.collect()
-    temporal = new_temporal
-  
-    print('Adding oil')
-    oil.name = 'oil'
-    oil.index = pd.to_datetime(oil.index)
-    #XXX the lines below match the value of the oil on given date with the rest of the timeseries
-    # missing values in oil series are copied from the index before. Then the oil series is joined with
-    # temporal. Then there are some dates present in temporal which arent present in oil, for which 
-    # oil values is substituted with -1. WHY?!
-    #TODO: check how many nans there are after first step. Previously oil series was extended by dates
-    # present in dates variable with nan value, which were forward filled. 
-    # This behavior is no longer supported by pandas, so we changed to DataFrame.isin method.
-    # This leaves us with more nans after first step than previously. To achieve previous behavior
-    # we have to join series before filling nans.
-    temporal = temporal.join(
-        #oil.loc[oil.index.isin(dates)].fillna(method='ffill'), on='date', how='left')
-        oil.loc[oil.index.isin(dates)], on='date', how='left')
-    temporal['oil'] = temporal['oil'].fillna(method='ffill')
-    temporal['oil'] = temporal['oil'].fillna(-1)
-  
-    print('Adding store info')
-    temporal = temporal.join(store_info, on='store_nbr', how='left')
-  
-    print('Adding item info')
-    temporal = temporal.join(items, on='item_nbr', how='left')
-  
-    transactions['date'] = pd.to_datetime(transactions['date'])
-    temporal = temporal.merge(
-        transactions,
-        left_on=['date', 'store_nbr'],
-        right_on=['date', 'store_nbr'],
-        how='left')
-    temporal['transactions'] = temporal['transactions'].fillna(-1)
-  
-    # Additional date info
-    temporal['day_of_week'] = pd.to_datetime(temporal['date'].values).dayofweek
-    temporal['day_of_month'] = pd.to_datetime(temporal['date'].values).day
-    temporal['month'] = pd.to_datetime(temporal['date'].values).month
-  
-    # Add holiday info
-    print('Adding holidays')
-    holiday_subset = holidays[holidays['transferred'].apply(
-        lambda x: not x)].copy()
-    holiday_subset.columns = [
-        s if s != 'type' else 'holiday_type' for s in holiday_subset.columns
-    ]
-    holiday_subset['date'] = pd.to_datetime(holiday_subset['date'])
-    local_holidays = holiday_subset[holiday_subset['locale'] == 'Local']
-    regional_holidays = holiday_subset[holiday_subset['locale'] == 'Regional']
-    national_holidays = holiday_subset[holiday_subset['locale'] == 'National']
-  
-    temporal['national_hol'] = temporal.merge(
-        national_holidays, left_on=['date'], right_on=['date'],
-        how='left')['description'].fillna('')
-    temporal['regional_hol'] = temporal.merge(
-        regional_holidays,
-        left_on=['state', 'date'],
-        right_on=['locale_name', 'date'],
-        how='left')['description'].fillna('')
-    temporal['local_hol'] = temporal.merge(
-        local_holidays,
-        left_on=['city', 'date'],
-        right_on=['locale_name', 'date'],
-        how='left')['description'].fillna('')
-  
-    temporal.sort_values('unique_id', inplace=True)
-
-    # Transform date to integer index
-    start_date = pd.to_datetime(min(temporal['date']))
-    dates = temporal['date'].apply(pd.to_datetime)
-    temporal['days_from_start'] = (dates - start_date).dt.days
-    temporal['categorical_id'] = temporal['traj_id'].copy()
-  
-    print('Saving processed file to {}'.format(os.path.join(data_folder, 'standarized.csv')))
-    temporal.to_csv(os.path.join(data_folder, 'standarized.csv'))
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/ema.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/ema.py
deleted file mode 100644
index f8f5b331..00000000
--- a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/ema.py
+++ /dev/null
@@ -1,73 +0,0 @@
-# Copyright 2021 NVIDIA CORPORATION
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-
-#     http://www.apache.org/licenses/LICENSE-2.0
-
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Copyright 2019 Ross Wightman
-
-#    Licensed under the Apache License, Version 2.0 (the "License");
-#    you may not use this file except in compliance with the License.
-#    You may obtain a copy of the License at
-
-#        http://www.apache.org/licenses/LICENSE-2.0
-
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS,
-#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#    See the License for the specific language governing permissions and
-#    limitations under the License.
-
-""" 
-Exponential Moving Average (EMA) of model updates
-"""
-
-from collections import OrderedDict
-from copy import deepcopy
-
-import torch
-import torch.nn as nn
-
-class ModelEma(nn.Module):
-    """ Model Exponential Moving Average V2
-
-    Keep a moving average of everything in the model state_dict (parameters and buffers).
-    V2 of this module is simpler, it does not match params/buffers based on name but simply
-    iterates in order. It works with torchscript (JIT of full model).
-
-    """
-    def __init__(self, model, decay=0.999, device=None):
-        super().__init__()
-        # make a copy of the model for accumulating moving average of weights
-        self.module = deepcopy(model)
-        self.module.eval()
-        self.decay = decay
-        self.device = device  # perform ema on different device from model if set
-        if self.device is not None:
-            self.module.to(device=device)
-
-    def update(self, model):
-        update_fn=lambda ema_v, model_v: self.decay * ema_v + (1. - self.decay) * model_v
-        with torch.no_grad():
-            for ema_v, model_v in zip(self.module.state_dict().values(), model.state_dict().values()):
-                if self.device is not None:
-                    model_v = model_v.to(device=self.device)
-                ema_v.copy_(update_fn(ema_v, model_v))
-
-    def set(self, model):
-        with torch.no_grad():
-            for ema_v, model_v in zip(self.module.state_dict().values(), model.state_dict().values()):
-                if self.device is not None:
-                    model_v = model_v.to(device=self.device)
-                ema_v.copy_( model_v )
-
-    def forward(self, x):
-        return self.module(x)
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/gpu_affinity.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/gpu_affinity.py
deleted file mode 100644
index 79fb1fc4..00000000
--- a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/gpu_affinity.py
+++ /dev/null
@@ -1,157 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import collections
-import math
-import os
-import pathlib
-import re
-
-import pynvml
-
-pynvml.nvmlInit()
-
-
-def systemGetDriverVersion():
-    return pynvml.nvmlSystemGetDriverVersion()
-
-
-def deviceGetCount():
-    return pynvml.nvmlDeviceGetCount()
-
-
-class device:
-    # assume nvml returns list of 64 bit ints
-    _nvml_affinity_elements = math.ceil(os.cpu_count() / 64)
-
-    def __init__(self, device_idx):
-        super().__init__()
-        self.handle = pynvml.nvmlDeviceGetHandleByIndex(device_idx)
-
-    def getName(self):
-        return pynvml.nvmlDeviceGetName(self.handle)
-
-    def getCpuAffinity(self):
-        affinity_string = ''
-        for j in pynvml.nvmlDeviceGetCpuAffinity(
-            self.handle, device._nvml_affinity_elements
-        ):
-            # assume nvml returns list of 64 bit ints
-            affinity_string = '{:064b}'.format(j) + affinity_string
-        affinity_list = [int(x) for x in affinity_string]
-        affinity_list.reverse()  # so core 0 is in 0th element of list
-
-        ret = [i for i, e in enumerate(affinity_list) if e != 0]
-        return ret
-
-
-def set_socket_affinity(gpu_id):
-    dev = device(gpu_id)
-    affinity = dev.getCpuAffinity()
-    os.sched_setaffinity(0, affinity)
-
-
-def set_single_affinity(gpu_id):
-    dev = device(gpu_id)
-    affinity = dev.getCpuAffinity()
-    os.sched_setaffinity(0, affinity[:1])
-
-
-def set_single_unique_affinity(gpu_id, nproc_per_node):
-    devices = [device(i) for i in range(nproc_per_node)]
-    socket_affinities = [dev.getCpuAffinity() for dev in devices]
-
-    siblings_list = get_thread_siblings_list()
-    siblings_dict = dict(siblings_list)
-
-    # remove siblings
-    for idx, socket_affinity in enumerate(socket_affinities):
-        socket_affinities[idx] = list(set(socket_affinity) - set(siblings_dict.values()))
-
-    affinities = []
-    assigned = []
-
-    for socket_affinity in socket_affinities:
-        for core in socket_affinity:
-            if core not in assigned:
-                affinities.append([core])
-                assigned.append(core)
-                break
-    os.sched_setaffinity(0, affinities[gpu_id])
-
-
-def set_socket_unique_affinity(gpu_id, nproc_per_node, mode):
-    device_ids = [device(i) for i in range(nproc_per_node)]
-    socket_affinities = [dev.getCpuAffinity() for dev in device_ids]
-
-    siblings_list = get_thread_siblings_list()
-    siblings_dict = dict(siblings_list)
-
-    # remove siblings
-    for idx, socket_affinity in enumerate(socket_affinities):
-        socket_affinities[idx] = list(set(socket_affinity) - set(siblings_dict.values()))
-
-    socket_affinities_to_device_ids = collections.defaultdict(list)
-
-    for idx, socket_affinity in enumerate(socket_affinities):
-        socket_affinities_to_device_ids[tuple(socket_affinity)].append(idx)
-
-    for socket_affinity, device_ids in socket_affinities_to_device_ids.items():
-        devices_per_group = len(device_ids)
-        cores_per_device = len(socket_affinity) // devices_per_group
-        for group_id, device_id in enumerate(device_ids):
-            if device_id == gpu_id:
-                if mode == 'interleaved':
-                    affinity = list(socket_affinity[group_id::devices_per_group])
-                elif mode == 'continuous':
-                    affinity = list(socket_affinity[group_id*cores_per_device:(group_id+1)*cores_per_device])
-                else:
-                    raise RuntimeError('Unknown set_socket_unique_affinity mode')
-
-                # reintroduce siblings
-                affinity += [siblings_dict[aff] for aff in affinity if aff in siblings_dict]
-                os.sched_setaffinity(0, affinity)
-
-
-def get_thread_siblings_list():
-    path = '/sys/devices/system/cpu/cpu*/topology/thread_siblings_list'
-    thread_siblings_list = []
-    pattern = re.compile(r'(\d+)\D(\d+)')
-    for fname in pathlib.Path(path[0]).glob(path[1:]):
-        with open(fname) as f:
-            content = f.read().strip()
-            res = pattern.findall(content)
-            if res:
-                pair = tuple(map(int, res[0]))
-                thread_siblings_list.append(pair)
-    return thread_siblings_list
-
-
-def set_affinity(gpu_id, nproc_per_node, mode='socket'):
-    if mode == 'socket':
-        set_socket_affinity(gpu_id)
-    elif mode == 'single':
-        set_single_affinity(gpu_id)
-    elif mode == 'single_unique':
-        set_single_unique_affinity(gpu_id, nproc_per_node)
-    elif mode == 'socket_unique_interleaved':
-        set_socket_unique_affinity(gpu_id, nproc_per_node, 'interleaved')
-    elif mode == 'socket_unique_continuous':
-        set_socket_unique_affinity(gpu_id, nproc_per_node, 'continuous')
-    else:
-        raise RuntimeError('Unknown affinity mode')
-
-    affinity = os.sched_getaffinity(0)
-    return affinity
-
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/inference.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/inference.py
deleted file mode 100644
index 056429f1..00000000
--- a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/inference.py
+++ /dev/null
@@ -1,239 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import pandas as pd
-import numpy as np
-import pickle
-import argparse
-import torch
-from torch.utils.data import DataLoader
-from torch.cuda import amp
-from torch.utils.tensorboard import SummaryWriter
-from tqdm import tqdm
-from modeling import TemporalFusionTransformer
-from configuration import ElectricityConfig
-from data_utils import TFTDataset
-from utils import PerformanceMeter
-from criterions import QuantileLoss
-import dllogger
-from log_helper import setup_logger
-
-def _unscale_per_id(config, values, ids, scalers):
-    values = values.cpu().numpy()
-    num_horizons = config.example_length - config.encoder_length + 1
-    flat_values = pd.DataFrame(
-            values,
-            columns=[f't{j}' for j in range(num_horizons - values.shape[1], num_horizons)]
-            )
-    flat_values['id'] = ids
-    df_list = []
-    for idx, group in flat_values.groupby('id'):
-        scaler = scalers[idx]
-        group_copy = group.copy()
-        for col in group_copy.columns:
-            if not 'id' in col:
-                _col = np.expand_dims(group_copy[col].values, -1)
-                _t_col = scaler.inverse_transform(_col)[:,-1]
-                group_copy[col] = _t_col
-        df_list.append(group_copy)
-    flat_values = pd.concat(df_list, axis=0)
-
-    flat_values = flat_values[[col for col in flat_values if not 'id' in col]]
-    flat_tensor = torch.from_numpy(flat_values.values)
-    return flat_tensor
-
-def _unscale(config, values, scaler):
-    values = values.cpu().numpy()
-    num_horizons = config.example_length - config.encoder_length + 1
-    flat_values = pd.DataFrame(
-            values,
-            columns=[f't{j}' for j in range(num_horizons - values.shape[1], num_horizons)]
-            )
-    for col in flat_values.columns:
-        if not 'id' in col:
-            _col = np.expand_dims(flat_values[col].values, -1)
-            _t_col = scaler.inverse_transform(_col)[:,-1]
-            flat_values[col] = _t_col
-
-    flat_values = flat_values[[col for col in flat_values if not 'id' in col]]
-    flat_tensor = torch.from_numpy(flat_values.values)
-    return flat_tensor
-
-def predict(args, config, model, data_loader, scalers, cat_encodings, extend_targets=False):
-    model.eval()
-    predictions = []
-    targets = []
-    ids = []
-    perf_meter = PerformanceMeter()
-    n_workers = args.distributed_world_size if hasattr(args, 'distributed_world_size') else 1
-
-    for step, batch in enumerate(data_loader):
-        perf_meter.reset_current_lap()
-        with torch.no_grad():
-            batch = {key: tensor.cuda() if tensor.numel() else None for key, tensor in batch.items()}
-            ids.append(batch['id'][:,0,:])
-            targets.append(batch['target'])
-            predictions.append(model(batch).float())
-
-        perf_meter.update(args.batch_size * n_workers,
-            exclude_from_total=step in [0, len(data_loader)-1])
-
-    targets = torch.cat(targets, dim=0)
-    if not extend_targets:
-        targets = targets[:,config.encoder_length:,:] 
-    predictions = torch.cat(predictions, dim=0)
-    
-    if config.scale_per_id:
-        ids = torch.cat(ids, dim=0).cpu().numpy()
-
-        unscaled_predictions = torch.stack(
-                [_unscale_per_id(config, predictions[:,:,i], ids, scalers) for i in range(len(config.quantiles))], 
-                dim=-1)
-        unscaled_targets = _unscale_per_id(config, targets[:,:,0], ids, scalers).unsqueeze(-1)
-    else:
-        ids = None
-        unscaled_predictions = torch.stack(
-                [_unscale(config, predictions[:,:,i], scalers['']) for i in range(len(config.quantiles))], 
-                dim=-1)
-        unscaled_targets = _unscale(config, targets[:,:,0], scalers['']).unsqueeze(-1)
-
-    return unscaled_predictions, unscaled_targets, ids, perf_meter
-
-def visualize_v2(args, config, model, data_loader, scalers, cat_encodings):
-    unscaled_predictions, unscaled_targets, ids, _ = predict(args, config, model, data_loader, scalers, cat_encodings, extend_targets=True)
-
-    num_horizons = config.example_length - config.encoder_length + 1
-    pad = unscaled_predictions.new_full((unscaled_targets.shape[0], unscaled_targets.shape[1] - unscaled_predictions.shape[1], unscaled_predictions.shape[2]), fill_value=float('nan'))
-    pad[:,-1,:] = unscaled_targets[:,-num_horizons,:]
-    unscaled_predictions = torch.cat((pad, unscaled_predictions), dim=1)
-
-    ids = torch.from_numpy(ids.squeeze())
-    joint_graphs = torch.cat([unscaled_targets, unscaled_predictions], dim=2)
-    graphs = {i:joint_graphs[ids == i, :, :] for i in set(ids.tolist())}
-    for key, g in graphs.items():
-        for i, ex in enumerate(g):
-            df = pd.DataFrame(ex.numpy(), 
-                    index=range(num_horizons - ex.shape[0], num_horizons),
-                    columns=['target'] + [f'P{int(q*100)}' for q in config.quantiles])
-            fig = df.plot().get_figure()
-            ax = fig.get_axes()[0]
-            _values = df.values[config.encoder_length-1:,:]
-            ax.fill_between(range(num_horizons), _values[:,1], _values[:,-1], alpha=0.2, color='green')
-            os.makedirs(os.path.join(args.results, 'single_example_vis', str(key)), exist_ok=True)
-            fig.savefig(os.path.join(args.results, 'single_example_vis', str(key), f'{i}.pdf'))
-
-def inference(args, config, model, data_loader, scalers, cat_encodings):
-    unscaled_predictions, unscaled_targets, ids, perf_meter = predict(args, config, model, data_loader, scalers, cat_encodings)
-
-    if args.joint_visualization or args.save_predictions:
-        ids = torch.from_numpy(ids.squeeze())
-        #ids = torch.cat([x['id'][0] for x in data_loader.dataset])
-        joint_graphs = torch.cat([unscaled_targets, unscaled_predictions], dim=2)
-        graphs = {i:joint_graphs[ids == i, :, :] for i in set(ids.tolist())}
-        for key, g in graphs.items(): #timeseries id, joint targets and predictions
-            _g = {'targets': g[:,:,0]}
-            _g.update({f'P{int(q*100)}':g[:,:,i+1] for i, q in enumerate(config.quantiles)})
-            
-            if args.joint_visualization:
-                summary_writer = SummaryWriter(log_dir=os.path.join(args.results, 'predictions_vis', str(key)))
-                for q, t in _g.items(): # target and quantiles, timehorizon values
-                    if q == 'targets':
-                        targets = torch.cat([t[:,0], t[-1,1:]]) # WIP
-                        # We want to plot targets on the same graph as predictions. Probably could be written better.
-                        for i, val in enumerate(targets):
-                            summary_writer.add_scalars(str(key), {f'{q}':val}, i)
-                        continue
-
-                    # Tensor t contains different time horizons which are shifted in phase
-                    # Next lines realign them
-                    y = t.new_full((t.shape[0] + t.shape[1] -1, t.shape[1]), float('nan'))
-                    for i in range(y.shape[1]):
-                        y[i:i+t.shape[0], i] = t[:,i]
-
-                    for i, vals in enumerate(y): # timestep, timehorizon values value
-                        summary_writer.add_scalars(str(key), {f'{q}_t+{j+1}':v for j,v in enumerate(vals) if v == v}, i)
-                summary_writer.close()
-
-            if args.save_predictions:
-                for q, t in _g.items():
-                    df = pd.DataFrame(t.tolist())
-                    df.columns = [f't+{i+1}' for i in range(len(df.columns))]
-                    os.makedirs(os.path.join(args.results, 'predictions', str(key)), exist_ok=True)
-                    df.to_csv(os.path.join(args.results, 'predictions', str(key), q+'.csv'))
-
-    losses = QuantileLoss(config)(unscaled_predictions, unscaled_targets)
-    normalizer = unscaled_targets.abs().mean()
-    q_risk = 2 * losses / normalizer
-
-    perf_dict = {
-                'throughput': perf_meter.avg,
-                'latency_avg': perf_meter.total_time/len(perf_meter.intervals),
-                'latency_p90': perf_meter.p(90),
-                'latency_p95': perf_meter.p(95),
-                'latency_p99': perf_meter.p(99),
-                'total_infernece_time': perf_meter.total_time,
-                }
-
-    return q_risk, perf_dict
-
-
-def main(args):
-    
-    setup_logger(args)
-    # Set up model
-    state_dict = torch.load(args.checkpoint)
-    config = state_dict['config']
-    model = TemporalFusionTransformer(config).cuda()
-    model.load_state_dict(state_dict['model'])
-    model.eval()
-    model.cuda()
-
-    # Set up dataset
-    test_split = TFTDataset(args.data, config)
-    data_loader = DataLoader(test_split, batch_size=args.batch_size, num_workers=4)
-
-    scalers = pickle.load(open(args.tgt_scalers, 'rb'))
-    cat_encodings = pickle.load(open(args.cat_encodings, 'rb'))
-
-    if args.visualize:
-        # TODO: abstract away all forms of visualization.
-        visualize_v2(args, config, model, data_loader, scalers, cat_encodings)
-
-    quantiles, perf_dict = inference(args, config, model, data_loader, scalers, cat_encodings)
-    quantiles = {'test_p10': quantiles[0].item(), 'test_p50': quantiles[1].item(), 'test_p90': quantiles[2].item(), 'sum':sum(quantiles).item()}
-    finish_log = {**quantiles, **perf_dict}
-    dllogger.log(step=(), data=finish_log, verbosity=1)
-    print('Test q-risk: P10 {} | P50 {} | P90 {}'.format(*quantiles))
-    print('Latency:\n\tAverage {:.3f}s\n\tp90 {:.3f}s\n\tp95 {:.3f}s\n\tp99 {:.3f}s'.format(
-        perf_dict['latency_avg'], perf_dict['latency_p90'], perf_dict['latency_p95'], perf_dict['latency_p99']))
-
-if __name__=='__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--checkpoint', type=str,
-                        help='Path to the checkpoint')
-    parser.add_argument('--data', type=str,
-                        help='Path to the test split of the dataset')
-    parser.add_argument('--tgt_scalers', type=str,
-                        help='Path to the tgt_scalers.bin file produced by the preprocessing')
-    parser.add_argument('--cat_encodings', type=str,
-                        help='Path to the cat_encodings.bin file produced by the preprocessing')
-    parser.add_argument('--batch_size', type=int, default=64)
-    parser.add_argument('--visualize', action='store_true', help='Visualize predictions - each example on the separate plot')
-    parser.add_argument('--joint_visualization', action='store_true', help='Visualize predictions - each timeseries on separate plot. Projections will be concatenated.')
-    parser.add_argument('--save_predictions', action='store_true')
-    parser.add_argument('--results', type=str, default='/results')
-    parser.add_argument('--log_file', type=str, default='dllogger.json')
-    ARGS = parser.parse_args()
-    main(ARGS)
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/log_helper.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/log_helper.py
deleted file mode 100644
index 83d2ac7f..00000000
--- a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/log_helper.py
+++ /dev/null
@@ -1,141 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import subprocess
-import sys
-import itertools
-import atexit
-
-import dllogger
-from dllogger import Backend, JSONStreamBackend, StdOutBackend
-
-import torch.distributed as dist
-from torch.utils.tensorboard import SummaryWriter
-
-class TensorBoardBackend(Backend):
-    def __init__(self, verbosity, log_dir):
-        super().__init__(verbosity=verbosity)
-        self.summary_writer = SummaryWriter(log_dir=os.path.join(log_dir, 'TB_summary'),
-                                            flush_secs=120,
-                                            max_queue=200
-                                            )
-        self.hp_cache = None
-        atexit.register(self.summary_writer.close)
-
-    @property
-    def log_level(self):
-        return self._log_level
-
-    def metadata(self, timestamp, elapsedtime, metric, metadata):
-        pass
-
-    def log(self, timestamp, elapsedtime, step, data):
-        if step == 'HPARAMS':
-            parameters = {k: v for k, v in data.items() if not isinstance(v, (list, tuple))}
-            #Unpack list and tuples
-            for d in [{k+f'_{i}':v for i,v in enumerate(l)} for k,l in data.items() if isinstance(l, (list, tuple))]:
-                parameters.update(d)
-            #Remove custom classes
-            parameters = {k: v for k, v in data.items() if isinstance(v, (int, float, str, bool))}
-            parameters.update({k:'None' for k, v in data.items() if v is None})
-            self.hp_cache = parameters
-        if step == ():
-            if self.hp_cache is None:
-                print('Warning: Cannot save HParameters. Please log HParameters with step=\'HPARAMS\'', file=sys.stderr)
-                return
-            self.summary_writer.add_hparams(self.hp_cache, data)
-        if not isinstance(step, int):
-            return
-        for k, v in data.items():
-            self.summary_writer.add_scalar(k, v, step)
-
-    def flush(self):
-        pass
-
-def setup_logger(args):
-    os.makedirs(args.results, exist_ok=True)
-    log_path = os.path.join(args.results, args.log_file)
-
-    if os.path.exists(log_path):
-        for i in itertools.count():
-            s_fname = args.log_file.split('.')
-            fname = '.'.join(s_fname[:-1]) + f'_{i}.' + s_fname[-1] if len(s_fname) > 1 else args.stat_file + f'.{i}'
-            log_path = os.path.join(args.results, fname)
-            if not os.path.exists(log_path):
-                break
-
-    def metric_format(metric, metadata, value):
-        return "{}: {}".format(metric, f'{value:.5f}' if isinstance(value, float) else value)
-    def step_format(step):
-        if step == ():
-            return "Finished |"
-        elif isinstance(step, int):
-            return "Step {0: <5} |".format(step)
-        return "Step {} |".format(step)
-
-
-    if not dist.is_initialized() or not args.distributed_world_size > 1 or args.distributed_rank == 0:
-        dllogger.init(backends=[JSONStreamBackend(verbosity=1, filename=log_path),
-                                TensorBoardBackend(verbosity=1, log_dir=args.results),
-                                StdOutBackend(verbosity=2, 
-                                              step_format=step_format,
-                                              prefix_format=lambda x: "")#,
-                                              #metric_format=metric_format)
-                                ])
-    else:
-        dllogger.init(backends=[])
-    dllogger.log(step='PARAMETER', data=vars(args), verbosity=0)
-
-    container_setup_info = {**get_framework_env_vars(), **get_system_info()}
-    dllogger.log(step='ENVIRONMENT', data=container_setup_info, verbosity=0)
-
-    dllogger.metadata('loss', {'GOAL': 'MINIMIZE', 'STAGE': 'TRAIN', 'format': ':5f'})
-    dllogger.metadata('P10', {'GOAL': 'MINIMIZE', 'STAGE': 'TRAIN', 'format': ':5f'})
-    dllogger.metadata('P50', {'GOAL': 'MINIMIZE', 'STAGE': 'TRAIN', 'format': ':5f'})
-    dllogger.metadata('P90', {'GOAL': 'MINIMIZE', 'STAGE': 'TRAIN', 'format': ':5f'})
-    dllogger.metadata('items/s', {'GOAL': 'MAXIMIZE', 'STAGE': 'TRAIN', 'format': ':1f'})
-    dllogger.metadata('val_loss', {'GOAL': 'MINIMIZE', 'STAGE': 'VAL', 'format':':5f'})
-    dllogger.metadata('val_P10', {'GOAL': 'MINIMIZE', 'STAGE': 'VAL', 'format': ':5f'})
-    dllogger.metadata('val_P50', {'GOAL': 'MINIMIZE', 'STAGE': 'VAL', 'format': ':5f'})
-    dllogger.metadata('val_P90', {'GOAL': 'MINIMIZE', 'STAGE': 'VAL', 'format': ':5f'})
-    dllogger.metadata('val_items/s', {'GOAL': 'MAXIMIZE', 'STAGE': 'VAL', 'format': ':1f'})
-    dllogger.metadata('test_P10', {'GOAL': 'MINIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
-    dllogger.metadata('test_P50', {'GOAL': 'MINIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
-    dllogger.metadata('test_P90', {'GOAL': 'MINIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
-    dllogger.metadata('throughput', {'GOAL': 'MAXIMIZE', 'STAGE': 'TEST', 'format': ':1f'})
-    dllogger.metadata('latency_p90', {'GOAL': 'MIMIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
-    dllogger.metadata('latency_p95', {'GOAL': 'MIMIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
-    dllogger.metadata('latency_p99', {'GOAL': 'MIMIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
-
-
-def get_framework_env_vars():
-    return {
-        'NVIDIA_PYTORCH_VERSION': os.environ.get('NVIDIA_PYTORCH_VERSION'),
-        'PYTORCH_VERSION': os.environ.get('PYTORCH_VERSION'),
-        'CUBLAS_VERSION': os.environ.get('CUBLAS_VERSION'),
-        'NCCL_VERSION': os.environ.get('NCCL_VERSION'),
-        'CUDA_DRIVER_VERSION': os.environ.get('CUDA_DRIVER_VERSION'),
-        'CUDNN_VERSION': os.environ.get('CUDNN_VERSION'),
-        'CUDA_VERSION': os.environ.get('CUDA_VERSION'),
-        'NVIDIA_PIPELINE_ID': os.environ.get('NVIDIA_PIPELINE_ID'),
-        'NVIDIA_BUILD_ID': os.environ.get('NVIDIA_BUILD_ID'),
-        'NVIDIA_TF32_OVERRIDE': os.environ.get('NVIDIA_TF32_OVERRIDE'),
-    }
-
-def get_system_info():
-    system_info = subprocess.run('nvidia-smi --query-gpu=gpu_name,memory.total,enforced.power.limit --format=csv'.split(), capture_output=True).stdout
-    system_info = [i.decode('utf-8') for i in system_info.split(b'\n')]
-    system_info = [x for x in system_info if x]
-    return {'system_info': system_info}
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/modeling.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/modeling.py
deleted file mode 100644
index 65e64983..00000000
--- a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/modeling.py
+++ /dev/null
@@ -1,367 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-from torch import Tensor
-from typing import Dict, Tuple, Optional, List
-
-if os.environ.get("TFT_SCRIPTING", False):
-    from torch.nn import LayerNorm
-else:
-    from apex.normalization.fused_layer_norm import FusedLayerNorm as LayerNorm
-
-class MaybeLayerNorm(nn.Module):
-    def __init__(self, output_size, hidden_size, eps):
-        super().__init__()
-        if output_size and output_size == 1:
-            self.ln = nn.Identity()
-        else:
-            self.ln = LayerNorm(output_size if output_size else hidden_size, eps=eps)
-    
-    def forward(self, x):
-        return self.ln(x)
-
-
-class GLU(nn.Module):
-    def __init__(self, hidden_size, output_size):
-        super().__init__()
-        self.lin = nn.Linear(hidden_size, output_size * 2)
-
-    def forward(self, x: Tensor) -> Tensor:
-        x = self.lin(x)
-        x = F.glu(x)
-        return x
-
-
-class GRN(nn.Module):
-    def __init__(self,
-                 input_size,
-                 hidden_size, 
-                 output_size=None,
-                 context_hidden_size=None,
-                 dropout=0):
-        super().__init__()
-
-        
-        self.layer_norm = MaybeLayerNorm(output_size, hidden_size, eps=1e-3)
-        self.lin_a = nn.Linear(input_size, hidden_size)
-        if context_hidden_size is not None:
-            self.lin_c = nn.Linear(context_hidden_size, hidden_size, bias=False)
-        self.lin_i = nn.Linear(hidden_size, hidden_size)
-        self.glu = GLU(hidden_size, output_size if output_size else hidden_size)
-        self.dropout = nn.Dropout(dropout)
-        self.out_proj = nn.Linear(input_size, output_size) if output_size else None
-
-    def forward(self, a: Tensor, c: Optional[Tensor] = None):
-        x = self.lin_a(a)
-        if c is not None:
-            x = x + self.lin_c(c).unsqueeze(1)
-        x = F.elu(x)
-        x = self.lin_i(x)
-        x = self.dropout(x)
-        x = self.glu(x)
-        y = a if not self.out_proj else self.out_proj(a)
-        x = x + y
-        x = self.layer_norm(x)
-        return x 
-
-class TFTEmbedding(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.s_cat_inp_lens    = config.static_categorical_inp_lens
-        self.t_cat_k_inp_lens  = config.temporal_known_categorical_inp_lens
-        self.t_cat_o_inp_lens  = config.temporal_observed_categorical_inp_lens
-        self.s_cont_inp_size   = config.static_continuous_inp_size
-        self.t_cont_k_inp_size = config.temporal_known_continuous_inp_size
-        self.t_cont_o_inp_size = config.temporal_observed_continuous_inp_size
-        self.t_tgt_size        = config.temporal_target_size
-
-        self.hidden_size = config.hidden_size
-
-        # There are 7 types of input:
-        # 1. Static categorical
-        # 2. Static continuous
-        # 3. Temporal known a priori categorical
-        # 4. Temporal known a priori continuous
-        # 5. Temporal observed categorical
-        # 6. Temporal observed continuous
-        # 7. Temporal observed targets (time series obseved so far)
-
-        self.s_cat_embed = nn.ModuleList([
-            nn.Embedding(n, self.hidden_size) for n in self.s_cat_inp_lens]) if self.s_cat_inp_lens else None
-        self.t_cat_k_embed = nn.ModuleList([
-            nn.Embedding(n, self.hidden_size) for n in self.t_cat_k_inp_lens]) if self.t_cat_k_inp_lens else None
-        self.t_cat_o_embed = nn.ModuleList([
-            nn.Embedding(n, self.hidden_size) for n in self.t_cat_o_inp_lens]) if self.t_cat_o_inp_lens else None
-
-        self.s_cont_embedding_vectors = nn.Parameter(torch.Tensor(self.s_cont_inp_size, self.hidden_size)) if self.s_cont_inp_size else None
-        self.t_cont_k_embedding_vectors = nn.Parameter(torch.Tensor(self.t_cont_k_inp_size, self.hidden_size)) if self.t_cont_k_inp_size else None
-        self.t_cont_o_embedding_vectors = nn.Parameter(torch.Tensor(self.t_cont_o_inp_size, self.hidden_size)) if self.t_cont_o_inp_size else None
-        self.t_tgt_embedding_vectors = nn.Parameter(torch.Tensor(self.t_tgt_size, self.hidden_size))
-
-        self.s_cont_embedding_bias = nn.Parameter(torch.zeros(self.s_cont_inp_size, self.hidden_size)) if self.s_cont_inp_size else None
-        self.t_cont_k_embedding_bias = nn.Parameter(torch.zeros(self.t_cont_k_inp_size, self.hidden_size)) if self.t_cont_k_inp_size else None
-        self.t_cont_o_embedding_bias = nn.Parameter(torch.zeros(self.t_cont_o_inp_size, self.hidden_size)) if self.t_cont_o_inp_size else None
-        self.t_tgt_embedding_bias = nn.Parameter(torch.zeros(self.t_tgt_size, self.hidden_size))
-
-        if self.s_cont_embedding_vectors is not None:
-            torch.nn.init.xavier_normal_(self.s_cont_embedding_vectors)
-        if self.t_cont_k_embedding_vectors is not None:
-            torch.nn.init.xavier_normal_(self.t_cont_k_embedding_vectors)
-        if self.t_cont_o_embedding_vectors is not None:
-            torch.nn.init.xavier_normal_(self.t_cont_o_embedding_vectors)
-        torch.nn.init.xavier_normal_(self.t_tgt_embedding_vectors)
-
-    def _apply_embedding(self,
-            cat: Optional[Tensor],
-            cont: Optional[Tensor],
-            cat_emb: Optional[nn.ModuleList], 
-            cont_emb: Tensor,
-            cont_bias: Tensor,
-            ) -> Tuple[Optional[Tensor], Optional[Tensor]]:
-        e_cat = torch.stack([embed(cat[...,i]) for i, embed in enumerate(cat_emb)], dim=-2) if cat is not None else None
-        if cont is not None:
-            #the line below is equivalent to following einsums
-            #e_cont = torch.einsum('btf,fh->bthf', cont, cont_emb)
-            #e_cont = torch.einsum('bf,fh->bhf', cont, cont_emb)
-            e_cont = torch.mul(cont.unsqueeze(-1), cont_emb)
-            e_cont = e_cont + cont_bias
-        else:
-            e_cont = None
-
-        if e_cat is not None and e_cont is not None:
-            return torch.cat([e_cat, e_cont], dim=-2)
-        elif e_cat is not None:
-            return e_cat
-        elif e_cont is not None:
-            return e_cont
-        else:
-            return None
-
-    def forward(self, x: Dict[str, Tensor]):
-        # temporal/static categorical/continuous known/observed input 
-        s_cat_inp = x.get('s_cat', None)
-        s_cont_inp = x.get('s_cont', None)
-        t_cat_k_inp = x.get('k_cat', None)
-        t_cont_k_inp = x.get('k_cont', None)
-        t_cat_o_inp = x.get('o_cat', None)
-        t_cont_o_inp = x.get('o_cont', None)
-        t_tgt_obs = x['target'] # Has to be present
-
-        # Static inputs are expected to be equal for all timesteps
-        # For memory efficiency there is no assert statement
-        s_cat_inp = s_cat_inp[:,0,:] if s_cat_inp is not None else None
-        s_cont_inp = s_cont_inp[:,0,:] if s_cont_inp is not None else None
-
-        s_inp = self._apply_embedding(s_cat_inp,
-                                      s_cont_inp,
-                                      self.s_cat_embed,
-                                      self.s_cont_embedding_vectors,
-                                      self.s_cont_embedding_bias)
-        t_known_inp = self._apply_embedding(t_cat_k_inp,
-                                            t_cont_k_inp,
-                                            self.t_cat_k_embed,
-                                            self.t_cont_k_embedding_vectors,
-                                            self.t_cont_k_embedding_bias)
-        t_observed_inp = self._apply_embedding(t_cat_o_inp,
-                                               t_cont_o_inp,
-                                               self.t_cat_o_embed,
-                                               self.t_cont_o_embedding_vectors,
-                                               self.t_cont_o_embedding_bias)
-
-        # Temporal observed targets
-        # t_observed_tgt = torch.einsum('btf,fh->btfh', t_tgt_obs, self.t_tgt_embedding_vectors)
-        t_observed_tgt = torch.matmul(t_tgt_obs.unsqueeze(3).unsqueeze(4), self.t_tgt_embedding_vectors.unsqueeze(1)).squeeze(3)
-        t_observed_tgt = t_observed_tgt + self.t_tgt_embedding_bias
-
-        return s_inp, t_known_inp, t_observed_inp, t_observed_tgt
-
-class VariableSelectionNetwork(nn.Module):
-    def __init__(self, config, num_inputs):
-        super().__init__()
-        self.joint_grn = GRN(config.hidden_size*num_inputs, config.hidden_size, output_size=num_inputs, context_hidden_size=config.hidden_size)
-        self.var_grns = nn.ModuleList([GRN(config.hidden_size, config.hidden_size, dropout=config.dropout) for _ in range(num_inputs)])
-
-    def forward(self, x: Tensor, context: Optional[Tensor] = None):
-        Xi = x.reshape(*x.shape[:-2], -1)
-        grn_outputs = self.joint_grn(Xi, c=context)
-        sparse_weights = F.softmax(grn_outputs, dim=-1)
-        transformed_embed_list = [m(x[...,i,:]) for i, m in enumerate(self.var_grns)]
-        transformed_embed = torch.stack(transformed_embed_list, dim=-1)
-        #the line below performs batched matrix vector multiplication
-        #for temporal features it's bthf,btf->bth
-        #for static features it's bhf,bf->bh
-        variable_ctx = torch.matmul(transformed_embed, sparse_weights.unsqueeze(-1)).squeeze(-1)
-
-        return variable_ctx, sparse_weights
-
-class StaticCovariateEncoder(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.vsn = VariableSelectionNetwork(config, config.num_static_vars)
-        self.context_grns = nn.ModuleList([GRN(config.hidden_size, config.hidden_size, dropout=config.dropout) for _ in range(4)])
-
-    def forward(self, x: Tensor) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
-        variable_ctx, sparse_weights = self.vsn(x)
-
-        # Context vectors:
-        # variable selection context
-        # enrichment context
-        # state_c context
-        # state_h context
-        cs, ce, ch, cc = tuple(m(variable_ctx) for m in self.context_grns)
-
-        return cs, ce, ch, cc
-
-
-class InterpretableMultiHeadAttention(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.n_head = config.n_head
-        assert config.hidden_size % config.n_head == 0
-        self.d_head = config.hidden_size // config.n_head
-        self.qkv_linears = nn.Linear(config.hidden_size, (2 * self.n_head + 1) * self.d_head, bias=False)
-        self.out_proj = nn.Linear(self.d_head, config.hidden_size, bias=False)
-        self.attn_dropout = nn.Dropout(config.attn_dropout)
-        self.out_dropout = nn.Dropout(config.dropout)
-        self.scale = self.d_head**-0.5
-        self.register_buffer("_mask", torch.triu(torch.full((config.example_length, config.example_length), float('-inf')), 1).unsqueeze(0))
-
-    def forward(self, x: Tensor, mask_future_timesteps: bool = True) -> Tuple[Tensor, Tensor]:
-        bs, t, h_size = x.shape
-        qkv = self.qkv_linears(x)
-        q, k, v = qkv.split((self.n_head * self.d_head, self.n_head * self.d_head, self.d_head), dim=-1)
-        q = q.view(bs, t, self.n_head, self.d_head)
-        k = k.view(bs, t, self.n_head, self.d_head)
-        v = v.view(bs, t, self.d_head)
-
-        # attn_score = torch.einsum('bind,bjnd->bnij', q, k)
-        attn_score = torch.matmul(q.permute((0, 2, 1, 3)), k.permute((0, 2, 3, 1)))
-        attn_score.mul_(self.scale)
-
-        if mask_future_timesteps:
-            attn_score = attn_score + self._mask
-
-        attn_prob = F.softmax(attn_score, dim=3)
-        attn_prob = self.attn_dropout(attn_prob)
-
-        # attn_vec = torch.einsum('bnij,bjd->bnid', attn_prob, v)
-        attn_vec = torch.matmul(attn_prob, v.unsqueeze(1))
-        m_attn_vec = torch.mean(attn_vec, dim=1)
-        out = self.out_proj(m_attn_vec)
-        out = self.out_dropout(out)
-
-        return out, attn_vec
-
-
-
-class TemporalFusionTransformer(nn.Module):
-    """ 
-    Implementation of https://arxiv.org/abs/1912.09363 
-    """
-    def __init__(self, config):
-        super().__init__()
-
-        if hasattr(config, 'model'):
-            config = config.model
-
-        self.encoder_length = config.encoder_length #this determines from how distant past we want to use data from
-
-        self.embedding = TFTEmbedding(config)
-        self.static_encoder = StaticCovariateEncoder(config)
-
-        self.history_vsn = VariableSelectionNetwork(config, config.num_historic_vars) 
-        self.history_encoder = nn.LSTM(config.hidden_size, config.hidden_size, batch_first=True)
-        self.future_vsn = VariableSelectionNetwork(config, config.num_future_vars)
-        self.future_encoder = nn.LSTM(config.hidden_size, config.hidden_size, batch_first=True)
-
-
-        self.input_gate = GLU(config.hidden_size, config.hidden_size)
-        self.input_gate_ln = LayerNorm(config.hidden_size, eps=1e-3)
-
-        self.enrichment_grn = GRN(config.hidden_size,
-                                  config.hidden_size,
-                                  context_hidden_size=config.hidden_size, 
-                                  dropout=config.dropout)
-        self.attention = InterpretableMultiHeadAttention(config)
-        self.attention_gate = GLU(config.hidden_size, config.hidden_size)
-        self.attention_ln = LayerNorm(config.hidden_size, eps=1e-3)
-
-        self.positionwise_grn = GRN(config.hidden_size,
-                                    config.hidden_size,
-                                    dropout=config.dropout)
-
-        self.decoder_gate = GLU(config.hidden_size, config.hidden_size)
-        self.decoder_ln = LayerNorm(config.hidden_size, eps=1e-3)
-
-        self.quantile_proj = nn.Linear(config.hidden_size, len(config.quantiles))
-
-    def forward(self, x: Dict[str, Tensor]) -> Tensor:
-        s_inp, t_known_inp, t_observed_inp, t_observed_tgt = self.embedding(x)
-
-        # Static context
-        cs, ce, ch, cc = self.static_encoder(s_inp)
-        ch, cc = ch.unsqueeze(0), cc.unsqueeze(0) #lstm initial states
-
-        # Temporal input
-        _historical_inputs = [t_known_inp[:,:self.encoder_length,:], t_observed_tgt[:,:self.encoder_length,:]]
-        if t_observed_inp is not None:
-            _historical_inputs.insert(0,t_observed_inp[:,:self.encoder_length,:])
-
-        historical_inputs = torch.cat(_historical_inputs, dim=-2)
-        future_inputs = t_known_inp[:, self.encoder_length:]
-
-        # Encoders
-        historical_features, _ = self.history_vsn(historical_inputs, cs)
-        history, state = self.history_encoder(historical_features, (ch, cc))
-        future_features, _ = self.future_vsn(future_inputs, cs)
-        future, _ = self.future_encoder(future_features, state)
-        torch.cuda.synchronize() # this call gives perf boost for unknown reasons
-
-        # skip connection
-        input_embedding = torch.cat([historical_features, future_features], dim=1)
-        temporal_features = torch.cat([history, future], dim=1)
-        temporal_features = self.input_gate(temporal_features)
-        temporal_features = temporal_features + input_embedding
-        temporal_features = self.input_gate_ln(temporal_features)
-
-        # Static enrichment
-        enriched = self.enrichment_grn(temporal_features, c=ce)
-
-        # Temporal self attention
-        x, _ = self.attention(enriched, mask_future_timesteps=True)
-
-        # Don't compute hictorical quantiles
-        x = x[:, self.encoder_length:, :]
-        temporal_features = temporal_features[:, self.encoder_length:, :]
-        enriched = enriched[:, self.encoder_length:, :]
-
-        x = self.attention_gate(x)
-        x = x + enriched
-        x = self.attention_ln(x)
-
-        # Position-wise feed-forward
-        x = self.positionwise_grn(x)
-
-        # Final skip connection
-        x = self.decoder_gate(x)
-        x = x + temporal_features
-        x = self.decoder_ln(x)
-
-        out = self.quantile_proj(x)
-
-        return out
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/requirements.txt b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/requirements.txt
deleted file mode 100644
index 8ba46efc..00000000
--- a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/requirements.txt
+++ /dev/null
@@ -1 +0,0 @@
-tensorboard
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/scripts/benchmark.sh b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/scripts/benchmark.sh
deleted file mode 100644
index c8a04c36..00000000
--- a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/scripts/benchmark.sh
+++ /dev/null
@@ -1,54 +0,0 @@
-#! /bin/bash
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-NUM_GPUS=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
-[ $NUM_GPUS -eq 16 ] && WORKER_NUMS=(1 8 16) || WORKER_NUMS=(1 8)
-DATASETS=(electricity traffic)
-
-rm -r /tmp/benchmark_results
-
-for DATASET in ${DATASETS[@]}
-do
-    for NGPU in ${WORKER_NUMS[@]}
-    do
-        for BATCH_SIZE in 512 1024 1536 2048 2560
-        do
-            for USE_AMP in --use_amp ""
-            do
-                for AFFINITY in "--affinity disabled" "--affinity single" "--affinity socket_unique_interleaved"
-                do 
-                    EXP_NAME="TFT_benchmark_${DATASET}_BS_${BATCH_SIZE}_${NGPU}GPU${USE_AMP}_${AFFINITY}"
-                    python -m torch.distributed.launch --nproc_per_node=${NGPU} train.py \
-                            --dataset ${DATASET} \
-                            --data_path /data/processed/${DATASET}_bin \
-                            --batch_size=${BATCH_SIZE} \
-                            --lr 5e-4 \
-                            --epochs 1 \
-                            --sample 100000 5000 \
-                            --seed 1 \
-                            ${USE_AMP} \
-                            ${AFFINITY} \
-                            --clip_grad 0.1 \
-                            --results /tmp/benchmark_results/${EXP_NAME}
-                done
-            done
-        done
-    done
-done
-for P in `ls /tmp/benchmark_results/`;
-do
-    echo ${P}
-    tail -n 1 /tmp/benchmark_results/${P}/dllogger.json
-done
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/scripts/get_data.sh b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/scripts/get_data.sh
deleted file mode 100644
index d4c7c7e1..00000000
--- a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/scripts/get_data.sh
+++ /dev/null
@@ -1,40 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-DATAPATH='/data'
-
-declare -A URLS=( ['electricity']='https://archive.ics.uci.edu/ml/machine-learning-databases/00321/LD2011_2014.txt.zip'
-                  ['traffic']='https://archive.ics.uci.edu/ml/machine-learning-databases/00204/PEMS-SF.zip'
-                )
-
-mkdir -p ${DATAPATH}/raw
-mkdir -p ${DATAPATH}/processed
-
-for DS in electricity traffic
-do
-	DS_PATH=${DATAPATH}/raw/${DS}
-	ZIP_FNAME=${DS_PATH}.zip
-    if [ ! -d ${DS_PATH} ]
-    then
-        wget "${URLS[${DS}]}" -O ${ZIP_FNAME}
-        unzip ${ZIP_FNAME} -d ${DS_PATH}
-    fi
-	python -c "from data_utils import standarize_${DS} as standarize; standarize(\"${DS_PATH}\")"
-	python -c "from data_utils import preprocess; \
-               from configuration import ${DS^}Config as Config; \
-               preprocess(\"${DS_PATH}/standarized.csv\", \"${DATAPATH}/processed/${DS}_bin\", Config())" 
-done
-
-
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/scripts/run_electricity.sh b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/scripts/run_electricity.sh
deleted file mode 100644
index 86214a9a..00000000
--- a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/scripts/run_electricity.sh
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-: ${SEED:=1}
-: ${LR:=1e-3}
-: ${NGPU:=8}
-: ${BATCH_SIZE:=1024}
-: ${EPOCHS:=30}
-
-python -m torch.distributed.launch --nproc_per_node=${NGPU} train.py \
-        --dataset electricity \
-        --data_path /data/processed/electricity_bin \
-        --batch_size=${BATCH_SIZE} \
-        --sample 450000 50000 \
-        --lr ${LR} \
-        --epochs ${EPOCHS} \
-        --seed ${SEED} \
-        --use_amp \
-        --results /results/TFT_electricity_bs${NGPU}x${BATCH_SIZE}_lr${LR}/seed_${SEED}
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/scripts/run_electricity_DGX1-16G.sh b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/scripts/run_electricity_DGX1-16G.sh
deleted file mode 100644
index 86214a9a..00000000
--- a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/scripts/run_electricity_DGX1-16G.sh
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-: ${SEED:=1}
-: ${LR:=1e-3}
-: ${NGPU:=8}
-: ${BATCH_SIZE:=1024}
-: ${EPOCHS:=30}
-
-python -m torch.distributed.launch --nproc_per_node=${NGPU} train.py \
-        --dataset electricity \
-        --data_path /data/processed/electricity_bin \
-        --batch_size=${BATCH_SIZE} \
-        --sample 450000 50000 \
-        --lr ${LR} \
-        --epochs ${EPOCHS} \
-        --seed ${SEED} \
-        --use_amp \
-        --results /results/TFT_electricity_bs${NGPU}x${BATCH_SIZE}_lr${LR}/seed_${SEED}
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/scripts/run_traffic.sh b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/scripts/run_traffic.sh
deleted file mode 100644
index cab8e473..00000000
--- a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/scripts/run_traffic.sh
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-: ${SEED:=1}
-: ${LR:=1e-3}
-: ${NGPU:=8}
-: ${BATCH_SIZE:=1024}
-: ${EPOCHS:=20}
-
-python -m torch.distributed.launch --nproc_per_node=${NGPU} train.py \
-        --dataset traffic \
-        --data_path /data/processed/traffic_bin \
-        --batch_size=${BATCH_SIZE} \
-        --sample 450000 50000 \
-        --lr ${LR} \
-        --epochs ${EPOCHS} \
-        --seed ${SEED} \
-        --use_amp \
-        --results /results/TFT_traffic_bs${NGPU}x${BATCH_SIZE}_lr${LR}/seed_${SEED}
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/scripts/run_traffic_DGX1-16G.sh b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/scripts/run_traffic_DGX1-16G.sh
deleted file mode 100644
index cab8e473..00000000
--- a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/scripts/run_traffic_DGX1-16G.sh
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-: ${SEED:=1}
-: ${LR:=1e-3}
-: ${NGPU:=8}
-: ${BATCH_SIZE:=1024}
-: ${EPOCHS:=20}
-
-python -m torch.distributed.launch --nproc_per_node=${NGPU} train.py \
-        --dataset traffic \
-        --data_path /data/processed/traffic_bin \
-        --batch_size=${BATCH_SIZE} \
-        --sample 450000 50000 \
-        --lr ${LR} \
-        --epochs ${EPOCHS} \
-        --seed ${SEED} \
-        --use_amp \
-        --results /results/TFT_traffic_bs${NGPU}x${BATCH_SIZE}_lr${LR}/seed_${SEED}
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/Dockerfile b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/Dockerfile
deleted file mode 100644
index 70552ea1..00000000
--- a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/Dockerfile
+++ /dev/null
@@ -1,36 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:21.06-py3
-
-FROM ${FROM_IMAGE_NAME}
-
-RUN apt-get update && apt-get install -y libb64-dev libb64-0d
-WORKDIR /workspace
-#ENV PYTHONPATH /workspace
-RUN pip uninstall -y typing
-
-RUN apt update && apt install -y p7zip-full
-COPY requirements.txt .
-RUN pip install --upgrade pip
-RUN pip install --no-cache-dir --ignore-installed -r requirements.txt
-RUN pip install --no-cache-dir -e git://github.com/NVIDIA/dllogger#egg=dllogger
-
-COPY . .
-ENV PYTHONPATH="${PYTHONPATH}:/workspace"
-
-# AMP monkey-patch
-RUN sed -i 's/  def forward(ctx,/  @amp.custom_fwd\(cast_inputs=torch.float32\)\n  def forward(ctx,/g' /opt/conda/lib/python3.8/site-packages/apex/normalization/fused_layer_norm.py
-RUN sed -i 's/  def backward(ctx,/  @amp.custom_bwd\n  def backward(ctx,/g' /opt/conda/lib/python3.8/site-packages/apex/normalization/fused_layer_norm.py
-RUN sed -i 's/^import torch$/import torch\nfrom torch.cuda import amp/' /opt/conda/lib/python3.8/site-packages/apex/normalization/fused_layer_norm.py
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/LICENCE b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/LICENCE
deleted file mode 100644
index 261eeb9e..00000000
--- a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/LICENCE
+++ /dev/null
@@ -1,201 +0,0 @@
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-   1. Definitions.
-
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-
-   END OF TERMS AND CONDITIONS
-
-   APPENDIX: How to apply the Apache License to your work.
-
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-
-   Copyright [yyyy] [name of copyright owner]
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/LICENSE AGREEMENT b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/LICENSE AGREEMENT
deleted file mode 100644
index 5d1d88cf..00000000
--- a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/LICENSE AGREEMENT	
+++ /dev/null
@@ -1,25 +0,0 @@
-Individual Contributor License Agreement (CLA)
-Thank you for submitting your contributions to this project.
-
-By signing this CLA, you agree that the following terms apply to all of your past, present and future contributions to the project.
-
-License.
-You hereby represent that all present, past and future contributions are governed by the Apache 2.0 License copyright statement.
-
-This entails that to the extent possible under law, you transfer all copyright and related or neighboring rights of the code or documents you contribute to the project itself or its maintainers. Furthermore you also represent that you have the authority to perform the above waiver with respect to the entirety of you contributions.
-
-Moral Rights.
-To the fullest extent permitted under applicable law, you hereby waive, and agree not to assert, all of your “moral rights” in or relating to your contributions for the benefit of the project.
-
-Third Party Content.
-If your Contribution includes or is based on any source code, object code, bug fixes, configuration changes, tools, specifications, documentation, data, materials, feedback, information or other works of authorship that were not authored by you (“Third Party Content”) or if you are aware of any third party intellectual property or proprietary rights associated with your Contribution (“Third Party Rights”), then you agree to include with the submission of your Contribution full details respecting such Third Party Content and Third Party Rights, including, without limitation, identification of which aspects of your Contribution contain Third Party Content or are associated with Third Party Rights, the owner/author of the Third Party Content and Third Party Rights, where you obtained the Third Party Content, and any applicable third party license terms or restrictions respecting the Third Party Content and Third Party Rights. For greater certainty, the foregoing obligations respecting the identification of Third Party Content and Third Party Rights do not apply to any portion of a Project that is incorporated into your Contribution to that same Project.
-
-Representations.
-You represent that, other than the Third Party Content and Third Party Rights identified by you in accordance with this Agreement, you are the sole author of your Contributions and are legally entitled to grant the foregoing licenses and waivers in respect of your Contributions. If your Contributions were created in the course of your employment with your past or present employer(s), you represent that such employer(s) has authorized you to make your Contributions on behalf of such employer(s) or such employer (s) has waived all of their right, title or interest in or to your Contributions.
-
-Disclaimer.
-To the fullest extent permitted under applicable law, your Contributions are provided on an "as is" basis, without any warranties or conditions, express or implied, including, without limitation, any implied warranties or conditions of non-infringement, merchantability or fitness for a particular purpose. You are not required to provide support for your Contributions, except to the extent you desire to provide support.
-
-No Obligation.
-You acknowledge that the maintainers of this project are under no obligation to use or incorporate your contributions into the project. The decision to use or incorporate your contributions into the project will be made at the sole discretion of the maintainers or their authorized delegates.
-
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/NOTICE b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/NOTICE
deleted file mode 100644
index ae19bb47..00000000
--- a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/NOTICE
+++ /dev/null
@@ -1,3 +0,0 @@
-TFT for PyTorch
-
-This repository includes software from https://github.com/google-research/google-research/tree/master/tft licensed under the Apache License, Version 2.0
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/README.md b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/README.md
deleted file mode 100644
index 69b39d12..00000000
--- a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/README.md
+++ /dev/null
@@ -1,465 +0,0 @@
-# Temporal Fusion Transformer For PyTorch
-
-This repository provides a script and recipe to train the Temporal Fusion Transformer model to achieve state-of-the-art accuracy. The content of this repository is tested and maintained by NVIDIA.
-
-## Table Of Contents
-
-- [Model overview](#model-overview)
-    * [Model architecture](#model-architecture)
-    * [Default configuration](#default-configuration)
-    * [Feature support matrix](#feature-support-matrix)
-	    * [Features](#features)
-    * [Mixed precision training](#mixed-precision-training)
-	    * [Enabling mixed precision](#enabling-mixed-precision)
-          * [Enabling TF32](#enabling-tf32)
-    * [Glossary](#glossary)
-- [Setup](#setup)
-    * [Requirements](#requirements)
-- [Quick Start Guide](#quick-start-guide)
-- [Advanced](#advanced)
-    * [Scripts and sample code](#scripts-and-sample-code)
-    * [Command-line options](#command-line-options)
-    * [Getting the data](#getting-the-data)
-        * [Dataset guidelines](#dataset-guidelines)
-        * [Multi-dataset](#multi-dataset)
-    * [Training process](#training-process)
-    * [Inference process](#inference-process)
-- [Performance](#performance)
-    * [Benchmarking](#benchmarking)
-        * [Training performance benchmark](#training-performance-benchmark)
-        * [Inference performance benchmark](#inference-performance-benchmark)
-    * [Results](#results)
-        * [Training accuracy results](#training-accuracy-results)                         
-            * [Training accuracy: NVIDIA DGX A100 (8x A100 80GB)](#training-accuracy-nvidia-dgx-a100-8x-a100-80gb)
-            * [Training accuracy: NVIDIA DGX-1 (8x V100 16GB)](#training-accuracy-nvidia-dgx-1-8x-v100-16gb)
-            * [Training stability test](#training-stability-test)
-        * [Training performance results](#training-performance-results)
-            * [Training performance: NVIDIA DGX A100 (8x A100 80GB)](#training-performance-nvidia-dgx-a100-8x-a100-80gb)
-            * [Training performance: NVIDIA DGX-1 (8x V100 16GB)](#training-performance-nvidia-dgx-1-8x-v100-16gb)
-- [Release notes](#release-notes)
-    * [Changelog](#changelog)
-    * [Known issues](#known-issues)
-
-
-
-## Model overview
-
-The Temporal Fusion Transformer [TFT](https://arxiv.org/abs/1912.09363) model is a state-of-the-art architecture for interpretable, multi-horizon time-series prediction. The model was first developed and [implemented by Google](https://github.com/google-research/google-research/tree/master/tft) with the collaboration with the University of Oxford.
-This implementation differs from the reference implementation by addressing the issue of missing data, which is common in production datasets, by either masking their values in attention matrices or embedding them as a special value in the latent space.
-This model enables the prediction of confidence intervals for future values of time series for multiple future timesteps.
-
-This model is trained with mixed precision using Tensor Cores on Volta, Turing, and the NVIDIA Ampere GPU architectures. Therefore, researchers can get results 1.45x faster than training without Tensor Cores while experiencing the benefits of mixed precision training. This model is tested against each NGC monthly container release to ensure consistent accuracy and performance over time.
-
-### Model architecture
-
-The TFT model is a hybrid architecture joining LSTM encoding of time series and interpretability of transformer attention layers. Prediction is based on three  types of variables: static (constant for a given time series), known (known in advance for whole history and future), observed (known only for historical data). All these variables come in two flavors: categorical, and continuous. In addition to historical data, we feed the model with historical values of time series. All variables are embedded in high-dimensional space by learning an embedding vector. Categorical variables embeddings are learned in the classical sense of embedding discrete values. The model learns a single vector for each continuous variable, which is then scaled by this variable’s value for further processing. The next step is to filter variables through the Variable Selection Network (VSN), which assigns weights to the inputs in accordance with their relevance to the prediction. Static variables are used as a context for variable selection of other variables and as an initial state of LSTM encoders.
-After encoding, variables are passed to multi-head attention layers (decoder), which produce the final prediction. Whole architecture is interwoven with residual connections with gating mechanisms that allow  the architecture to adapt to various problems by skipping some parts of it.
-For the sake of explainability, heads of self-attention layers share value matrices. This allows interpreting  self-attention as an ensemble of models predicting different temporal patterns over the same feature set. The other feature that helps us understand the model is VSN activations, which tells us how relevant the given feature is to the prediction.
-![](TFT_architecture.PNG)
-*image source: https://arxiv.org/abs/1912.09363*
-
-### Default configuration
-
-The specific configuration of the TFT model depends on the dataset used. Not only is the volume of the model subject to change but so are the data sampling and preprocessing strategies. During preprocessing, data is normalized per feature. For a part of the datasets, we apply scaling per-time-series, which takes into account shifts in distribution between entities (i.e., a factory consumes more electricity than an average house). The model is trained with the quantile loss: <img src="https://render.githubusercontent.com/render/math?math=\Large\sum_{i=1}^N\sum_{q\in\mathcal{Q}}\sum_{t=1}^{t_{max}}\frac{QL(y_it,\hat{y}_i(q,t),q)}{Nt_{max}}">
-For quantiles in [0.1, 0.5, 0.9]. The default configurations are tuned for distributed training on DGX-1-32G with mixed precision. We use dynamic loss scaling. Specific values are provided in the table below.
-
-| Dataset | Training samples | Validation samples | Test samples | History length | Forecast horizon | Dropout | Hidden size | #Heads | BS | LR | Gradient clipping |
-| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
-| Electricity | 450k | 50k | 53.5k | 168 | 24 | 0.1 | 128 | 4 | 8x1024 | 1e-3 | 0.0 |
-| Traffic | 450k | 50k | 139.6k | 168 | 24 | 0.3 | 128 | 4 | 8x1024 | 1e-3 | 0.0
-
-### Feature support matrix
-
-The following features are supported by this model: 
-
-| Feature                    | Yes column                
-|----------------------------|--------------------------
-|Distributed data parallel   |         Yes
-|PyTorch AMP                 |         Yes 
-    
-         
-#### Features
-
-[Automatic Mixed Precision](https://pytorch.org/docs/stable/amp.html)
-provides an easy way to leverage Tensor Cores’ performance. It allows the execution of parts of a network in lower precision. Refer to [Mixed precision training](#mixed-precision-training) for more information.
-
-[PyTorch
-DistributedDataParallel](https://pytorch.org/docs/stable/nn.html#torch.nn.parallel.DistributedDataParallel) - a module
-wrapper that enables easy multiprocess distributed data-parallel
-training.
-
-### Mixed precision training
-
-Mixed precision is the combined use of different numerical precisions in a
-computational method.
-[Mixed precision](https://arxiv.org/abs/1710.03740) training offers significant
-computational speedup by performing operations in half-precision format while
-storing minimal information in single-precision to retain as much information
-as possible in critical parts of the network. Since the introduction of [Tensor Cores](https://developer.nvidia.com/tensor-cores) in Volta, and following with 
-both the Turing and Ampere architectures, significant training speedups are 
-experienced by switching to
-mixed precision -- up to 3x overall speedup on the most arithmetically intense
-model architectures. Using mixed precision training previously required two
-steps:
-
-1. Porting the model to use the FP16 data type where appropriate.
-2. Manually adding loss scaling to preserve small gradient values.
-
-The ability to train deep learning networks with lower precision was introduced
-in the Pascal architecture and first supported in [CUDA
-8](https://devblogs.nvidia.com/parallelforall/tag/fp16/) in the NVIDIA Deep
-Learning SDK.
-
-For information about:
-* How to train using mixed precision, refer to the [Mixed Precision
-  Training](https://arxiv.org/abs/1710.03740) paper and [Training With Mixed
-  Precision](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html)
-  documentation.
-* Techniques used for mixed precision training, refer to the [Mixed-Precision
-  Training of Deep Neural
-  Networks](https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/)
-  blog.
-* APEX tools for mixed precision training, refer to the [NVIDIA Apex: Tools for Easy Mixed-Precision Training in
-  PyTorch](https://devblogs.nvidia.com/apex-pytorch-easy-mixed-precision-training/)
-  .
-
-
-#### Enabling mixed precision
-
-
-Mixed precision is enabled in PyTorch by using the Automatic Mixed Precision torch.cuda.amp module, which casts variables to half-precision upon retrieval while storing variables in single-precision format. Furthermore, to preserve small gradient magnitudes in backpropagation, a [loss scaling](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html#lossscaling) step must be included when applying gradients. In PyTorch, loss scaling can be applied automatically by the GradScaler class. All the necessary steps to implement AMP are verbosely described [here](https://pytorch.org/docs/stable/notes/amp_examples.html#amp-examples).
-
-To enable mixed precision for TFT, simply add the `--use_amp` option to the training script.
-#### Enabling TF32
-
-TensorFloat-32 (TF32) is the new math mode in [NVIDIA A100](https://www.nvidia.com/en-us/data-center/a100/) GPUs for handling the matrix math, also called tensor operations. TF32 running on Tensor Cores in A100 GPUs can provide up to 10x speedups compared to single-precision floating-point math (FP32) on Volta GPUs. 
-
-TF32 Tensor Cores can speed up networks using FP32, typically with no loss of accuracy. It is more robust than FP16 for models which require high dynamic range for weights or activations.
-
-For more information, refer to the [TensorFloat-32 in the A100 GPU Accelerates AI Training, HPC up to 20x](https://blogs.nvidia.com/blog/2020/05/14/tensorfloat-32-precision-format/) blog post.
-
-TF32 is supported in the NVIDIA Ampere GPU architecture and is enabled by default.
-
-
-
-### Glossary
-
-**Multi horizon prediction**  
-Process of estimating values of a time series for multiple future time steps.
-
-**Quantiles**  
-Cut points dividing the range of a probability distribution intervals with equal probabilities.
-
-**Time series**  
-Series of data points indexed and equally spaced in time.
-
-**Transformer**  
-The paper [Attention Is All You Need](https://arxiv.org/abs/1706.03762) introduces a novel architecture called Transformer that uses an attention mechanism and transforms one sequence into another.
- 
-
-## Setup
-
-The following section lists the requirements that you need to meet in order to start training the TFT model.
-
-### Requirements
-
-This repository contains Dockerfile, which extends the PyTorch NGC container and encapsulates some dependencies. Aside from these dependencies, ensure you have the following components:
--   [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker)
--   [PyTorch 21.06 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch)
--   Supported GPUs:
-- [NVIDIA Volta architecture](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/)
-- [NVIDIA Turing architecture](https://www.nvidia.com/en-us/design-visualization/technologies/turing-architecture/)
-- [NVIDIA Ampere architecture](https://www.nvidia.com/en-us/data-center/nvidia-ampere-gpu-architecture/)
-
-For more information about how to get started with NGC containers, refer to the following sections from the NVIDIA GPU Cloud Documentation and the Deep Learning Documentation:
--   [Getting Started Using NVIDIA GPU Cloud](https://docs.nvidia.com/ngc/ngc-getting-started-guide/index.html)
--   [Accessing And Pulling From The NGC Container Registry](https://docs.nvidia.com/deeplearning/frameworks/user-guide/index.html#accessing_registry)
--   Running [PyTorch](https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/running.html#running)
-
-  
-For those unable to use the PyTorch NGC container to set up the required environment or create your own container, refer to the versioned [NVIDIA Container Support Matrix](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html).
-
-## Quick Start Guide
-
-To train your model using mixed or TF32 precision with Tensor Cores, perform the following steps using the default parameters of the TFT model on any of the benchmark datasets. For the specifics concerning training and inference, refer to the [Advanced](#advanced) section.
-
-1. Clone the repository.
-```bash
-git clone https://github.com/NVIDIA/DeepLearningExamples
-cd DeepLearningExamples/PyTorch/Forecasting/TFT
-```
-
-2.  Build the TFT PyTorch NGC container.
-```bash
-docker build --network=host -t tft .
-```
-
-3.  Start an interactive session in the NGC container to run training/inference.
-```bash
-docker run -it --rm --ipc=host --network=host --gpus all -v /path/to/your/data:/data/ tft
-```
-
-Note: Ensure to mount your dataset using the -v flag to make it available for training inside the NVIDIA Docker container.
-
-4.  Download and preprocess datasets.
-```bash
-bash scripts/get_data.sh
-```
-
-5. Start training. Choose one of the scripts provided in the `scripts/` directory. Results are stored in the `/results` directory.
-These scripts are tuned for DGX1-32G. If you have a different system, use NGPU and BATCH_SIZE variables to adjust the parameters for your system.
-```bash
-bash scripts/run_electricity.sh
-bash scripts/run_traffic.sh
-```
-
-6. Start validation/evaluation. The metric we use for evaluation is q-risk. We can compare it per-quantile in the Pareto sense or jointly as one number indicating accuracy.
-```bash
-python inference.py \
---checkpoint <your_checkpoint> \
---data /data/processed/<dataset>/test.csv \
---cat_encodings /data/processed/<dataset>/cat_encodings.bin \
---tgt_scalers /data/processed/<dataset>/tgt_scalers.bin
-```
-
-7. Start inference/predictions. Visualize and save predictions by running the following command.
-```bash
-python inference.py \
---checkpoint <your_checkpoint> \
---data /data/processed/<dataset>/test.csv \
---cat_encodings /data/processed/<dataset>/cat_encodings.bin \
---tgt_scalers /data/processed/<dataset>/tgt_scalers.bin \
---visualize \
---save_predictions
-```
-
-
-
-Now that you have your model trained and evaluated, you can choose to compare your training results with our [Training accuracy results](#training-accuracy-results). You can also choose to benchmark your performance to [Training performance benchmark](#training-performance-results). Following the steps in these sections will ensure that you achieve the same accuracy and performance results as stated in the [Results](#results) section.
-## Advanced
-
-The following sections provide more  details about the dataset, running training and inference, and the training results.
-
-### Scripts and sample code
-
-In the root directory, the most important files are:
-
-`train.py`: Entry point for training
-`data_utils.py`: File containing the dataset implementation and preprocessing functions
-`modeling.py`: Definition of the model
-`configuration.py`: Contains configuration classes for various experiments
-`test.py`: Entry point testing trained model.
-`Dockerfile`: Container definition
-`log_helper.py`: Contains helper functions for setting up dllogger
-`criterions.py`: Definitions of loss functions
-
-The `scripts` directory contains scripts for default use cases:
-`run_electricity.sh`: train default model on the electricity dataset
-`run_traffic.sh`: train default model on the traffic dataset
-
-### Command-line options
-
-To view the full list of available options and their descriptions, use the `-h` or `--help` command-line option, for example:
-`python train.py --help`.
-
-The following example output is printed when running the model:
-```
-usage: train.py [-h] --data_path DATA_PATH --dataset {electricity,volatility,traffic,favorita} [--epochs EPOCHS] [--sample_data SAMPLE_DATA SAMPLE_DATA] [--batch_size BATCH_SIZE] [--lr LR] [--seed SEED] [--use_amp] [--clip_grad CLIP_GRAD]
-                [--early_stopping EARLY_STOPPING] [--results RESULTS] [--log_file LOG_FILE] [--distributed_world_size N] [--distributed_rank DISTRIBUTED_RANK] [--local_rank LOCAL_RANK] [--overwrite_config OVERWRITE_CONFIG]
-
-optional arguments:
-  -h, --help            show this help message and exit
-  --data_path DATA_PATH
-  --dataset {electricity,volatility,traffic,favorita}
-  --epochs EPOCHS
-  --sample_data SAMPLE_DATA SAMPLE_DATA
-  --batch_size BATCH_SIZE
-  --lr LR
-  --seed SEED
-  --use_amp             Enable automatic mixed precision
-  --clip_grad CLIP_GRAD
-  --early_stopping EARLY_STOPPING
-                        Stop training if validation loss does not improve for more than this number of epochs.
-  --results RESULTS
-  --log_file LOG_FILE
-  --distributed_world_size N
-                        total number of GPUs across all nodes (default: all visible GPUs)
-  --distributed_rank DISTRIBUTED_RANK
-                        rank of the current worker
-  --local_rank LOCAL_RANK
-                        rank of the current worker
-  --overwrite_config OVERWRITE_CONFIG
-                        JSON string used to overload config
-
-```
-
-### Getting the data
-    
-The TFT model was trained on the electricity and traffic benchmark datasets. This repository contains the `get_data.sh` download script, which for electricity and and traffic datasets will automatically download and preprocess the training, validation and test datasets, and produce files that contain scalers.
-#### Dataset guidelines
-
-The `data_utils.py` file contains all functions that are used to preprocess the data. Initially the data is loaded to a `pandas.DataFrame` and parsed to the common format which contains the features we will use for training. Then standardized data is cleaned, normalized, encoded and binarized.
-This step does the following:
-Drop all the columns that are not marked in the configuration file as used for training or preprocessing
-Flatten indices in case time series are indexed by more than one column
-Split the data into training, validation and test splits
-Filter out all the time series shorter than minimal example length
-Normalize columns marked as continuous in the configuration file
-Encode as integers columns marked as categorical
-Save the data in csv and binary formats
-
-#### Multi-dataset
-In order to use an alternate dataset, you have to write a function that parses your data to a common format. The format is as follows:
-There is at least one id column
-There is exactly one time column (that can also be used as a feature column)
-Each feature is in a separate column
-Each row represents a moment in time for only one time series
-Additionally, you must specify a configuration of the network, including a data description. Refer to the example in `configuration.py` file.
-### Training process
-
-The `train.py` script is an entry point for a training procedure. Refined recipes can be found in the `scripts` directory.
-The model trains for at most `--epochs` epochs. If option `--early_stopping N` is set, then training will end if for N subsequent epochs validation loss hadn’t improved.
-The details of the architecture and the dataset configuration are encapsulated by the `--dataset` option. This option chooses one of the configurations stored in the `configuration.py` file. You can enable mixed precision training by providing the `--use_amp` option. The training script supports multi-GPU training with the APEX package. To enable distributed training prepend training command with `python -m torch.distributed.launch --nproc_per_node=${NGPU}`.
-
-Example command:
-```
-python -m torch.distributed.launch --nproc_per_node=8 train.py \
-        --dataset electricity \
-        --data_path /data/processed/electricity_bin \
-        --batch_size=1024 \
-        --sample 450000 50000 \
-        --lr 1e-3 \
-        --epochs 25 \
-        --early_stopping 5 \
-        --seed 1 \
-        --use_amp \
-        --results /results/TFT_electricity_bs8x1024_lr1e-3/seed_1
-```
-
-The model is trained by optimizing quantile loss <img src="https://render.githubusercontent.com/render/math?math=\Large\sum_{i=1}^N\sum_{q\in\mathcal{Q}}\sum_{t=1}^{t_{max}}\frac{QL(y_{it},\hat{y}_i(q,t),q)}{Nt_{max}}">
-. After training, the checkpoint with the least validation loss is evaluated on a test split with q-risk metric <img src="https://render.githubusercontent.com/render/math?math=\Large\frac{2\sum_{y\in\Omega}\sum_{t=1}^{t_{max}}QL(y_t,\hat{y}(q,t),q)}{\sum_{y\in\Omega}\sum_{t=1}^{t_{max}}|y_t|}">.
-Results are by default stored in the `/results` directory. This can be changed by providing the `--results` option. At the end of the training,  the results directory will contain the trained checkpoint which had the lowest validation loss, dllogger logs (in dictionary per line format), and TensorBoard logs.
-
-### Inference process
-
-Inference can be run by launching the `inference.py` script. The script requires a trained checkpoint to run. It is crucial to prepare the data in the same way as training data prior to running the inference. Example command:
-```
-python inference.py \
---checkpoint /results/checkpoint.pt \
---data /data/processed/electricity_bin/test.csv \
---tgt_scalers /data/processed/electricity_bin/tgt_scalers.bin \
---cat_encodings /data/processed/electricity_bin/cat_encodings.bin \
---batch_size 2048 \
---visualize \
---save_predictions \
---joint_visualization \
---results /results \
---use_amp
-```
-
-In the default setting, it performs the evaluation of the model on a specified dataset and prints q-risk evaluated on this dataset. In order to save the predictions, use the `--save_predictions` option. Predictions will be stored in the directory specified by the `--results` option in the csv format. Option `--joint_visualization` allows us to plot graphs in TensorBoard format, allowing us to inspect the results and compare them to true values. Using `--visualize`, you can save plots for each example in a separate file.
-## Performance
-
-### Benchmarking
-
-The following section shows how to run benchmarks measuring the model performance in training and inference modes.
-
-#### Training performance benchmark
-
-In order to run training benchmarks, use the `scripts/benchmark.sh` script.
-
-#### Inference performance benchmark
-
-To benchmark the inference performance on a specific batch size and dataset, run the `inference.py` script.
-### Results
-
-The following sections provide details on how we achieved our performance and accuracy in training and inference.
-
-#### Training accuracy results
-
-We conducted an extensive hyperparameter search along with stability tests. The presented results are the averages from the hundreds of runs.
-
-##### Training accuracy: NVIDIA DGX A100 (A100 80GB)
-
-Our results were obtained by running the `train.sh` training script in the [PyTorch 21.06 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch) on NVIDIA A100 GPUs.
-
-| Dataset | GPUs | Batch size / GPU    | Accuracy - TF32  | Accuracy - mixed precision  |   Time to train - TF32  |  Time to train - mixed precision | Time to train speedup (TF32 to mixed precision)     
-|-------------|---|------|-----------------------|-----------------------|-------|-------|-------
-| Electricity | 1 | 1024 | 0.027 / 0.059 / 0.029 | 0.028 / 0.058 / 0.029 | 1427s | 1087s | 1.313x
-| Electricity | 8 | 1024 | 0.027 / 0.056 / 0.028 | 0.026 / 0.054 / 0.029 | 216s  | 176s  | 1.227x
-| Traffic     | 1 | 1024 | 0.040 / 0.103 / 0.075 | 0.040 / 0.103 / 0.075 | 957s  | 726s  | 1.318x
-| Traffic     | 8 | 1024 | 0.042 / 0.104 / 0.076 | 0.042 / 0.106 / 0.077 | 151s  | 126s  | 1.198x
-
-
-
-
-##### Training accuracy: NVIDIA DGX-1 (V100 16GB)
-
-Our results were obtained by running the `train.sh` training script in the [PyTorch 21.06 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch) on NVIDIA DGX-1 with V100 16GB GPUs.
-
-| Dataset | GPUs    | Batch size / GPU    | Accuracy - FP32  | Accuracy - mixed precision  |   Time to train - FP32  |  Time to train - mixed precision | Time to train speedup (FP32 to mixed precision)        
-|-------------|---|------|-----------------------|-----------------------|-------|-------|-----------
-| Electricity | 1 | 1024 | 0.027 / 0.056 / 0.028 | 0.027 / 0.058 / 0.029 | 2559s | 1598s | 1.601x 
-| Electricity | 8 | 1024 | 0.027 / 0.055 / 0.028 | 0.027 / 0.055 / 0.029 | 381s  | 261s  | 1.460x   
-| Traffic     | 1 | 1024 | 0.040 / 0.102 / 0.075 | 0.041 / 0.101 / 0.074 | 1718s | 1062s | 1.618x 
-| Traffic     | 8 | 1024 | 0.042 / 0.106 / 0.076 | 0.042 / 0.105 / 0.077 | 256s  | 176s  | 1.455x
-
-
-
-##### Training stability test
-
-In order to get a greater picture of the model’s accuracy, we performed a hyperparameter search along with stability tests on 100 random seeds for each configuration. Then, for each benchmark dataset, we have chosen the architecture with the least mean test q-risk. The table below summarizes the best configurations.
-
-| Dataset     | #GPU | Hidden size | #Heads | Local BS | LR   | Gradient clipping | Dropout | Mean q-risk | Std q-risk | Min q-risk | Max q-risk
-|-------------|------|-------------|--------|----------|------|-------------------|---------|-------------|------------| -----------|------ 
-| Electricity | 8    | 128         | 4      | 1024     | 1e-3 | 0.0               | 0.1     | 0.1131      | 0.0025     | 0.1080     | 0.1200
-| Traffic     | 8    | 128         | 4      | 1024     | 1e-3 | 0.0               | 0.3     | 0.2180      | 0.0049     | 0.2069     | 0.2336
-
-
-#### Training performance results
-
-##### Training performance: NVIDIA DGX A100 (A100 80GB)
-
-Our results were obtained by running the `train.sh` training script in the [PyTorch 21.06 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch) on NVIDIA A100 (A100 80GB) GPUs. Performance numbers (in items/images per second) were averaged over an entire training epoch.
-
-| Dataset | GPUs   | Batch size / GPU   | Throughput - TF32    | Throughput - mixed precision    | Throughput speedup (TF32 - mixed precision)   | Weak scaling - TF32    | Weak scaling - mixed precision        
-|-------------|---|------|--------|--------|-------|-------|-----
-| Electricity | 1 | 1024 | 10173  | 13703  | 1.35x | 1     | 1
-| Electricity | 8 | 1024 | 80596  | 107761 | 1.34x | 7.92x | 7.86x
-| Traffic     | 1 | 1024 | 10197  | 13779  | 1.35x | 1     | 1
-| Traffic     | 8 | 1024 | 80692  | 107979 | 1.34x | 7.91x | 7.84x
-
-
-To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
-
-The performance metrics used were items per second.
-
-
-##### Training performance: NVIDIA DGX-1 (V100 16GB)
-
-Our results were obtained by running the `train.sh` training script in the [PyTorch 21.06 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch) on NVIDIA DGX-1 with (V100 16GB) GPUs. Performance numbers (in items/images per second) were averaged over an entire training epoch.
-
-| Dataset | GPUs   | Batch size / GPU   | Throughput - FP32    | Throughput - mixed precision    | Throughput speedup (FP32 - mixed precision)   | Weak scaling - FP32    | Weak scaling - mixed precision        
-|-------------|---|------|-------|-------|-------|------|----
-| Electricity | 1 | 1024 | 5580  | 9148  | 1.64x | 1     | 1
-| Electricity | 8 | 1024 | 43351 | 69855 | 1.61x | 7.77x | 7.64x
-| Traffic     | 1 | 1024 | 5593  | 9194  | 1.64x | 1     | 1
-| Traffic     | 8 | 1024 | 43426 | 69983 | 1.61x | 7.76x | 7.61x
-
-
-
-To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
-
-The performance metrics used were items per second.
-
-## Release notes
-The performance measurements in this document were conducted at the time of publication and may not reflect the performance achieved from NVIDIA’s latest software release. For the most up-to-date performance measurements, go to https://developer.nvidia.com/deep-learning-performance-training-inference.
-
-### Changelog
-
-October 2021
-- Initial release
-
-### Known issues
-There are no known issues with this model.
-
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/TFT_architecture.PNG b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/TFT_architecture.PNG
deleted file mode 100644
index c3431031..00000000
Binary files a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/TFT_architecture.PNG and /dev/null differ
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/configuration.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/configuration.py
deleted file mode 100644
index bef26e66..00000000
--- a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/configuration.py
+++ /dev/null
@@ -1,128 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from data_utils import InputTypes, DataTypes, FeatureSpec
-import datetime
-
-class ElectricityConfig():
-    def __init__(self):
-
-        self.features = [
-                         FeatureSpec('id', InputTypes.ID, DataTypes.CATEGORICAL),
-                         FeatureSpec('hours_from_start', InputTypes.TIME, DataTypes.CONTINUOUS),
-                         FeatureSpec('power_usage', InputTypes.TARGET, DataTypes.CONTINUOUS),
-                         FeatureSpec('hour', InputTypes.KNOWN, DataTypes.CONTINUOUS),
-                         FeatureSpec('day_of_week', InputTypes.KNOWN, DataTypes.CONTINUOUS),
-                         FeatureSpec('hours_from_start', InputTypes.KNOWN, DataTypes.CONTINUOUS),
-                         FeatureSpec('categorical_id', InputTypes.STATIC, DataTypes.CATEGORICAL),
-                        ]
-        # Dataset split boundaries
-        self.time_ids = 'days_from_start' # This column contains time indices across which we split the data
-        self.train_range = (1096, 1315)
-        self.valid_range = (1308, 1339)
-        self.test_range = (1332, 1346)
-        self.dataset_stride = 1 #how many timesteps between examples
-        self.scale_per_id = True
-        self.missing_id_strategy = None
-        self.missing_cat_data_strategy='encode_all'
-
-        # Feature sizes
-        self.static_categorical_inp_lens = [369]
-        self.temporal_known_categorical_inp_lens = []
-        self.temporal_observed_categorical_inp_lens = []
-        self.quantiles = [0.1, 0.5, 0.9]
-
-        self.example_length = 8 * 24
-        self.encoder_length = 7 * 24
-
-        self.n_head = 4
-        self.hidden_size = 128
-        self.dropout = 0.1
-        self.attn_dropout = 0.0
-
-        #### Derived variables ####
-        self.temporal_known_continuous_inp_size = len([x for x in self.features 
-            if x.feature_type == InputTypes.KNOWN and x.feature_embed_type == DataTypes.CONTINUOUS])
-        self.temporal_observed_continuous_inp_size = len([x for x in self.features 
-            if x.feature_type == InputTypes.OBSERVED and x.feature_embed_type == DataTypes.CONTINUOUS])
-        self.temporal_target_size = len([x for x in self.features if x.feature_type == InputTypes.TARGET])
-        self.static_continuous_inp_size = len([x for x in self.features 
-            if x.feature_type == InputTypes.STATIC and x.feature_embed_type == DataTypes.CONTINUOUS])
-
-        self.num_static_vars = self.static_continuous_inp_size + len(self.static_categorical_inp_lens)
-        self.num_future_vars = self.temporal_known_continuous_inp_size + len(self.temporal_known_categorical_inp_lens)
-        self.num_historic_vars = sum([self.num_future_vars,
-                                      self.temporal_observed_continuous_inp_size,
-                                      self.temporal_target_size,
-                                      len(self.temporal_observed_categorical_inp_lens),
-                                      ])
-
-
-class TrafficConfig():
-    def __init__(self):
-
-        self.features = [
-                         FeatureSpec('id', InputTypes.ID, DataTypes.CATEGORICAL),
-                         FeatureSpec('hours_from_start', InputTypes.TIME, DataTypes.CONTINUOUS),
-                         FeatureSpec('values', InputTypes.TARGET, DataTypes.CONTINUOUS),
-                         FeatureSpec('time_on_day', InputTypes.KNOWN, DataTypes.CONTINUOUS),
-                         FeatureSpec('day_of_week', InputTypes.KNOWN, DataTypes.CONTINUOUS),
-                         FeatureSpec('hours_from_start', InputTypes.KNOWN, DataTypes.CONTINUOUS),
-                         FeatureSpec('categorical_id', InputTypes.STATIC, DataTypes.CATEGORICAL),
-                        ]
-        # Dataset split boundaries
-        self.time_ids = 'sensor_day' # This column contains time indices across which we split the data
-        self.train_range = (0, 151)
-        self.valid_range = (144, 166)
-        self.test_range = (159, float('inf'))
-        self.dataset_stride = 1 #how many timesteps between examples
-        self.scale_per_id = False
-        self.missing_id_strategy = None
-        self.missing_cat_data_strategy='encode_all'
-
-        # Feature sizes
-        self.static_categorical_inp_lens = [963]
-        self.temporal_known_categorical_inp_lens = []
-        self.temporal_observed_categorical_inp_lens = []
-        self.quantiles = [0.1, 0.5, 0.9]
-
-        self.example_length = 8 * 24
-        self.encoder_length = 7 * 24
-
-        self.n_head = 4
-        self.hidden_size = 128
-        self.dropout = 0.3
-        self.attn_dropout = 0.0
-
-        #### Derived variables ####
-        self.temporal_known_continuous_inp_size = len([x for x in self.features 
-            if x.feature_type == InputTypes.KNOWN and x.feature_embed_type == DataTypes.CONTINUOUS])
-        self.temporal_observed_continuous_inp_size = len([x for x in self.features 
-            if x.feature_type == InputTypes.OBSERVED and x.feature_embed_type == DataTypes.CONTINUOUS])
-        self.temporal_target_size = len([x for x in self.features if x.feature_type == InputTypes.TARGET])
-        self.static_continuous_inp_size = len([x for x in self.features 
-            if x.feature_type == InputTypes.STATIC and x.feature_embed_type == DataTypes.CONTINUOUS])
-
-        self.num_static_vars = self.static_continuous_inp_size + len(self.static_categorical_inp_lens)
-        self.num_future_vars = self.temporal_known_continuous_inp_size + len(self.temporal_known_categorical_inp_lens)
-        self.num_historic_vars = sum([self.num_future_vars,
-                                      self.temporal_observed_continuous_inp_size,
-                                      self.temporal_target_size,
-                                      len(self.temporal_observed_categorical_inp_lens),
-                                      ])
-
-
-CONFIGS = {'electricity':  ElectricityConfig,
-           'traffic':      TrafficConfig, 
-           }
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/criterions.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/criterions.py
deleted file mode 100644
index 5c9df6ae..00000000
--- a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/criterions.py
+++ /dev/null
@@ -1,28 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-class QuantileLoss(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.register_buffer('q', torch.tensor(config.quantiles))
-
-    def forward(self, predictions, targets):
-        diff = predictions - targets
-        ql = (1-self.q)*F.relu(diff) + self.q*F.relu(-diff)
-        losses = ql.view(-1, ql.shape[-1]).mean(0)
-        return losses
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/data_utils.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/data_utils.py
deleted file mode 100644
index f38f8bfb..00000000
--- a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/data_utils.py
+++ /dev/null
@@ -1,790 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-################################
-# Copyright 2021 The Google Research Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import math
-import pickle
-import enum
-import datetime
-
-from collections import namedtuple, OrderedDict
-
-import sklearn.preprocessing
-from sklearn.impute import SimpleImputer
-import pandas as pd
-import numpy as np
-from bisect import bisect
-
-import torch
-from torch.utils.data import Dataset,IterableDataset,DataLoader
-
-class DataTypes(enum.IntEnum):
-    """Defines numerical types of each column."""
-    CONTINUOUS = 0
-    CATEGORICAL = 1
-    DATE = 2
-    STR = 3
-
-class InputTypes(enum.IntEnum):
-    """Defines input types of each column."""
-    TARGET = 0
-    OBSERVED = 1
-    KNOWN = 2
-    STATIC = 3
-    ID = 4  # Single column used as an entity identifier
-    TIME = 5  # Single column exclusively used as a time index
-
-FeatureSpec = namedtuple('FeatureSpec', ['name', 'feature_type', 'feature_embed_type'])
-DTYPE_MAP = {
-        DataTypes.CONTINUOUS : np.float32,
-        DataTypes.CATEGORICAL : np.int64,
-        DataTypes.DATE:'datetime64[ns]',
-        DataTypes.STR: str
-        }
-
-FEAT_ORDER = [
-        (InputTypes.STATIC, DataTypes.CATEGORICAL),
-        (InputTypes.STATIC, DataTypes.CONTINUOUS),
-        (InputTypes.KNOWN, DataTypes.CATEGORICAL),
-        (InputTypes.KNOWN, DataTypes.CONTINUOUS),
-        (InputTypes.OBSERVED, DataTypes.CATEGORICAL),
-        (InputTypes.OBSERVED, DataTypes.CONTINUOUS),
-        (InputTypes.TARGET, DataTypes.CONTINUOUS),
-        (InputTypes.ID, DataTypes.CATEGORICAL)
-        ]
-
-FEAT_NAMES = ['s_cat' , 's_cont' , 'k_cat' , 'k_cont' , 'o_cat' , 'o_cont' , 'target', 'id']
-DEFAULT_ID_COL = 'id'
-
-class TFTBinaryDataset(Dataset):
-    def __init__(self, path, config):
-        super(TFTBinaryDataset).__init__()
-        self.features = [x for x in config.features if x.feature_embed_type != DataTypes.DATE]
-        self.example_length = config.example_length
-        self.stride = config.dataset_stride
-
-        self.grouped = pickle.load(open(path, 'rb'))
-        self.grouped = [x for x in self.grouped if x.shape[0] >= self.example_length]
-        self._cum_examples_in_group = np.cumsum([(g.shape[0] - self.example_length + 1)//self.stride for g in self.grouped])
-
-
-        self.feature_type_col_map = [[i for i,f in enumerate(self.features) if (f.feature_type, f.feature_embed_type) == x] for x in FEAT_ORDER]
-
-        # The list comprehension below is an elaborate way of rearranging data into correct order,
-        # simultaneously doing casting to proper types. Probably can be written neater
-        self.grouped = [
-                [
-                    arr[:, idxs].view(dtype=np.float32).astype(DTYPE_MAP[t[1]]) 
-                    for t, idxs in zip(FEAT_ORDER, self.feature_type_col_map)
-                ] 
-                for arr in self.grouped
-            ]
-
-    def __len__(self):
-        return self._cum_examples_in_group[-1] if len(self._cum_examples_in_group) else 0
-
-    def __getitem__(self, idx):
-        g_idx = bisect(self._cum_examples_in_group, idx)
-        e_idx = idx - self._cum_examples_in_group[g_idx-1] if g_idx else idx
-
-        group =  self.grouped[g_idx]
-
-        tensors = [
-                   torch.from_numpy(feat[e_idx * self.stride:e_idx*self.stride + self.example_length])
-                   if feat.size else torch.empty(0)
-                   for feat in group
-                  ]
-
-        return OrderedDict(zip(FEAT_NAMES, tensors))
-
-
-class TFTDataset(Dataset):
-    def __init__(self, path, config):
-        super(TFTDataset).__init__()
-        self.features = config.features
-        self.data = pd.read_csv(path, index_col=0)
-        self.example_length = config.example_length
-        self.stride = config.dataset_stride
-
-        # name field is a column name.
-        # there can be multiple entries with the same name because one column can be interpreted in many ways
-        time_col_name = next(x.name for x in self.features if x.feature_type==InputTypes.TIME)
-        id_col_name   = next(x.name for x in self.features if x.feature_type==InputTypes.ID)
-        if not id_col_name in self.data.columns:
-            id_col_name = DEFAULT_ID_COL
-            self.features = [x for x in self.features if x.feature_type!=InputTypes.ID]
-            self.features.append(FeatureSpec(DEFAULT_ID_COL, InputTypes.ID, DataTypes.CATEGORICAL))
-        col_dtypes    = {v.name:DTYPE_MAP[v.feature_embed_type] for v in self.features}
-
-
-        self.data.sort_values(time_col_name,inplace=True)
-        self.data = self.data[set(x.name for x in self.features)] #leave only relevant columns
-        self.data = self.data.astype(col_dtypes)
-        self.data = self.data.groupby(id_col_name).filter(lambda group: len(group) >= self.example_length)
-        self.grouped = list(self.data.groupby(id_col_name))
-
-        self._cum_examples_in_group = np.cumsum([(len(g[1]) - self.example_length + 1)//self.stride for g in self.grouped])
-
-    def __len__(self):
-        return self._cum_examples_in_group[-1]
-
-    def __getitem__(self, idx):
-        g_idx = len([x for x in self._cum_examples_in_group if x <= idx])
-        e_idx = idx - self._cum_examples_in_group[g_idx-1] if g_idx else idx
-
-        group =  self.grouped[g_idx][1]
-        sliced = group.iloc[e_idx * self.stride:e_idx*self.stride + self.example_length]
-
-        # We need to be sure that tensors are returned in the correct order
-        tensors = tuple([] for _ in range(8))
-        for v in self.features:
-            if v.feature_type == InputTypes.STATIC and v.feature_embed_type == DataTypes.CATEGORICAL:
-                tensors[0].append(torch.from_numpy(sliced[v.name].to_numpy()))
-            elif v.feature_type == InputTypes.STATIC and v.feature_embed_type == DataTypes.CONTINUOUS:
-                tensors[1].append(torch.from_numpy(sliced[v.name].to_numpy()))
-            elif v.feature_type == InputTypes.KNOWN and v.feature_embed_type == DataTypes.CATEGORICAL:
-                tensors[2].append(torch.from_numpy(sliced[v.name].to_numpy()))
-            elif v.feature_type == InputTypes.KNOWN and v.feature_embed_type == DataTypes.CONTINUOUS:
-                tensors[3].append(torch.from_numpy(sliced[v.name].to_numpy()))
-            elif v.feature_type == InputTypes.OBSERVED and v.feature_embed_type == DataTypes.CATEGORICAL:
-                tensors[4].append(torch.from_numpy(sliced[v.name].to_numpy()))
-            elif v.feature_type == InputTypes.OBSERVED and v.feature_embed_type == DataTypes.CONTINUOUS:
-                tensors[5].append(torch.from_numpy(sliced[v.name].to_numpy()))
-            elif v.feature_type == InputTypes.TARGET:
-                tensors[6].append(torch.from_numpy(sliced[v.name].to_numpy()))
-            elif v.feature_type == InputTypes.ID:
-                tensors[7].append(torch.from_numpy(sliced[v.name].to_numpy()))
-
-
-        tensors = [torch.stack(x, dim=-1) if x else torch.empty(0) for x in tensors]
-
-        return OrderedDict(zip(FEAT_NAMES, tensors))
-        
-def get_dataset_splits(df, config):
-
-    if hasattr(config, 'relative_split') and config.relative_split:
-        forecast_len = config.example_length - config.encoder_length
-        # The valid split is shifted from the train split by number of the forecast steps to the future.
-        # The test split is shifted by the number of the forecast steps from the valid split
-        train = []
-        valid = []
-        test = []
-
-        for _, group in df.groupby(DEFAULT_ID_COL):
-            index = group[config.time_ids]
-            _train = group.loc[index < config.valid_boundary]
-            _valid = group.iloc[(len(_train) - config.encoder_length):(len(_train) + forecast_len)]
-            _test = group.iloc[(len(_train) - config.encoder_length + forecast_len):(len(_train) + 2*forecast_len)]
-            train.append(_train)
-            valid.append(_valid)
-            test.append(_test)
-
-        train = pd.concat(train, axis=0)
-        valid = pd.concat(valid, axis=0)
-        test = pd.concat(test, axis=0)
-    else:
-        index = df[config.time_ids]
-        train = df.loc[(index >= config.train_range[0]) & (index < config.train_range[1])]
-        valid = df.loc[(index >= config.valid_range[0]) & (index < config.valid_range[1])]
-        test  = df.loc[(index >= config.test_range[0]) & (index < config.test_range[1])]
-
-    return train, valid, test
-
-def flatten_ids(df, config):
-
-    if config.missing_id_strategy == 'drop':
-        if hasattr(config, 'combine_ids') and config.combine_ids:
-            index = np.logical_or.reduce([df[c].isna() for c in config.combine_ids])
-        else:
-            id_col = next(x.name for x in config.features if x.feature_type == InputTypes.ID)
-            index = df[id_col].isna()
-        index = index[index == True].index # Extract indices of nans
-        df.drop(index, inplace=True)
-
-    if not (hasattr(config, 'combine_ids') and config.combine_ids):
-        id_col = next(x.name for x in config.features if x.feature_type == InputTypes.ID)
-        ids = df[id_col].apply(str)
-        df.drop(id_col, axis=1, inplace=True)
-        encoder = sklearn.preprocessing.LabelEncoder().fit(ids.values)
-        df[DEFAULT_ID_COL] = encoder.transform(ids)
-        encoders = OrderedDict({id_col: encoder})
-
-    else:
-        encoders = {c:sklearn.preprocessing.LabelEncoder().fit(df[c].values) for c in config.combine_ids}
-        encoders = OrderedDict(encoders)
-        lens = [len(v.classes_) for v in encoders.values()]
-        clens = np.roll(np.cumprod(lens), 1)
-        clens[0] = 1
-
-        # this takes a looooooot of time. Probably it would be better to create 2 dummy columns
-        df[DEFAULT_ID_COL] = df.apply(lambda row: sum([encoders[c].transform([row[c]])[0]*clens[i] for i,c in enumerate(encoders.keys())]), axis=1)
-        df.drop(config.combine_ids, axis=1, inplace=True)
-
-    return DEFAULT_ID_COL, encoders
-
-def impute(df, config):
-    #XXX This ensures that out scaling will have the same mean. We still need to check the variance
-    if not hasattr(config, 'missing_data_label'):
-        return df, None
-    else:
-        imp = SimpleImputer(missing_values=config.missing_data_label, strategy='mean')
-        mask = df.applymap(lambda x: True if x == config.missing_data_label else False)
-        data = df.values
-        col_mask = (data == config.missing_data_label).all(axis=0)
-        data[:,~col_mask] = imp.fit_transform(data)
-        return data, mask
-
-def normalize_reals(train, valid, test, config, id_col=DEFAULT_ID_COL):
-    tgt_cols = [x.name for x in config.features if x.feature_type == InputTypes.TARGET]
-    real_cols = list(set(v.name for v in config.features if v.feature_embed_type == DataTypes.CONTINUOUS).difference(set(tgt_cols)))
-    real_scalers = {}
-    tgt_scalers = {}
-
-    def apply_scalers(df, name=None):
-        if name is None:
-            name = df.name
-        mask = df.applymap(lambda x: True if x == config.missing_data_label else False) if hasattr(config, 'missing_data_label') else None
-        df[real_cols] = real_scalers[name].transform(df[real_cols])
-        if mask is not None and any(mask):
-            df[real_cols].mask(mask, 10**9)
-        df[tgt_cols] = tgt_scalers[name].transform(df[tgt_cols])
-        return df
-
-    if config.scale_per_id:
-        for identifier, sliced in train.groupby(id_col):
-            data = sliced[real_cols]
-            data, _ = impute(data, config)
-            real_scalers[identifier] = sklearn.preprocessing.StandardScaler().fit(data)
-            # XXX We should probably remove examples that contain NaN as a target
-            target = sliced[tgt_cols]
-            tgt_scalers[identifier] = sklearn.preprocessing.StandardScaler().fit(target)
-
-        train = train.groupby(id_col).apply(apply_scalers)
-        # For valid and testing leave only timeseries previously present in train subset
-        # XXX for proper data science we should consider encoding unseen timeseries as a special case, not throwing them away
-        valid = valid.loc[valid[id_col].isin(real_scalers.keys())]
-        valid = valid.groupby(id_col).apply(apply_scalers)
-        test = test.loc[test[id_col].isin(real_scalers.keys())]
-        test = test.groupby(id_col).apply(apply_scalers)
-
-    else:
-        data, _ = impute(train[real_cols], config)
-        real_scalers[''] = sklearn.preprocessing.StandardScaler().fit(data)
-        tgt_scalers[''] = sklearn.preprocessing.StandardScaler().fit(train[tgt_cols])
-
-        train = apply_scalers(train, name='')
-        valid = apply_scalers(valid, name='')
-        test = apply_scalers(test, name='')
-
-    return train, valid, test, real_scalers, tgt_scalers
-
-def encode_categoricals(train, valid, test, config):
-    cat_encodings = {}
-    cat_cols = list(set(v.name for v in config.features if v.feature_embed_type == DataTypes.CATEGORICAL and v.feature_type != InputTypes.ID))
-    num_classes = [] #XXX Maybe we should modify config based on this value? Or send a warninig?
-                     # For TC performance reasons we might want for num_classes[i] be divisible by 8
-
-    # Train categorical encoders
-    for c in cat_cols:
-        if config.missing_cat_data_strategy == 'special_token':
-            #XXX this will probably require some data augmentation
-            unique = train[c].unique()
-            valid[c].loc[valid[c].isin(unique)] = '<UNK>'
-            test[c].loc[test[c].isin(unique)] = '<UNK>'
-
-        if config.missing_cat_data_strategy == 'encode_all' or \
-                config.missing_cat_data_strategy == 'special_token':
-            srs = pd.concat([train[c], valid[c], test[c]]).apply(str)
-            cat_encodings[c] = sklearn.preprocessing.LabelEncoder().fit(srs.values)
-        elif config.missing_cat_data_strategy == 'drop':
-            # TODO: implement this. In addition to dropping rows this has to split specific time series in chunks
-            # to prevent data from having temporal gaps
-            pass
-        num_classes.append(srs.nunique())
-    print('Categorical variables encodings lens: ', num_classes)
-
-
-    for split in [train, valid, test]:
-        for c in cat_cols:
-            srs = split[c].apply(str)
-            split[c] = srs
-            split.loc[:,c] = cat_encodings[c].transform(srs)
-
-    return cat_encodings
-
-
-def preprocess(src_path, dst_path, config):
-    df = pd.read_csv(src_path, index_col=0)
-
-    for c in config.features:
-        if c.feature_embed_type == DataTypes.DATE:
-            df[c.name] = pd.to_datetime(df[c.name])
-
-    # Leave only columns relevant to preprocessing
-    relevant_columns = list(set([f.name for f in config.features] + [config.time_ids]))
-    df = df[relevant_columns]
-
-
-    id_col, id_encoders = flatten_ids(df, config)
-    df = df.reindex(sorted(df.columns), axis=1)
-    
-    train, valid, test = get_dataset_splits(df, config)
-   
-    # Length filter the data (all timeseries shorter than example len will be dropped)
-    #for df in [train, valid, test]:
-    #    df.groupby(id_col).filter(lambda x: len(x) >= config.example_length)
-    train = pd.concat([x[1] for x in train.groupby(id_col) if len(x[1]) >= config.example_length])
-    valid = pd.concat([x[1] for x in valid.groupby(id_col) if len(x[1]) >= config.example_length])
-    test  = pd.concat([x[1] for x in test.groupby(id_col)  if len(x[1]) >= config.example_length])
-
-    train, valid, test, real_scalers, tgt_scalers = normalize_reals(train, valid, test, config, id_col)
-
-    cat_encodings = encode_categoricals(train, valid, test, config)
-
-    os.makedirs(dst_path, exist_ok=True)
-    
-    train.to_csv(os.path.join(dst_path, 'train.csv'))
-    valid.to_csv(os.path.join(dst_path, 'valid.csv'))
-    test.to_csv(os.path.join(dst_path, 'test.csv'))
-
-    # Save relevant columns in binary form for faster dataloading
-    # IMORTANT: We always expect id to be a single column indicating the complete timeseries
-    # We also expect a copy of id in form of static categorical input!!!
-    col_names = [id_col] + [x.name for x in config.features if x.feature_embed_type != DataTypes.DATE and x.feature_type != InputTypes.ID]
-    grouped_train = [x[1][col_names].values.astype(np.float32).view(dtype=np.int32) for x in train.groupby(id_col)]
-    grouped_valid = [x[1][col_names].values.astype(np.float32).view(dtype=np.int32) for x in valid.groupby(id_col)]
-    grouped_test  = [x[1][col_names].values.astype(np.float32).view(dtype=np.int32) for x in test.groupby(id_col)]
-
-    pickle.dump(grouped_train, open(os.path.join(dst_path, 'train.bin'), 'wb'))
-    pickle.dump(grouped_valid, open(os.path.join(dst_path, 'valid.bin'), 'wb'))
-    pickle.dump(grouped_test,  open(os.path.join(dst_path, 'test.bin'), 'wb'))
-
-    
-    with open(os.path.join(dst_path, 'real_scalers.bin'), 'wb') as f:
-        pickle.dump(real_scalers, f)
-    with open(os.path.join(dst_path, 'tgt_scalers.bin'), 'wb') as f:
-        pickle.dump(tgt_scalers, f)
-    with open(os.path.join(dst_path, 'cat_encodings.bin'), 'wb') as f:
-        pickle.dump(cat_encodings, f)
-    with open(os.path.join(dst_path, 'id_encoders.bin'), 'wb') as f:
-        pickle.dump(id_encoders, f)
-    
-
-def sample_data(dataset, num_samples):
-    if num_samples < 0:
-        return dataset
-    else:
-        return torch.utils.data.Subset(dataset, np.random.choice(np.arange(len(dataset)), size=num_samples, replace=False))
-
-
-def standarize_electricity(path):
-    """Code taken from https://github.com/google-research/google-research/blob/master/tft/script_download_data.py"""
-    df = pd.read_csv(os.path.join(path, 'LD2011_2014.txt'), index_col=0, sep=';', decimal=',')
-    df.index = pd.to_datetime(df.index)
-    df.sort_index(inplace=True)
-  
-    # Used to determine the start and end dates of a series
-    output = df.resample('1h').mean().replace(0., np.nan)
-  
-    earliest_time = output.index.min()
-  
-    df_list = []
-    for label in output:
-        print('Processing {}'.format(label))
-        srs = output[label]
-  
-        start_date = min(srs.fillna(method='ffill').dropna().index)
-        end_date = max(srs.fillna(method='bfill').dropna().index)
-  
-        active_range = (srs.index >= start_date) & (srs.index <= end_date)
-        srs = srs[active_range].fillna(0.)
-  
-        tmp = pd.DataFrame({'power_usage': srs})
-        date = tmp.index
-        tmp['t'] = (date - earliest_time).seconds / 60 / 60 + (
-            date - earliest_time).days * 24
-        tmp['days_from_start'] = (date - earliest_time).days
-        tmp['categorical_id'] = label
-        tmp['date'] = date
-        tmp['id'] = label
-        tmp['hour'] = date.hour
-        tmp['day'] = date.day
-        tmp['day_of_week'] = date.dayofweek
-        tmp['month'] = date.month
-  
-        df_list.append(tmp)
-  
-    output = pd.concat(df_list, axis=0, join='outer').reset_index(drop=True)
-  
-    output['categorical_id'] = output['id'].copy()
-    output['hours_from_start'] = output['t']
-    output['categorical_day_of_week'] = output['day_of_week'].copy()
-    output['categorical_hour'] = output['hour'].copy()
-  
-    output.to_csv(os.path.join(path, 'standarized.csv'))
-
-def standarize_volatility(path):
-    df = pd.read_csv(os.path.join(path, 'oxfordmanrealizedvolatilityindices.csv'), index_col=0)  # no explicit index
-  
-    # Adds additional date/day fields
-    idx = [str(s).split('+')[0] for s in df.index
-          ]  # ignore timezones, we don't need them
-    dates = pd.to_datetime(idx)
-    df['date'] = dates
-    df['days_from_start'] = (dates - pd.datetime(2000, 1, 3)).days
-    df['day_of_week'] = dates.dayofweek
-    df['day_of_month'] = dates.day
-    df['week_of_year'] = dates.weekofyear
-    df['month'] = dates.month
-    df['year'] = dates.year
-    df['categorical_id'] = df['Symbol'].copy()
-  
-    # Processes log volatility
-    vol = df['rv5_ss'].copy()
-    vol.loc[vol == 0.] = np.nan
-    df['log_vol'] = np.log(vol)
-  
-    # Adds static information
-    symbol_region_mapping = {
-        '.AEX': 'EMEA',
-        '.AORD': 'APAC',
-        '.BFX': 'EMEA',
-        '.BSESN': 'APAC',
-        '.BVLG': 'EMEA',
-        '.BVSP': 'AMER',
-        '.DJI': 'AMER',
-        '.FCHI': 'EMEA',
-        '.FTMIB': 'EMEA',
-        '.FTSE': 'EMEA',
-        '.GDAXI': 'EMEA',
-        '.GSPTSE': 'AMER',
-        '.HSI': 'APAC',
-        '.IBEX': 'EMEA',
-        '.IXIC': 'AMER',
-        '.KS11': 'APAC',
-        '.KSE': 'APAC',
-        '.MXX': 'AMER',
-        '.N225': 'APAC ',
-        '.NSEI': 'APAC',
-        '.OMXC20': 'EMEA',
-        '.OMXHPI': 'EMEA',
-        '.OMXSPI': 'EMEA',
-        '.OSEAX': 'EMEA',
-        '.RUT': 'EMEA',
-        '.SMSI': 'EMEA',
-        '.SPX': 'AMER',
-        '.SSEC': 'APAC',
-        '.SSMI': 'EMEA',
-        '.STI': 'APAC',
-        '.STOXX50E': 'EMEA'
-    }
-  
-    df['Region'] = df['Symbol'].apply(lambda k: symbol_region_mapping[k])
-  
-    # Performs final processing
-    output_df_list = []
-    for grp in df.groupby('Symbol'):
-        sliced = grp[1].copy()
-        sliced.sort_values('days_from_start', inplace=True)
-        # Impute log volatility values
-        sliced['log_vol'].fillna(method='ffill', inplace=True)
-        sliced.dropna()
-        output_df_list.append(sliced)
-  
-    df = pd.concat(output_df_list, axis=0)
-  
-    df.to_csv(os.path.join(path, 'standarized.csv'))
-
-
-def standarize_traffic(path):
-    def process_list(s, variable_type=int, delimiter=None):
-        """Parses a line in the PEMS format to a list."""
-        if delimiter is None:
-            l = [
-                variable_type(i) for i in s.replace('[', '').replace(']', '').split()
-            ]
-        else:
-            l = [
-                variable_type(i)
-                for i in s.replace('[', '').replace(']', '').split(delimiter)
-            ]
-  
-        return l
-  
-    def read_single_list(filename):
-        """Returns single list from a file in the PEMS-custom format."""
-        with open(os.path.join(path, filename), 'r') as dat:
-            l = process_list(dat.readlines()[0])
-        return l
-  
-    def read_matrix(filename):
-        """Returns a matrix from a file in the PEMS-custom format."""
-        array_list = []
-        with open(os.path.join(path, filename), 'r') as dat:
-            lines = dat.readlines()
-            for i, line in enumerate(lines):
-                if (i + 1) % 50 == 0:
-                    print('Completed {} of {} rows for {}'.format(i + 1, len(lines),
-                                                                filename))
-                array = [
-                    process_list(row_split, variable_type=float, delimiter=None)
-                    for row_split in process_list(
-                        line, variable_type=str, delimiter=';')
-                ]
-                array_list.append(array)
-  
-        return array_list
-  
-    shuffle_order = np.array(read_single_list('randperm')) - 1  # index from 0
-    train_dayofweek = read_single_list('PEMS_trainlabels')
-    train_tensor = read_matrix('PEMS_train')
-    test_dayofweek = read_single_list('PEMS_testlabels')
-    test_tensor = read_matrix('PEMS_test')
-  
-    # Inverse permutate shuffle order
-    print('Shuffling')
-    inverse_mapping = {
-        new_location: previous_location
-        for previous_location, new_location in enumerate(shuffle_order)
-    }
-    reverse_shuffle_order = np.array([
-        inverse_mapping[new_location]
-        for new_location, _ in enumerate(shuffle_order)
-    ])
-  
-    # Group and reoder based on permuation matrix
-    print('Reodering')
-    day_of_week = np.array(train_dayofweek + test_dayofweek)
-    combined_tensor = np.array(train_tensor + test_tensor)
-  
-    day_of_week = day_of_week[reverse_shuffle_order]
-    combined_tensor = combined_tensor[reverse_shuffle_order]
-  
-    # Put everything back into a dataframe
-    print('Parsing as dataframe')
-    labels = ['traj_{}'.format(i) for i in read_single_list('stations_list')]
-  
-    hourly_list = []
-    for day, day_matrix in enumerate(combined_tensor):
-        # Hourly data
-        hourly = pd.DataFrame(day_matrix.T, columns=labels)
-        hourly['hour_on_day'] = [int(i / 6) for i in hourly.index
-                                ]  # sampled at 10 min intervals
-        if hourly['hour_on_day'].max() > 23 or hourly['hour_on_day'].min() < 0:
-            raise ValueError('Invalid hour! {}-{}'.format(
-                hourly['hour_on_day'].min(), hourly['hour_on_day'].max()))
-  
-        hourly = hourly.groupby('hour_on_day', as_index=True).mean()[labels]
-        hourly['sensor_day'] = day
-        hourly['time_on_day'] = hourly.index
-        hourly['day_of_week'] = day_of_week[day]
-  
-        hourly_list.append(hourly)
-  
-    hourly_frame = pd.concat(hourly_list, axis=0, ignore_index=True, sort=False)
-  
-    # Flatten such that each entitiy uses one row in dataframe
-    store_columns = [c for c in hourly_frame.columns if 'traj' in c]
-    other_columns = [c for c in hourly_frame.columns if 'traj' not in c]
-    flat_df = pd.DataFrame(columns=['values', 'prev_values', 'next_values'] +
-                           other_columns + ['id'])
-  
-    for store in store_columns:
-        print('Processing {}'.format(store))
-  
-        sliced = hourly_frame[[store] + other_columns].copy()
-        sliced.columns = ['values'] + other_columns
-        sliced['id'] = int(store.replace('traj_', ''))
-  
-        # Sort by Sensor-date-time
-        key = sliced['id'].apply(str) \
-                + sliced['sensor_day'].apply(lambda x: '_{:03d}'.format(x)) \
-                + sliced['time_on_day'].apply(lambda x: '_{:03d}'.format(x))
-        sliced = sliced.set_index(key).sort_index()
-  
-        sliced['values'] = sliced['values'].fillna(method='ffill')
-        sliced['prev_values'] = sliced['values'].shift(1)
-        sliced['next_values'] = sliced['values'].shift(-1)
-  
-        flat_df = flat_df.append(sliced.dropna(), ignore_index=True, sort=False)
-  
-    # Filter to match range used by other academic papers
-    index = flat_df['sensor_day']
-    flat_df = flat_df[index < 173].copy()
-  
-    # Creating columns fo categorical inputs
-    flat_df['categorical_id'] = flat_df['id'].copy()
-    flat_df['hours_from_start'] = flat_df['time_on_day'] \
-        + flat_df['sensor_day']*24.
-    flat_df['categorical_day_of_week'] = flat_df['day_of_week'].copy()
-    flat_df['categorical_time_on_day'] = flat_df['time_on_day'].copy()
-  
-    flat_df.to_csv(os.path.join(path, 'standarized.csv'))
-
-
-# XXX needs rework
-def standarize_favorita(data_folder):
-    import gc
-    # Extract only a subset of data to save/process for efficiency
-    start_date = pd.datetime(2015, 1, 1)
-    end_date = pd.datetime(2016, 6, 1)
-  
-    print('Regenerating data...')
-  
-    # load temporal data
-    temporal = pd.read_csv(os.path.join(data_folder, 'train.csv'), index_col=0)
-  
-    store_info = pd.read_csv(os.path.join(data_folder, 'stores.csv'), index_col=0)
-    oil = pd.read_csv(
-        os.path.join(data_folder, 'oil.csv'), index_col=0).iloc[:, 0]
-    holidays = pd.read_csv(os.path.join(data_folder, 'holidays_events.csv'))
-    items = pd.read_csv(os.path.join(data_folder, 'items.csv'), index_col=0)
-    transactions = pd.read_csv(os.path.join(data_folder, 'transactions.csv'))
-  
-    # Take first 6 months of data
-    temporal['date'] = pd.to_datetime(temporal['date'])
-  
-    # Filter dates to reduce storage space requirements
-    if start_date is not None:
-        temporal = temporal[(temporal['date'] >= start_date)]
-    if end_date is not None:
-        temporal = temporal[(temporal['date'] < end_date)]
-  
-    dates = temporal['date'].unique()
-  
-    # Add trajectory identifier
-    temporal['traj_id'] = temporal['store_nbr'].apply(
-        str) + '_' + temporal['item_nbr'].apply(str)
-    temporal['unique_id'] = temporal['traj_id'] + '_' + temporal['date'].apply(
-        str)
-  
-    # Remove all IDs with negative returns
-    print('Removing returns data')
-    min_returns = temporal['unit_sales'].groupby(temporal['traj_id']).min()
-    valid_ids = set(min_returns[min_returns >= 0].index)
-    selector = temporal['traj_id'].apply(lambda traj_id: traj_id in valid_ids)
-    new_temporal = temporal[selector].copy()
-    del temporal
-    gc.collect()
-    temporal = new_temporal
-    temporal['open'] = 1
-  
-    # Resampling
-    print('Resampling to regular grid')
-    resampled_dfs = []
-    for traj_id, raw_sub_df in temporal.groupby('traj_id'):
-        print('Resampling', traj_id)
-        sub_df = raw_sub_df.set_index('date', drop=True).copy()
-        sub_df = sub_df.resample('1d').last()
-        sub_df['date'] = sub_df.index
-        sub_df[['store_nbr', 'item_nbr', 'onpromotion']] \
-            = sub_df[['store_nbr', 'item_nbr', 'onpromotion']].fillna(method='ffill')
-        sub_df['open'] = sub_df['open'].fillna(
-            0)  # flag where sales data is unknown
-        sub_df['log_sales'] = np.log(sub_df['unit_sales'])
-    
-        resampled_dfs.append(sub_df.reset_index(drop=True))
-  
-    new_temporal = pd.concat(resampled_dfs, axis=0)
-    del temporal
-    gc.collect()
-    temporal = new_temporal
-  
-    print('Adding oil')
-    oil.name = 'oil'
-    oil.index = pd.to_datetime(oil.index)
-    #XXX the lines below match the value of the oil on given date with the rest of the timeseries
-    # missing values in oil series are copied from the index before. Then the oil series is joined with
-    # temporal. Then there are some dates present in temporal which arent present in oil, for which 
-    # oil values is substituted with -1. WHY?!
-    #TODO: check how many nans there are after first step. Previously oil series was extended by dates
-    # present in dates variable with nan value, which were forward filled. 
-    # This behavior is no longer supported by pandas, so we changed to DataFrame.isin method.
-    # This leaves us with more nans after first step than previously. To achieve previous behavior
-    # we have to join series before filling nans.
-    temporal = temporal.join(
-        #oil.loc[oil.index.isin(dates)].fillna(method='ffill'), on='date', how='left')
-        oil.loc[oil.index.isin(dates)], on='date', how='left')
-    temporal['oil'] = temporal['oil'].fillna(method='ffill')
-    temporal['oil'] = temporal['oil'].fillna(-1)
-  
-    print('Adding store info')
-    temporal = temporal.join(store_info, on='store_nbr', how='left')
-  
-    print('Adding item info')
-    temporal = temporal.join(items, on='item_nbr', how='left')
-  
-    transactions['date'] = pd.to_datetime(transactions['date'])
-    temporal = temporal.merge(
-        transactions,
-        left_on=['date', 'store_nbr'],
-        right_on=['date', 'store_nbr'],
-        how='left')
-    temporal['transactions'] = temporal['transactions'].fillna(-1)
-  
-    # Additional date info
-    temporal['day_of_week'] = pd.to_datetime(temporal['date'].values).dayofweek
-    temporal['day_of_month'] = pd.to_datetime(temporal['date'].values).day
-    temporal['month'] = pd.to_datetime(temporal['date'].values).month
-  
-    # Add holiday info
-    print('Adding holidays')
-    holiday_subset = holidays[holidays['transferred'].apply(
-        lambda x: not x)].copy()
-    holiday_subset.columns = [
-        s if s != 'type' else 'holiday_type' for s in holiday_subset.columns
-    ]
-    holiday_subset['date'] = pd.to_datetime(holiday_subset['date'])
-    local_holidays = holiday_subset[holiday_subset['locale'] == 'Local']
-    regional_holidays = holiday_subset[holiday_subset['locale'] == 'Regional']
-    national_holidays = holiday_subset[holiday_subset['locale'] == 'National']
-  
-    temporal['national_hol'] = temporal.merge(
-        national_holidays, left_on=['date'], right_on=['date'],
-        how='left')['description'].fillna('')
-    temporal['regional_hol'] = temporal.merge(
-        regional_holidays,
-        left_on=['state', 'date'],
-        right_on=['locale_name', 'date'],
-        how='left')['description'].fillna('')
-    temporal['local_hol'] = temporal.merge(
-        local_holidays,
-        left_on=['city', 'date'],
-        right_on=['locale_name', 'date'],
-        how='left')['description'].fillna('')
-  
-    temporal.sort_values('unique_id', inplace=True)
-
-    # Transform date to integer index
-    start_date = pd.to_datetime(min(temporal['date']))
-    dates = temporal['date'].apply(pd.to_datetime)
-    temporal['days_from_start'] = (dates - start_date).dt.days
-    temporal['categorical_id'] = temporal['traj_id'].copy()
-  
-    print('Saving processed file to {}'.format(os.path.join(data_folder, 'standarized.csv')))
-    temporal.to_csv(os.path.join(data_folder, 'standarized.csv'))
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/ema.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/ema.py
deleted file mode 100644
index f8f5b331..00000000
--- a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/ema.py
+++ /dev/null
@@ -1,73 +0,0 @@
-# Copyright 2021 NVIDIA CORPORATION
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-
-#     http://www.apache.org/licenses/LICENSE-2.0
-
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Copyright 2019 Ross Wightman
-
-#    Licensed under the Apache License, Version 2.0 (the "License");
-#    you may not use this file except in compliance with the License.
-#    You may obtain a copy of the License at
-
-#        http://www.apache.org/licenses/LICENSE-2.0
-
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS,
-#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#    See the License for the specific language governing permissions and
-#    limitations under the License.
-
-""" 
-Exponential Moving Average (EMA) of model updates
-"""
-
-from collections import OrderedDict
-from copy import deepcopy
-
-import torch
-import torch.nn as nn
-
-class ModelEma(nn.Module):
-    """ Model Exponential Moving Average V2
-
-    Keep a moving average of everything in the model state_dict (parameters and buffers).
-    V2 of this module is simpler, it does not match params/buffers based on name but simply
-    iterates in order. It works with torchscript (JIT of full model).
-
-    """
-    def __init__(self, model, decay=0.999, device=None):
-        super().__init__()
-        # make a copy of the model for accumulating moving average of weights
-        self.module = deepcopy(model)
-        self.module.eval()
-        self.decay = decay
-        self.device = device  # perform ema on different device from model if set
-        if self.device is not None:
-            self.module.to(device=device)
-
-    def update(self, model):
-        update_fn=lambda ema_v, model_v: self.decay * ema_v + (1. - self.decay) * model_v
-        with torch.no_grad():
-            for ema_v, model_v in zip(self.module.state_dict().values(), model.state_dict().values()):
-                if self.device is not None:
-                    model_v = model_v.to(device=self.device)
-                ema_v.copy_(update_fn(ema_v, model_v))
-
-    def set(self, model):
-        with torch.no_grad():
-            for ema_v, model_v in zip(self.module.state_dict().values(), model.state_dict().values()):
-                if self.device is not None:
-                    model_v = model_v.to(device=self.device)
-                ema_v.copy_( model_v )
-
-    def forward(self, x):
-        return self.module(x)
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/gpu_affinity.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/gpu_affinity.py
deleted file mode 100644
index 79fb1fc4..00000000
--- a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/gpu_affinity.py
+++ /dev/null
@@ -1,157 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import collections
-import math
-import os
-import pathlib
-import re
-
-import pynvml
-
-pynvml.nvmlInit()
-
-
-def systemGetDriverVersion():
-    return pynvml.nvmlSystemGetDriverVersion()
-
-
-def deviceGetCount():
-    return pynvml.nvmlDeviceGetCount()
-
-
-class device:
-    # assume nvml returns list of 64 bit ints
-    _nvml_affinity_elements = math.ceil(os.cpu_count() / 64)
-
-    def __init__(self, device_idx):
-        super().__init__()
-        self.handle = pynvml.nvmlDeviceGetHandleByIndex(device_idx)
-
-    def getName(self):
-        return pynvml.nvmlDeviceGetName(self.handle)
-
-    def getCpuAffinity(self):
-        affinity_string = ''
-        for j in pynvml.nvmlDeviceGetCpuAffinity(
-            self.handle, device._nvml_affinity_elements
-        ):
-            # assume nvml returns list of 64 bit ints
-            affinity_string = '{:064b}'.format(j) + affinity_string
-        affinity_list = [int(x) for x in affinity_string]
-        affinity_list.reverse()  # so core 0 is in 0th element of list
-
-        ret = [i for i, e in enumerate(affinity_list) if e != 0]
-        return ret
-
-
-def set_socket_affinity(gpu_id):
-    dev = device(gpu_id)
-    affinity = dev.getCpuAffinity()
-    os.sched_setaffinity(0, affinity)
-
-
-def set_single_affinity(gpu_id):
-    dev = device(gpu_id)
-    affinity = dev.getCpuAffinity()
-    os.sched_setaffinity(0, affinity[:1])
-
-
-def set_single_unique_affinity(gpu_id, nproc_per_node):
-    devices = [device(i) for i in range(nproc_per_node)]
-    socket_affinities = [dev.getCpuAffinity() for dev in devices]
-
-    siblings_list = get_thread_siblings_list()
-    siblings_dict = dict(siblings_list)
-
-    # remove siblings
-    for idx, socket_affinity in enumerate(socket_affinities):
-        socket_affinities[idx] = list(set(socket_affinity) - set(siblings_dict.values()))
-
-    affinities = []
-    assigned = []
-
-    for socket_affinity in socket_affinities:
-        for core in socket_affinity:
-            if core not in assigned:
-                affinities.append([core])
-                assigned.append(core)
-                break
-    os.sched_setaffinity(0, affinities[gpu_id])
-
-
-def set_socket_unique_affinity(gpu_id, nproc_per_node, mode):
-    device_ids = [device(i) for i in range(nproc_per_node)]
-    socket_affinities = [dev.getCpuAffinity() for dev in device_ids]
-
-    siblings_list = get_thread_siblings_list()
-    siblings_dict = dict(siblings_list)
-
-    # remove siblings
-    for idx, socket_affinity in enumerate(socket_affinities):
-        socket_affinities[idx] = list(set(socket_affinity) - set(siblings_dict.values()))
-
-    socket_affinities_to_device_ids = collections.defaultdict(list)
-
-    for idx, socket_affinity in enumerate(socket_affinities):
-        socket_affinities_to_device_ids[tuple(socket_affinity)].append(idx)
-
-    for socket_affinity, device_ids in socket_affinities_to_device_ids.items():
-        devices_per_group = len(device_ids)
-        cores_per_device = len(socket_affinity) // devices_per_group
-        for group_id, device_id in enumerate(device_ids):
-            if device_id == gpu_id:
-                if mode == 'interleaved':
-                    affinity = list(socket_affinity[group_id::devices_per_group])
-                elif mode == 'continuous':
-                    affinity = list(socket_affinity[group_id*cores_per_device:(group_id+1)*cores_per_device])
-                else:
-                    raise RuntimeError('Unknown set_socket_unique_affinity mode')
-
-                # reintroduce siblings
-                affinity += [siblings_dict[aff] for aff in affinity if aff in siblings_dict]
-                os.sched_setaffinity(0, affinity)
-
-
-def get_thread_siblings_list():
-    path = '/sys/devices/system/cpu/cpu*/topology/thread_siblings_list'
-    thread_siblings_list = []
-    pattern = re.compile(r'(\d+)\D(\d+)')
-    for fname in pathlib.Path(path[0]).glob(path[1:]):
-        with open(fname) as f:
-            content = f.read().strip()
-            res = pattern.findall(content)
-            if res:
-                pair = tuple(map(int, res[0]))
-                thread_siblings_list.append(pair)
-    return thread_siblings_list
-
-
-def set_affinity(gpu_id, nproc_per_node, mode='socket'):
-    if mode == 'socket':
-        set_socket_affinity(gpu_id)
-    elif mode == 'single':
-        set_single_affinity(gpu_id)
-    elif mode == 'single_unique':
-        set_single_unique_affinity(gpu_id, nproc_per_node)
-    elif mode == 'socket_unique_interleaved':
-        set_socket_unique_affinity(gpu_id, nproc_per_node, 'interleaved')
-    elif mode == 'socket_unique_continuous':
-        set_socket_unique_affinity(gpu_id, nproc_per_node, 'continuous')
-    else:
-        raise RuntimeError('Unknown affinity mode')
-
-    affinity = os.sched_getaffinity(0)
-    return affinity
-
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/inference.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/inference.py
deleted file mode 100644
index 056429f1..00000000
--- a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/inference.py
+++ /dev/null
@@ -1,239 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import pandas as pd
-import numpy as np
-import pickle
-import argparse
-import torch
-from torch.utils.data import DataLoader
-from torch.cuda import amp
-from torch.utils.tensorboard import SummaryWriter
-from tqdm import tqdm
-from modeling import TemporalFusionTransformer
-from configuration import ElectricityConfig
-from data_utils import TFTDataset
-from utils import PerformanceMeter
-from criterions import QuantileLoss
-import dllogger
-from log_helper import setup_logger
-
-def _unscale_per_id(config, values, ids, scalers):
-    values = values.cpu().numpy()
-    num_horizons = config.example_length - config.encoder_length + 1
-    flat_values = pd.DataFrame(
-            values,
-            columns=[f't{j}' for j in range(num_horizons - values.shape[1], num_horizons)]
-            )
-    flat_values['id'] = ids
-    df_list = []
-    for idx, group in flat_values.groupby('id'):
-        scaler = scalers[idx]
-        group_copy = group.copy()
-        for col in group_copy.columns:
-            if not 'id' in col:
-                _col = np.expand_dims(group_copy[col].values, -1)
-                _t_col = scaler.inverse_transform(_col)[:,-1]
-                group_copy[col] = _t_col
-        df_list.append(group_copy)
-    flat_values = pd.concat(df_list, axis=0)
-
-    flat_values = flat_values[[col for col in flat_values if not 'id' in col]]
-    flat_tensor = torch.from_numpy(flat_values.values)
-    return flat_tensor
-
-def _unscale(config, values, scaler):
-    values = values.cpu().numpy()
-    num_horizons = config.example_length - config.encoder_length + 1
-    flat_values = pd.DataFrame(
-            values,
-            columns=[f't{j}' for j in range(num_horizons - values.shape[1], num_horizons)]
-            )
-    for col in flat_values.columns:
-        if not 'id' in col:
-            _col = np.expand_dims(flat_values[col].values, -1)
-            _t_col = scaler.inverse_transform(_col)[:,-1]
-            flat_values[col] = _t_col
-
-    flat_values = flat_values[[col for col in flat_values if not 'id' in col]]
-    flat_tensor = torch.from_numpy(flat_values.values)
-    return flat_tensor
-
-def predict(args, config, model, data_loader, scalers, cat_encodings, extend_targets=False):
-    model.eval()
-    predictions = []
-    targets = []
-    ids = []
-    perf_meter = PerformanceMeter()
-    n_workers = args.distributed_world_size if hasattr(args, 'distributed_world_size') else 1
-
-    for step, batch in enumerate(data_loader):
-        perf_meter.reset_current_lap()
-        with torch.no_grad():
-            batch = {key: tensor.cuda() if tensor.numel() else None for key, tensor in batch.items()}
-            ids.append(batch['id'][:,0,:])
-            targets.append(batch['target'])
-            predictions.append(model(batch).float())
-
-        perf_meter.update(args.batch_size * n_workers,
-            exclude_from_total=step in [0, len(data_loader)-1])
-
-    targets = torch.cat(targets, dim=0)
-    if not extend_targets:
-        targets = targets[:,config.encoder_length:,:] 
-    predictions = torch.cat(predictions, dim=0)
-    
-    if config.scale_per_id:
-        ids = torch.cat(ids, dim=0).cpu().numpy()
-
-        unscaled_predictions = torch.stack(
-                [_unscale_per_id(config, predictions[:,:,i], ids, scalers) for i in range(len(config.quantiles))], 
-                dim=-1)
-        unscaled_targets = _unscale_per_id(config, targets[:,:,0], ids, scalers).unsqueeze(-1)
-    else:
-        ids = None
-        unscaled_predictions = torch.stack(
-                [_unscale(config, predictions[:,:,i], scalers['']) for i in range(len(config.quantiles))], 
-                dim=-1)
-        unscaled_targets = _unscale(config, targets[:,:,0], scalers['']).unsqueeze(-1)
-
-    return unscaled_predictions, unscaled_targets, ids, perf_meter
-
-def visualize_v2(args, config, model, data_loader, scalers, cat_encodings):
-    unscaled_predictions, unscaled_targets, ids, _ = predict(args, config, model, data_loader, scalers, cat_encodings, extend_targets=True)
-
-    num_horizons = config.example_length - config.encoder_length + 1
-    pad = unscaled_predictions.new_full((unscaled_targets.shape[0], unscaled_targets.shape[1] - unscaled_predictions.shape[1], unscaled_predictions.shape[2]), fill_value=float('nan'))
-    pad[:,-1,:] = unscaled_targets[:,-num_horizons,:]
-    unscaled_predictions = torch.cat((pad, unscaled_predictions), dim=1)
-
-    ids = torch.from_numpy(ids.squeeze())
-    joint_graphs = torch.cat([unscaled_targets, unscaled_predictions], dim=2)
-    graphs = {i:joint_graphs[ids == i, :, :] for i in set(ids.tolist())}
-    for key, g in graphs.items():
-        for i, ex in enumerate(g):
-            df = pd.DataFrame(ex.numpy(), 
-                    index=range(num_horizons - ex.shape[0], num_horizons),
-                    columns=['target'] + [f'P{int(q*100)}' for q in config.quantiles])
-            fig = df.plot().get_figure()
-            ax = fig.get_axes()[0]
-            _values = df.values[config.encoder_length-1:,:]
-            ax.fill_between(range(num_horizons), _values[:,1], _values[:,-1], alpha=0.2, color='green')
-            os.makedirs(os.path.join(args.results, 'single_example_vis', str(key)), exist_ok=True)
-            fig.savefig(os.path.join(args.results, 'single_example_vis', str(key), f'{i}.pdf'))
-
-def inference(args, config, model, data_loader, scalers, cat_encodings):
-    unscaled_predictions, unscaled_targets, ids, perf_meter = predict(args, config, model, data_loader, scalers, cat_encodings)
-
-    if args.joint_visualization or args.save_predictions:
-        ids = torch.from_numpy(ids.squeeze())
-        #ids = torch.cat([x['id'][0] for x in data_loader.dataset])
-        joint_graphs = torch.cat([unscaled_targets, unscaled_predictions], dim=2)
-        graphs = {i:joint_graphs[ids == i, :, :] for i in set(ids.tolist())}
-        for key, g in graphs.items(): #timeseries id, joint targets and predictions
-            _g = {'targets': g[:,:,0]}
-            _g.update({f'P{int(q*100)}':g[:,:,i+1] for i, q in enumerate(config.quantiles)})
-            
-            if args.joint_visualization:
-                summary_writer = SummaryWriter(log_dir=os.path.join(args.results, 'predictions_vis', str(key)))
-                for q, t in _g.items(): # target and quantiles, timehorizon values
-                    if q == 'targets':
-                        targets = torch.cat([t[:,0], t[-1,1:]]) # WIP
-                        # We want to plot targets on the same graph as predictions. Probably could be written better.
-                        for i, val in enumerate(targets):
-                            summary_writer.add_scalars(str(key), {f'{q}':val}, i)
-                        continue
-
-                    # Tensor t contains different time horizons which are shifted in phase
-                    # Next lines realign them
-                    y = t.new_full((t.shape[0] + t.shape[1] -1, t.shape[1]), float('nan'))
-                    for i in range(y.shape[1]):
-                        y[i:i+t.shape[0], i] = t[:,i]
-
-                    for i, vals in enumerate(y): # timestep, timehorizon values value
-                        summary_writer.add_scalars(str(key), {f'{q}_t+{j+1}':v for j,v in enumerate(vals) if v == v}, i)
-                summary_writer.close()
-
-            if args.save_predictions:
-                for q, t in _g.items():
-                    df = pd.DataFrame(t.tolist())
-                    df.columns = [f't+{i+1}' for i in range(len(df.columns))]
-                    os.makedirs(os.path.join(args.results, 'predictions', str(key)), exist_ok=True)
-                    df.to_csv(os.path.join(args.results, 'predictions', str(key), q+'.csv'))
-
-    losses = QuantileLoss(config)(unscaled_predictions, unscaled_targets)
-    normalizer = unscaled_targets.abs().mean()
-    q_risk = 2 * losses / normalizer
-
-    perf_dict = {
-                'throughput': perf_meter.avg,
-                'latency_avg': perf_meter.total_time/len(perf_meter.intervals),
-                'latency_p90': perf_meter.p(90),
-                'latency_p95': perf_meter.p(95),
-                'latency_p99': perf_meter.p(99),
-                'total_infernece_time': perf_meter.total_time,
-                }
-
-    return q_risk, perf_dict
-
-
-def main(args):
-    
-    setup_logger(args)
-    # Set up model
-    state_dict = torch.load(args.checkpoint)
-    config = state_dict['config']
-    model = TemporalFusionTransformer(config).cuda()
-    model.load_state_dict(state_dict['model'])
-    model.eval()
-    model.cuda()
-
-    # Set up dataset
-    test_split = TFTDataset(args.data, config)
-    data_loader = DataLoader(test_split, batch_size=args.batch_size, num_workers=4)
-
-    scalers = pickle.load(open(args.tgt_scalers, 'rb'))
-    cat_encodings = pickle.load(open(args.cat_encodings, 'rb'))
-
-    if args.visualize:
-        # TODO: abstract away all forms of visualization.
-        visualize_v2(args, config, model, data_loader, scalers, cat_encodings)
-
-    quantiles, perf_dict = inference(args, config, model, data_loader, scalers, cat_encodings)
-    quantiles = {'test_p10': quantiles[0].item(), 'test_p50': quantiles[1].item(), 'test_p90': quantiles[2].item(), 'sum':sum(quantiles).item()}
-    finish_log = {**quantiles, **perf_dict}
-    dllogger.log(step=(), data=finish_log, verbosity=1)
-    print('Test q-risk: P10 {} | P50 {} | P90 {}'.format(*quantiles))
-    print('Latency:\n\tAverage {:.3f}s\n\tp90 {:.3f}s\n\tp95 {:.3f}s\n\tp99 {:.3f}s'.format(
-        perf_dict['latency_avg'], perf_dict['latency_p90'], perf_dict['latency_p95'], perf_dict['latency_p99']))
-
-if __name__=='__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--checkpoint', type=str,
-                        help='Path to the checkpoint')
-    parser.add_argument('--data', type=str,
-                        help='Path to the test split of the dataset')
-    parser.add_argument('--tgt_scalers', type=str,
-                        help='Path to the tgt_scalers.bin file produced by the preprocessing')
-    parser.add_argument('--cat_encodings', type=str,
-                        help='Path to the cat_encodings.bin file produced by the preprocessing')
-    parser.add_argument('--batch_size', type=int, default=64)
-    parser.add_argument('--visualize', action='store_true', help='Visualize predictions - each example on the separate plot')
-    parser.add_argument('--joint_visualization', action='store_true', help='Visualize predictions - each timeseries on separate plot. Projections will be concatenated.')
-    parser.add_argument('--save_predictions', action='store_true')
-    parser.add_argument('--results', type=str, default='/results')
-    parser.add_argument('--log_file', type=str, default='dllogger.json')
-    ARGS = parser.parse_args()
-    main(ARGS)
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/log_helper.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/log_helper.py
deleted file mode 100644
index 83d2ac7f..00000000
--- a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/log_helper.py
+++ /dev/null
@@ -1,141 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import subprocess
-import sys
-import itertools
-import atexit
-
-import dllogger
-from dllogger import Backend, JSONStreamBackend, StdOutBackend
-
-import torch.distributed as dist
-from torch.utils.tensorboard import SummaryWriter
-
-class TensorBoardBackend(Backend):
-    def __init__(self, verbosity, log_dir):
-        super().__init__(verbosity=verbosity)
-        self.summary_writer = SummaryWriter(log_dir=os.path.join(log_dir, 'TB_summary'),
-                                            flush_secs=120,
-                                            max_queue=200
-                                            )
-        self.hp_cache = None
-        atexit.register(self.summary_writer.close)
-
-    @property
-    def log_level(self):
-        return self._log_level
-
-    def metadata(self, timestamp, elapsedtime, metric, metadata):
-        pass
-
-    def log(self, timestamp, elapsedtime, step, data):
-        if step == 'HPARAMS':
-            parameters = {k: v for k, v in data.items() if not isinstance(v, (list, tuple))}
-            #Unpack list and tuples
-            for d in [{k+f'_{i}':v for i,v in enumerate(l)} for k,l in data.items() if isinstance(l, (list, tuple))]:
-                parameters.update(d)
-            #Remove custom classes
-            parameters = {k: v for k, v in data.items() if isinstance(v, (int, float, str, bool))}
-            parameters.update({k:'None' for k, v in data.items() if v is None})
-            self.hp_cache = parameters
-        if step == ():
-            if self.hp_cache is None:
-                print('Warning: Cannot save HParameters. Please log HParameters with step=\'HPARAMS\'', file=sys.stderr)
-                return
-            self.summary_writer.add_hparams(self.hp_cache, data)
-        if not isinstance(step, int):
-            return
-        for k, v in data.items():
-            self.summary_writer.add_scalar(k, v, step)
-
-    def flush(self):
-        pass
-
-def setup_logger(args):
-    os.makedirs(args.results, exist_ok=True)
-    log_path = os.path.join(args.results, args.log_file)
-
-    if os.path.exists(log_path):
-        for i in itertools.count():
-            s_fname = args.log_file.split('.')
-            fname = '.'.join(s_fname[:-1]) + f'_{i}.' + s_fname[-1] if len(s_fname) > 1 else args.stat_file + f'.{i}'
-            log_path = os.path.join(args.results, fname)
-            if not os.path.exists(log_path):
-                break
-
-    def metric_format(metric, metadata, value):
-        return "{}: {}".format(metric, f'{value:.5f}' if isinstance(value, float) else value)
-    def step_format(step):
-        if step == ():
-            return "Finished |"
-        elif isinstance(step, int):
-            return "Step {0: <5} |".format(step)
-        return "Step {} |".format(step)
-
-
-    if not dist.is_initialized() or not args.distributed_world_size > 1 or args.distributed_rank == 0:
-        dllogger.init(backends=[JSONStreamBackend(verbosity=1, filename=log_path),
-                                TensorBoardBackend(verbosity=1, log_dir=args.results),
-                                StdOutBackend(verbosity=2, 
-                                              step_format=step_format,
-                                              prefix_format=lambda x: "")#,
-                                              #metric_format=metric_format)
-                                ])
-    else:
-        dllogger.init(backends=[])
-    dllogger.log(step='PARAMETER', data=vars(args), verbosity=0)
-
-    container_setup_info = {**get_framework_env_vars(), **get_system_info()}
-    dllogger.log(step='ENVIRONMENT', data=container_setup_info, verbosity=0)
-
-    dllogger.metadata('loss', {'GOAL': 'MINIMIZE', 'STAGE': 'TRAIN', 'format': ':5f'})
-    dllogger.metadata('P10', {'GOAL': 'MINIMIZE', 'STAGE': 'TRAIN', 'format': ':5f'})
-    dllogger.metadata('P50', {'GOAL': 'MINIMIZE', 'STAGE': 'TRAIN', 'format': ':5f'})
-    dllogger.metadata('P90', {'GOAL': 'MINIMIZE', 'STAGE': 'TRAIN', 'format': ':5f'})
-    dllogger.metadata('items/s', {'GOAL': 'MAXIMIZE', 'STAGE': 'TRAIN', 'format': ':1f'})
-    dllogger.metadata('val_loss', {'GOAL': 'MINIMIZE', 'STAGE': 'VAL', 'format':':5f'})
-    dllogger.metadata('val_P10', {'GOAL': 'MINIMIZE', 'STAGE': 'VAL', 'format': ':5f'})
-    dllogger.metadata('val_P50', {'GOAL': 'MINIMIZE', 'STAGE': 'VAL', 'format': ':5f'})
-    dllogger.metadata('val_P90', {'GOAL': 'MINIMIZE', 'STAGE': 'VAL', 'format': ':5f'})
-    dllogger.metadata('val_items/s', {'GOAL': 'MAXIMIZE', 'STAGE': 'VAL', 'format': ':1f'})
-    dllogger.metadata('test_P10', {'GOAL': 'MINIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
-    dllogger.metadata('test_P50', {'GOAL': 'MINIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
-    dllogger.metadata('test_P90', {'GOAL': 'MINIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
-    dllogger.metadata('throughput', {'GOAL': 'MAXIMIZE', 'STAGE': 'TEST', 'format': ':1f'})
-    dllogger.metadata('latency_p90', {'GOAL': 'MIMIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
-    dllogger.metadata('latency_p95', {'GOAL': 'MIMIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
-    dllogger.metadata('latency_p99', {'GOAL': 'MIMIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
-
-
-def get_framework_env_vars():
-    return {
-        'NVIDIA_PYTORCH_VERSION': os.environ.get('NVIDIA_PYTORCH_VERSION'),
-        'PYTORCH_VERSION': os.environ.get('PYTORCH_VERSION'),
-        'CUBLAS_VERSION': os.environ.get('CUBLAS_VERSION'),
-        'NCCL_VERSION': os.environ.get('NCCL_VERSION'),
-        'CUDA_DRIVER_VERSION': os.environ.get('CUDA_DRIVER_VERSION'),
-        'CUDNN_VERSION': os.environ.get('CUDNN_VERSION'),
-        'CUDA_VERSION': os.environ.get('CUDA_VERSION'),
-        'NVIDIA_PIPELINE_ID': os.environ.get('NVIDIA_PIPELINE_ID'),
-        'NVIDIA_BUILD_ID': os.environ.get('NVIDIA_BUILD_ID'),
-        'NVIDIA_TF32_OVERRIDE': os.environ.get('NVIDIA_TF32_OVERRIDE'),
-    }
-
-def get_system_info():
-    system_info = subprocess.run('nvidia-smi --query-gpu=gpu_name,memory.total,enforced.power.limit --format=csv'.split(), capture_output=True).stdout
-    system_info = [i.decode('utf-8') for i in system_info.split(b'\n')]
-    system_info = [x for x in system_info if x]
-    return {'system_info': system_info}
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/modeling.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/modeling.py
deleted file mode 100644
index 65e64983..00000000
--- a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/modeling.py
+++ /dev/null
@@ -1,367 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-from torch import Tensor
-from typing import Dict, Tuple, Optional, List
-
-if os.environ.get("TFT_SCRIPTING", False):
-    from torch.nn import LayerNorm
-else:
-    from apex.normalization.fused_layer_norm import FusedLayerNorm as LayerNorm
-
-class MaybeLayerNorm(nn.Module):
-    def __init__(self, output_size, hidden_size, eps):
-        super().__init__()
-        if output_size and output_size == 1:
-            self.ln = nn.Identity()
-        else:
-            self.ln = LayerNorm(output_size if output_size else hidden_size, eps=eps)
-    
-    def forward(self, x):
-        return self.ln(x)
-
-
-class GLU(nn.Module):
-    def __init__(self, hidden_size, output_size):
-        super().__init__()
-        self.lin = nn.Linear(hidden_size, output_size * 2)
-
-    def forward(self, x: Tensor) -> Tensor:
-        x = self.lin(x)
-        x = F.glu(x)
-        return x
-
-
-class GRN(nn.Module):
-    def __init__(self,
-                 input_size,
-                 hidden_size, 
-                 output_size=None,
-                 context_hidden_size=None,
-                 dropout=0):
-        super().__init__()
-
-        
-        self.layer_norm = MaybeLayerNorm(output_size, hidden_size, eps=1e-3)
-        self.lin_a = nn.Linear(input_size, hidden_size)
-        if context_hidden_size is not None:
-            self.lin_c = nn.Linear(context_hidden_size, hidden_size, bias=False)
-        self.lin_i = nn.Linear(hidden_size, hidden_size)
-        self.glu = GLU(hidden_size, output_size if output_size else hidden_size)
-        self.dropout = nn.Dropout(dropout)
-        self.out_proj = nn.Linear(input_size, output_size) if output_size else None
-
-    def forward(self, a: Tensor, c: Optional[Tensor] = None):
-        x = self.lin_a(a)
-        if c is not None:
-            x = x + self.lin_c(c).unsqueeze(1)
-        x = F.elu(x)
-        x = self.lin_i(x)
-        x = self.dropout(x)
-        x = self.glu(x)
-        y = a if not self.out_proj else self.out_proj(a)
-        x = x + y
-        x = self.layer_norm(x)
-        return x 
-
-class TFTEmbedding(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.s_cat_inp_lens    = config.static_categorical_inp_lens
-        self.t_cat_k_inp_lens  = config.temporal_known_categorical_inp_lens
-        self.t_cat_o_inp_lens  = config.temporal_observed_categorical_inp_lens
-        self.s_cont_inp_size   = config.static_continuous_inp_size
-        self.t_cont_k_inp_size = config.temporal_known_continuous_inp_size
-        self.t_cont_o_inp_size = config.temporal_observed_continuous_inp_size
-        self.t_tgt_size        = config.temporal_target_size
-
-        self.hidden_size = config.hidden_size
-
-        # There are 7 types of input:
-        # 1. Static categorical
-        # 2. Static continuous
-        # 3. Temporal known a priori categorical
-        # 4. Temporal known a priori continuous
-        # 5. Temporal observed categorical
-        # 6. Temporal observed continuous
-        # 7. Temporal observed targets (time series obseved so far)
-
-        self.s_cat_embed = nn.ModuleList([
-            nn.Embedding(n, self.hidden_size) for n in self.s_cat_inp_lens]) if self.s_cat_inp_lens else None
-        self.t_cat_k_embed = nn.ModuleList([
-            nn.Embedding(n, self.hidden_size) for n in self.t_cat_k_inp_lens]) if self.t_cat_k_inp_lens else None
-        self.t_cat_o_embed = nn.ModuleList([
-            nn.Embedding(n, self.hidden_size) for n in self.t_cat_o_inp_lens]) if self.t_cat_o_inp_lens else None
-
-        self.s_cont_embedding_vectors = nn.Parameter(torch.Tensor(self.s_cont_inp_size, self.hidden_size)) if self.s_cont_inp_size else None
-        self.t_cont_k_embedding_vectors = nn.Parameter(torch.Tensor(self.t_cont_k_inp_size, self.hidden_size)) if self.t_cont_k_inp_size else None
-        self.t_cont_o_embedding_vectors = nn.Parameter(torch.Tensor(self.t_cont_o_inp_size, self.hidden_size)) if self.t_cont_o_inp_size else None
-        self.t_tgt_embedding_vectors = nn.Parameter(torch.Tensor(self.t_tgt_size, self.hidden_size))
-
-        self.s_cont_embedding_bias = nn.Parameter(torch.zeros(self.s_cont_inp_size, self.hidden_size)) if self.s_cont_inp_size else None
-        self.t_cont_k_embedding_bias = nn.Parameter(torch.zeros(self.t_cont_k_inp_size, self.hidden_size)) if self.t_cont_k_inp_size else None
-        self.t_cont_o_embedding_bias = nn.Parameter(torch.zeros(self.t_cont_o_inp_size, self.hidden_size)) if self.t_cont_o_inp_size else None
-        self.t_tgt_embedding_bias = nn.Parameter(torch.zeros(self.t_tgt_size, self.hidden_size))
-
-        if self.s_cont_embedding_vectors is not None:
-            torch.nn.init.xavier_normal_(self.s_cont_embedding_vectors)
-        if self.t_cont_k_embedding_vectors is not None:
-            torch.nn.init.xavier_normal_(self.t_cont_k_embedding_vectors)
-        if self.t_cont_o_embedding_vectors is not None:
-            torch.nn.init.xavier_normal_(self.t_cont_o_embedding_vectors)
-        torch.nn.init.xavier_normal_(self.t_tgt_embedding_vectors)
-
-    def _apply_embedding(self,
-            cat: Optional[Tensor],
-            cont: Optional[Tensor],
-            cat_emb: Optional[nn.ModuleList], 
-            cont_emb: Tensor,
-            cont_bias: Tensor,
-            ) -> Tuple[Optional[Tensor], Optional[Tensor]]:
-        e_cat = torch.stack([embed(cat[...,i]) for i, embed in enumerate(cat_emb)], dim=-2) if cat is not None else None
-        if cont is not None:
-            #the line below is equivalent to following einsums
-            #e_cont = torch.einsum('btf,fh->bthf', cont, cont_emb)
-            #e_cont = torch.einsum('bf,fh->bhf', cont, cont_emb)
-            e_cont = torch.mul(cont.unsqueeze(-1), cont_emb)
-            e_cont = e_cont + cont_bias
-        else:
-            e_cont = None
-
-        if e_cat is not None and e_cont is not None:
-            return torch.cat([e_cat, e_cont], dim=-2)
-        elif e_cat is not None:
-            return e_cat
-        elif e_cont is not None:
-            return e_cont
-        else:
-            return None
-
-    def forward(self, x: Dict[str, Tensor]):
-        # temporal/static categorical/continuous known/observed input 
-        s_cat_inp = x.get('s_cat', None)
-        s_cont_inp = x.get('s_cont', None)
-        t_cat_k_inp = x.get('k_cat', None)
-        t_cont_k_inp = x.get('k_cont', None)
-        t_cat_o_inp = x.get('o_cat', None)
-        t_cont_o_inp = x.get('o_cont', None)
-        t_tgt_obs = x['target'] # Has to be present
-
-        # Static inputs are expected to be equal for all timesteps
-        # For memory efficiency there is no assert statement
-        s_cat_inp = s_cat_inp[:,0,:] if s_cat_inp is not None else None
-        s_cont_inp = s_cont_inp[:,0,:] if s_cont_inp is not None else None
-
-        s_inp = self._apply_embedding(s_cat_inp,
-                                      s_cont_inp,
-                                      self.s_cat_embed,
-                                      self.s_cont_embedding_vectors,
-                                      self.s_cont_embedding_bias)
-        t_known_inp = self._apply_embedding(t_cat_k_inp,
-                                            t_cont_k_inp,
-                                            self.t_cat_k_embed,
-                                            self.t_cont_k_embedding_vectors,
-                                            self.t_cont_k_embedding_bias)
-        t_observed_inp = self._apply_embedding(t_cat_o_inp,
-                                               t_cont_o_inp,
-                                               self.t_cat_o_embed,
-                                               self.t_cont_o_embedding_vectors,
-                                               self.t_cont_o_embedding_bias)
-
-        # Temporal observed targets
-        # t_observed_tgt = torch.einsum('btf,fh->btfh', t_tgt_obs, self.t_tgt_embedding_vectors)
-        t_observed_tgt = torch.matmul(t_tgt_obs.unsqueeze(3).unsqueeze(4), self.t_tgt_embedding_vectors.unsqueeze(1)).squeeze(3)
-        t_observed_tgt = t_observed_tgt + self.t_tgt_embedding_bias
-
-        return s_inp, t_known_inp, t_observed_inp, t_observed_tgt
-
-class VariableSelectionNetwork(nn.Module):
-    def __init__(self, config, num_inputs):
-        super().__init__()
-        self.joint_grn = GRN(config.hidden_size*num_inputs, config.hidden_size, output_size=num_inputs, context_hidden_size=config.hidden_size)
-        self.var_grns = nn.ModuleList([GRN(config.hidden_size, config.hidden_size, dropout=config.dropout) for _ in range(num_inputs)])
-
-    def forward(self, x: Tensor, context: Optional[Tensor] = None):
-        Xi = x.reshape(*x.shape[:-2], -1)
-        grn_outputs = self.joint_grn(Xi, c=context)
-        sparse_weights = F.softmax(grn_outputs, dim=-1)
-        transformed_embed_list = [m(x[...,i,:]) for i, m in enumerate(self.var_grns)]
-        transformed_embed = torch.stack(transformed_embed_list, dim=-1)
-        #the line below performs batched matrix vector multiplication
-        #for temporal features it's bthf,btf->bth
-        #for static features it's bhf,bf->bh
-        variable_ctx = torch.matmul(transformed_embed, sparse_weights.unsqueeze(-1)).squeeze(-1)
-
-        return variable_ctx, sparse_weights
-
-class StaticCovariateEncoder(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.vsn = VariableSelectionNetwork(config, config.num_static_vars)
-        self.context_grns = nn.ModuleList([GRN(config.hidden_size, config.hidden_size, dropout=config.dropout) for _ in range(4)])
-
-    def forward(self, x: Tensor) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
-        variable_ctx, sparse_weights = self.vsn(x)
-
-        # Context vectors:
-        # variable selection context
-        # enrichment context
-        # state_c context
-        # state_h context
-        cs, ce, ch, cc = tuple(m(variable_ctx) for m in self.context_grns)
-
-        return cs, ce, ch, cc
-
-
-class InterpretableMultiHeadAttention(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.n_head = config.n_head
-        assert config.hidden_size % config.n_head == 0
-        self.d_head = config.hidden_size // config.n_head
-        self.qkv_linears = nn.Linear(config.hidden_size, (2 * self.n_head + 1) * self.d_head, bias=False)
-        self.out_proj = nn.Linear(self.d_head, config.hidden_size, bias=False)
-        self.attn_dropout = nn.Dropout(config.attn_dropout)
-        self.out_dropout = nn.Dropout(config.dropout)
-        self.scale = self.d_head**-0.5
-        self.register_buffer("_mask", torch.triu(torch.full((config.example_length, config.example_length), float('-inf')), 1).unsqueeze(0))
-
-    def forward(self, x: Tensor, mask_future_timesteps: bool = True) -> Tuple[Tensor, Tensor]:
-        bs, t, h_size = x.shape
-        qkv = self.qkv_linears(x)
-        q, k, v = qkv.split((self.n_head * self.d_head, self.n_head * self.d_head, self.d_head), dim=-1)
-        q = q.view(bs, t, self.n_head, self.d_head)
-        k = k.view(bs, t, self.n_head, self.d_head)
-        v = v.view(bs, t, self.d_head)
-
-        # attn_score = torch.einsum('bind,bjnd->bnij', q, k)
-        attn_score = torch.matmul(q.permute((0, 2, 1, 3)), k.permute((0, 2, 3, 1)))
-        attn_score.mul_(self.scale)
-
-        if mask_future_timesteps:
-            attn_score = attn_score + self._mask
-
-        attn_prob = F.softmax(attn_score, dim=3)
-        attn_prob = self.attn_dropout(attn_prob)
-
-        # attn_vec = torch.einsum('bnij,bjd->bnid', attn_prob, v)
-        attn_vec = torch.matmul(attn_prob, v.unsqueeze(1))
-        m_attn_vec = torch.mean(attn_vec, dim=1)
-        out = self.out_proj(m_attn_vec)
-        out = self.out_dropout(out)
-
-        return out, attn_vec
-
-
-
-class TemporalFusionTransformer(nn.Module):
-    """ 
-    Implementation of https://arxiv.org/abs/1912.09363 
-    """
-    def __init__(self, config):
-        super().__init__()
-
-        if hasattr(config, 'model'):
-            config = config.model
-
-        self.encoder_length = config.encoder_length #this determines from how distant past we want to use data from
-
-        self.embedding = TFTEmbedding(config)
-        self.static_encoder = StaticCovariateEncoder(config)
-
-        self.history_vsn = VariableSelectionNetwork(config, config.num_historic_vars) 
-        self.history_encoder = nn.LSTM(config.hidden_size, config.hidden_size, batch_first=True)
-        self.future_vsn = VariableSelectionNetwork(config, config.num_future_vars)
-        self.future_encoder = nn.LSTM(config.hidden_size, config.hidden_size, batch_first=True)
-
-
-        self.input_gate = GLU(config.hidden_size, config.hidden_size)
-        self.input_gate_ln = LayerNorm(config.hidden_size, eps=1e-3)
-
-        self.enrichment_grn = GRN(config.hidden_size,
-                                  config.hidden_size,
-                                  context_hidden_size=config.hidden_size, 
-                                  dropout=config.dropout)
-        self.attention = InterpretableMultiHeadAttention(config)
-        self.attention_gate = GLU(config.hidden_size, config.hidden_size)
-        self.attention_ln = LayerNorm(config.hidden_size, eps=1e-3)
-
-        self.positionwise_grn = GRN(config.hidden_size,
-                                    config.hidden_size,
-                                    dropout=config.dropout)
-
-        self.decoder_gate = GLU(config.hidden_size, config.hidden_size)
-        self.decoder_ln = LayerNorm(config.hidden_size, eps=1e-3)
-
-        self.quantile_proj = nn.Linear(config.hidden_size, len(config.quantiles))
-
-    def forward(self, x: Dict[str, Tensor]) -> Tensor:
-        s_inp, t_known_inp, t_observed_inp, t_observed_tgt = self.embedding(x)
-
-        # Static context
-        cs, ce, ch, cc = self.static_encoder(s_inp)
-        ch, cc = ch.unsqueeze(0), cc.unsqueeze(0) #lstm initial states
-
-        # Temporal input
-        _historical_inputs = [t_known_inp[:,:self.encoder_length,:], t_observed_tgt[:,:self.encoder_length,:]]
-        if t_observed_inp is not None:
-            _historical_inputs.insert(0,t_observed_inp[:,:self.encoder_length,:])
-
-        historical_inputs = torch.cat(_historical_inputs, dim=-2)
-        future_inputs = t_known_inp[:, self.encoder_length:]
-
-        # Encoders
-        historical_features, _ = self.history_vsn(historical_inputs, cs)
-        history, state = self.history_encoder(historical_features, (ch, cc))
-        future_features, _ = self.future_vsn(future_inputs, cs)
-        future, _ = self.future_encoder(future_features, state)
-        torch.cuda.synchronize() # this call gives perf boost for unknown reasons
-
-        # skip connection
-        input_embedding = torch.cat([historical_features, future_features], dim=1)
-        temporal_features = torch.cat([history, future], dim=1)
-        temporal_features = self.input_gate(temporal_features)
-        temporal_features = temporal_features + input_embedding
-        temporal_features = self.input_gate_ln(temporal_features)
-
-        # Static enrichment
-        enriched = self.enrichment_grn(temporal_features, c=ce)
-
-        # Temporal self attention
-        x, _ = self.attention(enriched, mask_future_timesteps=True)
-
-        # Don't compute hictorical quantiles
-        x = x[:, self.encoder_length:, :]
-        temporal_features = temporal_features[:, self.encoder_length:, :]
-        enriched = enriched[:, self.encoder_length:, :]
-
-        x = self.attention_gate(x)
-        x = x + enriched
-        x = self.attention_ln(x)
-
-        # Position-wise feed-forward
-        x = self.positionwise_grn(x)
-
-        # Final skip connection
-        x = self.decoder_gate(x)
-        x = x + temporal_features
-        x = self.decoder_ln(x)
-
-        out = self.quantile_proj(x)
-
-        return out
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/requirements.txt b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/requirements.txt
deleted file mode 100644
index 8ba46efc..00000000
--- a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/requirements.txt
+++ /dev/null
@@ -1 +0,0 @@
-tensorboard
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/scripts/benchmark.sh b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/scripts/benchmark.sh
deleted file mode 100644
index c8a04c36..00000000
--- a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/scripts/benchmark.sh
+++ /dev/null
@@ -1,54 +0,0 @@
-#! /bin/bash
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-NUM_GPUS=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
-[ $NUM_GPUS -eq 16 ] && WORKER_NUMS=(1 8 16) || WORKER_NUMS=(1 8)
-DATASETS=(electricity traffic)
-
-rm -r /tmp/benchmark_results
-
-for DATASET in ${DATASETS[@]}
-do
-    for NGPU in ${WORKER_NUMS[@]}
-    do
-        for BATCH_SIZE in 512 1024 1536 2048 2560
-        do
-            for USE_AMP in --use_amp ""
-            do
-                for AFFINITY in "--affinity disabled" "--affinity single" "--affinity socket_unique_interleaved"
-                do 
-                    EXP_NAME="TFT_benchmark_${DATASET}_BS_${BATCH_SIZE}_${NGPU}GPU${USE_AMP}_${AFFINITY}"
-                    python -m torch.distributed.launch --nproc_per_node=${NGPU} train.py \
-                            --dataset ${DATASET} \
-                            --data_path /data/processed/${DATASET}_bin \
-                            --batch_size=${BATCH_SIZE} \
-                            --lr 5e-4 \
-                            --epochs 1 \
-                            --sample 100000 5000 \
-                            --seed 1 \
-                            ${USE_AMP} \
-                            ${AFFINITY} \
-                            --clip_grad 0.1 \
-                            --results /tmp/benchmark_results/${EXP_NAME}
-                done
-            done
-        done
-    done
-done
-for P in `ls /tmp/benchmark_results/`;
-do
-    echo ${P}
-    tail -n 1 /tmp/benchmark_results/${P}/dllogger.json
-done
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/scripts/get_data.sh b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/scripts/get_data.sh
deleted file mode 100644
index d4c7c7e1..00000000
--- a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/scripts/get_data.sh
+++ /dev/null
@@ -1,40 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-DATAPATH='/data'
-
-declare -A URLS=( ['electricity']='https://archive.ics.uci.edu/ml/machine-learning-databases/00321/LD2011_2014.txt.zip'
-                  ['traffic']='https://archive.ics.uci.edu/ml/machine-learning-databases/00204/PEMS-SF.zip'
-                )
-
-mkdir -p ${DATAPATH}/raw
-mkdir -p ${DATAPATH}/processed
-
-for DS in electricity traffic
-do
-	DS_PATH=${DATAPATH}/raw/${DS}
-	ZIP_FNAME=${DS_PATH}.zip
-    if [ ! -d ${DS_PATH} ]
-    then
-        wget "${URLS[${DS}]}" -O ${ZIP_FNAME}
-        unzip ${ZIP_FNAME} -d ${DS_PATH}
-    fi
-	python -c "from data_utils import standarize_${DS} as standarize; standarize(\"${DS_PATH}\")"
-	python -c "from data_utils import preprocess; \
-               from configuration import ${DS^}Config as Config; \
-               preprocess(\"${DS_PATH}/standarized.csv\", \"${DATAPATH}/processed/${DS}_bin\", Config())" 
-done
-
-
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/scripts/run_electricity.sh b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/scripts/run_electricity.sh
deleted file mode 100644
index 86214a9a..00000000
--- a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/scripts/run_electricity.sh
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-: ${SEED:=1}
-: ${LR:=1e-3}
-: ${NGPU:=8}
-: ${BATCH_SIZE:=1024}
-: ${EPOCHS:=30}
-
-python -m torch.distributed.launch --nproc_per_node=${NGPU} train.py \
-        --dataset electricity \
-        --data_path /data/processed/electricity_bin \
-        --batch_size=${BATCH_SIZE} \
-        --sample 450000 50000 \
-        --lr ${LR} \
-        --epochs ${EPOCHS} \
-        --seed ${SEED} \
-        --use_amp \
-        --results /results/TFT_electricity_bs${NGPU}x${BATCH_SIZE}_lr${LR}/seed_${SEED}
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/scripts/run_electricity_DGX1-16G.sh b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/scripts/run_electricity_DGX1-16G.sh
deleted file mode 100644
index 86214a9a..00000000
--- a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/scripts/run_electricity_DGX1-16G.sh
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-: ${SEED:=1}
-: ${LR:=1e-3}
-: ${NGPU:=8}
-: ${BATCH_SIZE:=1024}
-: ${EPOCHS:=30}
-
-python -m torch.distributed.launch --nproc_per_node=${NGPU} train.py \
-        --dataset electricity \
-        --data_path /data/processed/electricity_bin \
-        --batch_size=${BATCH_SIZE} \
-        --sample 450000 50000 \
-        --lr ${LR} \
-        --epochs ${EPOCHS} \
-        --seed ${SEED} \
-        --use_amp \
-        --results /results/TFT_electricity_bs${NGPU}x${BATCH_SIZE}_lr${LR}/seed_${SEED}
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/scripts/run_traffic.sh b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/scripts/run_traffic.sh
deleted file mode 100644
index cab8e473..00000000
--- a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/scripts/run_traffic.sh
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-: ${SEED:=1}
-: ${LR:=1e-3}
-: ${NGPU:=8}
-: ${BATCH_SIZE:=1024}
-: ${EPOCHS:=20}
-
-python -m torch.distributed.launch --nproc_per_node=${NGPU} train.py \
-        --dataset traffic \
-        --data_path /data/processed/traffic_bin \
-        --batch_size=${BATCH_SIZE} \
-        --sample 450000 50000 \
-        --lr ${LR} \
-        --epochs ${EPOCHS} \
-        --seed ${SEED} \
-        --use_amp \
-        --results /results/TFT_traffic_bs${NGPU}x${BATCH_SIZE}_lr${LR}/seed_${SEED}
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/scripts/run_traffic_DGX1-16G.sh b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/scripts/run_traffic_DGX1-16G.sh
deleted file mode 100644
index cab8e473..00000000
--- a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/scripts/run_traffic_DGX1-16G.sh
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-: ${SEED:=1}
-: ${LR:=1e-3}
-: ${NGPU:=8}
-: ${BATCH_SIZE:=1024}
-: ${EPOCHS:=20}
-
-python -m torch.distributed.launch --nproc_per_node=${NGPU} train.py \
-        --dataset traffic \
-        --data_path /data/processed/traffic_bin \
-        --batch_size=${BATCH_SIZE} \
-        --sample 450000 50000 \
-        --lr ${LR} \
-        --epochs ${EPOCHS} \
-        --seed ${SEED} \
-        --use_amp \
-        --results /results/TFT_traffic_bs${NGPU}x${BATCH_SIZE}_lr${LR}/seed_${SEED}
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/train.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/train.py
deleted file mode 100644
index e5ceceeb..00000000
--- a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/train.py
+++ /dev/null
@@ -1,294 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import time
-import os
-import pickle
-import json
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import torch.distributed as dist
-from torch.utils.data import DataLoader, DistributedSampler, RandomSampler
-from apex import amp
-from apex.optimizers import FusedAdam
-#from torch.nn.parallel import DistributedDataParallel as DDP
-from apex.parallel import DistributedDataParallel as DDP
-
-import numpy as np
-
-import dllogger
-
-from modeling import TemporalFusionTransformer
-from configuration import CONFIGS
-from data_utils import TFTBinaryDataset, sample_data
-from log_helper import setup_logger
-from criterions import QuantileLoss
-from inference import predict
-from utils import PerformanceMeter
-import gpu_affinity
-from ema import ModelEma
-
-def load_dataset(args, config):
-    train_split = TFTBinaryDataset(os.path.join(args.data_path, 'train.bin'), config)
-    train_split = sample_data(train_split, args.sample_data[0])
-    if args.distributed_world_size > 1:
-        data_sampler = DistributedSampler(train_split, args.distributed_world_size, args.distributed_rank, seed=args.seed + args.distributed_rank, drop_last=True)
-    else:
-        data_sampler = RandomSampler(train_split)
-    train_loader = DataLoader(train_split, batch_size=args.batch_size, num_workers=4, sampler=data_sampler, pin_memory=True)
-
-    valid_split = TFTBinaryDataset(os.path.join(args.data_path, 'valid.bin'), config)
-    valid_split = sample_data(valid_split, args.sample_data[1])
-    if args.distributed_world_size > 1:
-        data_sampler = DistributedSampler(valid_split, args.distributed_world_size, args.distributed_rank, shuffle=False, drop_last=False)
-    else:
-        data_sampler = None
-    valid_loader = DataLoader(valid_split, batch_size=args.batch_size, sampler=data_sampler, num_workers=4, pin_memory=True)
-
-    test_split = TFTBinaryDataset(os.path.join(args.data_path, 'test.bin'), config)
-    if args.distributed_world_size > 1:
-        data_sampler = DistributedSampler(test_split, args.distributed_world_size, args.distributed_rank, shuffle=False, drop_last=False)
-    else:
-        data_sampler = None
-    test_loader = DataLoader(test_split, batch_size=args.batch_size, sampler=data_sampler, num_workers=4, pin_memory=True)
-
-    print_once(f'Train split length: {len(train_split)}')
-    print_once(f'Valid split length: {len(valid_split)}')
-    print_once(f'Test split length: {len(test_split)}')
-
-    return train_loader, valid_loader, test_loader
-
-def print_once(*args, **kwargs):
-    if not dist.is_initialized() or dist.get_rank() == 0:
-        print(*args, **kwargs)
-
-
-def main(args):
-    # Enable CuDNN autotuner
-    nproc_per_node = torch.cuda.device_count()
-    if args.affinity != 'disabled':
-        affinity = gpu_affinity.set_affinity(
-                args.local_rank,
-                nproc_per_node,
-                args.affinity
-            )
-        print(f'{args.local_rank}: thread affinity: {affinity}')
-
-
-    torch.backends.cudnn.benchmark = True
-
-    ### INIT DISTRIBUTED
-    if args.distributed_world_size > 1:
-        args.local_rank = int(os.environ.get('LOCAL_RANK', args.local_rank))
-        torch.cuda.set_device(args.local_rank)
-        dist.init_process_group(backend='nccl', init_method='env://')
-        args.distributed_world_size = int(os.environ['WORLD_SIZE'])
-        args.distributed_rank = dist.get_rank()
-        print_once(f'Distributed training with {args.distributed_world_size} GPUs')
-        torch.cuda.synchronize()
-
-    if args.seed:
-        np.random.seed(args.seed)
-        torch.manual_seed(args.seed)
-        torch.cuda.manual_seed(args.seed)
-
-    setup_logger(args)
-
-    config = CONFIGS[args.dataset]()
-    if args.overwrite_config:
-        config.__dict__.update(json.loads(args.overwrite_config))
-
-    dllogger.log(step='HPARAMS', data={**vars(args), **vars(config)}, verbosity=1)
-
-    model = TemporalFusionTransformer(config).cuda()
-    if args.ema_decay:
-        model_ema = ModelEma(model, decay=args.ema_decay)
-
-    print_once('Model params: {}'.format(sum(p.numel() for p in model.parameters())))
-    criterion = QuantileLoss(config).cuda()
-    optimizer = FusedAdam(model.parameters(), lr=args.lr)
-    if args.use_amp:
-        model, optimizer = amp.initialize(model, optimizer, opt_level="O2", loss_scale="dynamic")
-    if args.distributed_world_size > 1:
-        #model = DDP(model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True)
-        model = DDP(model)
-
-    train_loader, valid_loader, test_loader = load_dataset(args, config)
-
-    global_step = 0
-    perf_meter = PerformanceMeter()
-
-    for epoch in range(args.epochs):
-        start = time.time()
-        dllogger.log(step=global_step, data={'epoch': epoch}, verbosity=1)
-
-        model.train() 
-        for local_step, batch in enumerate(train_loader):
-            perf_meter.reset_current_lap()
-            batch = {key: tensor.cuda() if tensor.numel() else None for key, tensor in batch.items()}
-            predictions = model(batch)
-            targets = batch['target'][:,config.encoder_length:,:]
-            p_losses = criterion(predictions, targets)
-            loss = p_losses.sum()
-
-            if args.use_amp:
-                with amp.scale_loss(loss, optimizer) as scaled_loss:
-                    scaled_loss.backward()
-            else:
-                loss.backward()
-            if not args.grad_accumulation or (global_step+1) % args.grad_accumulation == 0:
-                if args.clip_grad:
-                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip_grad)
-                optimizer.step()
-                optimizer.zero_grad()
-                if args.ema_decay:
-                    model_ema.update(model)
-
-            if args.distributed_world_size > 1:
-                dist.all_reduce(p_losses)
-                p_losses /= args.distributed_world_size
-                loss = p_losses.sum()
-
-            torch.cuda.synchronize()
-            ips = perf_meter.update(args.batch_size * args.distributed_world_size,
-                    exclude_from_total=local_step in [0, len(train_loader)-1])
-
-            log_dict = {'P10':p_losses[0].item(), 'P50':p_losses[1].item(), 'P90':p_losses[2].item(), 'loss': loss.item(), 'items/s':ips}
-            dllogger.log(step=global_step, data=log_dict, verbosity=1)
-            global_step += 1
-
-        validate(args, config, model_ema if args.ema_decay else model, criterion, valid_loader, global_step)
-
-        if validate.early_stop_c >= args.early_stopping:
-            print_once('Early stopping')
-            break
-
-    ### TEST PHASE ###
-    state_dict = torch.load(os.path.join(args.results, 'checkpoint.pt'), map_location='cpu')
-    if isinstance(model, DDP):
-        model.module.load_state_dict(state_dict['model'])
-    else:
-        model.load_state_dict(state_dict['model'])
-    model.cuda().eval()
-
-    tgt_scalers = pickle.load(open(os.path.join(args.data_path, 'tgt_scalers.bin'), 'rb'))
-    cat_encodings = pickle.load(open(os.path.join(args.data_path,'cat_encodings.bin'), 'rb'))
-
-    unscaled_predictions, unscaled_targets, _, _ = predict(args, config, model, test_loader, tgt_scalers, cat_encodings)
-    losses = QuantileLoss(config)(unscaled_predictions, unscaled_targets)
-    normalizer = unscaled_targets.abs().mean()
-    quantiles = 2 * losses / normalizer
-
-    if args.distributed_world_size > 1:
-        quantiles = quantiles.cuda()
-        dist.all_reduce(quantiles)
-        quantiles /= args.distributed_world_size
-
-    quantiles = {'test_p10': quantiles[0].item(), 'test_p50': quantiles[1].item(), 'test_p90': quantiles[2].item(), 'sum':sum(quantiles).item()}
-    finish_log = {**quantiles, 'average_ips':perf_meter.avg, 'convergence_step':validate.conv_step}
-    dllogger.log(step=(), data=finish_log, verbosity=1)
-
-def validate(args, config, model, criterion, dataloader, global_step):
-    if not hasattr(validate, 'best_valid_loss'):
-        validate.best_valid_loss = float('inf')
-    if not hasattr(validate, 'early_stop_c'):
-        validate.early_stop_c = 0
-    model.eval()
-
-    losses = []
-    validation_start = time.time()
-    for batch in dataloader:
-        with torch.no_grad():
-            batch = {key: tensor.cuda() if tensor.numel() else None for key, tensor in batch.items()}
-            predictions = model(batch)
-            targets = batch['target'][:,config.encoder_length:,:]
-            p_losses = criterion(predictions, targets)
-            bs = next(t for t in batch.values() if t is not None).shape[0]
-            losses.append((p_losses, bs))
-
-    validation_end = time.time()
-
-    p_losses = sum([l[0]*l[1] for l in losses])/sum([l[1] for l in losses]) #takes into accunt that the last batch is not full
-    if args.distributed_world_size > 1:
-        dist.all_reduce(p_losses)
-        p_losses = p_losses/args.distributed_world_size
-
-    ips = len(dataloader.dataset) / (validation_end - validation_start)
-
-    log_dict = {'P10':p_losses[0].item(), 'P50':p_losses[1].item(), 'P90':p_losses[2].item(), 'loss': p_losses.sum().item(), 'items/s':ips}
-
-    if log_dict['loss'] < validate.best_valid_loss:
-        validate.best_valid_loss = log_dict['loss']
-        validate.early_stop_c = 0
-        validate.conv_step = global_step
-        if not dist.is_initialized() or dist.get_rank() == 0:
-            state_dict = model.module.state_dict() if isinstance(model, (DDP, ModelEma)) else model.state_dict()
-            ckpt = {'args':args, 'config':config, 'model':state_dict}
-            torch.save(ckpt, os.path.join(args.results, 'checkpoint.pt'))
-        if args.distributed_world_size > 1:
-            dist.barrier()
-    else:
-        validate.early_stop_c += 1
-        
-    log_dict = {'val_'+k:v for k,v in log_dict.items()}
-    dllogger.log(step=global_step, data=log_dict, verbosity=1)
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--data_path', type=str, required=True,
-                        help='Path to the dataset')
-    parser.add_argument('--dataset', type=str, required=True, choices=CONFIGS.keys(),
-                        help='Dataset name')
-    parser.add_argument('--epochs', type=int, default=25,
-                        help='Default number of training epochs')
-    parser.add_argument('--sample_data', type=lambda x: int(float(x)), nargs=2, default=[-1, -1],
-                        help="""Subsample the dataset. Specify number of training and valid examples.
-                        Values can be provided in scientific notation. Floats will be truncated.""")
-    parser.add_argument('--batch_size', type=int, default=64)
-    parser.add_argument('--lr', type=float, default=1e-3)
-    parser.add_argument('--seed', type=int, default=1)
-    parser.add_argument('--use_amp', action='store_true', help='Enable automatic mixed precision')
-    parser.add_argument('--clip_grad', type=float, default=0.0)
-    parser.add_argument('--grad_accumulation', type=int, default=0)
-    parser.add_argument('--early_stopping', type=int, default=1000,
-                        help='Stop training if validation loss does not improve for more than this number of epochs.')
-    parser.add_argument('--results', type=str, default='/results',
-                        help='Directory in which results are stored')
-    parser.add_argument('--log_file', type=str, default='dllogger.json',
-                        help='Name of dllogger output file')
-    parser.add_argument('--distributed_world_size', type=int, metavar='N',
-                       default=torch.cuda.device_count(),
-                       help='total number of GPUs across all nodes (default: all visible GPUs)')
-    parser.add_argument('--distributed_rank', default=os.getenv('LOCAL_RANK', 0), type=int,
-                       help='rank of the current worker')
-    parser.add_argument('--local_rank', default=0, type=int,
-                       help='rank of the current worker')
-    parser.add_argument('--overwrite_config', type=str, default='',
-                       help='JSON string used to overload config')
-    parser.add_argument('--affinity', type=str,
-                         default='socket_unique_interleaved',
-                         choices=['socket', 'single', 'single_unique',
-                                  'socket_unique_interleaved',
-                                  'socket_unique_continuous',
-                                  'disabled'],
-                         help='type of CPU affinity')
-    parser.add_argument("--ema_decay", type=float, default=0.0, help='Use exponential moving average')
-
-
-    ARGS = parser.parse_args()
-    main(ARGS)
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/utils.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/utils.py
deleted file mode 100644
index bf88be40..00000000
--- a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/tft_pyt/utils.py
+++ /dev/null
@@ -1,46 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import time
-
-class PerformanceMeter():
-    def __init__(self):
-        self.reset()
-
-    def reset(self):
-        self.avg = 0
-        self.count = 0
-        self.total_time = 0
-        self.last_update_time = time.time()
-        self.intervals = []
-
-    def update(self, n, exclude_from_total=False):
-        delta = time.time() - self.last_update_time
-        self.intervals.append(delta)
-        if not exclude_from_total:
-            self.total_time += delta
-            self.count += n
-            self.avg = self.count / self.total_time
-        self.last_update_time = time.time()
-
-        return n/delta
-
-    def reset_current_lap(self):
-        self.last_update_time = time.time()
-
-    def p(self, i):
-        assert i <= 100
-        idx = int(len(self.intervals) * i / 100)
-        return sorted(self.intervals)[idx]
-
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/train.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/train.py
deleted file mode 100644
index e5ceceeb..00000000
--- a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/train.py
+++ /dev/null
@@ -1,294 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import time
-import os
-import pickle
-import json
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import torch.distributed as dist
-from torch.utils.data import DataLoader, DistributedSampler, RandomSampler
-from apex import amp
-from apex.optimizers import FusedAdam
-#from torch.nn.parallel import DistributedDataParallel as DDP
-from apex.parallel import DistributedDataParallel as DDP
-
-import numpy as np
-
-import dllogger
-
-from modeling import TemporalFusionTransformer
-from configuration import CONFIGS
-from data_utils import TFTBinaryDataset, sample_data
-from log_helper import setup_logger
-from criterions import QuantileLoss
-from inference import predict
-from utils import PerformanceMeter
-import gpu_affinity
-from ema import ModelEma
-
-def load_dataset(args, config):
-    train_split = TFTBinaryDataset(os.path.join(args.data_path, 'train.bin'), config)
-    train_split = sample_data(train_split, args.sample_data[0])
-    if args.distributed_world_size > 1:
-        data_sampler = DistributedSampler(train_split, args.distributed_world_size, args.distributed_rank, seed=args.seed + args.distributed_rank, drop_last=True)
-    else:
-        data_sampler = RandomSampler(train_split)
-    train_loader = DataLoader(train_split, batch_size=args.batch_size, num_workers=4, sampler=data_sampler, pin_memory=True)
-
-    valid_split = TFTBinaryDataset(os.path.join(args.data_path, 'valid.bin'), config)
-    valid_split = sample_data(valid_split, args.sample_data[1])
-    if args.distributed_world_size > 1:
-        data_sampler = DistributedSampler(valid_split, args.distributed_world_size, args.distributed_rank, shuffle=False, drop_last=False)
-    else:
-        data_sampler = None
-    valid_loader = DataLoader(valid_split, batch_size=args.batch_size, sampler=data_sampler, num_workers=4, pin_memory=True)
-
-    test_split = TFTBinaryDataset(os.path.join(args.data_path, 'test.bin'), config)
-    if args.distributed_world_size > 1:
-        data_sampler = DistributedSampler(test_split, args.distributed_world_size, args.distributed_rank, shuffle=False, drop_last=False)
-    else:
-        data_sampler = None
-    test_loader = DataLoader(test_split, batch_size=args.batch_size, sampler=data_sampler, num_workers=4, pin_memory=True)
-
-    print_once(f'Train split length: {len(train_split)}')
-    print_once(f'Valid split length: {len(valid_split)}')
-    print_once(f'Test split length: {len(test_split)}')
-
-    return train_loader, valid_loader, test_loader
-
-def print_once(*args, **kwargs):
-    if not dist.is_initialized() or dist.get_rank() == 0:
-        print(*args, **kwargs)
-
-
-def main(args):
-    # Enable CuDNN autotuner
-    nproc_per_node = torch.cuda.device_count()
-    if args.affinity != 'disabled':
-        affinity = gpu_affinity.set_affinity(
-                args.local_rank,
-                nproc_per_node,
-                args.affinity
-            )
-        print(f'{args.local_rank}: thread affinity: {affinity}')
-
-
-    torch.backends.cudnn.benchmark = True
-
-    ### INIT DISTRIBUTED
-    if args.distributed_world_size > 1:
-        args.local_rank = int(os.environ.get('LOCAL_RANK', args.local_rank))
-        torch.cuda.set_device(args.local_rank)
-        dist.init_process_group(backend='nccl', init_method='env://')
-        args.distributed_world_size = int(os.environ['WORLD_SIZE'])
-        args.distributed_rank = dist.get_rank()
-        print_once(f'Distributed training with {args.distributed_world_size} GPUs')
-        torch.cuda.synchronize()
-
-    if args.seed:
-        np.random.seed(args.seed)
-        torch.manual_seed(args.seed)
-        torch.cuda.manual_seed(args.seed)
-
-    setup_logger(args)
-
-    config = CONFIGS[args.dataset]()
-    if args.overwrite_config:
-        config.__dict__.update(json.loads(args.overwrite_config))
-
-    dllogger.log(step='HPARAMS', data={**vars(args), **vars(config)}, verbosity=1)
-
-    model = TemporalFusionTransformer(config).cuda()
-    if args.ema_decay:
-        model_ema = ModelEma(model, decay=args.ema_decay)
-
-    print_once('Model params: {}'.format(sum(p.numel() for p in model.parameters())))
-    criterion = QuantileLoss(config).cuda()
-    optimizer = FusedAdam(model.parameters(), lr=args.lr)
-    if args.use_amp:
-        model, optimizer = amp.initialize(model, optimizer, opt_level="O2", loss_scale="dynamic")
-    if args.distributed_world_size > 1:
-        #model = DDP(model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True)
-        model = DDP(model)
-
-    train_loader, valid_loader, test_loader = load_dataset(args, config)
-
-    global_step = 0
-    perf_meter = PerformanceMeter()
-
-    for epoch in range(args.epochs):
-        start = time.time()
-        dllogger.log(step=global_step, data={'epoch': epoch}, verbosity=1)
-
-        model.train() 
-        for local_step, batch in enumerate(train_loader):
-            perf_meter.reset_current_lap()
-            batch = {key: tensor.cuda() if tensor.numel() else None for key, tensor in batch.items()}
-            predictions = model(batch)
-            targets = batch['target'][:,config.encoder_length:,:]
-            p_losses = criterion(predictions, targets)
-            loss = p_losses.sum()
-
-            if args.use_amp:
-                with amp.scale_loss(loss, optimizer) as scaled_loss:
-                    scaled_loss.backward()
-            else:
-                loss.backward()
-            if not args.grad_accumulation or (global_step+1) % args.grad_accumulation == 0:
-                if args.clip_grad:
-                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip_grad)
-                optimizer.step()
-                optimizer.zero_grad()
-                if args.ema_decay:
-                    model_ema.update(model)
-
-            if args.distributed_world_size > 1:
-                dist.all_reduce(p_losses)
-                p_losses /= args.distributed_world_size
-                loss = p_losses.sum()
-
-            torch.cuda.synchronize()
-            ips = perf_meter.update(args.batch_size * args.distributed_world_size,
-                    exclude_from_total=local_step in [0, len(train_loader)-1])
-
-            log_dict = {'P10':p_losses[0].item(), 'P50':p_losses[1].item(), 'P90':p_losses[2].item(), 'loss': loss.item(), 'items/s':ips}
-            dllogger.log(step=global_step, data=log_dict, verbosity=1)
-            global_step += 1
-
-        validate(args, config, model_ema if args.ema_decay else model, criterion, valid_loader, global_step)
-
-        if validate.early_stop_c >= args.early_stopping:
-            print_once('Early stopping')
-            break
-
-    ### TEST PHASE ###
-    state_dict = torch.load(os.path.join(args.results, 'checkpoint.pt'), map_location='cpu')
-    if isinstance(model, DDP):
-        model.module.load_state_dict(state_dict['model'])
-    else:
-        model.load_state_dict(state_dict['model'])
-    model.cuda().eval()
-
-    tgt_scalers = pickle.load(open(os.path.join(args.data_path, 'tgt_scalers.bin'), 'rb'))
-    cat_encodings = pickle.load(open(os.path.join(args.data_path,'cat_encodings.bin'), 'rb'))
-
-    unscaled_predictions, unscaled_targets, _, _ = predict(args, config, model, test_loader, tgt_scalers, cat_encodings)
-    losses = QuantileLoss(config)(unscaled_predictions, unscaled_targets)
-    normalizer = unscaled_targets.abs().mean()
-    quantiles = 2 * losses / normalizer
-
-    if args.distributed_world_size > 1:
-        quantiles = quantiles.cuda()
-        dist.all_reduce(quantiles)
-        quantiles /= args.distributed_world_size
-
-    quantiles = {'test_p10': quantiles[0].item(), 'test_p50': quantiles[1].item(), 'test_p90': quantiles[2].item(), 'sum':sum(quantiles).item()}
-    finish_log = {**quantiles, 'average_ips':perf_meter.avg, 'convergence_step':validate.conv_step}
-    dllogger.log(step=(), data=finish_log, verbosity=1)
-
-def validate(args, config, model, criterion, dataloader, global_step):
-    if not hasattr(validate, 'best_valid_loss'):
-        validate.best_valid_loss = float('inf')
-    if not hasattr(validate, 'early_stop_c'):
-        validate.early_stop_c = 0
-    model.eval()
-
-    losses = []
-    validation_start = time.time()
-    for batch in dataloader:
-        with torch.no_grad():
-            batch = {key: tensor.cuda() if tensor.numel() else None for key, tensor in batch.items()}
-            predictions = model(batch)
-            targets = batch['target'][:,config.encoder_length:,:]
-            p_losses = criterion(predictions, targets)
-            bs = next(t for t in batch.values() if t is not None).shape[0]
-            losses.append((p_losses, bs))
-
-    validation_end = time.time()
-
-    p_losses = sum([l[0]*l[1] for l in losses])/sum([l[1] for l in losses]) #takes into accunt that the last batch is not full
-    if args.distributed_world_size > 1:
-        dist.all_reduce(p_losses)
-        p_losses = p_losses/args.distributed_world_size
-
-    ips = len(dataloader.dataset) / (validation_end - validation_start)
-
-    log_dict = {'P10':p_losses[0].item(), 'P50':p_losses[1].item(), 'P90':p_losses[2].item(), 'loss': p_losses.sum().item(), 'items/s':ips}
-
-    if log_dict['loss'] < validate.best_valid_loss:
-        validate.best_valid_loss = log_dict['loss']
-        validate.early_stop_c = 0
-        validate.conv_step = global_step
-        if not dist.is_initialized() or dist.get_rank() == 0:
-            state_dict = model.module.state_dict() if isinstance(model, (DDP, ModelEma)) else model.state_dict()
-            ckpt = {'args':args, 'config':config, 'model':state_dict}
-            torch.save(ckpt, os.path.join(args.results, 'checkpoint.pt'))
-        if args.distributed_world_size > 1:
-            dist.barrier()
-    else:
-        validate.early_stop_c += 1
-        
-    log_dict = {'val_'+k:v for k,v in log_dict.items()}
-    dllogger.log(step=global_step, data=log_dict, verbosity=1)
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--data_path', type=str, required=True,
-                        help='Path to the dataset')
-    parser.add_argument('--dataset', type=str, required=True, choices=CONFIGS.keys(),
-                        help='Dataset name')
-    parser.add_argument('--epochs', type=int, default=25,
-                        help='Default number of training epochs')
-    parser.add_argument('--sample_data', type=lambda x: int(float(x)), nargs=2, default=[-1, -1],
-                        help="""Subsample the dataset. Specify number of training and valid examples.
-                        Values can be provided in scientific notation. Floats will be truncated.""")
-    parser.add_argument('--batch_size', type=int, default=64)
-    parser.add_argument('--lr', type=float, default=1e-3)
-    parser.add_argument('--seed', type=int, default=1)
-    parser.add_argument('--use_amp', action='store_true', help='Enable automatic mixed precision')
-    parser.add_argument('--clip_grad', type=float, default=0.0)
-    parser.add_argument('--grad_accumulation', type=int, default=0)
-    parser.add_argument('--early_stopping', type=int, default=1000,
-                        help='Stop training if validation loss does not improve for more than this number of epochs.')
-    parser.add_argument('--results', type=str, default='/results',
-                        help='Directory in which results are stored')
-    parser.add_argument('--log_file', type=str, default='dllogger.json',
-                        help='Name of dllogger output file')
-    parser.add_argument('--distributed_world_size', type=int, metavar='N',
-                       default=torch.cuda.device_count(),
-                       help='total number of GPUs across all nodes (default: all visible GPUs)')
-    parser.add_argument('--distributed_rank', default=os.getenv('LOCAL_RANK', 0), type=int,
-                       help='rank of the current worker')
-    parser.add_argument('--local_rank', default=0, type=int,
-                       help='rank of the current worker')
-    parser.add_argument('--overwrite_config', type=str, default='',
-                       help='JSON string used to overload config')
-    parser.add_argument('--affinity', type=str,
-                         default='socket_unique_interleaved',
-                         choices=['socket', 'single', 'single_unique',
-                                  'socket_unique_interleaved',
-                                  'socket_unique_continuous',
-                                  'disabled'],
-                         help='type of CPU affinity')
-    parser.add_argument("--ema_decay", type=float, default=0.0, help='Use exponential moving average')
-
-
-    ARGS = parser.parse_args()
-    main(ARGS)
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/utils.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/utils.py
deleted file mode 100644
index bf88be40..00000000
--- a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/TemporalFusionTransformers/utils.py
+++ /dev/null
@@ -1,46 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import time
-
-class PerformanceMeter():
-    def __init__(self):
-        self.reset()
-
-    def reset(self):
-        self.avg = 0
-        self.count = 0
-        self.total_time = 0
-        self.last_update_time = time.time()
-        self.intervals = []
-
-    def update(self, n, exclude_from_total=False):
-        delta = time.time() - self.last_update_time
-        self.intervals.append(delta)
-        if not exclude_from_total:
-            self.total_time += delta
-            self.count += n
-            self.avg = self.count / self.total_time
-        self.last_update_time = time.time()
-
-        return n/delta
-
-    def reset_current_lap(self):
-        self.last_update_time = time.time()
-
-    def p(self, i):
-        assert i <= 100
-        idx = int(len(self.intervals) * i / 100)
-        return sorted(self.intervals)[idx]
-
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/configuration.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/configuration.py
deleted file mode 100644
index bef26e66..00000000
--- a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/configuration.py
+++ /dev/null
@@ -1,128 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from data_utils import InputTypes, DataTypes, FeatureSpec
-import datetime
-
-class ElectricityConfig():
-    def __init__(self):
-
-        self.features = [
-                         FeatureSpec('id', InputTypes.ID, DataTypes.CATEGORICAL),
-                         FeatureSpec('hours_from_start', InputTypes.TIME, DataTypes.CONTINUOUS),
-                         FeatureSpec('power_usage', InputTypes.TARGET, DataTypes.CONTINUOUS),
-                         FeatureSpec('hour', InputTypes.KNOWN, DataTypes.CONTINUOUS),
-                         FeatureSpec('day_of_week', InputTypes.KNOWN, DataTypes.CONTINUOUS),
-                         FeatureSpec('hours_from_start', InputTypes.KNOWN, DataTypes.CONTINUOUS),
-                         FeatureSpec('categorical_id', InputTypes.STATIC, DataTypes.CATEGORICAL),
-                        ]
-        # Dataset split boundaries
-        self.time_ids = 'days_from_start' # This column contains time indices across which we split the data
-        self.train_range = (1096, 1315)
-        self.valid_range = (1308, 1339)
-        self.test_range = (1332, 1346)
-        self.dataset_stride = 1 #how many timesteps between examples
-        self.scale_per_id = True
-        self.missing_id_strategy = None
-        self.missing_cat_data_strategy='encode_all'
-
-        # Feature sizes
-        self.static_categorical_inp_lens = [369]
-        self.temporal_known_categorical_inp_lens = []
-        self.temporal_observed_categorical_inp_lens = []
-        self.quantiles = [0.1, 0.5, 0.9]
-
-        self.example_length = 8 * 24
-        self.encoder_length = 7 * 24
-
-        self.n_head = 4
-        self.hidden_size = 128
-        self.dropout = 0.1
-        self.attn_dropout = 0.0
-
-        #### Derived variables ####
-        self.temporal_known_continuous_inp_size = len([x for x in self.features 
-            if x.feature_type == InputTypes.KNOWN and x.feature_embed_type == DataTypes.CONTINUOUS])
-        self.temporal_observed_continuous_inp_size = len([x for x in self.features 
-            if x.feature_type == InputTypes.OBSERVED and x.feature_embed_type == DataTypes.CONTINUOUS])
-        self.temporal_target_size = len([x for x in self.features if x.feature_type == InputTypes.TARGET])
-        self.static_continuous_inp_size = len([x for x in self.features 
-            if x.feature_type == InputTypes.STATIC and x.feature_embed_type == DataTypes.CONTINUOUS])
-
-        self.num_static_vars = self.static_continuous_inp_size + len(self.static_categorical_inp_lens)
-        self.num_future_vars = self.temporal_known_continuous_inp_size + len(self.temporal_known_categorical_inp_lens)
-        self.num_historic_vars = sum([self.num_future_vars,
-                                      self.temporal_observed_continuous_inp_size,
-                                      self.temporal_target_size,
-                                      len(self.temporal_observed_categorical_inp_lens),
-                                      ])
-
-
-class TrafficConfig():
-    def __init__(self):
-
-        self.features = [
-                         FeatureSpec('id', InputTypes.ID, DataTypes.CATEGORICAL),
-                         FeatureSpec('hours_from_start', InputTypes.TIME, DataTypes.CONTINUOUS),
-                         FeatureSpec('values', InputTypes.TARGET, DataTypes.CONTINUOUS),
-                         FeatureSpec('time_on_day', InputTypes.KNOWN, DataTypes.CONTINUOUS),
-                         FeatureSpec('day_of_week', InputTypes.KNOWN, DataTypes.CONTINUOUS),
-                         FeatureSpec('hours_from_start', InputTypes.KNOWN, DataTypes.CONTINUOUS),
-                         FeatureSpec('categorical_id', InputTypes.STATIC, DataTypes.CATEGORICAL),
-                        ]
-        # Dataset split boundaries
-        self.time_ids = 'sensor_day' # This column contains time indices across which we split the data
-        self.train_range = (0, 151)
-        self.valid_range = (144, 166)
-        self.test_range = (159, float('inf'))
-        self.dataset_stride = 1 #how many timesteps between examples
-        self.scale_per_id = False
-        self.missing_id_strategy = None
-        self.missing_cat_data_strategy='encode_all'
-
-        # Feature sizes
-        self.static_categorical_inp_lens = [963]
-        self.temporal_known_categorical_inp_lens = []
-        self.temporal_observed_categorical_inp_lens = []
-        self.quantiles = [0.1, 0.5, 0.9]
-
-        self.example_length = 8 * 24
-        self.encoder_length = 7 * 24
-
-        self.n_head = 4
-        self.hidden_size = 128
-        self.dropout = 0.3
-        self.attn_dropout = 0.0
-
-        #### Derived variables ####
-        self.temporal_known_continuous_inp_size = len([x for x in self.features 
-            if x.feature_type == InputTypes.KNOWN and x.feature_embed_type == DataTypes.CONTINUOUS])
-        self.temporal_observed_continuous_inp_size = len([x for x in self.features 
-            if x.feature_type == InputTypes.OBSERVED and x.feature_embed_type == DataTypes.CONTINUOUS])
-        self.temporal_target_size = len([x for x in self.features if x.feature_type == InputTypes.TARGET])
-        self.static_continuous_inp_size = len([x for x in self.features 
-            if x.feature_type == InputTypes.STATIC and x.feature_embed_type == DataTypes.CONTINUOUS])
-
-        self.num_static_vars = self.static_continuous_inp_size + len(self.static_categorical_inp_lens)
-        self.num_future_vars = self.temporal_known_continuous_inp_size + len(self.temporal_known_categorical_inp_lens)
-        self.num_historic_vars = sum([self.num_future_vars,
-                                      self.temporal_observed_continuous_inp_size,
-                                      self.temporal_target_size,
-                                      len(self.temporal_observed_categorical_inp_lens),
-                                      ])
-
-
-CONFIGS = {'electricity':  ElectricityConfig,
-           'traffic':      TrafficConfig, 
-           }
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/criterions.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/criterions.py
deleted file mode 100644
index 5c9df6ae..00000000
--- a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/criterions.py
+++ /dev/null
@@ -1,28 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-class QuantileLoss(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.register_buffer('q', torch.tensor(config.quantiles))
-
-    def forward(self, predictions, targets):
-        diff = predictions - targets
-        ql = (1-self.q)*F.relu(diff) + self.q*F.relu(-diff)
-        losses = ql.view(-1, ql.shape[-1]).mean(0)
-        return losses
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/data_utils.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/data_utils.py
deleted file mode 100644
index f38f8bfb..00000000
--- a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/data_utils.py
+++ /dev/null
@@ -1,790 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-################################
-# Copyright 2021 The Google Research Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import math
-import pickle
-import enum
-import datetime
-
-from collections import namedtuple, OrderedDict
-
-import sklearn.preprocessing
-from sklearn.impute import SimpleImputer
-import pandas as pd
-import numpy as np
-from bisect import bisect
-
-import torch
-from torch.utils.data import Dataset,IterableDataset,DataLoader
-
-class DataTypes(enum.IntEnum):
-    """Defines numerical types of each column."""
-    CONTINUOUS = 0
-    CATEGORICAL = 1
-    DATE = 2
-    STR = 3
-
-class InputTypes(enum.IntEnum):
-    """Defines input types of each column."""
-    TARGET = 0
-    OBSERVED = 1
-    KNOWN = 2
-    STATIC = 3
-    ID = 4  # Single column used as an entity identifier
-    TIME = 5  # Single column exclusively used as a time index
-
-FeatureSpec = namedtuple('FeatureSpec', ['name', 'feature_type', 'feature_embed_type'])
-DTYPE_MAP = {
-        DataTypes.CONTINUOUS : np.float32,
-        DataTypes.CATEGORICAL : np.int64,
-        DataTypes.DATE:'datetime64[ns]',
-        DataTypes.STR: str
-        }
-
-FEAT_ORDER = [
-        (InputTypes.STATIC, DataTypes.CATEGORICAL),
-        (InputTypes.STATIC, DataTypes.CONTINUOUS),
-        (InputTypes.KNOWN, DataTypes.CATEGORICAL),
-        (InputTypes.KNOWN, DataTypes.CONTINUOUS),
-        (InputTypes.OBSERVED, DataTypes.CATEGORICAL),
-        (InputTypes.OBSERVED, DataTypes.CONTINUOUS),
-        (InputTypes.TARGET, DataTypes.CONTINUOUS),
-        (InputTypes.ID, DataTypes.CATEGORICAL)
-        ]
-
-FEAT_NAMES = ['s_cat' , 's_cont' , 'k_cat' , 'k_cont' , 'o_cat' , 'o_cont' , 'target', 'id']
-DEFAULT_ID_COL = 'id'
-
-class TFTBinaryDataset(Dataset):
-    def __init__(self, path, config):
-        super(TFTBinaryDataset).__init__()
-        self.features = [x for x in config.features if x.feature_embed_type != DataTypes.DATE]
-        self.example_length = config.example_length
-        self.stride = config.dataset_stride
-
-        self.grouped = pickle.load(open(path, 'rb'))
-        self.grouped = [x for x in self.grouped if x.shape[0] >= self.example_length]
-        self._cum_examples_in_group = np.cumsum([(g.shape[0] - self.example_length + 1)//self.stride for g in self.grouped])
-
-
-        self.feature_type_col_map = [[i for i,f in enumerate(self.features) if (f.feature_type, f.feature_embed_type) == x] for x in FEAT_ORDER]
-
-        # The list comprehension below is an elaborate way of rearranging data into correct order,
-        # simultaneously doing casting to proper types. Probably can be written neater
-        self.grouped = [
-                [
-                    arr[:, idxs].view(dtype=np.float32).astype(DTYPE_MAP[t[1]]) 
-                    for t, idxs in zip(FEAT_ORDER, self.feature_type_col_map)
-                ] 
-                for arr in self.grouped
-            ]
-
-    def __len__(self):
-        return self._cum_examples_in_group[-1] if len(self._cum_examples_in_group) else 0
-
-    def __getitem__(self, idx):
-        g_idx = bisect(self._cum_examples_in_group, idx)
-        e_idx = idx - self._cum_examples_in_group[g_idx-1] if g_idx else idx
-
-        group =  self.grouped[g_idx]
-
-        tensors = [
-                   torch.from_numpy(feat[e_idx * self.stride:e_idx*self.stride + self.example_length])
-                   if feat.size else torch.empty(0)
-                   for feat in group
-                  ]
-
-        return OrderedDict(zip(FEAT_NAMES, tensors))
-
-
-class TFTDataset(Dataset):
-    def __init__(self, path, config):
-        super(TFTDataset).__init__()
-        self.features = config.features
-        self.data = pd.read_csv(path, index_col=0)
-        self.example_length = config.example_length
-        self.stride = config.dataset_stride
-
-        # name field is a column name.
-        # there can be multiple entries with the same name because one column can be interpreted in many ways
-        time_col_name = next(x.name for x in self.features if x.feature_type==InputTypes.TIME)
-        id_col_name   = next(x.name for x in self.features if x.feature_type==InputTypes.ID)
-        if not id_col_name in self.data.columns:
-            id_col_name = DEFAULT_ID_COL
-            self.features = [x for x in self.features if x.feature_type!=InputTypes.ID]
-            self.features.append(FeatureSpec(DEFAULT_ID_COL, InputTypes.ID, DataTypes.CATEGORICAL))
-        col_dtypes    = {v.name:DTYPE_MAP[v.feature_embed_type] for v in self.features}
-
-
-        self.data.sort_values(time_col_name,inplace=True)
-        self.data = self.data[set(x.name for x in self.features)] #leave only relevant columns
-        self.data = self.data.astype(col_dtypes)
-        self.data = self.data.groupby(id_col_name).filter(lambda group: len(group) >= self.example_length)
-        self.grouped = list(self.data.groupby(id_col_name))
-
-        self._cum_examples_in_group = np.cumsum([(len(g[1]) - self.example_length + 1)//self.stride for g in self.grouped])
-
-    def __len__(self):
-        return self._cum_examples_in_group[-1]
-
-    def __getitem__(self, idx):
-        g_idx = len([x for x in self._cum_examples_in_group if x <= idx])
-        e_idx = idx - self._cum_examples_in_group[g_idx-1] if g_idx else idx
-
-        group =  self.grouped[g_idx][1]
-        sliced = group.iloc[e_idx * self.stride:e_idx*self.stride + self.example_length]
-
-        # We need to be sure that tensors are returned in the correct order
-        tensors = tuple([] for _ in range(8))
-        for v in self.features:
-            if v.feature_type == InputTypes.STATIC and v.feature_embed_type == DataTypes.CATEGORICAL:
-                tensors[0].append(torch.from_numpy(sliced[v.name].to_numpy()))
-            elif v.feature_type == InputTypes.STATIC and v.feature_embed_type == DataTypes.CONTINUOUS:
-                tensors[1].append(torch.from_numpy(sliced[v.name].to_numpy()))
-            elif v.feature_type == InputTypes.KNOWN and v.feature_embed_type == DataTypes.CATEGORICAL:
-                tensors[2].append(torch.from_numpy(sliced[v.name].to_numpy()))
-            elif v.feature_type == InputTypes.KNOWN and v.feature_embed_type == DataTypes.CONTINUOUS:
-                tensors[3].append(torch.from_numpy(sliced[v.name].to_numpy()))
-            elif v.feature_type == InputTypes.OBSERVED and v.feature_embed_type == DataTypes.CATEGORICAL:
-                tensors[4].append(torch.from_numpy(sliced[v.name].to_numpy()))
-            elif v.feature_type == InputTypes.OBSERVED and v.feature_embed_type == DataTypes.CONTINUOUS:
-                tensors[5].append(torch.from_numpy(sliced[v.name].to_numpy()))
-            elif v.feature_type == InputTypes.TARGET:
-                tensors[6].append(torch.from_numpy(sliced[v.name].to_numpy()))
-            elif v.feature_type == InputTypes.ID:
-                tensors[7].append(torch.from_numpy(sliced[v.name].to_numpy()))
-
-
-        tensors = [torch.stack(x, dim=-1) if x else torch.empty(0) for x in tensors]
-
-        return OrderedDict(zip(FEAT_NAMES, tensors))
-        
-def get_dataset_splits(df, config):
-
-    if hasattr(config, 'relative_split') and config.relative_split:
-        forecast_len = config.example_length - config.encoder_length
-        # The valid split is shifted from the train split by number of the forecast steps to the future.
-        # The test split is shifted by the number of the forecast steps from the valid split
-        train = []
-        valid = []
-        test = []
-
-        for _, group in df.groupby(DEFAULT_ID_COL):
-            index = group[config.time_ids]
-            _train = group.loc[index < config.valid_boundary]
-            _valid = group.iloc[(len(_train) - config.encoder_length):(len(_train) + forecast_len)]
-            _test = group.iloc[(len(_train) - config.encoder_length + forecast_len):(len(_train) + 2*forecast_len)]
-            train.append(_train)
-            valid.append(_valid)
-            test.append(_test)
-
-        train = pd.concat(train, axis=0)
-        valid = pd.concat(valid, axis=0)
-        test = pd.concat(test, axis=0)
-    else:
-        index = df[config.time_ids]
-        train = df.loc[(index >= config.train_range[0]) & (index < config.train_range[1])]
-        valid = df.loc[(index >= config.valid_range[0]) & (index < config.valid_range[1])]
-        test  = df.loc[(index >= config.test_range[0]) & (index < config.test_range[1])]
-
-    return train, valid, test
-
-def flatten_ids(df, config):
-
-    if config.missing_id_strategy == 'drop':
-        if hasattr(config, 'combine_ids') and config.combine_ids:
-            index = np.logical_or.reduce([df[c].isna() for c in config.combine_ids])
-        else:
-            id_col = next(x.name for x in config.features if x.feature_type == InputTypes.ID)
-            index = df[id_col].isna()
-        index = index[index == True].index # Extract indices of nans
-        df.drop(index, inplace=True)
-
-    if not (hasattr(config, 'combine_ids') and config.combine_ids):
-        id_col = next(x.name for x in config.features if x.feature_type == InputTypes.ID)
-        ids = df[id_col].apply(str)
-        df.drop(id_col, axis=1, inplace=True)
-        encoder = sklearn.preprocessing.LabelEncoder().fit(ids.values)
-        df[DEFAULT_ID_COL] = encoder.transform(ids)
-        encoders = OrderedDict({id_col: encoder})
-
-    else:
-        encoders = {c:sklearn.preprocessing.LabelEncoder().fit(df[c].values) for c in config.combine_ids}
-        encoders = OrderedDict(encoders)
-        lens = [len(v.classes_) for v in encoders.values()]
-        clens = np.roll(np.cumprod(lens), 1)
-        clens[0] = 1
-
-        # this takes a looooooot of time. Probably it would be better to create 2 dummy columns
-        df[DEFAULT_ID_COL] = df.apply(lambda row: sum([encoders[c].transform([row[c]])[0]*clens[i] for i,c in enumerate(encoders.keys())]), axis=1)
-        df.drop(config.combine_ids, axis=1, inplace=True)
-
-    return DEFAULT_ID_COL, encoders
-
-def impute(df, config):
-    #XXX This ensures that out scaling will have the same mean. We still need to check the variance
-    if not hasattr(config, 'missing_data_label'):
-        return df, None
-    else:
-        imp = SimpleImputer(missing_values=config.missing_data_label, strategy='mean')
-        mask = df.applymap(lambda x: True if x == config.missing_data_label else False)
-        data = df.values
-        col_mask = (data == config.missing_data_label).all(axis=0)
-        data[:,~col_mask] = imp.fit_transform(data)
-        return data, mask
-
-def normalize_reals(train, valid, test, config, id_col=DEFAULT_ID_COL):
-    tgt_cols = [x.name for x in config.features if x.feature_type == InputTypes.TARGET]
-    real_cols = list(set(v.name for v in config.features if v.feature_embed_type == DataTypes.CONTINUOUS).difference(set(tgt_cols)))
-    real_scalers = {}
-    tgt_scalers = {}
-
-    def apply_scalers(df, name=None):
-        if name is None:
-            name = df.name
-        mask = df.applymap(lambda x: True if x == config.missing_data_label else False) if hasattr(config, 'missing_data_label') else None
-        df[real_cols] = real_scalers[name].transform(df[real_cols])
-        if mask is not None and any(mask):
-            df[real_cols].mask(mask, 10**9)
-        df[tgt_cols] = tgt_scalers[name].transform(df[tgt_cols])
-        return df
-
-    if config.scale_per_id:
-        for identifier, sliced in train.groupby(id_col):
-            data = sliced[real_cols]
-            data, _ = impute(data, config)
-            real_scalers[identifier] = sklearn.preprocessing.StandardScaler().fit(data)
-            # XXX We should probably remove examples that contain NaN as a target
-            target = sliced[tgt_cols]
-            tgt_scalers[identifier] = sklearn.preprocessing.StandardScaler().fit(target)
-
-        train = train.groupby(id_col).apply(apply_scalers)
-        # For valid and testing leave only timeseries previously present in train subset
-        # XXX for proper data science we should consider encoding unseen timeseries as a special case, not throwing them away
-        valid = valid.loc[valid[id_col].isin(real_scalers.keys())]
-        valid = valid.groupby(id_col).apply(apply_scalers)
-        test = test.loc[test[id_col].isin(real_scalers.keys())]
-        test = test.groupby(id_col).apply(apply_scalers)
-
-    else:
-        data, _ = impute(train[real_cols], config)
-        real_scalers[''] = sklearn.preprocessing.StandardScaler().fit(data)
-        tgt_scalers[''] = sklearn.preprocessing.StandardScaler().fit(train[tgt_cols])
-
-        train = apply_scalers(train, name='')
-        valid = apply_scalers(valid, name='')
-        test = apply_scalers(test, name='')
-
-    return train, valid, test, real_scalers, tgt_scalers
-
-def encode_categoricals(train, valid, test, config):
-    cat_encodings = {}
-    cat_cols = list(set(v.name for v in config.features if v.feature_embed_type == DataTypes.CATEGORICAL and v.feature_type != InputTypes.ID))
-    num_classes = [] #XXX Maybe we should modify config based on this value? Or send a warninig?
-                     # For TC performance reasons we might want for num_classes[i] be divisible by 8
-
-    # Train categorical encoders
-    for c in cat_cols:
-        if config.missing_cat_data_strategy == 'special_token':
-            #XXX this will probably require some data augmentation
-            unique = train[c].unique()
-            valid[c].loc[valid[c].isin(unique)] = '<UNK>'
-            test[c].loc[test[c].isin(unique)] = '<UNK>'
-
-        if config.missing_cat_data_strategy == 'encode_all' or \
-                config.missing_cat_data_strategy == 'special_token':
-            srs = pd.concat([train[c], valid[c], test[c]]).apply(str)
-            cat_encodings[c] = sklearn.preprocessing.LabelEncoder().fit(srs.values)
-        elif config.missing_cat_data_strategy == 'drop':
-            # TODO: implement this. In addition to dropping rows this has to split specific time series in chunks
-            # to prevent data from having temporal gaps
-            pass
-        num_classes.append(srs.nunique())
-    print('Categorical variables encodings lens: ', num_classes)
-
-
-    for split in [train, valid, test]:
-        for c in cat_cols:
-            srs = split[c].apply(str)
-            split[c] = srs
-            split.loc[:,c] = cat_encodings[c].transform(srs)
-
-    return cat_encodings
-
-
-def preprocess(src_path, dst_path, config):
-    df = pd.read_csv(src_path, index_col=0)
-
-    for c in config.features:
-        if c.feature_embed_type == DataTypes.DATE:
-            df[c.name] = pd.to_datetime(df[c.name])
-
-    # Leave only columns relevant to preprocessing
-    relevant_columns = list(set([f.name for f in config.features] + [config.time_ids]))
-    df = df[relevant_columns]
-
-
-    id_col, id_encoders = flatten_ids(df, config)
-    df = df.reindex(sorted(df.columns), axis=1)
-    
-    train, valid, test = get_dataset_splits(df, config)
-   
-    # Length filter the data (all timeseries shorter than example len will be dropped)
-    #for df in [train, valid, test]:
-    #    df.groupby(id_col).filter(lambda x: len(x) >= config.example_length)
-    train = pd.concat([x[1] for x in train.groupby(id_col) if len(x[1]) >= config.example_length])
-    valid = pd.concat([x[1] for x in valid.groupby(id_col) if len(x[1]) >= config.example_length])
-    test  = pd.concat([x[1] for x in test.groupby(id_col)  if len(x[1]) >= config.example_length])
-
-    train, valid, test, real_scalers, tgt_scalers = normalize_reals(train, valid, test, config, id_col)
-
-    cat_encodings = encode_categoricals(train, valid, test, config)
-
-    os.makedirs(dst_path, exist_ok=True)
-    
-    train.to_csv(os.path.join(dst_path, 'train.csv'))
-    valid.to_csv(os.path.join(dst_path, 'valid.csv'))
-    test.to_csv(os.path.join(dst_path, 'test.csv'))
-
-    # Save relevant columns in binary form for faster dataloading
-    # IMORTANT: We always expect id to be a single column indicating the complete timeseries
-    # We also expect a copy of id in form of static categorical input!!!
-    col_names = [id_col] + [x.name for x in config.features if x.feature_embed_type != DataTypes.DATE and x.feature_type != InputTypes.ID]
-    grouped_train = [x[1][col_names].values.astype(np.float32).view(dtype=np.int32) for x in train.groupby(id_col)]
-    grouped_valid = [x[1][col_names].values.astype(np.float32).view(dtype=np.int32) for x in valid.groupby(id_col)]
-    grouped_test  = [x[1][col_names].values.astype(np.float32).view(dtype=np.int32) for x in test.groupby(id_col)]
-
-    pickle.dump(grouped_train, open(os.path.join(dst_path, 'train.bin'), 'wb'))
-    pickle.dump(grouped_valid, open(os.path.join(dst_path, 'valid.bin'), 'wb'))
-    pickle.dump(grouped_test,  open(os.path.join(dst_path, 'test.bin'), 'wb'))
-
-    
-    with open(os.path.join(dst_path, 'real_scalers.bin'), 'wb') as f:
-        pickle.dump(real_scalers, f)
-    with open(os.path.join(dst_path, 'tgt_scalers.bin'), 'wb') as f:
-        pickle.dump(tgt_scalers, f)
-    with open(os.path.join(dst_path, 'cat_encodings.bin'), 'wb') as f:
-        pickle.dump(cat_encodings, f)
-    with open(os.path.join(dst_path, 'id_encoders.bin'), 'wb') as f:
-        pickle.dump(id_encoders, f)
-    
-
-def sample_data(dataset, num_samples):
-    if num_samples < 0:
-        return dataset
-    else:
-        return torch.utils.data.Subset(dataset, np.random.choice(np.arange(len(dataset)), size=num_samples, replace=False))
-
-
-def standarize_electricity(path):
-    """Code taken from https://github.com/google-research/google-research/blob/master/tft/script_download_data.py"""
-    df = pd.read_csv(os.path.join(path, 'LD2011_2014.txt'), index_col=0, sep=';', decimal=',')
-    df.index = pd.to_datetime(df.index)
-    df.sort_index(inplace=True)
-  
-    # Used to determine the start and end dates of a series
-    output = df.resample('1h').mean().replace(0., np.nan)
-  
-    earliest_time = output.index.min()
-  
-    df_list = []
-    for label in output:
-        print('Processing {}'.format(label))
-        srs = output[label]
-  
-        start_date = min(srs.fillna(method='ffill').dropna().index)
-        end_date = max(srs.fillna(method='bfill').dropna().index)
-  
-        active_range = (srs.index >= start_date) & (srs.index <= end_date)
-        srs = srs[active_range].fillna(0.)
-  
-        tmp = pd.DataFrame({'power_usage': srs})
-        date = tmp.index
-        tmp['t'] = (date - earliest_time).seconds / 60 / 60 + (
-            date - earliest_time).days * 24
-        tmp['days_from_start'] = (date - earliest_time).days
-        tmp['categorical_id'] = label
-        tmp['date'] = date
-        tmp['id'] = label
-        tmp['hour'] = date.hour
-        tmp['day'] = date.day
-        tmp['day_of_week'] = date.dayofweek
-        tmp['month'] = date.month
-  
-        df_list.append(tmp)
-  
-    output = pd.concat(df_list, axis=0, join='outer').reset_index(drop=True)
-  
-    output['categorical_id'] = output['id'].copy()
-    output['hours_from_start'] = output['t']
-    output['categorical_day_of_week'] = output['day_of_week'].copy()
-    output['categorical_hour'] = output['hour'].copy()
-  
-    output.to_csv(os.path.join(path, 'standarized.csv'))
-
-def standarize_volatility(path):
-    df = pd.read_csv(os.path.join(path, 'oxfordmanrealizedvolatilityindices.csv'), index_col=0)  # no explicit index
-  
-    # Adds additional date/day fields
-    idx = [str(s).split('+')[0] for s in df.index
-          ]  # ignore timezones, we don't need them
-    dates = pd.to_datetime(idx)
-    df['date'] = dates
-    df['days_from_start'] = (dates - pd.datetime(2000, 1, 3)).days
-    df['day_of_week'] = dates.dayofweek
-    df['day_of_month'] = dates.day
-    df['week_of_year'] = dates.weekofyear
-    df['month'] = dates.month
-    df['year'] = dates.year
-    df['categorical_id'] = df['Symbol'].copy()
-  
-    # Processes log volatility
-    vol = df['rv5_ss'].copy()
-    vol.loc[vol == 0.] = np.nan
-    df['log_vol'] = np.log(vol)
-  
-    # Adds static information
-    symbol_region_mapping = {
-        '.AEX': 'EMEA',
-        '.AORD': 'APAC',
-        '.BFX': 'EMEA',
-        '.BSESN': 'APAC',
-        '.BVLG': 'EMEA',
-        '.BVSP': 'AMER',
-        '.DJI': 'AMER',
-        '.FCHI': 'EMEA',
-        '.FTMIB': 'EMEA',
-        '.FTSE': 'EMEA',
-        '.GDAXI': 'EMEA',
-        '.GSPTSE': 'AMER',
-        '.HSI': 'APAC',
-        '.IBEX': 'EMEA',
-        '.IXIC': 'AMER',
-        '.KS11': 'APAC',
-        '.KSE': 'APAC',
-        '.MXX': 'AMER',
-        '.N225': 'APAC ',
-        '.NSEI': 'APAC',
-        '.OMXC20': 'EMEA',
-        '.OMXHPI': 'EMEA',
-        '.OMXSPI': 'EMEA',
-        '.OSEAX': 'EMEA',
-        '.RUT': 'EMEA',
-        '.SMSI': 'EMEA',
-        '.SPX': 'AMER',
-        '.SSEC': 'APAC',
-        '.SSMI': 'EMEA',
-        '.STI': 'APAC',
-        '.STOXX50E': 'EMEA'
-    }
-  
-    df['Region'] = df['Symbol'].apply(lambda k: symbol_region_mapping[k])
-  
-    # Performs final processing
-    output_df_list = []
-    for grp in df.groupby('Symbol'):
-        sliced = grp[1].copy()
-        sliced.sort_values('days_from_start', inplace=True)
-        # Impute log volatility values
-        sliced['log_vol'].fillna(method='ffill', inplace=True)
-        sliced.dropna()
-        output_df_list.append(sliced)
-  
-    df = pd.concat(output_df_list, axis=0)
-  
-    df.to_csv(os.path.join(path, 'standarized.csv'))
-
-
-def standarize_traffic(path):
-    def process_list(s, variable_type=int, delimiter=None):
-        """Parses a line in the PEMS format to a list."""
-        if delimiter is None:
-            l = [
-                variable_type(i) for i in s.replace('[', '').replace(']', '').split()
-            ]
-        else:
-            l = [
-                variable_type(i)
-                for i in s.replace('[', '').replace(']', '').split(delimiter)
-            ]
-  
-        return l
-  
-    def read_single_list(filename):
-        """Returns single list from a file in the PEMS-custom format."""
-        with open(os.path.join(path, filename), 'r') as dat:
-            l = process_list(dat.readlines()[0])
-        return l
-  
-    def read_matrix(filename):
-        """Returns a matrix from a file in the PEMS-custom format."""
-        array_list = []
-        with open(os.path.join(path, filename), 'r') as dat:
-            lines = dat.readlines()
-            for i, line in enumerate(lines):
-                if (i + 1) % 50 == 0:
-                    print('Completed {} of {} rows for {}'.format(i + 1, len(lines),
-                                                                filename))
-                array = [
-                    process_list(row_split, variable_type=float, delimiter=None)
-                    for row_split in process_list(
-                        line, variable_type=str, delimiter=';')
-                ]
-                array_list.append(array)
-  
-        return array_list
-  
-    shuffle_order = np.array(read_single_list('randperm')) - 1  # index from 0
-    train_dayofweek = read_single_list('PEMS_trainlabels')
-    train_tensor = read_matrix('PEMS_train')
-    test_dayofweek = read_single_list('PEMS_testlabels')
-    test_tensor = read_matrix('PEMS_test')
-  
-    # Inverse permutate shuffle order
-    print('Shuffling')
-    inverse_mapping = {
-        new_location: previous_location
-        for previous_location, new_location in enumerate(shuffle_order)
-    }
-    reverse_shuffle_order = np.array([
-        inverse_mapping[new_location]
-        for new_location, _ in enumerate(shuffle_order)
-    ])
-  
-    # Group and reoder based on permuation matrix
-    print('Reodering')
-    day_of_week = np.array(train_dayofweek + test_dayofweek)
-    combined_tensor = np.array(train_tensor + test_tensor)
-  
-    day_of_week = day_of_week[reverse_shuffle_order]
-    combined_tensor = combined_tensor[reverse_shuffle_order]
-  
-    # Put everything back into a dataframe
-    print('Parsing as dataframe')
-    labels = ['traj_{}'.format(i) for i in read_single_list('stations_list')]
-  
-    hourly_list = []
-    for day, day_matrix in enumerate(combined_tensor):
-        # Hourly data
-        hourly = pd.DataFrame(day_matrix.T, columns=labels)
-        hourly['hour_on_day'] = [int(i / 6) for i in hourly.index
-                                ]  # sampled at 10 min intervals
-        if hourly['hour_on_day'].max() > 23 or hourly['hour_on_day'].min() < 0:
-            raise ValueError('Invalid hour! {}-{}'.format(
-                hourly['hour_on_day'].min(), hourly['hour_on_day'].max()))
-  
-        hourly = hourly.groupby('hour_on_day', as_index=True).mean()[labels]
-        hourly['sensor_day'] = day
-        hourly['time_on_day'] = hourly.index
-        hourly['day_of_week'] = day_of_week[day]
-  
-        hourly_list.append(hourly)
-  
-    hourly_frame = pd.concat(hourly_list, axis=0, ignore_index=True, sort=False)
-  
-    # Flatten such that each entitiy uses one row in dataframe
-    store_columns = [c for c in hourly_frame.columns if 'traj' in c]
-    other_columns = [c for c in hourly_frame.columns if 'traj' not in c]
-    flat_df = pd.DataFrame(columns=['values', 'prev_values', 'next_values'] +
-                           other_columns + ['id'])
-  
-    for store in store_columns:
-        print('Processing {}'.format(store))
-  
-        sliced = hourly_frame[[store] + other_columns].copy()
-        sliced.columns = ['values'] + other_columns
-        sliced['id'] = int(store.replace('traj_', ''))
-  
-        # Sort by Sensor-date-time
-        key = sliced['id'].apply(str) \
-                + sliced['sensor_day'].apply(lambda x: '_{:03d}'.format(x)) \
-                + sliced['time_on_day'].apply(lambda x: '_{:03d}'.format(x))
-        sliced = sliced.set_index(key).sort_index()
-  
-        sliced['values'] = sliced['values'].fillna(method='ffill')
-        sliced['prev_values'] = sliced['values'].shift(1)
-        sliced['next_values'] = sliced['values'].shift(-1)
-  
-        flat_df = flat_df.append(sliced.dropna(), ignore_index=True, sort=False)
-  
-    # Filter to match range used by other academic papers
-    index = flat_df['sensor_day']
-    flat_df = flat_df[index < 173].copy()
-  
-    # Creating columns fo categorical inputs
-    flat_df['categorical_id'] = flat_df['id'].copy()
-    flat_df['hours_from_start'] = flat_df['time_on_day'] \
-        + flat_df['sensor_day']*24.
-    flat_df['categorical_day_of_week'] = flat_df['day_of_week'].copy()
-    flat_df['categorical_time_on_day'] = flat_df['time_on_day'].copy()
-  
-    flat_df.to_csv(os.path.join(path, 'standarized.csv'))
-
-
-# XXX needs rework
-def standarize_favorita(data_folder):
-    import gc
-    # Extract only a subset of data to save/process for efficiency
-    start_date = pd.datetime(2015, 1, 1)
-    end_date = pd.datetime(2016, 6, 1)
-  
-    print('Regenerating data...')
-  
-    # load temporal data
-    temporal = pd.read_csv(os.path.join(data_folder, 'train.csv'), index_col=0)
-  
-    store_info = pd.read_csv(os.path.join(data_folder, 'stores.csv'), index_col=0)
-    oil = pd.read_csv(
-        os.path.join(data_folder, 'oil.csv'), index_col=0).iloc[:, 0]
-    holidays = pd.read_csv(os.path.join(data_folder, 'holidays_events.csv'))
-    items = pd.read_csv(os.path.join(data_folder, 'items.csv'), index_col=0)
-    transactions = pd.read_csv(os.path.join(data_folder, 'transactions.csv'))
-  
-    # Take first 6 months of data
-    temporal['date'] = pd.to_datetime(temporal['date'])
-  
-    # Filter dates to reduce storage space requirements
-    if start_date is not None:
-        temporal = temporal[(temporal['date'] >= start_date)]
-    if end_date is not None:
-        temporal = temporal[(temporal['date'] < end_date)]
-  
-    dates = temporal['date'].unique()
-  
-    # Add trajectory identifier
-    temporal['traj_id'] = temporal['store_nbr'].apply(
-        str) + '_' + temporal['item_nbr'].apply(str)
-    temporal['unique_id'] = temporal['traj_id'] + '_' + temporal['date'].apply(
-        str)
-  
-    # Remove all IDs with negative returns
-    print('Removing returns data')
-    min_returns = temporal['unit_sales'].groupby(temporal['traj_id']).min()
-    valid_ids = set(min_returns[min_returns >= 0].index)
-    selector = temporal['traj_id'].apply(lambda traj_id: traj_id in valid_ids)
-    new_temporal = temporal[selector].copy()
-    del temporal
-    gc.collect()
-    temporal = new_temporal
-    temporal['open'] = 1
-  
-    # Resampling
-    print('Resampling to regular grid')
-    resampled_dfs = []
-    for traj_id, raw_sub_df in temporal.groupby('traj_id'):
-        print('Resampling', traj_id)
-        sub_df = raw_sub_df.set_index('date', drop=True).copy()
-        sub_df = sub_df.resample('1d').last()
-        sub_df['date'] = sub_df.index
-        sub_df[['store_nbr', 'item_nbr', 'onpromotion']] \
-            = sub_df[['store_nbr', 'item_nbr', 'onpromotion']].fillna(method='ffill')
-        sub_df['open'] = sub_df['open'].fillna(
-            0)  # flag where sales data is unknown
-        sub_df['log_sales'] = np.log(sub_df['unit_sales'])
-    
-        resampled_dfs.append(sub_df.reset_index(drop=True))
-  
-    new_temporal = pd.concat(resampled_dfs, axis=0)
-    del temporal
-    gc.collect()
-    temporal = new_temporal
-  
-    print('Adding oil')
-    oil.name = 'oil'
-    oil.index = pd.to_datetime(oil.index)
-    #XXX the lines below match the value of the oil on given date with the rest of the timeseries
-    # missing values in oil series are copied from the index before. Then the oil series is joined with
-    # temporal. Then there are some dates present in temporal which arent present in oil, for which 
-    # oil values is substituted with -1. WHY?!
-    #TODO: check how many nans there are after first step. Previously oil series was extended by dates
-    # present in dates variable with nan value, which were forward filled. 
-    # This behavior is no longer supported by pandas, so we changed to DataFrame.isin method.
-    # This leaves us with more nans after first step than previously. To achieve previous behavior
-    # we have to join series before filling nans.
-    temporal = temporal.join(
-        #oil.loc[oil.index.isin(dates)].fillna(method='ffill'), on='date', how='left')
-        oil.loc[oil.index.isin(dates)], on='date', how='left')
-    temporal['oil'] = temporal['oil'].fillna(method='ffill')
-    temporal['oil'] = temporal['oil'].fillna(-1)
-  
-    print('Adding store info')
-    temporal = temporal.join(store_info, on='store_nbr', how='left')
-  
-    print('Adding item info')
-    temporal = temporal.join(items, on='item_nbr', how='left')
-  
-    transactions['date'] = pd.to_datetime(transactions['date'])
-    temporal = temporal.merge(
-        transactions,
-        left_on=['date', 'store_nbr'],
-        right_on=['date', 'store_nbr'],
-        how='left')
-    temporal['transactions'] = temporal['transactions'].fillna(-1)
-  
-    # Additional date info
-    temporal['day_of_week'] = pd.to_datetime(temporal['date'].values).dayofweek
-    temporal['day_of_month'] = pd.to_datetime(temporal['date'].values).day
-    temporal['month'] = pd.to_datetime(temporal['date'].values).month
-  
-    # Add holiday info
-    print('Adding holidays')
-    holiday_subset = holidays[holidays['transferred'].apply(
-        lambda x: not x)].copy()
-    holiday_subset.columns = [
-        s if s != 'type' else 'holiday_type' for s in holiday_subset.columns
-    ]
-    holiday_subset['date'] = pd.to_datetime(holiday_subset['date'])
-    local_holidays = holiday_subset[holiday_subset['locale'] == 'Local']
-    regional_holidays = holiday_subset[holiday_subset['locale'] == 'Regional']
-    national_holidays = holiday_subset[holiday_subset['locale'] == 'National']
-  
-    temporal['national_hol'] = temporal.merge(
-        national_holidays, left_on=['date'], right_on=['date'],
-        how='left')['description'].fillna('')
-    temporal['regional_hol'] = temporal.merge(
-        regional_holidays,
-        left_on=['state', 'date'],
-        right_on=['locale_name', 'date'],
-        how='left')['description'].fillna('')
-    temporal['local_hol'] = temporal.merge(
-        local_holidays,
-        left_on=['city', 'date'],
-        right_on=['locale_name', 'date'],
-        how='left')['description'].fillna('')
-  
-    temporal.sort_values('unique_id', inplace=True)
-
-    # Transform date to integer index
-    start_date = pd.to_datetime(min(temporal['date']))
-    dates = temporal['date'].apply(pd.to_datetime)
-    temporal['days_from_start'] = (dates - start_date).dt.days
-    temporal['categorical_id'] = temporal['traj_id'].copy()
-  
-    print('Saving processed file to {}'.format(os.path.join(data_folder, 'standarized.csv')))
-    temporal.to_csv(os.path.join(data_folder, 'standarized.csv'))
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/ema.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/ema.py
deleted file mode 100644
index f8f5b331..00000000
--- a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/ema.py
+++ /dev/null
@@ -1,73 +0,0 @@
-# Copyright 2021 NVIDIA CORPORATION
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-
-#     http://www.apache.org/licenses/LICENSE-2.0
-
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Copyright 2019 Ross Wightman
-
-#    Licensed under the Apache License, Version 2.0 (the "License");
-#    you may not use this file except in compliance with the License.
-#    You may obtain a copy of the License at
-
-#        http://www.apache.org/licenses/LICENSE-2.0
-
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS,
-#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#    See the License for the specific language governing permissions and
-#    limitations under the License.
-
-""" 
-Exponential Moving Average (EMA) of model updates
-"""
-
-from collections import OrderedDict
-from copy import deepcopy
-
-import torch
-import torch.nn as nn
-
-class ModelEma(nn.Module):
-    """ Model Exponential Moving Average V2
-
-    Keep a moving average of everything in the model state_dict (parameters and buffers).
-    V2 of this module is simpler, it does not match params/buffers based on name but simply
-    iterates in order. It works with torchscript (JIT of full model).
-
-    """
-    def __init__(self, model, decay=0.999, device=None):
-        super().__init__()
-        # make a copy of the model for accumulating moving average of weights
-        self.module = deepcopy(model)
-        self.module.eval()
-        self.decay = decay
-        self.device = device  # perform ema on different device from model if set
-        if self.device is not None:
-            self.module.to(device=device)
-
-    def update(self, model):
-        update_fn=lambda ema_v, model_v: self.decay * ema_v + (1. - self.decay) * model_v
-        with torch.no_grad():
-            for ema_v, model_v in zip(self.module.state_dict().values(), model.state_dict().values()):
-                if self.device is not None:
-                    model_v = model_v.to(device=self.device)
-                ema_v.copy_(update_fn(ema_v, model_v))
-
-    def set(self, model):
-        with torch.no_grad():
-            for ema_v, model_v in zip(self.module.state_dict().values(), model.state_dict().values()):
-                if self.device is not None:
-                    model_v = model_v.to(device=self.device)
-                ema_v.copy_( model_v )
-
-    def forward(self, x):
-        return self.module(x)
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/gpu_affinity.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/gpu_affinity.py
deleted file mode 100644
index 79fb1fc4..00000000
--- a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/gpu_affinity.py
+++ /dev/null
@@ -1,157 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import collections
-import math
-import os
-import pathlib
-import re
-
-import pynvml
-
-pynvml.nvmlInit()
-
-
-def systemGetDriverVersion():
-    return pynvml.nvmlSystemGetDriverVersion()
-
-
-def deviceGetCount():
-    return pynvml.nvmlDeviceGetCount()
-
-
-class device:
-    # assume nvml returns list of 64 bit ints
-    _nvml_affinity_elements = math.ceil(os.cpu_count() / 64)
-
-    def __init__(self, device_idx):
-        super().__init__()
-        self.handle = pynvml.nvmlDeviceGetHandleByIndex(device_idx)
-
-    def getName(self):
-        return pynvml.nvmlDeviceGetName(self.handle)
-
-    def getCpuAffinity(self):
-        affinity_string = ''
-        for j in pynvml.nvmlDeviceGetCpuAffinity(
-            self.handle, device._nvml_affinity_elements
-        ):
-            # assume nvml returns list of 64 bit ints
-            affinity_string = '{:064b}'.format(j) + affinity_string
-        affinity_list = [int(x) for x in affinity_string]
-        affinity_list.reverse()  # so core 0 is in 0th element of list
-
-        ret = [i for i, e in enumerate(affinity_list) if e != 0]
-        return ret
-
-
-def set_socket_affinity(gpu_id):
-    dev = device(gpu_id)
-    affinity = dev.getCpuAffinity()
-    os.sched_setaffinity(0, affinity)
-
-
-def set_single_affinity(gpu_id):
-    dev = device(gpu_id)
-    affinity = dev.getCpuAffinity()
-    os.sched_setaffinity(0, affinity[:1])
-
-
-def set_single_unique_affinity(gpu_id, nproc_per_node):
-    devices = [device(i) for i in range(nproc_per_node)]
-    socket_affinities = [dev.getCpuAffinity() for dev in devices]
-
-    siblings_list = get_thread_siblings_list()
-    siblings_dict = dict(siblings_list)
-
-    # remove siblings
-    for idx, socket_affinity in enumerate(socket_affinities):
-        socket_affinities[idx] = list(set(socket_affinity) - set(siblings_dict.values()))
-
-    affinities = []
-    assigned = []
-
-    for socket_affinity in socket_affinities:
-        for core in socket_affinity:
-            if core not in assigned:
-                affinities.append([core])
-                assigned.append(core)
-                break
-    os.sched_setaffinity(0, affinities[gpu_id])
-
-
-def set_socket_unique_affinity(gpu_id, nproc_per_node, mode):
-    device_ids = [device(i) for i in range(nproc_per_node)]
-    socket_affinities = [dev.getCpuAffinity() for dev in device_ids]
-
-    siblings_list = get_thread_siblings_list()
-    siblings_dict = dict(siblings_list)
-
-    # remove siblings
-    for idx, socket_affinity in enumerate(socket_affinities):
-        socket_affinities[idx] = list(set(socket_affinity) - set(siblings_dict.values()))
-
-    socket_affinities_to_device_ids = collections.defaultdict(list)
-
-    for idx, socket_affinity in enumerate(socket_affinities):
-        socket_affinities_to_device_ids[tuple(socket_affinity)].append(idx)
-
-    for socket_affinity, device_ids in socket_affinities_to_device_ids.items():
-        devices_per_group = len(device_ids)
-        cores_per_device = len(socket_affinity) // devices_per_group
-        for group_id, device_id in enumerate(device_ids):
-            if device_id == gpu_id:
-                if mode == 'interleaved':
-                    affinity = list(socket_affinity[group_id::devices_per_group])
-                elif mode == 'continuous':
-                    affinity = list(socket_affinity[group_id*cores_per_device:(group_id+1)*cores_per_device])
-                else:
-                    raise RuntimeError('Unknown set_socket_unique_affinity mode')
-
-                # reintroduce siblings
-                affinity += [siblings_dict[aff] for aff in affinity if aff in siblings_dict]
-                os.sched_setaffinity(0, affinity)
-
-
-def get_thread_siblings_list():
-    path = '/sys/devices/system/cpu/cpu*/topology/thread_siblings_list'
-    thread_siblings_list = []
-    pattern = re.compile(r'(\d+)\D(\d+)')
-    for fname in pathlib.Path(path[0]).glob(path[1:]):
-        with open(fname) as f:
-            content = f.read().strip()
-            res = pattern.findall(content)
-            if res:
-                pair = tuple(map(int, res[0]))
-                thread_siblings_list.append(pair)
-    return thread_siblings_list
-
-
-def set_affinity(gpu_id, nproc_per_node, mode='socket'):
-    if mode == 'socket':
-        set_socket_affinity(gpu_id)
-    elif mode == 'single':
-        set_single_affinity(gpu_id)
-    elif mode == 'single_unique':
-        set_single_unique_affinity(gpu_id, nproc_per_node)
-    elif mode == 'socket_unique_interleaved':
-        set_socket_unique_affinity(gpu_id, nproc_per_node, 'interleaved')
-    elif mode == 'socket_unique_continuous':
-        set_socket_unique_affinity(gpu_id, nproc_per_node, 'continuous')
-    else:
-        raise RuntimeError('Unknown affinity mode')
-
-    affinity = os.sched_getaffinity(0)
-    return affinity
-
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/inference.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/inference.py
deleted file mode 100644
index 056429f1..00000000
--- a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/inference.py
+++ /dev/null
@@ -1,239 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import pandas as pd
-import numpy as np
-import pickle
-import argparse
-import torch
-from torch.utils.data import DataLoader
-from torch.cuda import amp
-from torch.utils.tensorboard import SummaryWriter
-from tqdm import tqdm
-from modeling import TemporalFusionTransformer
-from configuration import ElectricityConfig
-from data_utils import TFTDataset
-from utils import PerformanceMeter
-from criterions import QuantileLoss
-import dllogger
-from log_helper import setup_logger
-
-def _unscale_per_id(config, values, ids, scalers):
-    values = values.cpu().numpy()
-    num_horizons = config.example_length - config.encoder_length + 1
-    flat_values = pd.DataFrame(
-            values,
-            columns=[f't{j}' for j in range(num_horizons - values.shape[1], num_horizons)]
-            )
-    flat_values['id'] = ids
-    df_list = []
-    for idx, group in flat_values.groupby('id'):
-        scaler = scalers[idx]
-        group_copy = group.copy()
-        for col in group_copy.columns:
-            if not 'id' in col:
-                _col = np.expand_dims(group_copy[col].values, -1)
-                _t_col = scaler.inverse_transform(_col)[:,-1]
-                group_copy[col] = _t_col
-        df_list.append(group_copy)
-    flat_values = pd.concat(df_list, axis=0)
-
-    flat_values = flat_values[[col for col in flat_values if not 'id' in col]]
-    flat_tensor = torch.from_numpy(flat_values.values)
-    return flat_tensor
-
-def _unscale(config, values, scaler):
-    values = values.cpu().numpy()
-    num_horizons = config.example_length - config.encoder_length + 1
-    flat_values = pd.DataFrame(
-            values,
-            columns=[f't{j}' for j in range(num_horizons - values.shape[1], num_horizons)]
-            )
-    for col in flat_values.columns:
-        if not 'id' in col:
-            _col = np.expand_dims(flat_values[col].values, -1)
-            _t_col = scaler.inverse_transform(_col)[:,-1]
-            flat_values[col] = _t_col
-
-    flat_values = flat_values[[col for col in flat_values if not 'id' in col]]
-    flat_tensor = torch.from_numpy(flat_values.values)
-    return flat_tensor
-
-def predict(args, config, model, data_loader, scalers, cat_encodings, extend_targets=False):
-    model.eval()
-    predictions = []
-    targets = []
-    ids = []
-    perf_meter = PerformanceMeter()
-    n_workers = args.distributed_world_size if hasattr(args, 'distributed_world_size') else 1
-
-    for step, batch in enumerate(data_loader):
-        perf_meter.reset_current_lap()
-        with torch.no_grad():
-            batch = {key: tensor.cuda() if tensor.numel() else None for key, tensor in batch.items()}
-            ids.append(batch['id'][:,0,:])
-            targets.append(batch['target'])
-            predictions.append(model(batch).float())
-
-        perf_meter.update(args.batch_size * n_workers,
-            exclude_from_total=step in [0, len(data_loader)-1])
-
-    targets = torch.cat(targets, dim=0)
-    if not extend_targets:
-        targets = targets[:,config.encoder_length:,:] 
-    predictions = torch.cat(predictions, dim=0)
-    
-    if config.scale_per_id:
-        ids = torch.cat(ids, dim=0).cpu().numpy()
-
-        unscaled_predictions = torch.stack(
-                [_unscale_per_id(config, predictions[:,:,i], ids, scalers) for i in range(len(config.quantiles))], 
-                dim=-1)
-        unscaled_targets = _unscale_per_id(config, targets[:,:,0], ids, scalers).unsqueeze(-1)
-    else:
-        ids = None
-        unscaled_predictions = torch.stack(
-                [_unscale(config, predictions[:,:,i], scalers['']) for i in range(len(config.quantiles))], 
-                dim=-1)
-        unscaled_targets = _unscale(config, targets[:,:,0], scalers['']).unsqueeze(-1)
-
-    return unscaled_predictions, unscaled_targets, ids, perf_meter
-
-def visualize_v2(args, config, model, data_loader, scalers, cat_encodings):
-    unscaled_predictions, unscaled_targets, ids, _ = predict(args, config, model, data_loader, scalers, cat_encodings, extend_targets=True)
-
-    num_horizons = config.example_length - config.encoder_length + 1
-    pad = unscaled_predictions.new_full((unscaled_targets.shape[0], unscaled_targets.shape[1] - unscaled_predictions.shape[1], unscaled_predictions.shape[2]), fill_value=float('nan'))
-    pad[:,-1,:] = unscaled_targets[:,-num_horizons,:]
-    unscaled_predictions = torch.cat((pad, unscaled_predictions), dim=1)
-
-    ids = torch.from_numpy(ids.squeeze())
-    joint_graphs = torch.cat([unscaled_targets, unscaled_predictions], dim=2)
-    graphs = {i:joint_graphs[ids == i, :, :] for i in set(ids.tolist())}
-    for key, g in graphs.items():
-        for i, ex in enumerate(g):
-            df = pd.DataFrame(ex.numpy(), 
-                    index=range(num_horizons - ex.shape[0], num_horizons),
-                    columns=['target'] + [f'P{int(q*100)}' for q in config.quantiles])
-            fig = df.plot().get_figure()
-            ax = fig.get_axes()[0]
-            _values = df.values[config.encoder_length-1:,:]
-            ax.fill_between(range(num_horizons), _values[:,1], _values[:,-1], alpha=0.2, color='green')
-            os.makedirs(os.path.join(args.results, 'single_example_vis', str(key)), exist_ok=True)
-            fig.savefig(os.path.join(args.results, 'single_example_vis', str(key), f'{i}.pdf'))
-
-def inference(args, config, model, data_loader, scalers, cat_encodings):
-    unscaled_predictions, unscaled_targets, ids, perf_meter = predict(args, config, model, data_loader, scalers, cat_encodings)
-
-    if args.joint_visualization or args.save_predictions:
-        ids = torch.from_numpy(ids.squeeze())
-        #ids = torch.cat([x['id'][0] for x in data_loader.dataset])
-        joint_graphs = torch.cat([unscaled_targets, unscaled_predictions], dim=2)
-        graphs = {i:joint_graphs[ids == i, :, :] for i in set(ids.tolist())}
-        for key, g in graphs.items(): #timeseries id, joint targets and predictions
-            _g = {'targets': g[:,:,0]}
-            _g.update({f'P{int(q*100)}':g[:,:,i+1] for i, q in enumerate(config.quantiles)})
-            
-            if args.joint_visualization:
-                summary_writer = SummaryWriter(log_dir=os.path.join(args.results, 'predictions_vis', str(key)))
-                for q, t in _g.items(): # target and quantiles, timehorizon values
-                    if q == 'targets':
-                        targets = torch.cat([t[:,0], t[-1,1:]]) # WIP
-                        # We want to plot targets on the same graph as predictions. Probably could be written better.
-                        for i, val in enumerate(targets):
-                            summary_writer.add_scalars(str(key), {f'{q}':val}, i)
-                        continue
-
-                    # Tensor t contains different time horizons which are shifted in phase
-                    # Next lines realign them
-                    y = t.new_full((t.shape[0] + t.shape[1] -1, t.shape[1]), float('nan'))
-                    for i in range(y.shape[1]):
-                        y[i:i+t.shape[0], i] = t[:,i]
-
-                    for i, vals in enumerate(y): # timestep, timehorizon values value
-                        summary_writer.add_scalars(str(key), {f'{q}_t+{j+1}':v for j,v in enumerate(vals) if v == v}, i)
-                summary_writer.close()
-
-            if args.save_predictions:
-                for q, t in _g.items():
-                    df = pd.DataFrame(t.tolist())
-                    df.columns = [f't+{i+1}' for i in range(len(df.columns))]
-                    os.makedirs(os.path.join(args.results, 'predictions', str(key)), exist_ok=True)
-                    df.to_csv(os.path.join(args.results, 'predictions', str(key), q+'.csv'))
-
-    losses = QuantileLoss(config)(unscaled_predictions, unscaled_targets)
-    normalizer = unscaled_targets.abs().mean()
-    q_risk = 2 * losses / normalizer
-
-    perf_dict = {
-                'throughput': perf_meter.avg,
-                'latency_avg': perf_meter.total_time/len(perf_meter.intervals),
-                'latency_p90': perf_meter.p(90),
-                'latency_p95': perf_meter.p(95),
-                'latency_p99': perf_meter.p(99),
-                'total_infernece_time': perf_meter.total_time,
-                }
-
-    return q_risk, perf_dict
-
-
-def main(args):
-    
-    setup_logger(args)
-    # Set up model
-    state_dict = torch.load(args.checkpoint)
-    config = state_dict['config']
-    model = TemporalFusionTransformer(config).cuda()
-    model.load_state_dict(state_dict['model'])
-    model.eval()
-    model.cuda()
-
-    # Set up dataset
-    test_split = TFTDataset(args.data, config)
-    data_loader = DataLoader(test_split, batch_size=args.batch_size, num_workers=4)
-
-    scalers = pickle.load(open(args.tgt_scalers, 'rb'))
-    cat_encodings = pickle.load(open(args.cat_encodings, 'rb'))
-
-    if args.visualize:
-        # TODO: abstract away all forms of visualization.
-        visualize_v2(args, config, model, data_loader, scalers, cat_encodings)
-
-    quantiles, perf_dict = inference(args, config, model, data_loader, scalers, cat_encodings)
-    quantiles = {'test_p10': quantiles[0].item(), 'test_p50': quantiles[1].item(), 'test_p90': quantiles[2].item(), 'sum':sum(quantiles).item()}
-    finish_log = {**quantiles, **perf_dict}
-    dllogger.log(step=(), data=finish_log, verbosity=1)
-    print('Test q-risk: P10 {} | P50 {} | P90 {}'.format(*quantiles))
-    print('Latency:\n\tAverage {:.3f}s\n\tp90 {:.3f}s\n\tp95 {:.3f}s\n\tp99 {:.3f}s'.format(
-        perf_dict['latency_avg'], perf_dict['latency_p90'], perf_dict['latency_p95'], perf_dict['latency_p99']))
-
-if __name__=='__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--checkpoint', type=str,
-                        help='Path to the checkpoint')
-    parser.add_argument('--data', type=str,
-                        help='Path to the test split of the dataset')
-    parser.add_argument('--tgt_scalers', type=str,
-                        help='Path to the tgt_scalers.bin file produced by the preprocessing')
-    parser.add_argument('--cat_encodings', type=str,
-                        help='Path to the cat_encodings.bin file produced by the preprocessing')
-    parser.add_argument('--batch_size', type=int, default=64)
-    parser.add_argument('--visualize', action='store_true', help='Visualize predictions - each example on the separate plot')
-    parser.add_argument('--joint_visualization', action='store_true', help='Visualize predictions - each timeseries on separate plot. Projections will be concatenated.')
-    parser.add_argument('--save_predictions', action='store_true')
-    parser.add_argument('--results', type=str, default='/results')
-    parser.add_argument('--log_file', type=str, default='dllogger.json')
-    ARGS = parser.parse_args()
-    main(ARGS)
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/log_helper.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/log_helper.py
deleted file mode 100644
index 83d2ac7f..00000000
--- a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/log_helper.py
+++ /dev/null
@@ -1,141 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import subprocess
-import sys
-import itertools
-import atexit
-
-import dllogger
-from dllogger import Backend, JSONStreamBackend, StdOutBackend
-
-import torch.distributed as dist
-from torch.utils.tensorboard import SummaryWriter
-
-class TensorBoardBackend(Backend):
-    def __init__(self, verbosity, log_dir):
-        super().__init__(verbosity=verbosity)
-        self.summary_writer = SummaryWriter(log_dir=os.path.join(log_dir, 'TB_summary'),
-                                            flush_secs=120,
-                                            max_queue=200
-                                            )
-        self.hp_cache = None
-        atexit.register(self.summary_writer.close)
-
-    @property
-    def log_level(self):
-        return self._log_level
-
-    def metadata(self, timestamp, elapsedtime, metric, metadata):
-        pass
-
-    def log(self, timestamp, elapsedtime, step, data):
-        if step == 'HPARAMS':
-            parameters = {k: v for k, v in data.items() if not isinstance(v, (list, tuple))}
-            #Unpack list and tuples
-            for d in [{k+f'_{i}':v for i,v in enumerate(l)} for k,l in data.items() if isinstance(l, (list, tuple))]:
-                parameters.update(d)
-            #Remove custom classes
-            parameters = {k: v for k, v in data.items() if isinstance(v, (int, float, str, bool))}
-            parameters.update({k:'None' for k, v in data.items() if v is None})
-            self.hp_cache = parameters
-        if step == ():
-            if self.hp_cache is None:
-                print('Warning: Cannot save HParameters. Please log HParameters with step=\'HPARAMS\'', file=sys.stderr)
-                return
-            self.summary_writer.add_hparams(self.hp_cache, data)
-        if not isinstance(step, int):
-            return
-        for k, v in data.items():
-            self.summary_writer.add_scalar(k, v, step)
-
-    def flush(self):
-        pass
-
-def setup_logger(args):
-    os.makedirs(args.results, exist_ok=True)
-    log_path = os.path.join(args.results, args.log_file)
-
-    if os.path.exists(log_path):
-        for i in itertools.count():
-            s_fname = args.log_file.split('.')
-            fname = '.'.join(s_fname[:-1]) + f'_{i}.' + s_fname[-1] if len(s_fname) > 1 else args.stat_file + f'.{i}'
-            log_path = os.path.join(args.results, fname)
-            if not os.path.exists(log_path):
-                break
-
-    def metric_format(metric, metadata, value):
-        return "{}: {}".format(metric, f'{value:.5f}' if isinstance(value, float) else value)
-    def step_format(step):
-        if step == ():
-            return "Finished |"
-        elif isinstance(step, int):
-            return "Step {0: <5} |".format(step)
-        return "Step {} |".format(step)
-
-
-    if not dist.is_initialized() or not args.distributed_world_size > 1 or args.distributed_rank == 0:
-        dllogger.init(backends=[JSONStreamBackend(verbosity=1, filename=log_path),
-                                TensorBoardBackend(verbosity=1, log_dir=args.results),
-                                StdOutBackend(verbosity=2, 
-                                              step_format=step_format,
-                                              prefix_format=lambda x: "")#,
-                                              #metric_format=metric_format)
-                                ])
-    else:
-        dllogger.init(backends=[])
-    dllogger.log(step='PARAMETER', data=vars(args), verbosity=0)
-
-    container_setup_info = {**get_framework_env_vars(), **get_system_info()}
-    dllogger.log(step='ENVIRONMENT', data=container_setup_info, verbosity=0)
-
-    dllogger.metadata('loss', {'GOAL': 'MINIMIZE', 'STAGE': 'TRAIN', 'format': ':5f'})
-    dllogger.metadata('P10', {'GOAL': 'MINIMIZE', 'STAGE': 'TRAIN', 'format': ':5f'})
-    dllogger.metadata('P50', {'GOAL': 'MINIMIZE', 'STAGE': 'TRAIN', 'format': ':5f'})
-    dllogger.metadata('P90', {'GOAL': 'MINIMIZE', 'STAGE': 'TRAIN', 'format': ':5f'})
-    dllogger.metadata('items/s', {'GOAL': 'MAXIMIZE', 'STAGE': 'TRAIN', 'format': ':1f'})
-    dllogger.metadata('val_loss', {'GOAL': 'MINIMIZE', 'STAGE': 'VAL', 'format':':5f'})
-    dllogger.metadata('val_P10', {'GOAL': 'MINIMIZE', 'STAGE': 'VAL', 'format': ':5f'})
-    dllogger.metadata('val_P50', {'GOAL': 'MINIMIZE', 'STAGE': 'VAL', 'format': ':5f'})
-    dllogger.metadata('val_P90', {'GOAL': 'MINIMIZE', 'STAGE': 'VAL', 'format': ':5f'})
-    dllogger.metadata('val_items/s', {'GOAL': 'MAXIMIZE', 'STAGE': 'VAL', 'format': ':1f'})
-    dllogger.metadata('test_P10', {'GOAL': 'MINIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
-    dllogger.metadata('test_P50', {'GOAL': 'MINIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
-    dllogger.metadata('test_P90', {'GOAL': 'MINIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
-    dllogger.metadata('throughput', {'GOAL': 'MAXIMIZE', 'STAGE': 'TEST', 'format': ':1f'})
-    dllogger.metadata('latency_p90', {'GOAL': 'MIMIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
-    dllogger.metadata('latency_p95', {'GOAL': 'MIMIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
-    dllogger.metadata('latency_p99', {'GOAL': 'MIMIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
-
-
-def get_framework_env_vars():
-    return {
-        'NVIDIA_PYTORCH_VERSION': os.environ.get('NVIDIA_PYTORCH_VERSION'),
-        'PYTORCH_VERSION': os.environ.get('PYTORCH_VERSION'),
-        'CUBLAS_VERSION': os.environ.get('CUBLAS_VERSION'),
-        'NCCL_VERSION': os.environ.get('NCCL_VERSION'),
-        'CUDA_DRIVER_VERSION': os.environ.get('CUDA_DRIVER_VERSION'),
-        'CUDNN_VERSION': os.environ.get('CUDNN_VERSION'),
-        'CUDA_VERSION': os.environ.get('CUDA_VERSION'),
-        'NVIDIA_PIPELINE_ID': os.environ.get('NVIDIA_PIPELINE_ID'),
-        'NVIDIA_BUILD_ID': os.environ.get('NVIDIA_BUILD_ID'),
-        'NVIDIA_TF32_OVERRIDE': os.environ.get('NVIDIA_TF32_OVERRIDE'),
-    }
-
-def get_system_info():
-    system_info = subprocess.run('nvidia-smi --query-gpu=gpu_name,memory.total,enforced.power.limit --format=csv'.split(), capture_output=True).stdout
-    system_info = [i.decode('utf-8') for i in system_info.split(b'\n')]
-    system_info = [x for x in system_info if x]
-    return {'system_info': system_info}
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/modeling.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/modeling.py
deleted file mode 100644
index 65e64983..00000000
--- a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/modeling.py
+++ /dev/null
@@ -1,367 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-from torch import Tensor
-from typing import Dict, Tuple, Optional, List
-
-if os.environ.get("TFT_SCRIPTING", False):
-    from torch.nn import LayerNorm
-else:
-    from apex.normalization.fused_layer_norm import FusedLayerNorm as LayerNorm
-
-class MaybeLayerNorm(nn.Module):
-    def __init__(self, output_size, hidden_size, eps):
-        super().__init__()
-        if output_size and output_size == 1:
-            self.ln = nn.Identity()
-        else:
-            self.ln = LayerNorm(output_size if output_size else hidden_size, eps=eps)
-    
-    def forward(self, x):
-        return self.ln(x)
-
-
-class GLU(nn.Module):
-    def __init__(self, hidden_size, output_size):
-        super().__init__()
-        self.lin = nn.Linear(hidden_size, output_size * 2)
-
-    def forward(self, x: Tensor) -> Tensor:
-        x = self.lin(x)
-        x = F.glu(x)
-        return x
-
-
-class GRN(nn.Module):
-    def __init__(self,
-                 input_size,
-                 hidden_size, 
-                 output_size=None,
-                 context_hidden_size=None,
-                 dropout=0):
-        super().__init__()
-
-        
-        self.layer_norm = MaybeLayerNorm(output_size, hidden_size, eps=1e-3)
-        self.lin_a = nn.Linear(input_size, hidden_size)
-        if context_hidden_size is not None:
-            self.lin_c = nn.Linear(context_hidden_size, hidden_size, bias=False)
-        self.lin_i = nn.Linear(hidden_size, hidden_size)
-        self.glu = GLU(hidden_size, output_size if output_size else hidden_size)
-        self.dropout = nn.Dropout(dropout)
-        self.out_proj = nn.Linear(input_size, output_size) if output_size else None
-
-    def forward(self, a: Tensor, c: Optional[Tensor] = None):
-        x = self.lin_a(a)
-        if c is not None:
-            x = x + self.lin_c(c).unsqueeze(1)
-        x = F.elu(x)
-        x = self.lin_i(x)
-        x = self.dropout(x)
-        x = self.glu(x)
-        y = a if not self.out_proj else self.out_proj(a)
-        x = x + y
-        x = self.layer_norm(x)
-        return x 
-
-class TFTEmbedding(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.s_cat_inp_lens    = config.static_categorical_inp_lens
-        self.t_cat_k_inp_lens  = config.temporal_known_categorical_inp_lens
-        self.t_cat_o_inp_lens  = config.temporal_observed_categorical_inp_lens
-        self.s_cont_inp_size   = config.static_continuous_inp_size
-        self.t_cont_k_inp_size = config.temporal_known_continuous_inp_size
-        self.t_cont_o_inp_size = config.temporal_observed_continuous_inp_size
-        self.t_tgt_size        = config.temporal_target_size
-
-        self.hidden_size = config.hidden_size
-
-        # There are 7 types of input:
-        # 1. Static categorical
-        # 2. Static continuous
-        # 3. Temporal known a priori categorical
-        # 4. Temporal known a priori continuous
-        # 5. Temporal observed categorical
-        # 6. Temporal observed continuous
-        # 7. Temporal observed targets (time series obseved so far)
-
-        self.s_cat_embed = nn.ModuleList([
-            nn.Embedding(n, self.hidden_size) for n in self.s_cat_inp_lens]) if self.s_cat_inp_lens else None
-        self.t_cat_k_embed = nn.ModuleList([
-            nn.Embedding(n, self.hidden_size) for n in self.t_cat_k_inp_lens]) if self.t_cat_k_inp_lens else None
-        self.t_cat_o_embed = nn.ModuleList([
-            nn.Embedding(n, self.hidden_size) for n in self.t_cat_o_inp_lens]) if self.t_cat_o_inp_lens else None
-
-        self.s_cont_embedding_vectors = nn.Parameter(torch.Tensor(self.s_cont_inp_size, self.hidden_size)) if self.s_cont_inp_size else None
-        self.t_cont_k_embedding_vectors = nn.Parameter(torch.Tensor(self.t_cont_k_inp_size, self.hidden_size)) if self.t_cont_k_inp_size else None
-        self.t_cont_o_embedding_vectors = nn.Parameter(torch.Tensor(self.t_cont_o_inp_size, self.hidden_size)) if self.t_cont_o_inp_size else None
-        self.t_tgt_embedding_vectors = nn.Parameter(torch.Tensor(self.t_tgt_size, self.hidden_size))
-
-        self.s_cont_embedding_bias = nn.Parameter(torch.zeros(self.s_cont_inp_size, self.hidden_size)) if self.s_cont_inp_size else None
-        self.t_cont_k_embedding_bias = nn.Parameter(torch.zeros(self.t_cont_k_inp_size, self.hidden_size)) if self.t_cont_k_inp_size else None
-        self.t_cont_o_embedding_bias = nn.Parameter(torch.zeros(self.t_cont_o_inp_size, self.hidden_size)) if self.t_cont_o_inp_size else None
-        self.t_tgt_embedding_bias = nn.Parameter(torch.zeros(self.t_tgt_size, self.hidden_size))
-
-        if self.s_cont_embedding_vectors is not None:
-            torch.nn.init.xavier_normal_(self.s_cont_embedding_vectors)
-        if self.t_cont_k_embedding_vectors is not None:
-            torch.nn.init.xavier_normal_(self.t_cont_k_embedding_vectors)
-        if self.t_cont_o_embedding_vectors is not None:
-            torch.nn.init.xavier_normal_(self.t_cont_o_embedding_vectors)
-        torch.nn.init.xavier_normal_(self.t_tgt_embedding_vectors)
-
-    def _apply_embedding(self,
-            cat: Optional[Tensor],
-            cont: Optional[Tensor],
-            cat_emb: Optional[nn.ModuleList], 
-            cont_emb: Tensor,
-            cont_bias: Tensor,
-            ) -> Tuple[Optional[Tensor], Optional[Tensor]]:
-        e_cat = torch.stack([embed(cat[...,i]) for i, embed in enumerate(cat_emb)], dim=-2) if cat is not None else None
-        if cont is not None:
-            #the line below is equivalent to following einsums
-            #e_cont = torch.einsum('btf,fh->bthf', cont, cont_emb)
-            #e_cont = torch.einsum('bf,fh->bhf', cont, cont_emb)
-            e_cont = torch.mul(cont.unsqueeze(-1), cont_emb)
-            e_cont = e_cont + cont_bias
-        else:
-            e_cont = None
-
-        if e_cat is not None and e_cont is not None:
-            return torch.cat([e_cat, e_cont], dim=-2)
-        elif e_cat is not None:
-            return e_cat
-        elif e_cont is not None:
-            return e_cont
-        else:
-            return None
-
-    def forward(self, x: Dict[str, Tensor]):
-        # temporal/static categorical/continuous known/observed input 
-        s_cat_inp = x.get('s_cat', None)
-        s_cont_inp = x.get('s_cont', None)
-        t_cat_k_inp = x.get('k_cat', None)
-        t_cont_k_inp = x.get('k_cont', None)
-        t_cat_o_inp = x.get('o_cat', None)
-        t_cont_o_inp = x.get('o_cont', None)
-        t_tgt_obs = x['target'] # Has to be present
-
-        # Static inputs are expected to be equal for all timesteps
-        # For memory efficiency there is no assert statement
-        s_cat_inp = s_cat_inp[:,0,:] if s_cat_inp is not None else None
-        s_cont_inp = s_cont_inp[:,0,:] if s_cont_inp is not None else None
-
-        s_inp = self._apply_embedding(s_cat_inp,
-                                      s_cont_inp,
-                                      self.s_cat_embed,
-                                      self.s_cont_embedding_vectors,
-                                      self.s_cont_embedding_bias)
-        t_known_inp = self._apply_embedding(t_cat_k_inp,
-                                            t_cont_k_inp,
-                                            self.t_cat_k_embed,
-                                            self.t_cont_k_embedding_vectors,
-                                            self.t_cont_k_embedding_bias)
-        t_observed_inp = self._apply_embedding(t_cat_o_inp,
-                                               t_cont_o_inp,
-                                               self.t_cat_o_embed,
-                                               self.t_cont_o_embedding_vectors,
-                                               self.t_cont_o_embedding_bias)
-
-        # Temporal observed targets
-        # t_observed_tgt = torch.einsum('btf,fh->btfh', t_tgt_obs, self.t_tgt_embedding_vectors)
-        t_observed_tgt = torch.matmul(t_tgt_obs.unsqueeze(3).unsqueeze(4), self.t_tgt_embedding_vectors.unsqueeze(1)).squeeze(3)
-        t_observed_tgt = t_observed_tgt + self.t_tgt_embedding_bias
-
-        return s_inp, t_known_inp, t_observed_inp, t_observed_tgt
-
-class VariableSelectionNetwork(nn.Module):
-    def __init__(self, config, num_inputs):
-        super().__init__()
-        self.joint_grn = GRN(config.hidden_size*num_inputs, config.hidden_size, output_size=num_inputs, context_hidden_size=config.hidden_size)
-        self.var_grns = nn.ModuleList([GRN(config.hidden_size, config.hidden_size, dropout=config.dropout) for _ in range(num_inputs)])
-
-    def forward(self, x: Tensor, context: Optional[Tensor] = None):
-        Xi = x.reshape(*x.shape[:-2], -1)
-        grn_outputs = self.joint_grn(Xi, c=context)
-        sparse_weights = F.softmax(grn_outputs, dim=-1)
-        transformed_embed_list = [m(x[...,i,:]) for i, m in enumerate(self.var_grns)]
-        transformed_embed = torch.stack(transformed_embed_list, dim=-1)
-        #the line below performs batched matrix vector multiplication
-        #for temporal features it's bthf,btf->bth
-        #for static features it's bhf,bf->bh
-        variable_ctx = torch.matmul(transformed_embed, sparse_weights.unsqueeze(-1)).squeeze(-1)
-
-        return variable_ctx, sparse_weights
-
-class StaticCovariateEncoder(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.vsn = VariableSelectionNetwork(config, config.num_static_vars)
-        self.context_grns = nn.ModuleList([GRN(config.hidden_size, config.hidden_size, dropout=config.dropout) for _ in range(4)])
-
-    def forward(self, x: Tensor) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
-        variable_ctx, sparse_weights = self.vsn(x)
-
-        # Context vectors:
-        # variable selection context
-        # enrichment context
-        # state_c context
-        # state_h context
-        cs, ce, ch, cc = tuple(m(variable_ctx) for m in self.context_grns)
-
-        return cs, ce, ch, cc
-
-
-class InterpretableMultiHeadAttention(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.n_head = config.n_head
-        assert config.hidden_size % config.n_head == 0
-        self.d_head = config.hidden_size // config.n_head
-        self.qkv_linears = nn.Linear(config.hidden_size, (2 * self.n_head + 1) * self.d_head, bias=False)
-        self.out_proj = nn.Linear(self.d_head, config.hidden_size, bias=False)
-        self.attn_dropout = nn.Dropout(config.attn_dropout)
-        self.out_dropout = nn.Dropout(config.dropout)
-        self.scale = self.d_head**-0.5
-        self.register_buffer("_mask", torch.triu(torch.full((config.example_length, config.example_length), float('-inf')), 1).unsqueeze(0))
-
-    def forward(self, x: Tensor, mask_future_timesteps: bool = True) -> Tuple[Tensor, Tensor]:
-        bs, t, h_size = x.shape
-        qkv = self.qkv_linears(x)
-        q, k, v = qkv.split((self.n_head * self.d_head, self.n_head * self.d_head, self.d_head), dim=-1)
-        q = q.view(bs, t, self.n_head, self.d_head)
-        k = k.view(bs, t, self.n_head, self.d_head)
-        v = v.view(bs, t, self.d_head)
-
-        # attn_score = torch.einsum('bind,bjnd->bnij', q, k)
-        attn_score = torch.matmul(q.permute((0, 2, 1, 3)), k.permute((0, 2, 3, 1)))
-        attn_score.mul_(self.scale)
-
-        if mask_future_timesteps:
-            attn_score = attn_score + self._mask
-
-        attn_prob = F.softmax(attn_score, dim=3)
-        attn_prob = self.attn_dropout(attn_prob)
-
-        # attn_vec = torch.einsum('bnij,bjd->bnid', attn_prob, v)
-        attn_vec = torch.matmul(attn_prob, v.unsqueeze(1))
-        m_attn_vec = torch.mean(attn_vec, dim=1)
-        out = self.out_proj(m_attn_vec)
-        out = self.out_dropout(out)
-
-        return out, attn_vec
-
-
-
-class TemporalFusionTransformer(nn.Module):
-    """ 
-    Implementation of https://arxiv.org/abs/1912.09363 
-    """
-    def __init__(self, config):
-        super().__init__()
-
-        if hasattr(config, 'model'):
-            config = config.model
-
-        self.encoder_length = config.encoder_length #this determines from how distant past we want to use data from
-
-        self.embedding = TFTEmbedding(config)
-        self.static_encoder = StaticCovariateEncoder(config)
-
-        self.history_vsn = VariableSelectionNetwork(config, config.num_historic_vars) 
-        self.history_encoder = nn.LSTM(config.hidden_size, config.hidden_size, batch_first=True)
-        self.future_vsn = VariableSelectionNetwork(config, config.num_future_vars)
-        self.future_encoder = nn.LSTM(config.hidden_size, config.hidden_size, batch_first=True)
-
-
-        self.input_gate = GLU(config.hidden_size, config.hidden_size)
-        self.input_gate_ln = LayerNorm(config.hidden_size, eps=1e-3)
-
-        self.enrichment_grn = GRN(config.hidden_size,
-                                  config.hidden_size,
-                                  context_hidden_size=config.hidden_size, 
-                                  dropout=config.dropout)
-        self.attention = InterpretableMultiHeadAttention(config)
-        self.attention_gate = GLU(config.hidden_size, config.hidden_size)
-        self.attention_ln = LayerNorm(config.hidden_size, eps=1e-3)
-
-        self.positionwise_grn = GRN(config.hidden_size,
-                                    config.hidden_size,
-                                    dropout=config.dropout)
-
-        self.decoder_gate = GLU(config.hidden_size, config.hidden_size)
-        self.decoder_ln = LayerNorm(config.hidden_size, eps=1e-3)
-
-        self.quantile_proj = nn.Linear(config.hidden_size, len(config.quantiles))
-
-    def forward(self, x: Dict[str, Tensor]) -> Tensor:
-        s_inp, t_known_inp, t_observed_inp, t_observed_tgt = self.embedding(x)
-
-        # Static context
-        cs, ce, ch, cc = self.static_encoder(s_inp)
-        ch, cc = ch.unsqueeze(0), cc.unsqueeze(0) #lstm initial states
-
-        # Temporal input
-        _historical_inputs = [t_known_inp[:,:self.encoder_length,:], t_observed_tgt[:,:self.encoder_length,:]]
-        if t_observed_inp is not None:
-            _historical_inputs.insert(0,t_observed_inp[:,:self.encoder_length,:])
-
-        historical_inputs = torch.cat(_historical_inputs, dim=-2)
-        future_inputs = t_known_inp[:, self.encoder_length:]
-
-        # Encoders
-        historical_features, _ = self.history_vsn(historical_inputs, cs)
-        history, state = self.history_encoder(historical_features, (ch, cc))
-        future_features, _ = self.future_vsn(future_inputs, cs)
-        future, _ = self.future_encoder(future_features, state)
-        torch.cuda.synchronize() # this call gives perf boost for unknown reasons
-
-        # skip connection
-        input_embedding = torch.cat([historical_features, future_features], dim=1)
-        temporal_features = torch.cat([history, future], dim=1)
-        temporal_features = self.input_gate(temporal_features)
-        temporal_features = temporal_features + input_embedding
-        temporal_features = self.input_gate_ln(temporal_features)
-
-        # Static enrichment
-        enriched = self.enrichment_grn(temporal_features, c=ce)
-
-        # Temporal self attention
-        x, _ = self.attention(enriched, mask_future_timesteps=True)
-
-        # Don't compute hictorical quantiles
-        x = x[:, self.encoder_length:, :]
-        temporal_features = temporal_features[:, self.encoder_length:, :]
-        enriched = enriched[:, self.encoder_length:, :]
-
-        x = self.attention_gate(x)
-        x = x + enriched
-        x = self.attention_ln(x)
-
-        # Position-wise feed-forward
-        x = self.positionwise_grn(x)
-
-        # Final skip connection
-        x = self.decoder_gate(x)
-        x = x + temporal_features
-        x = self.decoder_ln(x)
-
-        out = self.quantile_proj(x)
-
-        return out
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/requirements.txt b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/requirements.txt
deleted file mode 100644
index 8ba46efc..00000000
--- a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/requirements.txt
+++ /dev/null
@@ -1 +0,0 @@
-tensorboard
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/scripts/benchmark.sh b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/scripts/benchmark.sh
deleted file mode 100644
index c8a04c36..00000000
--- a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/scripts/benchmark.sh
+++ /dev/null
@@ -1,54 +0,0 @@
-#! /bin/bash
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-NUM_GPUS=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
-[ $NUM_GPUS -eq 16 ] && WORKER_NUMS=(1 8 16) || WORKER_NUMS=(1 8)
-DATASETS=(electricity traffic)
-
-rm -r /tmp/benchmark_results
-
-for DATASET in ${DATASETS[@]}
-do
-    for NGPU in ${WORKER_NUMS[@]}
-    do
-        for BATCH_SIZE in 512 1024 1536 2048 2560
-        do
-            for USE_AMP in --use_amp ""
-            do
-                for AFFINITY in "--affinity disabled" "--affinity single" "--affinity socket_unique_interleaved"
-                do 
-                    EXP_NAME="TFT_benchmark_${DATASET}_BS_${BATCH_SIZE}_${NGPU}GPU${USE_AMP}_${AFFINITY}"
-                    python -m torch.distributed.launch --nproc_per_node=${NGPU} train.py \
-                            --dataset ${DATASET} \
-                            --data_path /data/processed/${DATASET}_bin \
-                            --batch_size=${BATCH_SIZE} \
-                            --lr 5e-4 \
-                            --epochs 1 \
-                            --sample 100000 5000 \
-                            --seed 1 \
-                            ${USE_AMP} \
-                            ${AFFINITY} \
-                            --clip_grad 0.1 \
-                            --results /tmp/benchmark_results/${EXP_NAME}
-                done
-            done
-        done
-    done
-done
-for P in `ls /tmp/benchmark_results/`;
-do
-    echo ${P}
-    tail -n 1 /tmp/benchmark_results/${P}/dllogger.json
-done
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/scripts/get_data.sh b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/scripts/get_data.sh
deleted file mode 100644
index d4c7c7e1..00000000
--- a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/scripts/get_data.sh
+++ /dev/null
@@ -1,40 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-DATAPATH='/data'
-
-declare -A URLS=( ['electricity']='https://archive.ics.uci.edu/ml/machine-learning-databases/00321/LD2011_2014.txt.zip'
-                  ['traffic']='https://archive.ics.uci.edu/ml/machine-learning-databases/00204/PEMS-SF.zip'
-                )
-
-mkdir -p ${DATAPATH}/raw
-mkdir -p ${DATAPATH}/processed
-
-for DS in electricity traffic
-do
-	DS_PATH=${DATAPATH}/raw/${DS}
-	ZIP_FNAME=${DS_PATH}.zip
-    if [ ! -d ${DS_PATH} ]
-    then
-        wget "${URLS[${DS}]}" -O ${ZIP_FNAME}
-        unzip ${ZIP_FNAME} -d ${DS_PATH}
-    fi
-	python -c "from data_utils import standarize_${DS} as standarize; standarize(\"${DS_PATH}\")"
-	python -c "from data_utils import preprocess; \
-               from configuration import ${DS^}Config as Config; \
-               preprocess(\"${DS_PATH}/standarized.csv\", \"${DATAPATH}/processed/${DS}_bin\", Config())" 
-done
-
-
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/scripts/run_electricity.sh b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/scripts/run_electricity.sh
deleted file mode 100644
index 86214a9a..00000000
--- a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/scripts/run_electricity.sh
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-: ${SEED:=1}
-: ${LR:=1e-3}
-: ${NGPU:=8}
-: ${BATCH_SIZE:=1024}
-: ${EPOCHS:=30}
-
-python -m torch.distributed.launch --nproc_per_node=${NGPU} train.py \
-        --dataset electricity \
-        --data_path /data/processed/electricity_bin \
-        --batch_size=${BATCH_SIZE} \
-        --sample 450000 50000 \
-        --lr ${LR} \
-        --epochs ${EPOCHS} \
-        --seed ${SEED} \
-        --use_amp \
-        --results /results/TFT_electricity_bs${NGPU}x${BATCH_SIZE}_lr${LR}/seed_${SEED}
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/scripts/run_electricity_DGX1-16G.sh b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/scripts/run_electricity_DGX1-16G.sh
deleted file mode 100644
index 86214a9a..00000000
--- a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/scripts/run_electricity_DGX1-16G.sh
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-: ${SEED:=1}
-: ${LR:=1e-3}
-: ${NGPU:=8}
-: ${BATCH_SIZE:=1024}
-: ${EPOCHS:=30}
-
-python -m torch.distributed.launch --nproc_per_node=${NGPU} train.py \
-        --dataset electricity \
-        --data_path /data/processed/electricity_bin \
-        --batch_size=${BATCH_SIZE} \
-        --sample 450000 50000 \
-        --lr ${LR} \
-        --epochs ${EPOCHS} \
-        --seed ${SEED} \
-        --use_amp \
-        --results /results/TFT_electricity_bs${NGPU}x${BATCH_SIZE}_lr${LR}/seed_${SEED}
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/scripts/run_traffic.sh b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/scripts/run_traffic.sh
deleted file mode 100644
index cab8e473..00000000
--- a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/scripts/run_traffic.sh
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-: ${SEED:=1}
-: ${LR:=1e-3}
-: ${NGPU:=8}
-: ${BATCH_SIZE:=1024}
-: ${EPOCHS:=20}
-
-python -m torch.distributed.launch --nproc_per_node=${NGPU} train.py \
-        --dataset traffic \
-        --data_path /data/processed/traffic_bin \
-        --batch_size=${BATCH_SIZE} \
-        --sample 450000 50000 \
-        --lr ${LR} \
-        --epochs ${EPOCHS} \
-        --seed ${SEED} \
-        --use_amp \
-        --results /results/TFT_traffic_bs${NGPU}x${BATCH_SIZE}_lr${LR}/seed_${SEED}
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/scripts/run_traffic_DGX1-16G.sh b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/scripts/run_traffic_DGX1-16G.sh
deleted file mode 100644
index cab8e473..00000000
--- a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/scripts/run_traffic_DGX1-16G.sh
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-: ${SEED:=1}
-: ${LR:=1e-3}
-: ${NGPU:=8}
-: ${BATCH_SIZE:=1024}
-: ${EPOCHS:=20}
-
-python -m torch.distributed.launch --nproc_per_node=${NGPU} train.py \
-        --dataset traffic \
-        --data_path /data/processed/traffic_bin \
-        --batch_size=${BATCH_SIZE} \
-        --sample 450000 50000 \
-        --lr ${LR} \
-        --epochs ${EPOCHS} \
-        --seed ${SEED} \
-        --use_amp \
-        --results /results/TFT_traffic_bs${NGPU}x${BATCH_SIZE}_lr${LR}/seed_${SEED}
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/Dockerfile b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/Dockerfile
deleted file mode 100644
index 70552ea1..00000000
--- a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/Dockerfile
+++ /dev/null
@@ -1,36 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:21.06-py3
-
-FROM ${FROM_IMAGE_NAME}
-
-RUN apt-get update && apt-get install -y libb64-dev libb64-0d
-WORKDIR /workspace
-#ENV PYTHONPATH /workspace
-RUN pip uninstall -y typing
-
-RUN apt update && apt install -y p7zip-full
-COPY requirements.txt .
-RUN pip install --upgrade pip
-RUN pip install --no-cache-dir --ignore-installed -r requirements.txt
-RUN pip install --no-cache-dir -e git://github.com/NVIDIA/dllogger#egg=dllogger
-
-COPY . .
-ENV PYTHONPATH="${PYTHONPATH}:/workspace"
-
-# AMP monkey-patch
-RUN sed -i 's/  def forward(ctx,/  @amp.custom_fwd\(cast_inputs=torch.float32\)\n  def forward(ctx,/g' /opt/conda/lib/python3.8/site-packages/apex/normalization/fused_layer_norm.py
-RUN sed -i 's/  def backward(ctx,/  @amp.custom_bwd\n  def backward(ctx,/g' /opt/conda/lib/python3.8/site-packages/apex/normalization/fused_layer_norm.py
-RUN sed -i 's/^import torch$/import torch\nfrom torch.cuda import amp/' /opt/conda/lib/python3.8/site-packages/apex/normalization/fused_layer_norm.py
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/LICENCE b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/LICENCE
deleted file mode 100644
index 261eeb9e..00000000
--- a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/LICENCE
+++ /dev/null
@@ -1,201 +0,0 @@
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-   1. Definitions.
-
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-
-   END OF TERMS AND CONDITIONS
-
-   APPENDIX: How to apply the Apache License to your work.
-
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-
-   Copyright [yyyy] [name of copyright owner]
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/LICENSE AGREEMENT b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/LICENSE AGREEMENT
deleted file mode 100644
index 5d1d88cf..00000000
--- a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/LICENSE AGREEMENT	
+++ /dev/null
@@ -1,25 +0,0 @@
-Individual Contributor License Agreement (CLA)
-Thank you for submitting your contributions to this project.
-
-By signing this CLA, you agree that the following terms apply to all of your past, present and future contributions to the project.
-
-License.
-You hereby represent that all present, past and future contributions are governed by the Apache 2.0 License copyright statement.
-
-This entails that to the extent possible under law, you transfer all copyright and related or neighboring rights of the code or documents you contribute to the project itself or its maintainers. Furthermore you also represent that you have the authority to perform the above waiver with respect to the entirety of you contributions.
-
-Moral Rights.
-To the fullest extent permitted under applicable law, you hereby waive, and agree not to assert, all of your “moral rights” in or relating to your contributions for the benefit of the project.
-
-Third Party Content.
-If your Contribution includes or is based on any source code, object code, bug fixes, configuration changes, tools, specifications, documentation, data, materials, feedback, information or other works of authorship that were not authored by you (“Third Party Content”) or if you are aware of any third party intellectual property or proprietary rights associated with your Contribution (“Third Party Rights”), then you agree to include with the submission of your Contribution full details respecting such Third Party Content and Third Party Rights, including, without limitation, identification of which aspects of your Contribution contain Third Party Content or are associated with Third Party Rights, the owner/author of the Third Party Content and Third Party Rights, where you obtained the Third Party Content, and any applicable third party license terms or restrictions respecting the Third Party Content and Third Party Rights. For greater certainty, the foregoing obligations respecting the identification of Third Party Content and Third Party Rights do not apply to any portion of a Project that is incorporated into your Contribution to that same Project.
-
-Representations.
-You represent that, other than the Third Party Content and Third Party Rights identified by you in accordance with this Agreement, you are the sole author of your Contributions and are legally entitled to grant the foregoing licenses and waivers in respect of your Contributions. If your Contributions were created in the course of your employment with your past or present employer(s), you represent that such employer(s) has authorized you to make your Contributions on behalf of such employer(s) or such employer (s) has waived all of their right, title or interest in or to your Contributions.
-
-Disclaimer.
-To the fullest extent permitted under applicable law, your Contributions are provided on an "as is" basis, without any warranties or conditions, express or implied, including, without limitation, any implied warranties or conditions of non-infringement, merchantability or fitness for a particular purpose. You are not required to provide support for your Contributions, except to the extent you desire to provide support.
-
-No Obligation.
-You acknowledge that the maintainers of this project are under no obligation to use or incorporate your contributions into the project. The decision to use or incorporate your contributions into the project will be made at the sole discretion of the maintainers or their authorized delegates.
-
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/NOTICE b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/NOTICE
deleted file mode 100644
index ae19bb47..00000000
--- a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/NOTICE
+++ /dev/null
@@ -1,3 +0,0 @@
-TFT for PyTorch
-
-This repository includes software from https://github.com/google-research/google-research/tree/master/tft licensed under the Apache License, Version 2.0
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/README.md b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/README.md
deleted file mode 100644
index 69b39d12..00000000
--- a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/README.md
+++ /dev/null
@@ -1,465 +0,0 @@
-# Temporal Fusion Transformer For PyTorch
-
-This repository provides a script and recipe to train the Temporal Fusion Transformer model to achieve state-of-the-art accuracy. The content of this repository is tested and maintained by NVIDIA.
-
-## Table Of Contents
-
-- [Model overview](#model-overview)
-    * [Model architecture](#model-architecture)
-    * [Default configuration](#default-configuration)
-    * [Feature support matrix](#feature-support-matrix)
-	    * [Features](#features)
-    * [Mixed precision training](#mixed-precision-training)
-	    * [Enabling mixed precision](#enabling-mixed-precision)
-          * [Enabling TF32](#enabling-tf32)
-    * [Glossary](#glossary)
-- [Setup](#setup)
-    * [Requirements](#requirements)
-- [Quick Start Guide](#quick-start-guide)
-- [Advanced](#advanced)
-    * [Scripts and sample code](#scripts-and-sample-code)
-    * [Command-line options](#command-line-options)
-    * [Getting the data](#getting-the-data)
-        * [Dataset guidelines](#dataset-guidelines)
-        * [Multi-dataset](#multi-dataset)
-    * [Training process](#training-process)
-    * [Inference process](#inference-process)
-- [Performance](#performance)
-    * [Benchmarking](#benchmarking)
-        * [Training performance benchmark](#training-performance-benchmark)
-        * [Inference performance benchmark](#inference-performance-benchmark)
-    * [Results](#results)
-        * [Training accuracy results](#training-accuracy-results)                         
-            * [Training accuracy: NVIDIA DGX A100 (8x A100 80GB)](#training-accuracy-nvidia-dgx-a100-8x-a100-80gb)
-            * [Training accuracy: NVIDIA DGX-1 (8x V100 16GB)](#training-accuracy-nvidia-dgx-1-8x-v100-16gb)
-            * [Training stability test](#training-stability-test)
-        * [Training performance results](#training-performance-results)
-            * [Training performance: NVIDIA DGX A100 (8x A100 80GB)](#training-performance-nvidia-dgx-a100-8x-a100-80gb)
-            * [Training performance: NVIDIA DGX-1 (8x V100 16GB)](#training-performance-nvidia-dgx-1-8x-v100-16gb)
-- [Release notes](#release-notes)
-    * [Changelog](#changelog)
-    * [Known issues](#known-issues)
-
-
-
-## Model overview
-
-The Temporal Fusion Transformer [TFT](https://arxiv.org/abs/1912.09363) model is a state-of-the-art architecture for interpretable, multi-horizon time-series prediction. The model was first developed and [implemented by Google](https://github.com/google-research/google-research/tree/master/tft) with the collaboration with the University of Oxford.
-This implementation differs from the reference implementation by addressing the issue of missing data, which is common in production datasets, by either masking their values in attention matrices or embedding them as a special value in the latent space.
-This model enables the prediction of confidence intervals for future values of time series for multiple future timesteps.
-
-This model is trained with mixed precision using Tensor Cores on Volta, Turing, and the NVIDIA Ampere GPU architectures. Therefore, researchers can get results 1.45x faster than training without Tensor Cores while experiencing the benefits of mixed precision training. This model is tested against each NGC monthly container release to ensure consistent accuracy and performance over time.
-
-### Model architecture
-
-The TFT model is a hybrid architecture joining LSTM encoding of time series and interpretability of transformer attention layers. Prediction is based on three  types of variables: static (constant for a given time series), known (known in advance for whole history and future), observed (known only for historical data). All these variables come in two flavors: categorical, and continuous. In addition to historical data, we feed the model with historical values of time series. All variables are embedded in high-dimensional space by learning an embedding vector. Categorical variables embeddings are learned in the classical sense of embedding discrete values. The model learns a single vector for each continuous variable, which is then scaled by this variable’s value for further processing. The next step is to filter variables through the Variable Selection Network (VSN), which assigns weights to the inputs in accordance with their relevance to the prediction. Static variables are used as a context for variable selection of other variables and as an initial state of LSTM encoders.
-After encoding, variables are passed to multi-head attention layers (decoder), which produce the final prediction. Whole architecture is interwoven with residual connections with gating mechanisms that allow  the architecture to adapt to various problems by skipping some parts of it.
-For the sake of explainability, heads of self-attention layers share value matrices. This allows interpreting  self-attention as an ensemble of models predicting different temporal patterns over the same feature set. The other feature that helps us understand the model is VSN activations, which tells us how relevant the given feature is to the prediction.
-![](TFT_architecture.PNG)
-*image source: https://arxiv.org/abs/1912.09363*
-
-### Default configuration
-
-The specific configuration of the TFT model depends on the dataset used. Not only is the volume of the model subject to change but so are the data sampling and preprocessing strategies. During preprocessing, data is normalized per feature. For a part of the datasets, we apply scaling per-time-series, which takes into account shifts in distribution between entities (i.e., a factory consumes more electricity than an average house). The model is trained with the quantile loss: <img src="https://render.githubusercontent.com/render/math?math=\Large\sum_{i=1}^N\sum_{q\in\mathcal{Q}}\sum_{t=1}^{t_{max}}\frac{QL(y_it,\hat{y}_i(q,t),q)}{Nt_{max}}">
-For quantiles in [0.1, 0.5, 0.9]. The default configurations are tuned for distributed training on DGX-1-32G with mixed precision. We use dynamic loss scaling. Specific values are provided in the table below.
-
-| Dataset | Training samples | Validation samples | Test samples | History length | Forecast horizon | Dropout | Hidden size | #Heads | BS | LR | Gradient clipping |
-| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
-| Electricity | 450k | 50k | 53.5k | 168 | 24 | 0.1 | 128 | 4 | 8x1024 | 1e-3 | 0.0 |
-| Traffic | 450k | 50k | 139.6k | 168 | 24 | 0.3 | 128 | 4 | 8x1024 | 1e-3 | 0.0
-
-### Feature support matrix
-
-The following features are supported by this model: 
-
-| Feature                    | Yes column                
-|----------------------------|--------------------------
-|Distributed data parallel   |         Yes
-|PyTorch AMP                 |         Yes 
-    
-         
-#### Features
-
-[Automatic Mixed Precision](https://pytorch.org/docs/stable/amp.html)
-provides an easy way to leverage Tensor Cores’ performance. It allows the execution of parts of a network in lower precision. Refer to [Mixed precision training](#mixed-precision-training) for more information.
-
-[PyTorch
-DistributedDataParallel](https://pytorch.org/docs/stable/nn.html#torch.nn.parallel.DistributedDataParallel) - a module
-wrapper that enables easy multiprocess distributed data-parallel
-training.
-
-### Mixed precision training
-
-Mixed precision is the combined use of different numerical precisions in a
-computational method.
-[Mixed precision](https://arxiv.org/abs/1710.03740) training offers significant
-computational speedup by performing operations in half-precision format while
-storing minimal information in single-precision to retain as much information
-as possible in critical parts of the network. Since the introduction of [Tensor Cores](https://developer.nvidia.com/tensor-cores) in Volta, and following with 
-both the Turing and Ampere architectures, significant training speedups are 
-experienced by switching to
-mixed precision -- up to 3x overall speedup on the most arithmetically intense
-model architectures. Using mixed precision training previously required two
-steps:
-
-1. Porting the model to use the FP16 data type where appropriate.
-2. Manually adding loss scaling to preserve small gradient values.
-
-The ability to train deep learning networks with lower precision was introduced
-in the Pascal architecture and first supported in [CUDA
-8](https://devblogs.nvidia.com/parallelforall/tag/fp16/) in the NVIDIA Deep
-Learning SDK.
-
-For information about:
-* How to train using mixed precision, refer to the [Mixed Precision
-  Training](https://arxiv.org/abs/1710.03740) paper and [Training With Mixed
-  Precision](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html)
-  documentation.
-* Techniques used for mixed precision training, refer to the [Mixed-Precision
-  Training of Deep Neural
-  Networks](https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/)
-  blog.
-* APEX tools for mixed precision training, refer to the [NVIDIA Apex: Tools for Easy Mixed-Precision Training in
-  PyTorch](https://devblogs.nvidia.com/apex-pytorch-easy-mixed-precision-training/)
-  .
-
-
-#### Enabling mixed precision
-
-
-Mixed precision is enabled in PyTorch by using the Automatic Mixed Precision torch.cuda.amp module, which casts variables to half-precision upon retrieval while storing variables in single-precision format. Furthermore, to preserve small gradient magnitudes in backpropagation, a [loss scaling](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html#lossscaling) step must be included when applying gradients. In PyTorch, loss scaling can be applied automatically by the GradScaler class. All the necessary steps to implement AMP are verbosely described [here](https://pytorch.org/docs/stable/notes/amp_examples.html#amp-examples).
-
-To enable mixed precision for TFT, simply add the `--use_amp` option to the training script.
-#### Enabling TF32
-
-TensorFloat-32 (TF32) is the new math mode in [NVIDIA A100](https://www.nvidia.com/en-us/data-center/a100/) GPUs for handling the matrix math, also called tensor operations. TF32 running on Tensor Cores in A100 GPUs can provide up to 10x speedups compared to single-precision floating-point math (FP32) on Volta GPUs. 
-
-TF32 Tensor Cores can speed up networks using FP32, typically with no loss of accuracy. It is more robust than FP16 for models which require high dynamic range for weights or activations.
-
-For more information, refer to the [TensorFloat-32 in the A100 GPU Accelerates AI Training, HPC up to 20x](https://blogs.nvidia.com/blog/2020/05/14/tensorfloat-32-precision-format/) blog post.
-
-TF32 is supported in the NVIDIA Ampere GPU architecture and is enabled by default.
-
-
-
-### Glossary
-
-**Multi horizon prediction**  
-Process of estimating values of a time series for multiple future time steps.
-
-**Quantiles**  
-Cut points dividing the range of a probability distribution intervals with equal probabilities.
-
-**Time series**  
-Series of data points indexed and equally spaced in time.
-
-**Transformer**  
-The paper [Attention Is All You Need](https://arxiv.org/abs/1706.03762) introduces a novel architecture called Transformer that uses an attention mechanism and transforms one sequence into another.
- 
-
-## Setup
-
-The following section lists the requirements that you need to meet in order to start training the TFT model.
-
-### Requirements
-
-This repository contains Dockerfile, which extends the PyTorch NGC container and encapsulates some dependencies. Aside from these dependencies, ensure you have the following components:
--   [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker)
--   [PyTorch 21.06 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch)
--   Supported GPUs:
-- [NVIDIA Volta architecture](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/)
-- [NVIDIA Turing architecture](https://www.nvidia.com/en-us/design-visualization/technologies/turing-architecture/)
-- [NVIDIA Ampere architecture](https://www.nvidia.com/en-us/data-center/nvidia-ampere-gpu-architecture/)
-
-For more information about how to get started with NGC containers, refer to the following sections from the NVIDIA GPU Cloud Documentation and the Deep Learning Documentation:
--   [Getting Started Using NVIDIA GPU Cloud](https://docs.nvidia.com/ngc/ngc-getting-started-guide/index.html)
--   [Accessing And Pulling From The NGC Container Registry](https://docs.nvidia.com/deeplearning/frameworks/user-guide/index.html#accessing_registry)
--   Running [PyTorch](https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/running.html#running)
-
-  
-For those unable to use the PyTorch NGC container to set up the required environment or create your own container, refer to the versioned [NVIDIA Container Support Matrix](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html).
-
-## Quick Start Guide
-
-To train your model using mixed or TF32 precision with Tensor Cores, perform the following steps using the default parameters of the TFT model on any of the benchmark datasets. For the specifics concerning training and inference, refer to the [Advanced](#advanced) section.
-
-1. Clone the repository.
-```bash
-git clone https://github.com/NVIDIA/DeepLearningExamples
-cd DeepLearningExamples/PyTorch/Forecasting/TFT
-```
-
-2.  Build the TFT PyTorch NGC container.
-```bash
-docker build --network=host -t tft .
-```
-
-3.  Start an interactive session in the NGC container to run training/inference.
-```bash
-docker run -it --rm --ipc=host --network=host --gpus all -v /path/to/your/data:/data/ tft
-```
-
-Note: Ensure to mount your dataset using the -v flag to make it available for training inside the NVIDIA Docker container.
-
-4.  Download and preprocess datasets.
-```bash
-bash scripts/get_data.sh
-```
-
-5. Start training. Choose one of the scripts provided in the `scripts/` directory. Results are stored in the `/results` directory.
-These scripts are tuned for DGX1-32G. If you have a different system, use NGPU and BATCH_SIZE variables to adjust the parameters for your system.
-```bash
-bash scripts/run_electricity.sh
-bash scripts/run_traffic.sh
-```
-
-6. Start validation/evaluation. The metric we use for evaluation is q-risk. We can compare it per-quantile in the Pareto sense or jointly as one number indicating accuracy.
-```bash
-python inference.py \
---checkpoint <your_checkpoint> \
---data /data/processed/<dataset>/test.csv \
---cat_encodings /data/processed/<dataset>/cat_encodings.bin \
---tgt_scalers /data/processed/<dataset>/tgt_scalers.bin
-```
-
-7. Start inference/predictions. Visualize and save predictions by running the following command.
-```bash
-python inference.py \
---checkpoint <your_checkpoint> \
---data /data/processed/<dataset>/test.csv \
---cat_encodings /data/processed/<dataset>/cat_encodings.bin \
---tgt_scalers /data/processed/<dataset>/tgt_scalers.bin \
---visualize \
---save_predictions
-```
-
-
-
-Now that you have your model trained and evaluated, you can choose to compare your training results with our [Training accuracy results](#training-accuracy-results). You can also choose to benchmark your performance to [Training performance benchmark](#training-performance-results). Following the steps in these sections will ensure that you achieve the same accuracy and performance results as stated in the [Results](#results) section.
-## Advanced
-
-The following sections provide more  details about the dataset, running training and inference, and the training results.
-
-### Scripts and sample code
-
-In the root directory, the most important files are:
-
-`train.py`: Entry point for training
-`data_utils.py`: File containing the dataset implementation and preprocessing functions
-`modeling.py`: Definition of the model
-`configuration.py`: Contains configuration classes for various experiments
-`test.py`: Entry point testing trained model.
-`Dockerfile`: Container definition
-`log_helper.py`: Contains helper functions for setting up dllogger
-`criterions.py`: Definitions of loss functions
-
-The `scripts` directory contains scripts for default use cases:
-`run_electricity.sh`: train default model on the electricity dataset
-`run_traffic.sh`: train default model on the traffic dataset
-
-### Command-line options
-
-To view the full list of available options and their descriptions, use the `-h` or `--help` command-line option, for example:
-`python train.py --help`.
-
-The following example output is printed when running the model:
-```
-usage: train.py [-h] --data_path DATA_PATH --dataset {electricity,volatility,traffic,favorita} [--epochs EPOCHS] [--sample_data SAMPLE_DATA SAMPLE_DATA] [--batch_size BATCH_SIZE] [--lr LR] [--seed SEED] [--use_amp] [--clip_grad CLIP_GRAD]
-                [--early_stopping EARLY_STOPPING] [--results RESULTS] [--log_file LOG_FILE] [--distributed_world_size N] [--distributed_rank DISTRIBUTED_RANK] [--local_rank LOCAL_RANK] [--overwrite_config OVERWRITE_CONFIG]
-
-optional arguments:
-  -h, --help            show this help message and exit
-  --data_path DATA_PATH
-  --dataset {electricity,volatility,traffic,favorita}
-  --epochs EPOCHS
-  --sample_data SAMPLE_DATA SAMPLE_DATA
-  --batch_size BATCH_SIZE
-  --lr LR
-  --seed SEED
-  --use_amp             Enable automatic mixed precision
-  --clip_grad CLIP_GRAD
-  --early_stopping EARLY_STOPPING
-                        Stop training if validation loss does not improve for more than this number of epochs.
-  --results RESULTS
-  --log_file LOG_FILE
-  --distributed_world_size N
-                        total number of GPUs across all nodes (default: all visible GPUs)
-  --distributed_rank DISTRIBUTED_RANK
-                        rank of the current worker
-  --local_rank LOCAL_RANK
-                        rank of the current worker
-  --overwrite_config OVERWRITE_CONFIG
-                        JSON string used to overload config
-
-```
-
-### Getting the data
-    
-The TFT model was trained on the electricity and traffic benchmark datasets. This repository contains the `get_data.sh` download script, which for electricity and and traffic datasets will automatically download and preprocess the training, validation and test datasets, and produce files that contain scalers.
-#### Dataset guidelines
-
-The `data_utils.py` file contains all functions that are used to preprocess the data. Initially the data is loaded to a `pandas.DataFrame` and parsed to the common format which contains the features we will use for training. Then standardized data is cleaned, normalized, encoded and binarized.
-This step does the following:
-Drop all the columns that are not marked in the configuration file as used for training or preprocessing
-Flatten indices in case time series are indexed by more than one column
-Split the data into training, validation and test splits
-Filter out all the time series shorter than minimal example length
-Normalize columns marked as continuous in the configuration file
-Encode as integers columns marked as categorical
-Save the data in csv and binary formats
-
-#### Multi-dataset
-In order to use an alternate dataset, you have to write a function that parses your data to a common format. The format is as follows:
-There is at least one id column
-There is exactly one time column (that can also be used as a feature column)
-Each feature is in a separate column
-Each row represents a moment in time for only one time series
-Additionally, you must specify a configuration of the network, including a data description. Refer to the example in `configuration.py` file.
-### Training process
-
-The `train.py` script is an entry point for a training procedure. Refined recipes can be found in the `scripts` directory.
-The model trains for at most `--epochs` epochs. If option `--early_stopping N` is set, then training will end if for N subsequent epochs validation loss hadn’t improved.
-The details of the architecture and the dataset configuration are encapsulated by the `--dataset` option. This option chooses one of the configurations stored in the `configuration.py` file. You can enable mixed precision training by providing the `--use_amp` option. The training script supports multi-GPU training with the APEX package. To enable distributed training prepend training command with `python -m torch.distributed.launch --nproc_per_node=${NGPU}`.
-
-Example command:
-```
-python -m torch.distributed.launch --nproc_per_node=8 train.py \
-        --dataset electricity \
-        --data_path /data/processed/electricity_bin \
-        --batch_size=1024 \
-        --sample 450000 50000 \
-        --lr 1e-3 \
-        --epochs 25 \
-        --early_stopping 5 \
-        --seed 1 \
-        --use_amp \
-        --results /results/TFT_electricity_bs8x1024_lr1e-3/seed_1
-```
-
-The model is trained by optimizing quantile loss <img src="https://render.githubusercontent.com/render/math?math=\Large\sum_{i=1}^N\sum_{q\in\mathcal{Q}}\sum_{t=1}^{t_{max}}\frac{QL(y_{it},\hat{y}_i(q,t),q)}{Nt_{max}}">
-. After training, the checkpoint with the least validation loss is evaluated on a test split with q-risk metric <img src="https://render.githubusercontent.com/render/math?math=\Large\frac{2\sum_{y\in\Omega}\sum_{t=1}^{t_{max}}QL(y_t,\hat{y}(q,t),q)}{\sum_{y\in\Omega}\sum_{t=1}^{t_{max}}|y_t|}">.
-Results are by default stored in the `/results` directory. This can be changed by providing the `--results` option. At the end of the training,  the results directory will contain the trained checkpoint which had the lowest validation loss, dllogger logs (in dictionary per line format), and TensorBoard logs.
-
-### Inference process
-
-Inference can be run by launching the `inference.py` script. The script requires a trained checkpoint to run. It is crucial to prepare the data in the same way as training data prior to running the inference. Example command:
-```
-python inference.py \
---checkpoint /results/checkpoint.pt \
---data /data/processed/electricity_bin/test.csv \
---tgt_scalers /data/processed/electricity_bin/tgt_scalers.bin \
---cat_encodings /data/processed/electricity_bin/cat_encodings.bin \
---batch_size 2048 \
---visualize \
---save_predictions \
---joint_visualization \
---results /results \
---use_amp
-```
-
-In the default setting, it performs the evaluation of the model on a specified dataset and prints q-risk evaluated on this dataset. In order to save the predictions, use the `--save_predictions` option. Predictions will be stored in the directory specified by the `--results` option in the csv format. Option `--joint_visualization` allows us to plot graphs in TensorBoard format, allowing us to inspect the results and compare them to true values. Using `--visualize`, you can save plots for each example in a separate file.
-## Performance
-
-### Benchmarking
-
-The following section shows how to run benchmarks measuring the model performance in training and inference modes.
-
-#### Training performance benchmark
-
-In order to run training benchmarks, use the `scripts/benchmark.sh` script.
-
-#### Inference performance benchmark
-
-To benchmark the inference performance on a specific batch size and dataset, run the `inference.py` script.
-### Results
-
-The following sections provide details on how we achieved our performance and accuracy in training and inference.
-
-#### Training accuracy results
-
-We conducted an extensive hyperparameter search along with stability tests. The presented results are the averages from the hundreds of runs.
-
-##### Training accuracy: NVIDIA DGX A100 (A100 80GB)
-
-Our results were obtained by running the `train.sh` training script in the [PyTorch 21.06 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch) on NVIDIA A100 GPUs.
-
-| Dataset | GPUs | Batch size / GPU    | Accuracy - TF32  | Accuracy - mixed precision  |   Time to train - TF32  |  Time to train - mixed precision | Time to train speedup (TF32 to mixed precision)     
-|-------------|---|------|-----------------------|-----------------------|-------|-------|-------
-| Electricity | 1 | 1024 | 0.027 / 0.059 / 0.029 | 0.028 / 0.058 / 0.029 | 1427s | 1087s | 1.313x
-| Electricity | 8 | 1024 | 0.027 / 0.056 / 0.028 | 0.026 / 0.054 / 0.029 | 216s  | 176s  | 1.227x
-| Traffic     | 1 | 1024 | 0.040 / 0.103 / 0.075 | 0.040 / 0.103 / 0.075 | 957s  | 726s  | 1.318x
-| Traffic     | 8 | 1024 | 0.042 / 0.104 / 0.076 | 0.042 / 0.106 / 0.077 | 151s  | 126s  | 1.198x
-
-
-
-
-##### Training accuracy: NVIDIA DGX-1 (V100 16GB)
-
-Our results were obtained by running the `train.sh` training script in the [PyTorch 21.06 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch) on NVIDIA DGX-1 with V100 16GB GPUs.
-
-| Dataset | GPUs    | Batch size / GPU    | Accuracy - FP32  | Accuracy - mixed precision  |   Time to train - FP32  |  Time to train - mixed precision | Time to train speedup (FP32 to mixed precision)        
-|-------------|---|------|-----------------------|-----------------------|-------|-------|-----------
-| Electricity | 1 | 1024 | 0.027 / 0.056 / 0.028 | 0.027 / 0.058 / 0.029 | 2559s | 1598s | 1.601x 
-| Electricity | 8 | 1024 | 0.027 / 0.055 / 0.028 | 0.027 / 0.055 / 0.029 | 381s  | 261s  | 1.460x   
-| Traffic     | 1 | 1024 | 0.040 / 0.102 / 0.075 | 0.041 / 0.101 / 0.074 | 1718s | 1062s | 1.618x 
-| Traffic     | 8 | 1024 | 0.042 / 0.106 / 0.076 | 0.042 / 0.105 / 0.077 | 256s  | 176s  | 1.455x
-
-
-
-##### Training stability test
-
-In order to get a greater picture of the model’s accuracy, we performed a hyperparameter search along with stability tests on 100 random seeds for each configuration. Then, for each benchmark dataset, we have chosen the architecture with the least mean test q-risk. The table below summarizes the best configurations.
-
-| Dataset     | #GPU | Hidden size | #Heads | Local BS | LR   | Gradient clipping | Dropout | Mean q-risk | Std q-risk | Min q-risk | Max q-risk
-|-------------|------|-------------|--------|----------|------|-------------------|---------|-------------|------------| -----------|------ 
-| Electricity | 8    | 128         | 4      | 1024     | 1e-3 | 0.0               | 0.1     | 0.1131      | 0.0025     | 0.1080     | 0.1200
-| Traffic     | 8    | 128         | 4      | 1024     | 1e-3 | 0.0               | 0.3     | 0.2180      | 0.0049     | 0.2069     | 0.2336
-
-
-#### Training performance results
-
-##### Training performance: NVIDIA DGX A100 (A100 80GB)
-
-Our results were obtained by running the `train.sh` training script in the [PyTorch 21.06 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch) on NVIDIA A100 (A100 80GB) GPUs. Performance numbers (in items/images per second) were averaged over an entire training epoch.
-
-| Dataset | GPUs   | Batch size / GPU   | Throughput - TF32    | Throughput - mixed precision    | Throughput speedup (TF32 - mixed precision)   | Weak scaling - TF32    | Weak scaling - mixed precision        
-|-------------|---|------|--------|--------|-------|-------|-----
-| Electricity | 1 | 1024 | 10173  | 13703  | 1.35x | 1     | 1
-| Electricity | 8 | 1024 | 80596  | 107761 | 1.34x | 7.92x | 7.86x
-| Traffic     | 1 | 1024 | 10197  | 13779  | 1.35x | 1     | 1
-| Traffic     | 8 | 1024 | 80692  | 107979 | 1.34x | 7.91x | 7.84x
-
-
-To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
-
-The performance metrics used were items per second.
-
-
-##### Training performance: NVIDIA DGX-1 (V100 16GB)
-
-Our results were obtained by running the `train.sh` training script in the [PyTorch 21.06 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch) on NVIDIA DGX-1 with (V100 16GB) GPUs. Performance numbers (in items/images per second) were averaged over an entire training epoch.
-
-| Dataset | GPUs   | Batch size / GPU   | Throughput - FP32    | Throughput - mixed precision    | Throughput speedup (FP32 - mixed precision)   | Weak scaling - FP32    | Weak scaling - mixed precision        
-|-------------|---|------|-------|-------|-------|------|----
-| Electricity | 1 | 1024 | 5580  | 9148  | 1.64x | 1     | 1
-| Electricity | 8 | 1024 | 43351 | 69855 | 1.61x | 7.77x | 7.64x
-| Traffic     | 1 | 1024 | 5593  | 9194  | 1.64x | 1     | 1
-| Traffic     | 8 | 1024 | 43426 | 69983 | 1.61x | 7.76x | 7.61x
-
-
-
-To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
-
-The performance metrics used were items per second.
-
-## Release notes
-The performance measurements in this document were conducted at the time of publication and may not reflect the performance achieved from NVIDIA’s latest software release. For the most up-to-date performance measurements, go to https://developer.nvidia.com/deep-learning-performance-training-inference.
-
-### Changelog
-
-October 2021
-- Initial release
-
-### Known issues
-There are no known issues with this model.
-
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/TFT_architecture.PNG b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/TFT_architecture.PNG
deleted file mode 100644
index c3431031..00000000
Binary files a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/TFT_architecture.PNG and /dev/null differ
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/configuration.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/configuration.py
deleted file mode 100644
index bef26e66..00000000
--- a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/configuration.py
+++ /dev/null
@@ -1,128 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from data_utils import InputTypes, DataTypes, FeatureSpec
-import datetime
-
-class ElectricityConfig():
-    def __init__(self):
-
-        self.features = [
-                         FeatureSpec('id', InputTypes.ID, DataTypes.CATEGORICAL),
-                         FeatureSpec('hours_from_start', InputTypes.TIME, DataTypes.CONTINUOUS),
-                         FeatureSpec('power_usage', InputTypes.TARGET, DataTypes.CONTINUOUS),
-                         FeatureSpec('hour', InputTypes.KNOWN, DataTypes.CONTINUOUS),
-                         FeatureSpec('day_of_week', InputTypes.KNOWN, DataTypes.CONTINUOUS),
-                         FeatureSpec('hours_from_start', InputTypes.KNOWN, DataTypes.CONTINUOUS),
-                         FeatureSpec('categorical_id', InputTypes.STATIC, DataTypes.CATEGORICAL),
-                        ]
-        # Dataset split boundaries
-        self.time_ids = 'days_from_start' # This column contains time indices across which we split the data
-        self.train_range = (1096, 1315)
-        self.valid_range = (1308, 1339)
-        self.test_range = (1332, 1346)
-        self.dataset_stride = 1 #how many timesteps between examples
-        self.scale_per_id = True
-        self.missing_id_strategy = None
-        self.missing_cat_data_strategy='encode_all'
-
-        # Feature sizes
-        self.static_categorical_inp_lens = [369]
-        self.temporal_known_categorical_inp_lens = []
-        self.temporal_observed_categorical_inp_lens = []
-        self.quantiles = [0.1, 0.5, 0.9]
-
-        self.example_length = 8 * 24
-        self.encoder_length = 7 * 24
-
-        self.n_head = 4
-        self.hidden_size = 128
-        self.dropout = 0.1
-        self.attn_dropout = 0.0
-
-        #### Derived variables ####
-        self.temporal_known_continuous_inp_size = len([x for x in self.features 
-            if x.feature_type == InputTypes.KNOWN and x.feature_embed_type == DataTypes.CONTINUOUS])
-        self.temporal_observed_continuous_inp_size = len([x for x in self.features 
-            if x.feature_type == InputTypes.OBSERVED and x.feature_embed_type == DataTypes.CONTINUOUS])
-        self.temporal_target_size = len([x for x in self.features if x.feature_type == InputTypes.TARGET])
-        self.static_continuous_inp_size = len([x for x in self.features 
-            if x.feature_type == InputTypes.STATIC and x.feature_embed_type == DataTypes.CONTINUOUS])
-
-        self.num_static_vars = self.static_continuous_inp_size + len(self.static_categorical_inp_lens)
-        self.num_future_vars = self.temporal_known_continuous_inp_size + len(self.temporal_known_categorical_inp_lens)
-        self.num_historic_vars = sum([self.num_future_vars,
-                                      self.temporal_observed_continuous_inp_size,
-                                      self.temporal_target_size,
-                                      len(self.temporal_observed_categorical_inp_lens),
-                                      ])
-
-
-class TrafficConfig():
-    def __init__(self):
-
-        self.features = [
-                         FeatureSpec('id', InputTypes.ID, DataTypes.CATEGORICAL),
-                         FeatureSpec('hours_from_start', InputTypes.TIME, DataTypes.CONTINUOUS),
-                         FeatureSpec('values', InputTypes.TARGET, DataTypes.CONTINUOUS),
-                         FeatureSpec('time_on_day', InputTypes.KNOWN, DataTypes.CONTINUOUS),
-                         FeatureSpec('day_of_week', InputTypes.KNOWN, DataTypes.CONTINUOUS),
-                         FeatureSpec('hours_from_start', InputTypes.KNOWN, DataTypes.CONTINUOUS),
-                         FeatureSpec('categorical_id', InputTypes.STATIC, DataTypes.CATEGORICAL),
-                        ]
-        # Dataset split boundaries
-        self.time_ids = 'sensor_day' # This column contains time indices across which we split the data
-        self.train_range = (0, 151)
-        self.valid_range = (144, 166)
-        self.test_range = (159, float('inf'))
-        self.dataset_stride = 1 #how many timesteps between examples
-        self.scale_per_id = False
-        self.missing_id_strategy = None
-        self.missing_cat_data_strategy='encode_all'
-
-        # Feature sizes
-        self.static_categorical_inp_lens = [963]
-        self.temporal_known_categorical_inp_lens = []
-        self.temporal_observed_categorical_inp_lens = []
-        self.quantiles = [0.1, 0.5, 0.9]
-
-        self.example_length = 8 * 24
-        self.encoder_length = 7 * 24
-
-        self.n_head = 4
-        self.hidden_size = 128
-        self.dropout = 0.3
-        self.attn_dropout = 0.0
-
-        #### Derived variables ####
-        self.temporal_known_continuous_inp_size = len([x for x in self.features 
-            if x.feature_type == InputTypes.KNOWN and x.feature_embed_type == DataTypes.CONTINUOUS])
-        self.temporal_observed_continuous_inp_size = len([x for x in self.features 
-            if x.feature_type == InputTypes.OBSERVED and x.feature_embed_type == DataTypes.CONTINUOUS])
-        self.temporal_target_size = len([x for x in self.features if x.feature_type == InputTypes.TARGET])
-        self.static_continuous_inp_size = len([x for x in self.features 
-            if x.feature_type == InputTypes.STATIC and x.feature_embed_type == DataTypes.CONTINUOUS])
-
-        self.num_static_vars = self.static_continuous_inp_size + len(self.static_categorical_inp_lens)
-        self.num_future_vars = self.temporal_known_continuous_inp_size + len(self.temporal_known_categorical_inp_lens)
-        self.num_historic_vars = sum([self.num_future_vars,
-                                      self.temporal_observed_continuous_inp_size,
-                                      self.temporal_target_size,
-                                      len(self.temporal_observed_categorical_inp_lens),
-                                      ])
-
-
-CONFIGS = {'electricity':  ElectricityConfig,
-           'traffic':      TrafficConfig, 
-           }
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/criterions.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/criterions.py
deleted file mode 100644
index 5c9df6ae..00000000
--- a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/criterions.py
+++ /dev/null
@@ -1,28 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-class QuantileLoss(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.register_buffer('q', torch.tensor(config.quantiles))
-
-    def forward(self, predictions, targets):
-        diff = predictions - targets
-        ql = (1-self.q)*F.relu(diff) + self.q*F.relu(-diff)
-        losses = ql.view(-1, ql.shape[-1]).mean(0)
-        return losses
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/data_utils.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/data_utils.py
deleted file mode 100644
index f38f8bfb..00000000
--- a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/data_utils.py
+++ /dev/null
@@ -1,790 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-################################
-# Copyright 2021 The Google Research Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import math
-import pickle
-import enum
-import datetime
-
-from collections import namedtuple, OrderedDict
-
-import sklearn.preprocessing
-from sklearn.impute import SimpleImputer
-import pandas as pd
-import numpy as np
-from bisect import bisect
-
-import torch
-from torch.utils.data import Dataset,IterableDataset,DataLoader
-
-class DataTypes(enum.IntEnum):
-    """Defines numerical types of each column."""
-    CONTINUOUS = 0
-    CATEGORICAL = 1
-    DATE = 2
-    STR = 3
-
-class InputTypes(enum.IntEnum):
-    """Defines input types of each column."""
-    TARGET = 0
-    OBSERVED = 1
-    KNOWN = 2
-    STATIC = 3
-    ID = 4  # Single column used as an entity identifier
-    TIME = 5  # Single column exclusively used as a time index
-
-FeatureSpec = namedtuple('FeatureSpec', ['name', 'feature_type', 'feature_embed_type'])
-DTYPE_MAP = {
-        DataTypes.CONTINUOUS : np.float32,
-        DataTypes.CATEGORICAL : np.int64,
-        DataTypes.DATE:'datetime64[ns]',
-        DataTypes.STR: str
-        }
-
-FEAT_ORDER = [
-        (InputTypes.STATIC, DataTypes.CATEGORICAL),
-        (InputTypes.STATIC, DataTypes.CONTINUOUS),
-        (InputTypes.KNOWN, DataTypes.CATEGORICAL),
-        (InputTypes.KNOWN, DataTypes.CONTINUOUS),
-        (InputTypes.OBSERVED, DataTypes.CATEGORICAL),
-        (InputTypes.OBSERVED, DataTypes.CONTINUOUS),
-        (InputTypes.TARGET, DataTypes.CONTINUOUS),
-        (InputTypes.ID, DataTypes.CATEGORICAL)
-        ]
-
-FEAT_NAMES = ['s_cat' , 's_cont' , 'k_cat' , 'k_cont' , 'o_cat' , 'o_cont' , 'target', 'id']
-DEFAULT_ID_COL = 'id'
-
-class TFTBinaryDataset(Dataset):
-    def __init__(self, path, config):
-        super(TFTBinaryDataset).__init__()
-        self.features = [x for x in config.features if x.feature_embed_type != DataTypes.DATE]
-        self.example_length = config.example_length
-        self.stride = config.dataset_stride
-
-        self.grouped = pickle.load(open(path, 'rb'))
-        self.grouped = [x for x in self.grouped if x.shape[0] >= self.example_length]
-        self._cum_examples_in_group = np.cumsum([(g.shape[0] - self.example_length + 1)//self.stride for g in self.grouped])
-
-
-        self.feature_type_col_map = [[i for i,f in enumerate(self.features) if (f.feature_type, f.feature_embed_type) == x] for x in FEAT_ORDER]
-
-        # The list comprehension below is an elaborate way of rearranging data into correct order,
-        # simultaneously doing casting to proper types. Probably can be written neater
-        self.grouped = [
-                [
-                    arr[:, idxs].view(dtype=np.float32).astype(DTYPE_MAP[t[1]]) 
-                    for t, idxs in zip(FEAT_ORDER, self.feature_type_col_map)
-                ] 
-                for arr in self.grouped
-            ]
-
-    def __len__(self):
-        return self._cum_examples_in_group[-1] if len(self._cum_examples_in_group) else 0
-
-    def __getitem__(self, idx):
-        g_idx = bisect(self._cum_examples_in_group, idx)
-        e_idx = idx - self._cum_examples_in_group[g_idx-1] if g_idx else idx
-
-        group =  self.grouped[g_idx]
-
-        tensors = [
-                   torch.from_numpy(feat[e_idx * self.stride:e_idx*self.stride + self.example_length])
-                   if feat.size else torch.empty(0)
-                   for feat in group
-                  ]
-
-        return OrderedDict(zip(FEAT_NAMES, tensors))
-
-
-class TFTDataset(Dataset):
-    def __init__(self, path, config):
-        super(TFTDataset).__init__()
-        self.features = config.features
-        self.data = pd.read_csv(path, index_col=0)
-        self.example_length = config.example_length
-        self.stride = config.dataset_stride
-
-        # name field is a column name.
-        # there can be multiple entries with the same name because one column can be interpreted in many ways
-        time_col_name = next(x.name for x in self.features if x.feature_type==InputTypes.TIME)
-        id_col_name   = next(x.name for x in self.features if x.feature_type==InputTypes.ID)
-        if not id_col_name in self.data.columns:
-            id_col_name = DEFAULT_ID_COL
-            self.features = [x for x in self.features if x.feature_type!=InputTypes.ID]
-            self.features.append(FeatureSpec(DEFAULT_ID_COL, InputTypes.ID, DataTypes.CATEGORICAL))
-        col_dtypes    = {v.name:DTYPE_MAP[v.feature_embed_type] for v in self.features}
-
-
-        self.data.sort_values(time_col_name,inplace=True)
-        self.data = self.data[set(x.name for x in self.features)] #leave only relevant columns
-        self.data = self.data.astype(col_dtypes)
-        self.data = self.data.groupby(id_col_name).filter(lambda group: len(group) >= self.example_length)
-        self.grouped = list(self.data.groupby(id_col_name))
-
-        self._cum_examples_in_group = np.cumsum([(len(g[1]) - self.example_length + 1)//self.stride for g in self.grouped])
-
-    def __len__(self):
-        return self._cum_examples_in_group[-1]
-
-    def __getitem__(self, idx):
-        g_idx = len([x for x in self._cum_examples_in_group if x <= idx])
-        e_idx = idx - self._cum_examples_in_group[g_idx-1] if g_idx else idx
-
-        group =  self.grouped[g_idx][1]
-        sliced = group.iloc[e_idx * self.stride:e_idx*self.stride + self.example_length]
-
-        # We need to be sure that tensors are returned in the correct order
-        tensors = tuple([] for _ in range(8))
-        for v in self.features:
-            if v.feature_type == InputTypes.STATIC and v.feature_embed_type == DataTypes.CATEGORICAL:
-                tensors[0].append(torch.from_numpy(sliced[v.name].to_numpy()))
-            elif v.feature_type == InputTypes.STATIC and v.feature_embed_type == DataTypes.CONTINUOUS:
-                tensors[1].append(torch.from_numpy(sliced[v.name].to_numpy()))
-            elif v.feature_type == InputTypes.KNOWN and v.feature_embed_type == DataTypes.CATEGORICAL:
-                tensors[2].append(torch.from_numpy(sliced[v.name].to_numpy()))
-            elif v.feature_type == InputTypes.KNOWN and v.feature_embed_type == DataTypes.CONTINUOUS:
-                tensors[3].append(torch.from_numpy(sliced[v.name].to_numpy()))
-            elif v.feature_type == InputTypes.OBSERVED and v.feature_embed_type == DataTypes.CATEGORICAL:
-                tensors[4].append(torch.from_numpy(sliced[v.name].to_numpy()))
-            elif v.feature_type == InputTypes.OBSERVED and v.feature_embed_type == DataTypes.CONTINUOUS:
-                tensors[5].append(torch.from_numpy(sliced[v.name].to_numpy()))
-            elif v.feature_type == InputTypes.TARGET:
-                tensors[6].append(torch.from_numpy(sliced[v.name].to_numpy()))
-            elif v.feature_type == InputTypes.ID:
-                tensors[7].append(torch.from_numpy(sliced[v.name].to_numpy()))
-
-
-        tensors = [torch.stack(x, dim=-1) if x else torch.empty(0) for x in tensors]
-
-        return OrderedDict(zip(FEAT_NAMES, tensors))
-        
-def get_dataset_splits(df, config):
-
-    if hasattr(config, 'relative_split') and config.relative_split:
-        forecast_len = config.example_length - config.encoder_length
-        # The valid split is shifted from the train split by number of the forecast steps to the future.
-        # The test split is shifted by the number of the forecast steps from the valid split
-        train = []
-        valid = []
-        test = []
-
-        for _, group in df.groupby(DEFAULT_ID_COL):
-            index = group[config.time_ids]
-            _train = group.loc[index < config.valid_boundary]
-            _valid = group.iloc[(len(_train) - config.encoder_length):(len(_train) + forecast_len)]
-            _test = group.iloc[(len(_train) - config.encoder_length + forecast_len):(len(_train) + 2*forecast_len)]
-            train.append(_train)
-            valid.append(_valid)
-            test.append(_test)
-
-        train = pd.concat(train, axis=0)
-        valid = pd.concat(valid, axis=0)
-        test = pd.concat(test, axis=0)
-    else:
-        index = df[config.time_ids]
-        train = df.loc[(index >= config.train_range[0]) & (index < config.train_range[1])]
-        valid = df.loc[(index >= config.valid_range[0]) & (index < config.valid_range[1])]
-        test  = df.loc[(index >= config.test_range[0]) & (index < config.test_range[1])]
-
-    return train, valid, test
-
-def flatten_ids(df, config):
-
-    if config.missing_id_strategy == 'drop':
-        if hasattr(config, 'combine_ids') and config.combine_ids:
-            index = np.logical_or.reduce([df[c].isna() for c in config.combine_ids])
-        else:
-            id_col = next(x.name for x in config.features if x.feature_type == InputTypes.ID)
-            index = df[id_col].isna()
-        index = index[index == True].index # Extract indices of nans
-        df.drop(index, inplace=True)
-
-    if not (hasattr(config, 'combine_ids') and config.combine_ids):
-        id_col = next(x.name for x in config.features if x.feature_type == InputTypes.ID)
-        ids = df[id_col].apply(str)
-        df.drop(id_col, axis=1, inplace=True)
-        encoder = sklearn.preprocessing.LabelEncoder().fit(ids.values)
-        df[DEFAULT_ID_COL] = encoder.transform(ids)
-        encoders = OrderedDict({id_col: encoder})
-
-    else:
-        encoders = {c:sklearn.preprocessing.LabelEncoder().fit(df[c].values) for c in config.combine_ids}
-        encoders = OrderedDict(encoders)
-        lens = [len(v.classes_) for v in encoders.values()]
-        clens = np.roll(np.cumprod(lens), 1)
-        clens[0] = 1
-
-        # this takes a looooooot of time. Probably it would be better to create 2 dummy columns
-        df[DEFAULT_ID_COL] = df.apply(lambda row: sum([encoders[c].transform([row[c]])[0]*clens[i] for i,c in enumerate(encoders.keys())]), axis=1)
-        df.drop(config.combine_ids, axis=1, inplace=True)
-
-    return DEFAULT_ID_COL, encoders
-
-def impute(df, config):
-    #XXX This ensures that out scaling will have the same mean. We still need to check the variance
-    if not hasattr(config, 'missing_data_label'):
-        return df, None
-    else:
-        imp = SimpleImputer(missing_values=config.missing_data_label, strategy='mean')
-        mask = df.applymap(lambda x: True if x == config.missing_data_label else False)
-        data = df.values
-        col_mask = (data == config.missing_data_label).all(axis=0)
-        data[:,~col_mask] = imp.fit_transform(data)
-        return data, mask
-
-def normalize_reals(train, valid, test, config, id_col=DEFAULT_ID_COL):
-    tgt_cols = [x.name for x in config.features if x.feature_type == InputTypes.TARGET]
-    real_cols = list(set(v.name for v in config.features if v.feature_embed_type == DataTypes.CONTINUOUS).difference(set(tgt_cols)))
-    real_scalers = {}
-    tgt_scalers = {}
-
-    def apply_scalers(df, name=None):
-        if name is None:
-            name = df.name
-        mask = df.applymap(lambda x: True if x == config.missing_data_label else False) if hasattr(config, 'missing_data_label') else None
-        df[real_cols] = real_scalers[name].transform(df[real_cols])
-        if mask is not None and any(mask):
-            df[real_cols].mask(mask, 10**9)
-        df[tgt_cols] = tgt_scalers[name].transform(df[tgt_cols])
-        return df
-
-    if config.scale_per_id:
-        for identifier, sliced in train.groupby(id_col):
-            data = sliced[real_cols]
-            data, _ = impute(data, config)
-            real_scalers[identifier] = sklearn.preprocessing.StandardScaler().fit(data)
-            # XXX We should probably remove examples that contain NaN as a target
-            target = sliced[tgt_cols]
-            tgt_scalers[identifier] = sklearn.preprocessing.StandardScaler().fit(target)
-
-        train = train.groupby(id_col).apply(apply_scalers)
-        # For valid and testing leave only timeseries previously present in train subset
-        # XXX for proper data science we should consider encoding unseen timeseries as a special case, not throwing them away
-        valid = valid.loc[valid[id_col].isin(real_scalers.keys())]
-        valid = valid.groupby(id_col).apply(apply_scalers)
-        test = test.loc[test[id_col].isin(real_scalers.keys())]
-        test = test.groupby(id_col).apply(apply_scalers)
-
-    else:
-        data, _ = impute(train[real_cols], config)
-        real_scalers[''] = sklearn.preprocessing.StandardScaler().fit(data)
-        tgt_scalers[''] = sklearn.preprocessing.StandardScaler().fit(train[tgt_cols])
-
-        train = apply_scalers(train, name='')
-        valid = apply_scalers(valid, name='')
-        test = apply_scalers(test, name='')
-
-    return train, valid, test, real_scalers, tgt_scalers
-
-def encode_categoricals(train, valid, test, config):
-    cat_encodings = {}
-    cat_cols = list(set(v.name for v in config.features if v.feature_embed_type == DataTypes.CATEGORICAL and v.feature_type != InputTypes.ID))
-    num_classes = [] #XXX Maybe we should modify config based on this value? Or send a warninig?
-                     # For TC performance reasons we might want for num_classes[i] be divisible by 8
-
-    # Train categorical encoders
-    for c in cat_cols:
-        if config.missing_cat_data_strategy == 'special_token':
-            #XXX this will probably require some data augmentation
-            unique = train[c].unique()
-            valid[c].loc[valid[c].isin(unique)] = '<UNK>'
-            test[c].loc[test[c].isin(unique)] = '<UNK>'
-
-        if config.missing_cat_data_strategy == 'encode_all' or \
-                config.missing_cat_data_strategy == 'special_token':
-            srs = pd.concat([train[c], valid[c], test[c]]).apply(str)
-            cat_encodings[c] = sklearn.preprocessing.LabelEncoder().fit(srs.values)
-        elif config.missing_cat_data_strategy == 'drop':
-            # TODO: implement this. In addition to dropping rows this has to split specific time series in chunks
-            # to prevent data from having temporal gaps
-            pass
-        num_classes.append(srs.nunique())
-    print('Categorical variables encodings lens: ', num_classes)
-
-
-    for split in [train, valid, test]:
-        for c in cat_cols:
-            srs = split[c].apply(str)
-            split[c] = srs
-            split.loc[:,c] = cat_encodings[c].transform(srs)
-
-    return cat_encodings
-
-
-def preprocess(src_path, dst_path, config):
-    df = pd.read_csv(src_path, index_col=0)
-
-    for c in config.features:
-        if c.feature_embed_type == DataTypes.DATE:
-            df[c.name] = pd.to_datetime(df[c.name])
-
-    # Leave only columns relevant to preprocessing
-    relevant_columns = list(set([f.name for f in config.features] + [config.time_ids]))
-    df = df[relevant_columns]
-
-
-    id_col, id_encoders = flatten_ids(df, config)
-    df = df.reindex(sorted(df.columns), axis=1)
-    
-    train, valid, test = get_dataset_splits(df, config)
-   
-    # Length filter the data (all timeseries shorter than example len will be dropped)
-    #for df in [train, valid, test]:
-    #    df.groupby(id_col).filter(lambda x: len(x) >= config.example_length)
-    train = pd.concat([x[1] for x in train.groupby(id_col) if len(x[1]) >= config.example_length])
-    valid = pd.concat([x[1] for x in valid.groupby(id_col) if len(x[1]) >= config.example_length])
-    test  = pd.concat([x[1] for x in test.groupby(id_col)  if len(x[1]) >= config.example_length])
-
-    train, valid, test, real_scalers, tgt_scalers = normalize_reals(train, valid, test, config, id_col)
-
-    cat_encodings = encode_categoricals(train, valid, test, config)
-
-    os.makedirs(dst_path, exist_ok=True)
-    
-    train.to_csv(os.path.join(dst_path, 'train.csv'))
-    valid.to_csv(os.path.join(dst_path, 'valid.csv'))
-    test.to_csv(os.path.join(dst_path, 'test.csv'))
-
-    # Save relevant columns in binary form for faster dataloading
-    # IMORTANT: We always expect id to be a single column indicating the complete timeseries
-    # We also expect a copy of id in form of static categorical input!!!
-    col_names = [id_col] + [x.name for x in config.features if x.feature_embed_type != DataTypes.DATE and x.feature_type != InputTypes.ID]
-    grouped_train = [x[1][col_names].values.astype(np.float32).view(dtype=np.int32) for x in train.groupby(id_col)]
-    grouped_valid = [x[1][col_names].values.astype(np.float32).view(dtype=np.int32) for x in valid.groupby(id_col)]
-    grouped_test  = [x[1][col_names].values.astype(np.float32).view(dtype=np.int32) for x in test.groupby(id_col)]
-
-    pickle.dump(grouped_train, open(os.path.join(dst_path, 'train.bin'), 'wb'))
-    pickle.dump(grouped_valid, open(os.path.join(dst_path, 'valid.bin'), 'wb'))
-    pickle.dump(grouped_test,  open(os.path.join(dst_path, 'test.bin'), 'wb'))
-
-    
-    with open(os.path.join(dst_path, 'real_scalers.bin'), 'wb') as f:
-        pickle.dump(real_scalers, f)
-    with open(os.path.join(dst_path, 'tgt_scalers.bin'), 'wb') as f:
-        pickle.dump(tgt_scalers, f)
-    with open(os.path.join(dst_path, 'cat_encodings.bin'), 'wb') as f:
-        pickle.dump(cat_encodings, f)
-    with open(os.path.join(dst_path, 'id_encoders.bin'), 'wb') as f:
-        pickle.dump(id_encoders, f)
-    
-
-def sample_data(dataset, num_samples):
-    if num_samples < 0:
-        return dataset
-    else:
-        return torch.utils.data.Subset(dataset, np.random.choice(np.arange(len(dataset)), size=num_samples, replace=False))
-
-
-def standarize_electricity(path):
-    """Code taken from https://github.com/google-research/google-research/blob/master/tft/script_download_data.py"""
-    df = pd.read_csv(os.path.join(path, 'LD2011_2014.txt'), index_col=0, sep=';', decimal=',')
-    df.index = pd.to_datetime(df.index)
-    df.sort_index(inplace=True)
-  
-    # Used to determine the start and end dates of a series
-    output = df.resample('1h').mean().replace(0., np.nan)
-  
-    earliest_time = output.index.min()
-  
-    df_list = []
-    for label in output:
-        print('Processing {}'.format(label))
-        srs = output[label]
-  
-        start_date = min(srs.fillna(method='ffill').dropna().index)
-        end_date = max(srs.fillna(method='bfill').dropna().index)
-  
-        active_range = (srs.index >= start_date) & (srs.index <= end_date)
-        srs = srs[active_range].fillna(0.)
-  
-        tmp = pd.DataFrame({'power_usage': srs})
-        date = tmp.index
-        tmp['t'] = (date - earliest_time).seconds / 60 / 60 + (
-            date - earliest_time).days * 24
-        tmp['days_from_start'] = (date - earliest_time).days
-        tmp['categorical_id'] = label
-        tmp['date'] = date
-        tmp['id'] = label
-        tmp['hour'] = date.hour
-        tmp['day'] = date.day
-        tmp['day_of_week'] = date.dayofweek
-        tmp['month'] = date.month
-  
-        df_list.append(tmp)
-  
-    output = pd.concat(df_list, axis=0, join='outer').reset_index(drop=True)
-  
-    output['categorical_id'] = output['id'].copy()
-    output['hours_from_start'] = output['t']
-    output['categorical_day_of_week'] = output['day_of_week'].copy()
-    output['categorical_hour'] = output['hour'].copy()
-  
-    output.to_csv(os.path.join(path, 'standarized.csv'))
-
-def standarize_volatility(path):
-    df = pd.read_csv(os.path.join(path, 'oxfordmanrealizedvolatilityindices.csv'), index_col=0)  # no explicit index
-  
-    # Adds additional date/day fields
-    idx = [str(s).split('+')[0] for s in df.index
-          ]  # ignore timezones, we don't need them
-    dates = pd.to_datetime(idx)
-    df['date'] = dates
-    df['days_from_start'] = (dates - pd.datetime(2000, 1, 3)).days
-    df['day_of_week'] = dates.dayofweek
-    df['day_of_month'] = dates.day
-    df['week_of_year'] = dates.weekofyear
-    df['month'] = dates.month
-    df['year'] = dates.year
-    df['categorical_id'] = df['Symbol'].copy()
-  
-    # Processes log volatility
-    vol = df['rv5_ss'].copy()
-    vol.loc[vol == 0.] = np.nan
-    df['log_vol'] = np.log(vol)
-  
-    # Adds static information
-    symbol_region_mapping = {
-        '.AEX': 'EMEA',
-        '.AORD': 'APAC',
-        '.BFX': 'EMEA',
-        '.BSESN': 'APAC',
-        '.BVLG': 'EMEA',
-        '.BVSP': 'AMER',
-        '.DJI': 'AMER',
-        '.FCHI': 'EMEA',
-        '.FTMIB': 'EMEA',
-        '.FTSE': 'EMEA',
-        '.GDAXI': 'EMEA',
-        '.GSPTSE': 'AMER',
-        '.HSI': 'APAC',
-        '.IBEX': 'EMEA',
-        '.IXIC': 'AMER',
-        '.KS11': 'APAC',
-        '.KSE': 'APAC',
-        '.MXX': 'AMER',
-        '.N225': 'APAC ',
-        '.NSEI': 'APAC',
-        '.OMXC20': 'EMEA',
-        '.OMXHPI': 'EMEA',
-        '.OMXSPI': 'EMEA',
-        '.OSEAX': 'EMEA',
-        '.RUT': 'EMEA',
-        '.SMSI': 'EMEA',
-        '.SPX': 'AMER',
-        '.SSEC': 'APAC',
-        '.SSMI': 'EMEA',
-        '.STI': 'APAC',
-        '.STOXX50E': 'EMEA'
-    }
-  
-    df['Region'] = df['Symbol'].apply(lambda k: symbol_region_mapping[k])
-  
-    # Performs final processing
-    output_df_list = []
-    for grp in df.groupby('Symbol'):
-        sliced = grp[1].copy()
-        sliced.sort_values('days_from_start', inplace=True)
-        # Impute log volatility values
-        sliced['log_vol'].fillna(method='ffill', inplace=True)
-        sliced.dropna()
-        output_df_list.append(sliced)
-  
-    df = pd.concat(output_df_list, axis=0)
-  
-    df.to_csv(os.path.join(path, 'standarized.csv'))
-
-
-def standarize_traffic(path):
-    def process_list(s, variable_type=int, delimiter=None):
-        """Parses a line in the PEMS format to a list."""
-        if delimiter is None:
-            l = [
-                variable_type(i) for i in s.replace('[', '').replace(']', '').split()
-            ]
-        else:
-            l = [
-                variable_type(i)
-                for i in s.replace('[', '').replace(']', '').split(delimiter)
-            ]
-  
-        return l
-  
-    def read_single_list(filename):
-        """Returns single list from a file in the PEMS-custom format."""
-        with open(os.path.join(path, filename), 'r') as dat:
-            l = process_list(dat.readlines()[0])
-        return l
-  
-    def read_matrix(filename):
-        """Returns a matrix from a file in the PEMS-custom format."""
-        array_list = []
-        with open(os.path.join(path, filename), 'r') as dat:
-            lines = dat.readlines()
-            for i, line in enumerate(lines):
-                if (i + 1) % 50 == 0:
-                    print('Completed {} of {} rows for {}'.format(i + 1, len(lines),
-                                                                filename))
-                array = [
-                    process_list(row_split, variable_type=float, delimiter=None)
-                    for row_split in process_list(
-                        line, variable_type=str, delimiter=';')
-                ]
-                array_list.append(array)
-  
-        return array_list
-  
-    shuffle_order = np.array(read_single_list('randperm')) - 1  # index from 0
-    train_dayofweek = read_single_list('PEMS_trainlabels')
-    train_tensor = read_matrix('PEMS_train')
-    test_dayofweek = read_single_list('PEMS_testlabels')
-    test_tensor = read_matrix('PEMS_test')
-  
-    # Inverse permutate shuffle order
-    print('Shuffling')
-    inverse_mapping = {
-        new_location: previous_location
-        for previous_location, new_location in enumerate(shuffle_order)
-    }
-    reverse_shuffle_order = np.array([
-        inverse_mapping[new_location]
-        for new_location, _ in enumerate(shuffle_order)
-    ])
-  
-    # Group and reoder based on permuation matrix
-    print('Reodering')
-    day_of_week = np.array(train_dayofweek + test_dayofweek)
-    combined_tensor = np.array(train_tensor + test_tensor)
-  
-    day_of_week = day_of_week[reverse_shuffle_order]
-    combined_tensor = combined_tensor[reverse_shuffle_order]
-  
-    # Put everything back into a dataframe
-    print('Parsing as dataframe')
-    labels = ['traj_{}'.format(i) for i in read_single_list('stations_list')]
-  
-    hourly_list = []
-    for day, day_matrix in enumerate(combined_tensor):
-        # Hourly data
-        hourly = pd.DataFrame(day_matrix.T, columns=labels)
-        hourly['hour_on_day'] = [int(i / 6) for i in hourly.index
-                                ]  # sampled at 10 min intervals
-        if hourly['hour_on_day'].max() > 23 or hourly['hour_on_day'].min() < 0:
-            raise ValueError('Invalid hour! {}-{}'.format(
-                hourly['hour_on_day'].min(), hourly['hour_on_day'].max()))
-  
-        hourly = hourly.groupby('hour_on_day', as_index=True).mean()[labels]
-        hourly['sensor_day'] = day
-        hourly['time_on_day'] = hourly.index
-        hourly['day_of_week'] = day_of_week[day]
-  
-        hourly_list.append(hourly)
-  
-    hourly_frame = pd.concat(hourly_list, axis=0, ignore_index=True, sort=False)
-  
-    # Flatten such that each entitiy uses one row in dataframe
-    store_columns = [c for c in hourly_frame.columns if 'traj' in c]
-    other_columns = [c for c in hourly_frame.columns if 'traj' not in c]
-    flat_df = pd.DataFrame(columns=['values', 'prev_values', 'next_values'] +
-                           other_columns + ['id'])
-  
-    for store in store_columns:
-        print('Processing {}'.format(store))
-  
-        sliced = hourly_frame[[store] + other_columns].copy()
-        sliced.columns = ['values'] + other_columns
-        sliced['id'] = int(store.replace('traj_', ''))
-  
-        # Sort by Sensor-date-time
-        key = sliced['id'].apply(str) \
-                + sliced['sensor_day'].apply(lambda x: '_{:03d}'.format(x)) \
-                + sliced['time_on_day'].apply(lambda x: '_{:03d}'.format(x))
-        sliced = sliced.set_index(key).sort_index()
-  
-        sliced['values'] = sliced['values'].fillna(method='ffill')
-        sliced['prev_values'] = sliced['values'].shift(1)
-        sliced['next_values'] = sliced['values'].shift(-1)
-  
-        flat_df = flat_df.append(sliced.dropna(), ignore_index=True, sort=False)
-  
-    # Filter to match range used by other academic papers
-    index = flat_df['sensor_day']
-    flat_df = flat_df[index < 173].copy()
-  
-    # Creating columns fo categorical inputs
-    flat_df['categorical_id'] = flat_df['id'].copy()
-    flat_df['hours_from_start'] = flat_df['time_on_day'] \
-        + flat_df['sensor_day']*24.
-    flat_df['categorical_day_of_week'] = flat_df['day_of_week'].copy()
-    flat_df['categorical_time_on_day'] = flat_df['time_on_day'].copy()
-  
-    flat_df.to_csv(os.path.join(path, 'standarized.csv'))
-
-
-# XXX needs rework
-def standarize_favorita(data_folder):
-    import gc
-    # Extract only a subset of data to save/process for efficiency
-    start_date = pd.datetime(2015, 1, 1)
-    end_date = pd.datetime(2016, 6, 1)
-  
-    print('Regenerating data...')
-  
-    # load temporal data
-    temporal = pd.read_csv(os.path.join(data_folder, 'train.csv'), index_col=0)
-  
-    store_info = pd.read_csv(os.path.join(data_folder, 'stores.csv'), index_col=0)
-    oil = pd.read_csv(
-        os.path.join(data_folder, 'oil.csv'), index_col=0).iloc[:, 0]
-    holidays = pd.read_csv(os.path.join(data_folder, 'holidays_events.csv'))
-    items = pd.read_csv(os.path.join(data_folder, 'items.csv'), index_col=0)
-    transactions = pd.read_csv(os.path.join(data_folder, 'transactions.csv'))
-  
-    # Take first 6 months of data
-    temporal['date'] = pd.to_datetime(temporal['date'])
-  
-    # Filter dates to reduce storage space requirements
-    if start_date is not None:
-        temporal = temporal[(temporal['date'] >= start_date)]
-    if end_date is not None:
-        temporal = temporal[(temporal['date'] < end_date)]
-  
-    dates = temporal['date'].unique()
-  
-    # Add trajectory identifier
-    temporal['traj_id'] = temporal['store_nbr'].apply(
-        str) + '_' + temporal['item_nbr'].apply(str)
-    temporal['unique_id'] = temporal['traj_id'] + '_' + temporal['date'].apply(
-        str)
-  
-    # Remove all IDs with negative returns
-    print('Removing returns data')
-    min_returns = temporal['unit_sales'].groupby(temporal['traj_id']).min()
-    valid_ids = set(min_returns[min_returns >= 0].index)
-    selector = temporal['traj_id'].apply(lambda traj_id: traj_id in valid_ids)
-    new_temporal = temporal[selector].copy()
-    del temporal
-    gc.collect()
-    temporal = new_temporal
-    temporal['open'] = 1
-  
-    # Resampling
-    print('Resampling to regular grid')
-    resampled_dfs = []
-    for traj_id, raw_sub_df in temporal.groupby('traj_id'):
-        print('Resampling', traj_id)
-        sub_df = raw_sub_df.set_index('date', drop=True).copy()
-        sub_df = sub_df.resample('1d').last()
-        sub_df['date'] = sub_df.index
-        sub_df[['store_nbr', 'item_nbr', 'onpromotion']] \
-            = sub_df[['store_nbr', 'item_nbr', 'onpromotion']].fillna(method='ffill')
-        sub_df['open'] = sub_df['open'].fillna(
-            0)  # flag where sales data is unknown
-        sub_df['log_sales'] = np.log(sub_df['unit_sales'])
-    
-        resampled_dfs.append(sub_df.reset_index(drop=True))
-  
-    new_temporal = pd.concat(resampled_dfs, axis=0)
-    del temporal
-    gc.collect()
-    temporal = new_temporal
-  
-    print('Adding oil')
-    oil.name = 'oil'
-    oil.index = pd.to_datetime(oil.index)
-    #XXX the lines below match the value of the oil on given date with the rest of the timeseries
-    # missing values in oil series are copied from the index before. Then the oil series is joined with
-    # temporal. Then there are some dates present in temporal which arent present in oil, for which 
-    # oil values is substituted with -1. WHY?!
-    #TODO: check how many nans there are after first step. Previously oil series was extended by dates
-    # present in dates variable with nan value, which were forward filled. 
-    # This behavior is no longer supported by pandas, so we changed to DataFrame.isin method.
-    # This leaves us with more nans after first step than previously. To achieve previous behavior
-    # we have to join series before filling nans.
-    temporal = temporal.join(
-        #oil.loc[oil.index.isin(dates)].fillna(method='ffill'), on='date', how='left')
-        oil.loc[oil.index.isin(dates)], on='date', how='left')
-    temporal['oil'] = temporal['oil'].fillna(method='ffill')
-    temporal['oil'] = temporal['oil'].fillna(-1)
-  
-    print('Adding store info')
-    temporal = temporal.join(store_info, on='store_nbr', how='left')
-  
-    print('Adding item info')
-    temporal = temporal.join(items, on='item_nbr', how='left')
-  
-    transactions['date'] = pd.to_datetime(transactions['date'])
-    temporal = temporal.merge(
-        transactions,
-        left_on=['date', 'store_nbr'],
-        right_on=['date', 'store_nbr'],
-        how='left')
-    temporal['transactions'] = temporal['transactions'].fillna(-1)
-  
-    # Additional date info
-    temporal['day_of_week'] = pd.to_datetime(temporal['date'].values).dayofweek
-    temporal['day_of_month'] = pd.to_datetime(temporal['date'].values).day
-    temporal['month'] = pd.to_datetime(temporal['date'].values).month
-  
-    # Add holiday info
-    print('Adding holidays')
-    holiday_subset = holidays[holidays['transferred'].apply(
-        lambda x: not x)].copy()
-    holiday_subset.columns = [
-        s if s != 'type' else 'holiday_type' for s in holiday_subset.columns
-    ]
-    holiday_subset['date'] = pd.to_datetime(holiday_subset['date'])
-    local_holidays = holiday_subset[holiday_subset['locale'] == 'Local']
-    regional_holidays = holiday_subset[holiday_subset['locale'] == 'Regional']
-    national_holidays = holiday_subset[holiday_subset['locale'] == 'National']
-  
-    temporal['national_hol'] = temporal.merge(
-        national_holidays, left_on=['date'], right_on=['date'],
-        how='left')['description'].fillna('')
-    temporal['regional_hol'] = temporal.merge(
-        regional_holidays,
-        left_on=['state', 'date'],
-        right_on=['locale_name', 'date'],
-        how='left')['description'].fillna('')
-    temporal['local_hol'] = temporal.merge(
-        local_holidays,
-        left_on=['city', 'date'],
-        right_on=['locale_name', 'date'],
-        how='left')['description'].fillna('')
-  
-    temporal.sort_values('unique_id', inplace=True)
-
-    # Transform date to integer index
-    start_date = pd.to_datetime(min(temporal['date']))
-    dates = temporal['date'].apply(pd.to_datetime)
-    temporal['days_from_start'] = (dates - start_date).dt.days
-    temporal['categorical_id'] = temporal['traj_id'].copy()
-  
-    print('Saving processed file to {}'.format(os.path.join(data_folder, 'standarized.csv')))
-    temporal.to_csv(os.path.join(data_folder, 'standarized.csv'))
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/ema.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/ema.py
deleted file mode 100644
index f8f5b331..00000000
--- a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/ema.py
+++ /dev/null
@@ -1,73 +0,0 @@
-# Copyright 2021 NVIDIA CORPORATION
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-
-#     http://www.apache.org/licenses/LICENSE-2.0
-
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Copyright 2019 Ross Wightman
-
-#    Licensed under the Apache License, Version 2.0 (the "License");
-#    you may not use this file except in compliance with the License.
-#    You may obtain a copy of the License at
-
-#        http://www.apache.org/licenses/LICENSE-2.0
-
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS,
-#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#    See the License for the specific language governing permissions and
-#    limitations under the License.
-
-""" 
-Exponential Moving Average (EMA) of model updates
-"""
-
-from collections import OrderedDict
-from copy import deepcopy
-
-import torch
-import torch.nn as nn
-
-class ModelEma(nn.Module):
-    """ Model Exponential Moving Average V2
-
-    Keep a moving average of everything in the model state_dict (parameters and buffers).
-    V2 of this module is simpler, it does not match params/buffers based on name but simply
-    iterates in order. It works with torchscript (JIT of full model).
-
-    """
-    def __init__(self, model, decay=0.999, device=None):
-        super().__init__()
-        # make a copy of the model for accumulating moving average of weights
-        self.module = deepcopy(model)
-        self.module.eval()
-        self.decay = decay
-        self.device = device  # perform ema on different device from model if set
-        if self.device is not None:
-            self.module.to(device=device)
-
-    def update(self, model):
-        update_fn=lambda ema_v, model_v: self.decay * ema_v + (1. - self.decay) * model_v
-        with torch.no_grad():
-            for ema_v, model_v in zip(self.module.state_dict().values(), model.state_dict().values()):
-                if self.device is not None:
-                    model_v = model_v.to(device=self.device)
-                ema_v.copy_(update_fn(ema_v, model_v))
-
-    def set(self, model):
-        with torch.no_grad():
-            for ema_v, model_v in zip(self.module.state_dict().values(), model.state_dict().values()):
-                if self.device is not None:
-                    model_v = model_v.to(device=self.device)
-                ema_v.copy_( model_v )
-
-    def forward(self, x):
-        return self.module(x)
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/gpu_affinity.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/gpu_affinity.py
deleted file mode 100644
index 79fb1fc4..00000000
--- a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/gpu_affinity.py
+++ /dev/null
@@ -1,157 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import collections
-import math
-import os
-import pathlib
-import re
-
-import pynvml
-
-pynvml.nvmlInit()
-
-
-def systemGetDriverVersion():
-    return pynvml.nvmlSystemGetDriverVersion()
-
-
-def deviceGetCount():
-    return pynvml.nvmlDeviceGetCount()
-
-
-class device:
-    # assume nvml returns list of 64 bit ints
-    _nvml_affinity_elements = math.ceil(os.cpu_count() / 64)
-
-    def __init__(self, device_idx):
-        super().__init__()
-        self.handle = pynvml.nvmlDeviceGetHandleByIndex(device_idx)
-
-    def getName(self):
-        return pynvml.nvmlDeviceGetName(self.handle)
-
-    def getCpuAffinity(self):
-        affinity_string = ''
-        for j in pynvml.nvmlDeviceGetCpuAffinity(
-            self.handle, device._nvml_affinity_elements
-        ):
-            # assume nvml returns list of 64 bit ints
-            affinity_string = '{:064b}'.format(j) + affinity_string
-        affinity_list = [int(x) for x in affinity_string]
-        affinity_list.reverse()  # so core 0 is in 0th element of list
-
-        ret = [i for i, e in enumerate(affinity_list) if e != 0]
-        return ret
-
-
-def set_socket_affinity(gpu_id):
-    dev = device(gpu_id)
-    affinity = dev.getCpuAffinity()
-    os.sched_setaffinity(0, affinity)
-
-
-def set_single_affinity(gpu_id):
-    dev = device(gpu_id)
-    affinity = dev.getCpuAffinity()
-    os.sched_setaffinity(0, affinity[:1])
-
-
-def set_single_unique_affinity(gpu_id, nproc_per_node):
-    devices = [device(i) for i in range(nproc_per_node)]
-    socket_affinities = [dev.getCpuAffinity() for dev in devices]
-
-    siblings_list = get_thread_siblings_list()
-    siblings_dict = dict(siblings_list)
-
-    # remove siblings
-    for idx, socket_affinity in enumerate(socket_affinities):
-        socket_affinities[idx] = list(set(socket_affinity) - set(siblings_dict.values()))
-
-    affinities = []
-    assigned = []
-
-    for socket_affinity in socket_affinities:
-        for core in socket_affinity:
-            if core not in assigned:
-                affinities.append([core])
-                assigned.append(core)
-                break
-    os.sched_setaffinity(0, affinities[gpu_id])
-
-
-def set_socket_unique_affinity(gpu_id, nproc_per_node, mode):
-    device_ids = [device(i) for i in range(nproc_per_node)]
-    socket_affinities = [dev.getCpuAffinity() for dev in device_ids]
-
-    siblings_list = get_thread_siblings_list()
-    siblings_dict = dict(siblings_list)
-
-    # remove siblings
-    for idx, socket_affinity in enumerate(socket_affinities):
-        socket_affinities[idx] = list(set(socket_affinity) - set(siblings_dict.values()))
-
-    socket_affinities_to_device_ids = collections.defaultdict(list)
-
-    for idx, socket_affinity in enumerate(socket_affinities):
-        socket_affinities_to_device_ids[tuple(socket_affinity)].append(idx)
-
-    for socket_affinity, device_ids in socket_affinities_to_device_ids.items():
-        devices_per_group = len(device_ids)
-        cores_per_device = len(socket_affinity) // devices_per_group
-        for group_id, device_id in enumerate(device_ids):
-            if device_id == gpu_id:
-                if mode == 'interleaved':
-                    affinity = list(socket_affinity[group_id::devices_per_group])
-                elif mode == 'continuous':
-                    affinity = list(socket_affinity[group_id*cores_per_device:(group_id+1)*cores_per_device])
-                else:
-                    raise RuntimeError('Unknown set_socket_unique_affinity mode')
-
-                # reintroduce siblings
-                affinity += [siblings_dict[aff] for aff in affinity if aff in siblings_dict]
-                os.sched_setaffinity(0, affinity)
-
-
-def get_thread_siblings_list():
-    path = '/sys/devices/system/cpu/cpu*/topology/thread_siblings_list'
-    thread_siblings_list = []
-    pattern = re.compile(r'(\d+)\D(\d+)')
-    for fname in pathlib.Path(path[0]).glob(path[1:]):
-        with open(fname) as f:
-            content = f.read().strip()
-            res = pattern.findall(content)
-            if res:
-                pair = tuple(map(int, res[0]))
-                thread_siblings_list.append(pair)
-    return thread_siblings_list
-
-
-def set_affinity(gpu_id, nproc_per_node, mode='socket'):
-    if mode == 'socket':
-        set_socket_affinity(gpu_id)
-    elif mode == 'single':
-        set_single_affinity(gpu_id)
-    elif mode == 'single_unique':
-        set_single_unique_affinity(gpu_id, nproc_per_node)
-    elif mode == 'socket_unique_interleaved':
-        set_socket_unique_affinity(gpu_id, nproc_per_node, 'interleaved')
-    elif mode == 'socket_unique_continuous':
-        set_socket_unique_affinity(gpu_id, nproc_per_node, 'continuous')
-    else:
-        raise RuntimeError('Unknown affinity mode')
-
-    affinity = os.sched_getaffinity(0)
-    return affinity
-
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/inference.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/inference.py
deleted file mode 100644
index 056429f1..00000000
--- a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/inference.py
+++ /dev/null
@@ -1,239 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import pandas as pd
-import numpy as np
-import pickle
-import argparse
-import torch
-from torch.utils.data import DataLoader
-from torch.cuda import amp
-from torch.utils.tensorboard import SummaryWriter
-from tqdm import tqdm
-from modeling import TemporalFusionTransformer
-from configuration import ElectricityConfig
-from data_utils import TFTDataset
-from utils import PerformanceMeter
-from criterions import QuantileLoss
-import dllogger
-from log_helper import setup_logger
-
-def _unscale_per_id(config, values, ids, scalers):
-    values = values.cpu().numpy()
-    num_horizons = config.example_length - config.encoder_length + 1
-    flat_values = pd.DataFrame(
-            values,
-            columns=[f't{j}' for j in range(num_horizons - values.shape[1], num_horizons)]
-            )
-    flat_values['id'] = ids
-    df_list = []
-    for idx, group in flat_values.groupby('id'):
-        scaler = scalers[idx]
-        group_copy = group.copy()
-        for col in group_copy.columns:
-            if not 'id' in col:
-                _col = np.expand_dims(group_copy[col].values, -1)
-                _t_col = scaler.inverse_transform(_col)[:,-1]
-                group_copy[col] = _t_col
-        df_list.append(group_copy)
-    flat_values = pd.concat(df_list, axis=0)
-
-    flat_values = flat_values[[col for col in flat_values if not 'id' in col]]
-    flat_tensor = torch.from_numpy(flat_values.values)
-    return flat_tensor
-
-def _unscale(config, values, scaler):
-    values = values.cpu().numpy()
-    num_horizons = config.example_length - config.encoder_length + 1
-    flat_values = pd.DataFrame(
-            values,
-            columns=[f't{j}' for j in range(num_horizons - values.shape[1], num_horizons)]
-            )
-    for col in flat_values.columns:
-        if not 'id' in col:
-            _col = np.expand_dims(flat_values[col].values, -1)
-            _t_col = scaler.inverse_transform(_col)[:,-1]
-            flat_values[col] = _t_col
-
-    flat_values = flat_values[[col for col in flat_values if not 'id' in col]]
-    flat_tensor = torch.from_numpy(flat_values.values)
-    return flat_tensor
-
-def predict(args, config, model, data_loader, scalers, cat_encodings, extend_targets=False):
-    model.eval()
-    predictions = []
-    targets = []
-    ids = []
-    perf_meter = PerformanceMeter()
-    n_workers = args.distributed_world_size if hasattr(args, 'distributed_world_size') else 1
-
-    for step, batch in enumerate(data_loader):
-        perf_meter.reset_current_lap()
-        with torch.no_grad():
-            batch = {key: tensor.cuda() if tensor.numel() else None for key, tensor in batch.items()}
-            ids.append(batch['id'][:,0,:])
-            targets.append(batch['target'])
-            predictions.append(model(batch).float())
-
-        perf_meter.update(args.batch_size * n_workers,
-            exclude_from_total=step in [0, len(data_loader)-1])
-
-    targets = torch.cat(targets, dim=0)
-    if not extend_targets:
-        targets = targets[:,config.encoder_length:,:] 
-    predictions = torch.cat(predictions, dim=0)
-    
-    if config.scale_per_id:
-        ids = torch.cat(ids, dim=0).cpu().numpy()
-
-        unscaled_predictions = torch.stack(
-                [_unscale_per_id(config, predictions[:,:,i], ids, scalers) for i in range(len(config.quantiles))], 
-                dim=-1)
-        unscaled_targets = _unscale_per_id(config, targets[:,:,0], ids, scalers).unsqueeze(-1)
-    else:
-        ids = None
-        unscaled_predictions = torch.stack(
-                [_unscale(config, predictions[:,:,i], scalers['']) for i in range(len(config.quantiles))], 
-                dim=-1)
-        unscaled_targets = _unscale(config, targets[:,:,0], scalers['']).unsqueeze(-1)
-
-    return unscaled_predictions, unscaled_targets, ids, perf_meter
-
-def visualize_v2(args, config, model, data_loader, scalers, cat_encodings):
-    unscaled_predictions, unscaled_targets, ids, _ = predict(args, config, model, data_loader, scalers, cat_encodings, extend_targets=True)
-
-    num_horizons = config.example_length - config.encoder_length + 1
-    pad = unscaled_predictions.new_full((unscaled_targets.shape[0], unscaled_targets.shape[1] - unscaled_predictions.shape[1], unscaled_predictions.shape[2]), fill_value=float('nan'))
-    pad[:,-1,:] = unscaled_targets[:,-num_horizons,:]
-    unscaled_predictions = torch.cat((pad, unscaled_predictions), dim=1)
-
-    ids = torch.from_numpy(ids.squeeze())
-    joint_graphs = torch.cat([unscaled_targets, unscaled_predictions], dim=2)
-    graphs = {i:joint_graphs[ids == i, :, :] for i in set(ids.tolist())}
-    for key, g in graphs.items():
-        for i, ex in enumerate(g):
-            df = pd.DataFrame(ex.numpy(), 
-                    index=range(num_horizons - ex.shape[0], num_horizons),
-                    columns=['target'] + [f'P{int(q*100)}' for q in config.quantiles])
-            fig = df.plot().get_figure()
-            ax = fig.get_axes()[0]
-            _values = df.values[config.encoder_length-1:,:]
-            ax.fill_between(range(num_horizons), _values[:,1], _values[:,-1], alpha=0.2, color='green')
-            os.makedirs(os.path.join(args.results, 'single_example_vis', str(key)), exist_ok=True)
-            fig.savefig(os.path.join(args.results, 'single_example_vis', str(key), f'{i}.pdf'))
-
-def inference(args, config, model, data_loader, scalers, cat_encodings):
-    unscaled_predictions, unscaled_targets, ids, perf_meter = predict(args, config, model, data_loader, scalers, cat_encodings)
-
-    if args.joint_visualization or args.save_predictions:
-        ids = torch.from_numpy(ids.squeeze())
-        #ids = torch.cat([x['id'][0] for x in data_loader.dataset])
-        joint_graphs = torch.cat([unscaled_targets, unscaled_predictions], dim=2)
-        graphs = {i:joint_graphs[ids == i, :, :] for i in set(ids.tolist())}
-        for key, g in graphs.items(): #timeseries id, joint targets and predictions
-            _g = {'targets': g[:,:,0]}
-            _g.update({f'P{int(q*100)}':g[:,:,i+1] for i, q in enumerate(config.quantiles)})
-            
-            if args.joint_visualization:
-                summary_writer = SummaryWriter(log_dir=os.path.join(args.results, 'predictions_vis', str(key)))
-                for q, t in _g.items(): # target and quantiles, timehorizon values
-                    if q == 'targets':
-                        targets = torch.cat([t[:,0], t[-1,1:]]) # WIP
-                        # We want to plot targets on the same graph as predictions. Probably could be written better.
-                        for i, val in enumerate(targets):
-                            summary_writer.add_scalars(str(key), {f'{q}':val}, i)
-                        continue
-
-                    # Tensor t contains different time horizons which are shifted in phase
-                    # Next lines realign them
-                    y = t.new_full((t.shape[0] + t.shape[1] -1, t.shape[1]), float('nan'))
-                    for i in range(y.shape[1]):
-                        y[i:i+t.shape[0], i] = t[:,i]
-
-                    for i, vals in enumerate(y): # timestep, timehorizon values value
-                        summary_writer.add_scalars(str(key), {f'{q}_t+{j+1}':v for j,v in enumerate(vals) if v == v}, i)
-                summary_writer.close()
-
-            if args.save_predictions:
-                for q, t in _g.items():
-                    df = pd.DataFrame(t.tolist())
-                    df.columns = [f't+{i+1}' for i in range(len(df.columns))]
-                    os.makedirs(os.path.join(args.results, 'predictions', str(key)), exist_ok=True)
-                    df.to_csv(os.path.join(args.results, 'predictions', str(key), q+'.csv'))
-
-    losses = QuantileLoss(config)(unscaled_predictions, unscaled_targets)
-    normalizer = unscaled_targets.abs().mean()
-    q_risk = 2 * losses / normalizer
-
-    perf_dict = {
-                'throughput': perf_meter.avg,
-                'latency_avg': perf_meter.total_time/len(perf_meter.intervals),
-                'latency_p90': perf_meter.p(90),
-                'latency_p95': perf_meter.p(95),
-                'latency_p99': perf_meter.p(99),
-                'total_infernece_time': perf_meter.total_time,
-                }
-
-    return q_risk, perf_dict
-
-
-def main(args):
-    
-    setup_logger(args)
-    # Set up model
-    state_dict = torch.load(args.checkpoint)
-    config = state_dict['config']
-    model = TemporalFusionTransformer(config).cuda()
-    model.load_state_dict(state_dict['model'])
-    model.eval()
-    model.cuda()
-
-    # Set up dataset
-    test_split = TFTDataset(args.data, config)
-    data_loader = DataLoader(test_split, batch_size=args.batch_size, num_workers=4)
-
-    scalers = pickle.load(open(args.tgt_scalers, 'rb'))
-    cat_encodings = pickle.load(open(args.cat_encodings, 'rb'))
-
-    if args.visualize:
-        # TODO: abstract away all forms of visualization.
-        visualize_v2(args, config, model, data_loader, scalers, cat_encodings)
-
-    quantiles, perf_dict = inference(args, config, model, data_loader, scalers, cat_encodings)
-    quantiles = {'test_p10': quantiles[0].item(), 'test_p50': quantiles[1].item(), 'test_p90': quantiles[2].item(), 'sum':sum(quantiles).item()}
-    finish_log = {**quantiles, **perf_dict}
-    dllogger.log(step=(), data=finish_log, verbosity=1)
-    print('Test q-risk: P10 {} | P50 {} | P90 {}'.format(*quantiles))
-    print('Latency:\n\tAverage {:.3f}s\n\tp90 {:.3f}s\n\tp95 {:.3f}s\n\tp99 {:.3f}s'.format(
-        perf_dict['latency_avg'], perf_dict['latency_p90'], perf_dict['latency_p95'], perf_dict['latency_p99']))
-
-if __name__=='__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--checkpoint', type=str,
-                        help='Path to the checkpoint')
-    parser.add_argument('--data', type=str,
-                        help='Path to the test split of the dataset')
-    parser.add_argument('--tgt_scalers', type=str,
-                        help='Path to the tgt_scalers.bin file produced by the preprocessing')
-    parser.add_argument('--cat_encodings', type=str,
-                        help='Path to the cat_encodings.bin file produced by the preprocessing')
-    parser.add_argument('--batch_size', type=int, default=64)
-    parser.add_argument('--visualize', action='store_true', help='Visualize predictions - each example on the separate plot')
-    parser.add_argument('--joint_visualization', action='store_true', help='Visualize predictions - each timeseries on separate plot. Projections will be concatenated.')
-    parser.add_argument('--save_predictions', action='store_true')
-    parser.add_argument('--results', type=str, default='/results')
-    parser.add_argument('--log_file', type=str, default='dllogger.json')
-    ARGS = parser.parse_args()
-    main(ARGS)
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/log_helper.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/log_helper.py
deleted file mode 100644
index 83d2ac7f..00000000
--- a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/log_helper.py
+++ /dev/null
@@ -1,141 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import subprocess
-import sys
-import itertools
-import atexit
-
-import dllogger
-from dllogger import Backend, JSONStreamBackend, StdOutBackend
-
-import torch.distributed as dist
-from torch.utils.tensorboard import SummaryWriter
-
-class TensorBoardBackend(Backend):
-    def __init__(self, verbosity, log_dir):
-        super().__init__(verbosity=verbosity)
-        self.summary_writer = SummaryWriter(log_dir=os.path.join(log_dir, 'TB_summary'),
-                                            flush_secs=120,
-                                            max_queue=200
-                                            )
-        self.hp_cache = None
-        atexit.register(self.summary_writer.close)
-
-    @property
-    def log_level(self):
-        return self._log_level
-
-    def metadata(self, timestamp, elapsedtime, metric, metadata):
-        pass
-
-    def log(self, timestamp, elapsedtime, step, data):
-        if step == 'HPARAMS':
-            parameters = {k: v for k, v in data.items() if not isinstance(v, (list, tuple))}
-            #Unpack list and tuples
-            for d in [{k+f'_{i}':v for i,v in enumerate(l)} for k,l in data.items() if isinstance(l, (list, tuple))]:
-                parameters.update(d)
-            #Remove custom classes
-            parameters = {k: v for k, v in data.items() if isinstance(v, (int, float, str, bool))}
-            parameters.update({k:'None' for k, v in data.items() if v is None})
-            self.hp_cache = parameters
-        if step == ():
-            if self.hp_cache is None:
-                print('Warning: Cannot save HParameters. Please log HParameters with step=\'HPARAMS\'', file=sys.stderr)
-                return
-            self.summary_writer.add_hparams(self.hp_cache, data)
-        if not isinstance(step, int):
-            return
-        for k, v in data.items():
-            self.summary_writer.add_scalar(k, v, step)
-
-    def flush(self):
-        pass
-
-def setup_logger(args):
-    os.makedirs(args.results, exist_ok=True)
-    log_path = os.path.join(args.results, args.log_file)
-
-    if os.path.exists(log_path):
-        for i in itertools.count():
-            s_fname = args.log_file.split('.')
-            fname = '.'.join(s_fname[:-1]) + f'_{i}.' + s_fname[-1] if len(s_fname) > 1 else args.stat_file + f'.{i}'
-            log_path = os.path.join(args.results, fname)
-            if not os.path.exists(log_path):
-                break
-
-    def metric_format(metric, metadata, value):
-        return "{}: {}".format(metric, f'{value:.5f}' if isinstance(value, float) else value)
-    def step_format(step):
-        if step == ():
-            return "Finished |"
-        elif isinstance(step, int):
-            return "Step {0: <5} |".format(step)
-        return "Step {} |".format(step)
-
-
-    if not dist.is_initialized() or not args.distributed_world_size > 1 or args.distributed_rank == 0:
-        dllogger.init(backends=[JSONStreamBackend(verbosity=1, filename=log_path),
-                                TensorBoardBackend(verbosity=1, log_dir=args.results),
-                                StdOutBackend(verbosity=2, 
-                                              step_format=step_format,
-                                              prefix_format=lambda x: "")#,
-                                              #metric_format=metric_format)
-                                ])
-    else:
-        dllogger.init(backends=[])
-    dllogger.log(step='PARAMETER', data=vars(args), verbosity=0)
-
-    container_setup_info = {**get_framework_env_vars(), **get_system_info()}
-    dllogger.log(step='ENVIRONMENT', data=container_setup_info, verbosity=0)
-
-    dllogger.metadata('loss', {'GOAL': 'MINIMIZE', 'STAGE': 'TRAIN', 'format': ':5f'})
-    dllogger.metadata('P10', {'GOAL': 'MINIMIZE', 'STAGE': 'TRAIN', 'format': ':5f'})
-    dllogger.metadata('P50', {'GOAL': 'MINIMIZE', 'STAGE': 'TRAIN', 'format': ':5f'})
-    dllogger.metadata('P90', {'GOAL': 'MINIMIZE', 'STAGE': 'TRAIN', 'format': ':5f'})
-    dllogger.metadata('items/s', {'GOAL': 'MAXIMIZE', 'STAGE': 'TRAIN', 'format': ':1f'})
-    dllogger.metadata('val_loss', {'GOAL': 'MINIMIZE', 'STAGE': 'VAL', 'format':':5f'})
-    dllogger.metadata('val_P10', {'GOAL': 'MINIMIZE', 'STAGE': 'VAL', 'format': ':5f'})
-    dllogger.metadata('val_P50', {'GOAL': 'MINIMIZE', 'STAGE': 'VAL', 'format': ':5f'})
-    dllogger.metadata('val_P90', {'GOAL': 'MINIMIZE', 'STAGE': 'VAL', 'format': ':5f'})
-    dllogger.metadata('val_items/s', {'GOAL': 'MAXIMIZE', 'STAGE': 'VAL', 'format': ':1f'})
-    dllogger.metadata('test_P10', {'GOAL': 'MINIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
-    dllogger.metadata('test_P50', {'GOAL': 'MINIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
-    dllogger.metadata('test_P90', {'GOAL': 'MINIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
-    dllogger.metadata('throughput', {'GOAL': 'MAXIMIZE', 'STAGE': 'TEST', 'format': ':1f'})
-    dllogger.metadata('latency_p90', {'GOAL': 'MIMIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
-    dllogger.metadata('latency_p95', {'GOAL': 'MIMIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
-    dllogger.metadata('latency_p99', {'GOAL': 'MIMIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
-
-
-def get_framework_env_vars():
-    return {
-        'NVIDIA_PYTORCH_VERSION': os.environ.get('NVIDIA_PYTORCH_VERSION'),
-        'PYTORCH_VERSION': os.environ.get('PYTORCH_VERSION'),
-        'CUBLAS_VERSION': os.environ.get('CUBLAS_VERSION'),
-        'NCCL_VERSION': os.environ.get('NCCL_VERSION'),
-        'CUDA_DRIVER_VERSION': os.environ.get('CUDA_DRIVER_VERSION'),
-        'CUDNN_VERSION': os.environ.get('CUDNN_VERSION'),
-        'CUDA_VERSION': os.environ.get('CUDA_VERSION'),
-        'NVIDIA_PIPELINE_ID': os.environ.get('NVIDIA_PIPELINE_ID'),
-        'NVIDIA_BUILD_ID': os.environ.get('NVIDIA_BUILD_ID'),
-        'NVIDIA_TF32_OVERRIDE': os.environ.get('NVIDIA_TF32_OVERRIDE'),
-    }
-
-def get_system_info():
-    system_info = subprocess.run('nvidia-smi --query-gpu=gpu_name,memory.total,enforced.power.limit --format=csv'.split(), capture_output=True).stdout
-    system_info = [i.decode('utf-8') for i in system_info.split(b'\n')]
-    system_info = [x for x in system_info if x]
-    return {'system_info': system_info}
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/modeling.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/modeling.py
deleted file mode 100644
index 65e64983..00000000
--- a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/modeling.py
+++ /dev/null
@@ -1,367 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-from torch import Tensor
-from typing import Dict, Tuple, Optional, List
-
-if os.environ.get("TFT_SCRIPTING", False):
-    from torch.nn import LayerNorm
-else:
-    from apex.normalization.fused_layer_norm import FusedLayerNorm as LayerNorm
-
-class MaybeLayerNorm(nn.Module):
-    def __init__(self, output_size, hidden_size, eps):
-        super().__init__()
-        if output_size and output_size == 1:
-            self.ln = nn.Identity()
-        else:
-            self.ln = LayerNorm(output_size if output_size else hidden_size, eps=eps)
-    
-    def forward(self, x):
-        return self.ln(x)
-
-
-class GLU(nn.Module):
-    def __init__(self, hidden_size, output_size):
-        super().__init__()
-        self.lin = nn.Linear(hidden_size, output_size * 2)
-
-    def forward(self, x: Tensor) -> Tensor:
-        x = self.lin(x)
-        x = F.glu(x)
-        return x
-
-
-class GRN(nn.Module):
-    def __init__(self,
-                 input_size,
-                 hidden_size, 
-                 output_size=None,
-                 context_hidden_size=None,
-                 dropout=0):
-        super().__init__()
-
-        
-        self.layer_norm = MaybeLayerNorm(output_size, hidden_size, eps=1e-3)
-        self.lin_a = nn.Linear(input_size, hidden_size)
-        if context_hidden_size is not None:
-            self.lin_c = nn.Linear(context_hidden_size, hidden_size, bias=False)
-        self.lin_i = nn.Linear(hidden_size, hidden_size)
-        self.glu = GLU(hidden_size, output_size if output_size else hidden_size)
-        self.dropout = nn.Dropout(dropout)
-        self.out_proj = nn.Linear(input_size, output_size) if output_size else None
-
-    def forward(self, a: Tensor, c: Optional[Tensor] = None):
-        x = self.lin_a(a)
-        if c is not None:
-            x = x + self.lin_c(c).unsqueeze(1)
-        x = F.elu(x)
-        x = self.lin_i(x)
-        x = self.dropout(x)
-        x = self.glu(x)
-        y = a if not self.out_proj else self.out_proj(a)
-        x = x + y
-        x = self.layer_norm(x)
-        return x 
-
-class TFTEmbedding(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.s_cat_inp_lens    = config.static_categorical_inp_lens
-        self.t_cat_k_inp_lens  = config.temporal_known_categorical_inp_lens
-        self.t_cat_o_inp_lens  = config.temporal_observed_categorical_inp_lens
-        self.s_cont_inp_size   = config.static_continuous_inp_size
-        self.t_cont_k_inp_size = config.temporal_known_continuous_inp_size
-        self.t_cont_o_inp_size = config.temporal_observed_continuous_inp_size
-        self.t_tgt_size        = config.temporal_target_size
-
-        self.hidden_size = config.hidden_size
-
-        # There are 7 types of input:
-        # 1. Static categorical
-        # 2. Static continuous
-        # 3. Temporal known a priori categorical
-        # 4. Temporal known a priori continuous
-        # 5. Temporal observed categorical
-        # 6. Temporal observed continuous
-        # 7. Temporal observed targets (time series obseved so far)
-
-        self.s_cat_embed = nn.ModuleList([
-            nn.Embedding(n, self.hidden_size) for n in self.s_cat_inp_lens]) if self.s_cat_inp_lens else None
-        self.t_cat_k_embed = nn.ModuleList([
-            nn.Embedding(n, self.hidden_size) for n in self.t_cat_k_inp_lens]) if self.t_cat_k_inp_lens else None
-        self.t_cat_o_embed = nn.ModuleList([
-            nn.Embedding(n, self.hidden_size) for n in self.t_cat_o_inp_lens]) if self.t_cat_o_inp_lens else None
-
-        self.s_cont_embedding_vectors = nn.Parameter(torch.Tensor(self.s_cont_inp_size, self.hidden_size)) if self.s_cont_inp_size else None
-        self.t_cont_k_embedding_vectors = nn.Parameter(torch.Tensor(self.t_cont_k_inp_size, self.hidden_size)) if self.t_cont_k_inp_size else None
-        self.t_cont_o_embedding_vectors = nn.Parameter(torch.Tensor(self.t_cont_o_inp_size, self.hidden_size)) if self.t_cont_o_inp_size else None
-        self.t_tgt_embedding_vectors = nn.Parameter(torch.Tensor(self.t_tgt_size, self.hidden_size))
-
-        self.s_cont_embedding_bias = nn.Parameter(torch.zeros(self.s_cont_inp_size, self.hidden_size)) if self.s_cont_inp_size else None
-        self.t_cont_k_embedding_bias = nn.Parameter(torch.zeros(self.t_cont_k_inp_size, self.hidden_size)) if self.t_cont_k_inp_size else None
-        self.t_cont_o_embedding_bias = nn.Parameter(torch.zeros(self.t_cont_o_inp_size, self.hidden_size)) if self.t_cont_o_inp_size else None
-        self.t_tgt_embedding_bias = nn.Parameter(torch.zeros(self.t_tgt_size, self.hidden_size))
-
-        if self.s_cont_embedding_vectors is not None:
-            torch.nn.init.xavier_normal_(self.s_cont_embedding_vectors)
-        if self.t_cont_k_embedding_vectors is not None:
-            torch.nn.init.xavier_normal_(self.t_cont_k_embedding_vectors)
-        if self.t_cont_o_embedding_vectors is not None:
-            torch.nn.init.xavier_normal_(self.t_cont_o_embedding_vectors)
-        torch.nn.init.xavier_normal_(self.t_tgt_embedding_vectors)
-
-    def _apply_embedding(self,
-            cat: Optional[Tensor],
-            cont: Optional[Tensor],
-            cat_emb: Optional[nn.ModuleList], 
-            cont_emb: Tensor,
-            cont_bias: Tensor,
-            ) -> Tuple[Optional[Tensor], Optional[Tensor]]:
-        e_cat = torch.stack([embed(cat[...,i]) for i, embed in enumerate(cat_emb)], dim=-2) if cat is not None else None
-        if cont is not None:
-            #the line below is equivalent to following einsums
-            #e_cont = torch.einsum('btf,fh->bthf', cont, cont_emb)
-            #e_cont = torch.einsum('bf,fh->bhf', cont, cont_emb)
-            e_cont = torch.mul(cont.unsqueeze(-1), cont_emb)
-            e_cont = e_cont + cont_bias
-        else:
-            e_cont = None
-
-        if e_cat is not None and e_cont is not None:
-            return torch.cat([e_cat, e_cont], dim=-2)
-        elif e_cat is not None:
-            return e_cat
-        elif e_cont is not None:
-            return e_cont
-        else:
-            return None
-
-    def forward(self, x: Dict[str, Tensor]):
-        # temporal/static categorical/continuous known/observed input 
-        s_cat_inp = x.get('s_cat', None)
-        s_cont_inp = x.get('s_cont', None)
-        t_cat_k_inp = x.get('k_cat', None)
-        t_cont_k_inp = x.get('k_cont', None)
-        t_cat_o_inp = x.get('o_cat', None)
-        t_cont_o_inp = x.get('o_cont', None)
-        t_tgt_obs = x['target'] # Has to be present
-
-        # Static inputs are expected to be equal for all timesteps
-        # For memory efficiency there is no assert statement
-        s_cat_inp = s_cat_inp[:,0,:] if s_cat_inp is not None else None
-        s_cont_inp = s_cont_inp[:,0,:] if s_cont_inp is not None else None
-
-        s_inp = self._apply_embedding(s_cat_inp,
-                                      s_cont_inp,
-                                      self.s_cat_embed,
-                                      self.s_cont_embedding_vectors,
-                                      self.s_cont_embedding_bias)
-        t_known_inp = self._apply_embedding(t_cat_k_inp,
-                                            t_cont_k_inp,
-                                            self.t_cat_k_embed,
-                                            self.t_cont_k_embedding_vectors,
-                                            self.t_cont_k_embedding_bias)
-        t_observed_inp = self._apply_embedding(t_cat_o_inp,
-                                               t_cont_o_inp,
-                                               self.t_cat_o_embed,
-                                               self.t_cont_o_embedding_vectors,
-                                               self.t_cont_o_embedding_bias)
-
-        # Temporal observed targets
-        # t_observed_tgt = torch.einsum('btf,fh->btfh', t_tgt_obs, self.t_tgt_embedding_vectors)
-        t_observed_tgt = torch.matmul(t_tgt_obs.unsqueeze(3).unsqueeze(4), self.t_tgt_embedding_vectors.unsqueeze(1)).squeeze(3)
-        t_observed_tgt = t_observed_tgt + self.t_tgt_embedding_bias
-
-        return s_inp, t_known_inp, t_observed_inp, t_observed_tgt
-
-class VariableSelectionNetwork(nn.Module):
-    def __init__(self, config, num_inputs):
-        super().__init__()
-        self.joint_grn = GRN(config.hidden_size*num_inputs, config.hidden_size, output_size=num_inputs, context_hidden_size=config.hidden_size)
-        self.var_grns = nn.ModuleList([GRN(config.hidden_size, config.hidden_size, dropout=config.dropout) for _ in range(num_inputs)])
-
-    def forward(self, x: Tensor, context: Optional[Tensor] = None):
-        Xi = x.reshape(*x.shape[:-2], -1)
-        grn_outputs = self.joint_grn(Xi, c=context)
-        sparse_weights = F.softmax(grn_outputs, dim=-1)
-        transformed_embed_list = [m(x[...,i,:]) for i, m in enumerate(self.var_grns)]
-        transformed_embed = torch.stack(transformed_embed_list, dim=-1)
-        #the line below performs batched matrix vector multiplication
-        #for temporal features it's bthf,btf->bth
-        #for static features it's bhf,bf->bh
-        variable_ctx = torch.matmul(transformed_embed, sparse_weights.unsqueeze(-1)).squeeze(-1)
-
-        return variable_ctx, sparse_weights
-
-class StaticCovariateEncoder(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.vsn = VariableSelectionNetwork(config, config.num_static_vars)
-        self.context_grns = nn.ModuleList([GRN(config.hidden_size, config.hidden_size, dropout=config.dropout) for _ in range(4)])
-
-    def forward(self, x: Tensor) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
-        variable_ctx, sparse_weights = self.vsn(x)
-
-        # Context vectors:
-        # variable selection context
-        # enrichment context
-        # state_c context
-        # state_h context
-        cs, ce, ch, cc = tuple(m(variable_ctx) for m in self.context_grns)
-
-        return cs, ce, ch, cc
-
-
-class InterpretableMultiHeadAttention(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.n_head = config.n_head
-        assert config.hidden_size % config.n_head == 0
-        self.d_head = config.hidden_size // config.n_head
-        self.qkv_linears = nn.Linear(config.hidden_size, (2 * self.n_head + 1) * self.d_head, bias=False)
-        self.out_proj = nn.Linear(self.d_head, config.hidden_size, bias=False)
-        self.attn_dropout = nn.Dropout(config.attn_dropout)
-        self.out_dropout = nn.Dropout(config.dropout)
-        self.scale = self.d_head**-0.5
-        self.register_buffer("_mask", torch.triu(torch.full((config.example_length, config.example_length), float('-inf')), 1).unsqueeze(0))
-
-    def forward(self, x: Tensor, mask_future_timesteps: bool = True) -> Tuple[Tensor, Tensor]:
-        bs, t, h_size = x.shape
-        qkv = self.qkv_linears(x)
-        q, k, v = qkv.split((self.n_head * self.d_head, self.n_head * self.d_head, self.d_head), dim=-1)
-        q = q.view(bs, t, self.n_head, self.d_head)
-        k = k.view(bs, t, self.n_head, self.d_head)
-        v = v.view(bs, t, self.d_head)
-
-        # attn_score = torch.einsum('bind,bjnd->bnij', q, k)
-        attn_score = torch.matmul(q.permute((0, 2, 1, 3)), k.permute((0, 2, 3, 1)))
-        attn_score.mul_(self.scale)
-
-        if mask_future_timesteps:
-            attn_score = attn_score + self._mask
-
-        attn_prob = F.softmax(attn_score, dim=3)
-        attn_prob = self.attn_dropout(attn_prob)
-
-        # attn_vec = torch.einsum('bnij,bjd->bnid', attn_prob, v)
-        attn_vec = torch.matmul(attn_prob, v.unsqueeze(1))
-        m_attn_vec = torch.mean(attn_vec, dim=1)
-        out = self.out_proj(m_attn_vec)
-        out = self.out_dropout(out)
-
-        return out, attn_vec
-
-
-
-class TemporalFusionTransformer(nn.Module):
-    """ 
-    Implementation of https://arxiv.org/abs/1912.09363 
-    """
-    def __init__(self, config):
-        super().__init__()
-
-        if hasattr(config, 'model'):
-            config = config.model
-
-        self.encoder_length = config.encoder_length #this determines from how distant past we want to use data from
-
-        self.embedding = TFTEmbedding(config)
-        self.static_encoder = StaticCovariateEncoder(config)
-
-        self.history_vsn = VariableSelectionNetwork(config, config.num_historic_vars) 
-        self.history_encoder = nn.LSTM(config.hidden_size, config.hidden_size, batch_first=True)
-        self.future_vsn = VariableSelectionNetwork(config, config.num_future_vars)
-        self.future_encoder = nn.LSTM(config.hidden_size, config.hidden_size, batch_first=True)
-
-
-        self.input_gate = GLU(config.hidden_size, config.hidden_size)
-        self.input_gate_ln = LayerNorm(config.hidden_size, eps=1e-3)
-
-        self.enrichment_grn = GRN(config.hidden_size,
-                                  config.hidden_size,
-                                  context_hidden_size=config.hidden_size, 
-                                  dropout=config.dropout)
-        self.attention = InterpretableMultiHeadAttention(config)
-        self.attention_gate = GLU(config.hidden_size, config.hidden_size)
-        self.attention_ln = LayerNorm(config.hidden_size, eps=1e-3)
-
-        self.positionwise_grn = GRN(config.hidden_size,
-                                    config.hidden_size,
-                                    dropout=config.dropout)
-
-        self.decoder_gate = GLU(config.hidden_size, config.hidden_size)
-        self.decoder_ln = LayerNorm(config.hidden_size, eps=1e-3)
-
-        self.quantile_proj = nn.Linear(config.hidden_size, len(config.quantiles))
-
-    def forward(self, x: Dict[str, Tensor]) -> Tensor:
-        s_inp, t_known_inp, t_observed_inp, t_observed_tgt = self.embedding(x)
-
-        # Static context
-        cs, ce, ch, cc = self.static_encoder(s_inp)
-        ch, cc = ch.unsqueeze(0), cc.unsqueeze(0) #lstm initial states
-
-        # Temporal input
-        _historical_inputs = [t_known_inp[:,:self.encoder_length,:], t_observed_tgt[:,:self.encoder_length,:]]
-        if t_observed_inp is not None:
-            _historical_inputs.insert(0,t_observed_inp[:,:self.encoder_length,:])
-
-        historical_inputs = torch.cat(_historical_inputs, dim=-2)
-        future_inputs = t_known_inp[:, self.encoder_length:]
-
-        # Encoders
-        historical_features, _ = self.history_vsn(historical_inputs, cs)
-        history, state = self.history_encoder(historical_features, (ch, cc))
-        future_features, _ = self.future_vsn(future_inputs, cs)
-        future, _ = self.future_encoder(future_features, state)
-        torch.cuda.synchronize() # this call gives perf boost for unknown reasons
-
-        # skip connection
-        input_embedding = torch.cat([historical_features, future_features], dim=1)
-        temporal_features = torch.cat([history, future], dim=1)
-        temporal_features = self.input_gate(temporal_features)
-        temporal_features = temporal_features + input_embedding
-        temporal_features = self.input_gate_ln(temporal_features)
-
-        # Static enrichment
-        enriched = self.enrichment_grn(temporal_features, c=ce)
-
-        # Temporal self attention
-        x, _ = self.attention(enriched, mask_future_timesteps=True)
-
-        # Don't compute hictorical quantiles
-        x = x[:, self.encoder_length:, :]
-        temporal_features = temporal_features[:, self.encoder_length:, :]
-        enriched = enriched[:, self.encoder_length:, :]
-
-        x = self.attention_gate(x)
-        x = x + enriched
-        x = self.attention_ln(x)
-
-        # Position-wise feed-forward
-        x = self.positionwise_grn(x)
-
-        # Final skip connection
-        x = self.decoder_gate(x)
-        x = x + temporal_features
-        x = self.decoder_ln(x)
-
-        out = self.quantile_proj(x)
-
-        return out
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/requirements.txt b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/requirements.txt
deleted file mode 100644
index 8ba46efc..00000000
--- a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/requirements.txt
+++ /dev/null
@@ -1 +0,0 @@
-tensorboard
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/scripts/benchmark.sh b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/scripts/benchmark.sh
deleted file mode 100644
index c8a04c36..00000000
--- a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/scripts/benchmark.sh
+++ /dev/null
@@ -1,54 +0,0 @@
-#! /bin/bash
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-NUM_GPUS=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
-[ $NUM_GPUS -eq 16 ] && WORKER_NUMS=(1 8 16) || WORKER_NUMS=(1 8)
-DATASETS=(electricity traffic)
-
-rm -r /tmp/benchmark_results
-
-for DATASET in ${DATASETS[@]}
-do
-    for NGPU in ${WORKER_NUMS[@]}
-    do
-        for BATCH_SIZE in 512 1024 1536 2048 2560
-        do
-            for USE_AMP in --use_amp ""
-            do
-                for AFFINITY in "--affinity disabled" "--affinity single" "--affinity socket_unique_interleaved"
-                do 
-                    EXP_NAME="TFT_benchmark_${DATASET}_BS_${BATCH_SIZE}_${NGPU}GPU${USE_AMP}_${AFFINITY}"
-                    python -m torch.distributed.launch --nproc_per_node=${NGPU} train.py \
-                            --dataset ${DATASET} \
-                            --data_path /data/processed/${DATASET}_bin \
-                            --batch_size=${BATCH_SIZE} \
-                            --lr 5e-4 \
-                            --epochs 1 \
-                            --sample 100000 5000 \
-                            --seed 1 \
-                            ${USE_AMP} \
-                            ${AFFINITY} \
-                            --clip_grad 0.1 \
-                            --results /tmp/benchmark_results/${EXP_NAME}
-                done
-            done
-        done
-    done
-done
-for P in `ls /tmp/benchmark_results/`;
-do
-    echo ${P}
-    tail -n 1 /tmp/benchmark_results/${P}/dllogger.json
-done
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/scripts/get_data.sh b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/scripts/get_data.sh
deleted file mode 100644
index d4c7c7e1..00000000
--- a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/scripts/get_data.sh
+++ /dev/null
@@ -1,40 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-DATAPATH='/data'
-
-declare -A URLS=( ['electricity']='https://archive.ics.uci.edu/ml/machine-learning-databases/00321/LD2011_2014.txt.zip'
-                  ['traffic']='https://archive.ics.uci.edu/ml/machine-learning-databases/00204/PEMS-SF.zip'
-                )
-
-mkdir -p ${DATAPATH}/raw
-mkdir -p ${DATAPATH}/processed
-
-for DS in electricity traffic
-do
-	DS_PATH=${DATAPATH}/raw/${DS}
-	ZIP_FNAME=${DS_PATH}.zip
-    if [ ! -d ${DS_PATH} ]
-    then
-        wget "${URLS[${DS}]}" -O ${ZIP_FNAME}
-        unzip ${ZIP_FNAME} -d ${DS_PATH}
-    fi
-	python -c "from data_utils import standarize_${DS} as standarize; standarize(\"${DS_PATH}\")"
-	python -c "from data_utils import preprocess; \
-               from configuration import ${DS^}Config as Config; \
-               preprocess(\"${DS_PATH}/standarized.csv\", \"${DATAPATH}/processed/${DS}_bin\", Config())" 
-done
-
-
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/scripts/run_electricity.sh b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/scripts/run_electricity.sh
deleted file mode 100644
index 86214a9a..00000000
--- a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/scripts/run_electricity.sh
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-: ${SEED:=1}
-: ${LR:=1e-3}
-: ${NGPU:=8}
-: ${BATCH_SIZE:=1024}
-: ${EPOCHS:=30}
-
-python -m torch.distributed.launch --nproc_per_node=${NGPU} train.py \
-        --dataset electricity \
-        --data_path /data/processed/electricity_bin \
-        --batch_size=${BATCH_SIZE} \
-        --sample 450000 50000 \
-        --lr ${LR} \
-        --epochs ${EPOCHS} \
-        --seed ${SEED} \
-        --use_amp \
-        --results /results/TFT_electricity_bs${NGPU}x${BATCH_SIZE}_lr${LR}/seed_${SEED}
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/scripts/run_electricity_DGX1-16G.sh b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/scripts/run_electricity_DGX1-16G.sh
deleted file mode 100644
index 86214a9a..00000000
--- a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/scripts/run_electricity_DGX1-16G.sh
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-: ${SEED:=1}
-: ${LR:=1e-3}
-: ${NGPU:=8}
-: ${BATCH_SIZE:=1024}
-: ${EPOCHS:=30}
-
-python -m torch.distributed.launch --nproc_per_node=${NGPU} train.py \
-        --dataset electricity \
-        --data_path /data/processed/electricity_bin \
-        --batch_size=${BATCH_SIZE} \
-        --sample 450000 50000 \
-        --lr ${LR} \
-        --epochs ${EPOCHS} \
-        --seed ${SEED} \
-        --use_amp \
-        --results /results/TFT_electricity_bs${NGPU}x${BATCH_SIZE}_lr${LR}/seed_${SEED}
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/scripts/run_traffic.sh b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/scripts/run_traffic.sh
deleted file mode 100644
index cab8e473..00000000
--- a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/scripts/run_traffic.sh
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-: ${SEED:=1}
-: ${LR:=1e-3}
-: ${NGPU:=8}
-: ${BATCH_SIZE:=1024}
-: ${EPOCHS:=20}
-
-python -m torch.distributed.launch --nproc_per_node=${NGPU} train.py \
-        --dataset traffic \
-        --data_path /data/processed/traffic_bin \
-        --batch_size=${BATCH_SIZE} \
-        --sample 450000 50000 \
-        --lr ${LR} \
-        --epochs ${EPOCHS} \
-        --seed ${SEED} \
-        --use_amp \
-        --results /results/TFT_traffic_bs${NGPU}x${BATCH_SIZE}_lr${LR}/seed_${SEED}
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/scripts/run_traffic_DGX1-16G.sh b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/scripts/run_traffic_DGX1-16G.sh
deleted file mode 100644
index cab8e473..00000000
--- a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/scripts/run_traffic_DGX1-16G.sh
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-: ${SEED:=1}
-: ${LR:=1e-3}
-: ${NGPU:=8}
-: ${BATCH_SIZE:=1024}
-: ${EPOCHS:=20}
-
-python -m torch.distributed.launch --nproc_per_node=${NGPU} train.py \
-        --dataset traffic \
-        --data_path /data/processed/traffic_bin \
-        --batch_size=${BATCH_SIZE} \
-        --sample 450000 50000 \
-        --lr ${LR} \
-        --epochs ${EPOCHS} \
-        --seed ${SEED} \
-        --use_amp \
-        --results /results/TFT_traffic_bs${NGPU}x${BATCH_SIZE}_lr${LR}/seed_${SEED}
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/train.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/train.py
deleted file mode 100644
index e5ceceeb..00000000
--- a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/train.py
+++ /dev/null
@@ -1,294 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import time
-import os
-import pickle
-import json
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import torch.distributed as dist
-from torch.utils.data import DataLoader, DistributedSampler, RandomSampler
-from apex import amp
-from apex.optimizers import FusedAdam
-#from torch.nn.parallel import DistributedDataParallel as DDP
-from apex.parallel import DistributedDataParallel as DDP
-
-import numpy as np
-
-import dllogger
-
-from modeling import TemporalFusionTransformer
-from configuration import CONFIGS
-from data_utils import TFTBinaryDataset, sample_data
-from log_helper import setup_logger
-from criterions import QuantileLoss
-from inference import predict
-from utils import PerformanceMeter
-import gpu_affinity
-from ema import ModelEma
-
-def load_dataset(args, config):
-    train_split = TFTBinaryDataset(os.path.join(args.data_path, 'train.bin'), config)
-    train_split = sample_data(train_split, args.sample_data[0])
-    if args.distributed_world_size > 1:
-        data_sampler = DistributedSampler(train_split, args.distributed_world_size, args.distributed_rank, seed=args.seed + args.distributed_rank, drop_last=True)
-    else:
-        data_sampler = RandomSampler(train_split)
-    train_loader = DataLoader(train_split, batch_size=args.batch_size, num_workers=4, sampler=data_sampler, pin_memory=True)
-
-    valid_split = TFTBinaryDataset(os.path.join(args.data_path, 'valid.bin'), config)
-    valid_split = sample_data(valid_split, args.sample_data[1])
-    if args.distributed_world_size > 1:
-        data_sampler = DistributedSampler(valid_split, args.distributed_world_size, args.distributed_rank, shuffle=False, drop_last=False)
-    else:
-        data_sampler = None
-    valid_loader = DataLoader(valid_split, batch_size=args.batch_size, sampler=data_sampler, num_workers=4, pin_memory=True)
-
-    test_split = TFTBinaryDataset(os.path.join(args.data_path, 'test.bin'), config)
-    if args.distributed_world_size > 1:
-        data_sampler = DistributedSampler(test_split, args.distributed_world_size, args.distributed_rank, shuffle=False, drop_last=False)
-    else:
-        data_sampler = None
-    test_loader = DataLoader(test_split, batch_size=args.batch_size, sampler=data_sampler, num_workers=4, pin_memory=True)
-
-    print_once(f'Train split length: {len(train_split)}')
-    print_once(f'Valid split length: {len(valid_split)}')
-    print_once(f'Test split length: {len(test_split)}')
-
-    return train_loader, valid_loader, test_loader
-
-def print_once(*args, **kwargs):
-    if not dist.is_initialized() or dist.get_rank() == 0:
-        print(*args, **kwargs)
-
-
-def main(args):
-    # Enable CuDNN autotuner
-    nproc_per_node = torch.cuda.device_count()
-    if args.affinity != 'disabled':
-        affinity = gpu_affinity.set_affinity(
-                args.local_rank,
-                nproc_per_node,
-                args.affinity
-            )
-        print(f'{args.local_rank}: thread affinity: {affinity}')
-
-
-    torch.backends.cudnn.benchmark = True
-
-    ### INIT DISTRIBUTED
-    if args.distributed_world_size > 1:
-        args.local_rank = int(os.environ.get('LOCAL_RANK', args.local_rank))
-        torch.cuda.set_device(args.local_rank)
-        dist.init_process_group(backend='nccl', init_method='env://')
-        args.distributed_world_size = int(os.environ['WORLD_SIZE'])
-        args.distributed_rank = dist.get_rank()
-        print_once(f'Distributed training with {args.distributed_world_size} GPUs')
-        torch.cuda.synchronize()
-
-    if args.seed:
-        np.random.seed(args.seed)
-        torch.manual_seed(args.seed)
-        torch.cuda.manual_seed(args.seed)
-
-    setup_logger(args)
-
-    config = CONFIGS[args.dataset]()
-    if args.overwrite_config:
-        config.__dict__.update(json.loads(args.overwrite_config))
-
-    dllogger.log(step='HPARAMS', data={**vars(args), **vars(config)}, verbosity=1)
-
-    model = TemporalFusionTransformer(config).cuda()
-    if args.ema_decay:
-        model_ema = ModelEma(model, decay=args.ema_decay)
-
-    print_once('Model params: {}'.format(sum(p.numel() for p in model.parameters())))
-    criterion = QuantileLoss(config).cuda()
-    optimizer = FusedAdam(model.parameters(), lr=args.lr)
-    if args.use_amp:
-        model, optimizer = amp.initialize(model, optimizer, opt_level="O2", loss_scale="dynamic")
-    if args.distributed_world_size > 1:
-        #model = DDP(model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True)
-        model = DDP(model)
-
-    train_loader, valid_loader, test_loader = load_dataset(args, config)
-
-    global_step = 0
-    perf_meter = PerformanceMeter()
-
-    for epoch in range(args.epochs):
-        start = time.time()
-        dllogger.log(step=global_step, data={'epoch': epoch}, verbosity=1)
-
-        model.train() 
-        for local_step, batch in enumerate(train_loader):
-            perf_meter.reset_current_lap()
-            batch = {key: tensor.cuda() if tensor.numel() else None for key, tensor in batch.items()}
-            predictions = model(batch)
-            targets = batch['target'][:,config.encoder_length:,:]
-            p_losses = criterion(predictions, targets)
-            loss = p_losses.sum()
-
-            if args.use_amp:
-                with amp.scale_loss(loss, optimizer) as scaled_loss:
-                    scaled_loss.backward()
-            else:
-                loss.backward()
-            if not args.grad_accumulation or (global_step+1) % args.grad_accumulation == 0:
-                if args.clip_grad:
-                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip_grad)
-                optimizer.step()
-                optimizer.zero_grad()
-                if args.ema_decay:
-                    model_ema.update(model)
-
-            if args.distributed_world_size > 1:
-                dist.all_reduce(p_losses)
-                p_losses /= args.distributed_world_size
-                loss = p_losses.sum()
-
-            torch.cuda.synchronize()
-            ips = perf_meter.update(args.batch_size * args.distributed_world_size,
-                    exclude_from_total=local_step in [0, len(train_loader)-1])
-
-            log_dict = {'P10':p_losses[0].item(), 'P50':p_losses[1].item(), 'P90':p_losses[2].item(), 'loss': loss.item(), 'items/s':ips}
-            dllogger.log(step=global_step, data=log_dict, verbosity=1)
-            global_step += 1
-
-        validate(args, config, model_ema if args.ema_decay else model, criterion, valid_loader, global_step)
-
-        if validate.early_stop_c >= args.early_stopping:
-            print_once('Early stopping')
-            break
-
-    ### TEST PHASE ###
-    state_dict = torch.load(os.path.join(args.results, 'checkpoint.pt'), map_location='cpu')
-    if isinstance(model, DDP):
-        model.module.load_state_dict(state_dict['model'])
-    else:
-        model.load_state_dict(state_dict['model'])
-    model.cuda().eval()
-
-    tgt_scalers = pickle.load(open(os.path.join(args.data_path, 'tgt_scalers.bin'), 'rb'))
-    cat_encodings = pickle.load(open(os.path.join(args.data_path,'cat_encodings.bin'), 'rb'))
-
-    unscaled_predictions, unscaled_targets, _, _ = predict(args, config, model, test_loader, tgt_scalers, cat_encodings)
-    losses = QuantileLoss(config)(unscaled_predictions, unscaled_targets)
-    normalizer = unscaled_targets.abs().mean()
-    quantiles = 2 * losses / normalizer
-
-    if args.distributed_world_size > 1:
-        quantiles = quantiles.cuda()
-        dist.all_reduce(quantiles)
-        quantiles /= args.distributed_world_size
-
-    quantiles = {'test_p10': quantiles[0].item(), 'test_p50': quantiles[1].item(), 'test_p90': quantiles[2].item(), 'sum':sum(quantiles).item()}
-    finish_log = {**quantiles, 'average_ips':perf_meter.avg, 'convergence_step':validate.conv_step}
-    dllogger.log(step=(), data=finish_log, verbosity=1)
-
-def validate(args, config, model, criterion, dataloader, global_step):
-    if not hasattr(validate, 'best_valid_loss'):
-        validate.best_valid_loss = float('inf')
-    if not hasattr(validate, 'early_stop_c'):
-        validate.early_stop_c = 0
-    model.eval()
-
-    losses = []
-    validation_start = time.time()
-    for batch in dataloader:
-        with torch.no_grad():
-            batch = {key: tensor.cuda() if tensor.numel() else None for key, tensor in batch.items()}
-            predictions = model(batch)
-            targets = batch['target'][:,config.encoder_length:,:]
-            p_losses = criterion(predictions, targets)
-            bs = next(t for t in batch.values() if t is not None).shape[0]
-            losses.append((p_losses, bs))
-
-    validation_end = time.time()
-
-    p_losses = sum([l[0]*l[1] for l in losses])/sum([l[1] for l in losses]) #takes into accunt that the last batch is not full
-    if args.distributed_world_size > 1:
-        dist.all_reduce(p_losses)
-        p_losses = p_losses/args.distributed_world_size
-
-    ips = len(dataloader.dataset) / (validation_end - validation_start)
-
-    log_dict = {'P10':p_losses[0].item(), 'P50':p_losses[1].item(), 'P90':p_losses[2].item(), 'loss': p_losses.sum().item(), 'items/s':ips}
-
-    if log_dict['loss'] < validate.best_valid_loss:
-        validate.best_valid_loss = log_dict['loss']
-        validate.early_stop_c = 0
-        validate.conv_step = global_step
-        if not dist.is_initialized() or dist.get_rank() == 0:
-            state_dict = model.module.state_dict() if isinstance(model, (DDP, ModelEma)) else model.state_dict()
-            ckpt = {'args':args, 'config':config, 'model':state_dict}
-            torch.save(ckpt, os.path.join(args.results, 'checkpoint.pt'))
-        if args.distributed_world_size > 1:
-            dist.barrier()
-    else:
-        validate.early_stop_c += 1
-        
-    log_dict = {'val_'+k:v for k,v in log_dict.items()}
-    dllogger.log(step=global_step, data=log_dict, verbosity=1)
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--data_path', type=str, required=True,
-                        help='Path to the dataset')
-    parser.add_argument('--dataset', type=str, required=True, choices=CONFIGS.keys(),
-                        help='Dataset name')
-    parser.add_argument('--epochs', type=int, default=25,
-                        help='Default number of training epochs')
-    parser.add_argument('--sample_data', type=lambda x: int(float(x)), nargs=2, default=[-1, -1],
-                        help="""Subsample the dataset. Specify number of training and valid examples.
-                        Values can be provided in scientific notation. Floats will be truncated.""")
-    parser.add_argument('--batch_size', type=int, default=64)
-    parser.add_argument('--lr', type=float, default=1e-3)
-    parser.add_argument('--seed', type=int, default=1)
-    parser.add_argument('--use_amp', action='store_true', help='Enable automatic mixed precision')
-    parser.add_argument('--clip_grad', type=float, default=0.0)
-    parser.add_argument('--grad_accumulation', type=int, default=0)
-    parser.add_argument('--early_stopping', type=int, default=1000,
-                        help='Stop training if validation loss does not improve for more than this number of epochs.')
-    parser.add_argument('--results', type=str, default='/results',
-                        help='Directory in which results are stored')
-    parser.add_argument('--log_file', type=str, default='dllogger.json',
-                        help='Name of dllogger output file')
-    parser.add_argument('--distributed_world_size', type=int, metavar='N',
-                       default=torch.cuda.device_count(),
-                       help='total number of GPUs across all nodes (default: all visible GPUs)')
-    parser.add_argument('--distributed_rank', default=os.getenv('LOCAL_RANK', 0), type=int,
-                       help='rank of the current worker')
-    parser.add_argument('--local_rank', default=0, type=int,
-                       help='rank of the current worker')
-    parser.add_argument('--overwrite_config', type=str, default='',
-                       help='JSON string used to overload config')
-    parser.add_argument('--affinity', type=str,
-                         default='socket_unique_interleaved',
-                         choices=['socket', 'single', 'single_unique',
-                                  'socket_unique_interleaved',
-                                  'socket_unique_continuous',
-                                  'disabled'],
-                         help='type of CPU affinity')
-    parser.add_argument("--ema_decay", type=float, default=0.0, help='Use exponential moving average')
-
-
-    ARGS = parser.parse_args()
-    main(ARGS)
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/utils.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/utils.py
deleted file mode 100644
index bf88be40..00000000
--- a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/tft_pyt/utils.py
+++ /dev/null
@@ -1,46 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import time
-
-class PerformanceMeter():
-    def __init__(self):
-        self.reset()
-
-    def reset(self):
-        self.avg = 0
-        self.count = 0
-        self.total_time = 0
-        self.last_update_time = time.time()
-        self.intervals = []
-
-    def update(self, n, exclude_from_total=False):
-        delta = time.time() - self.last_update_time
-        self.intervals.append(delta)
-        if not exclude_from_total:
-            self.total_time += delta
-            self.count += n
-            self.avg = self.count / self.total_time
-        self.last_update_time = time.time()
-
-        return n/delta
-
-    def reset_current_lap(self):
-        self.last_update_time = time.time()
-
-    def p(self, i):
-        assert i <= 100
-        idx = int(len(self.intervals) * i / 100)
-        return sorted(self.intervals)[idx]
-
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/train.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/train.py
deleted file mode 100644
index e5ceceeb..00000000
--- a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/train.py
+++ /dev/null
@@ -1,294 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import time
-import os
-import pickle
-import json
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import torch.distributed as dist
-from torch.utils.data import DataLoader, DistributedSampler, RandomSampler
-from apex import amp
-from apex.optimizers import FusedAdam
-#from torch.nn.parallel import DistributedDataParallel as DDP
-from apex.parallel import DistributedDataParallel as DDP
-
-import numpy as np
-
-import dllogger
-
-from modeling import TemporalFusionTransformer
-from configuration import CONFIGS
-from data_utils import TFTBinaryDataset, sample_data
-from log_helper import setup_logger
-from criterions import QuantileLoss
-from inference import predict
-from utils import PerformanceMeter
-import gpu_affinity
-from ema import ModelEma
-
-def load_dataset(args, config):
-    train_split = TFTBinaryDataset(os.path.join(args.data_path, 'train.bin'), config)
-    train_split = sample_data(train_split, args.sample_data[0])
-    if args.distributed_world_size > 1:
-        data_sampler = DistributedSampler(train_split, args.distributed_world_size, args.distributed_rank, seed=args.seed + args.distributed_rank, drop_last=True)
-    else:
-        data_sampler = RandomSampler(train_split)
-    train_loader = DataLoader(train_split, batch_size=args.batch_size, num_workers=4, sampler=data_sampler, pin_memory=True)
-
-    valid_split = TFTBinaryDataset(os.path.join(args.data_path, 'valid.bin'), config)
-    valid_split = sample_data(valid_split, args.sample_data[1])
-    if args.distributed_world_size > 1:
-        data_sampler = DistributedSampler(valid_split, args.distributed_world_size, args.distributed_rank, shuffle=False, drop_last=False)
-    else:
-        data_sampler = None
-    valid_loader = DataLoader(valid_split, batch_size=args.batch_size, sampler=data_sampler, num_workers=4, pin_memory=True)
-
-    test_split = TFTBinaryDataset(os.path.join(args.data_path, 'test.bin'), config)
-    if args.distributed_world_size > 1:
-        data_sampler = DistributedSampler(test_split, args.distributed_world_size, args.distributed_rank, shuffle=False, drop_last=False)
-    else:
-        data_sampler = None
-    test_loader = DataLoader(test_split, batch_size=args.batch_size, sampler=data_sampler, num_workers=4, pin_memory=True)
-
-    print_once(f'Train split length: {len(train_split)}')
-    print_once(f'Valid split length: {len(valid_split)}')
-    print_once(f'Test split length: {len(test_split)}')
-
-    return train_loader, valid_loader, test_loader
-
-def print_once(*args, **kwargs):
-    if not dist.is_initialized() or dist.get_rank() == 0:
-        print(*args, **kwargs)
-
-
-def main(args):
-    # Enable CuDNN autotuner
-    nproc_per_node = torch.cuda.device_count()
-    if args.affinity != 'disabled':
-        affinity = gpu_affinity.set_affinity(
-                args.local_rank,
-                nproc_per_node,
-                args.affinity
-            )
-        print(f'{args.local_rank}: thread affinity: {affinity}')
-
-
-    torch.backends.cudnn.benchmark = True
-
-    ### INIT DISTRIBUTED
-    if args.distributed_world_size > 1:
-        args.local_rank = int(os.environ.get('LOCAL_RANK', args.local_rank))
-        torch.cuda.set_device(args.local_rank)
-        dist.init_process_group(backend='nccl', init_method='env://')
-        args.distributed_world_size = int(os.environ['WORLD_SIZE'])
-        args.distributed_rank = dist.get_rank()
-        print_once(f'Distributed training with {args.distributed_world_size} GPUs')
-        torch.cuda.synchronize()
-
-    if args.seed:
-        np.random.seed(args.seed)
-        torch.manual_seed(args.seed)
-        torch.cuda.manual_seed(args.seed)
-
-    setup_logger(args)
-
-    config = CONFIGS[args.dataset]()
-    if args.overwrite_config:
-        config.__dict__.update(json.loads(args.overwrite_config))
-
-    dllogger.log(step='HPARAMS', data={**vars(args), **vars(config)}, verbosity=1)
-
-    model = TemporalFusionTransformer(config).cuda()
-    if args.ema_decay:
-        model_ema = ModelEma(model, decay=args.ema_decay)
-
-    print_once('Model params: {}'.format(sum(p.numel() for p in model.parameters())))
-    criterion = QuantileLoss(config).cuda()
-    optimizer = FusedAdam(model.parameters(), lr=args.lr)
-    if args.use_amp:
-        model, optimizer = amp.initialize(model, optimizer, opt_level="O2", loss_scale="dynamic")
-    if args.distributed_world_size > 1:
-        #model = DDP(model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True)
-        model = DDP(model)
-
-    train_loader, valid_loader, test_loader = load_dataset(args, config)
-
-    global_step = 0
-    perf_meter = PerformanceMeter()
-
-    for epoch in range(args.epochs):
-        start = time.time()
-        dllogger.log(step=global_step, data={'epoch': epoch}, verbosity=1)
-
-        model.train() 
-        for local_step, batch in enumerate(train_loader):
-            perf_meter.reset_current_lap()
-            batch = {key: tensor.cuda() if tensor.numel() else None for key, tensor in batch.items()}
-            predictions = model(batch)
-            targets = batch['target'][:,config.encoder_length:,:]
-            p_losses = criterion(predictions, targets)
-            loss = p_losses.sum()
-
-            if args.use_amp:
-                with amp.scale_loss(loss, optimizer) as scaled_loss:
-                    scaled_loss.backward()
-            else:
-                loss.backward()
-            if not args.grad_accumulation or (global_step+1) % args.grad_accumulation == 0:
-                if args.clip_grad:
-                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip_grad)
-                optimizer.step()
-                optimizer.zero_grad()
-                if args.ema_decay:
-                    model_ema.update(model)
-
-            if args.distributed_world_size > 1:
-                dist.all_reduce(p_losses)
-                p_losses /= args.distributed_world_size
-                loss = p_losses.sum()
-
-            torch.cuda.synchronize()
-            ips = perf_meter.update(args.batch_size * args.distributed_world_size,
-                    exclude_from_total=local_step in [0, len(train_loader)-1])
-
-            log_dict = {'P10':p_losses[0].item(), 'P50':p_losses[1].item(), 'P90':p_losses[2].item(), 'loss': loss.item(), 'items/s':ips}
-            dllogger.log(step=global_step, data=log_dict, verbosity=1)
-            global_step += 1
-
-        validate(args, config, model_ema if args.ema_decay else model, criterion, valid_loader, global_step)
-
-        if validate.early_stop_c >= args.early_stopping:
-            print_once('Early stopping')
-            break
-
-    ### TEST PHASE ###
-    state_dict = torch.load(os.path.join(args.results, 'checkpoint.pt'), map_location='cpu')
-    if isinstance(model, DDP):
-        model.module.load_state_dict(state_dict['model'])
-    else:
-        model.load_state_dict(state_dict['model'])
-    model.cuda().eval()
-
-    tgt_scalers = pickle.load(open(os.path.join(args.data_path, 'tgt_scalers.bin'), 'rb'))
-    cat_encodings = pickle.load(open(os.path.join(args.data_path,'cat_encodings.bin'), 'rb'))
-
-    unscaled_predictions, unscaled_targets, _, _ = predict(args, config, model, test_loader, tgt_scalers, cat_encodings)
-    losses = QuantileLoss(config)(unscaled_predictions, unscaled_targets)
-    normalizer = unscaled_targets.abs().mean()
-    quantiles = 2 * losses / normalizer
-
-    if args.distributed_world_size > 1:
-        quantiles = quantiles.cuda()
-        dist.all_reduce(quantiles)
-        quantiles /= args.distributed_world_size
-
-    quantiles = {'test_p10': quantiles[0].item(), 'test_p50': quantiles[1].item(), 'test_p90': quantiles[2].item(), 'sum':sum(quantiles).item()}
-    finish_log = {**quantiles, 'average_ips':perf_meter.avg, 'convergence_step':validate.conv_step}
-    dllogger.log(step=(), data=finish_log, verbosity=1)
-
-def validate(args, config, model, criterion, dataloader, global_step):
-    if not hasattr(validate, 'best_valid_loss'):
-        validate.best_valid_loss = float('inf')
-    if not hasattr(validate, 'early_stop_c'):
-        validate.early_stop_c = 0
-    model.eval()
-
-    losses = []
-    validation_start = time.time()
-    for batch in dataloader:
-        with torch.no_grad():
-            batch = {key: tensor.cuda() if tensor.numel() else None for key, tensor in batch.items()}
-            predictions = model(batch)
-            targets = batch['target'][:,config.encoder_length:,:]
-            p_losses = criterion(predictions, targets)
-            bs = next(t for t in batch.values() if t is not None).shape[0]
-            losses.append((p_losses, bs))
-
-    validation_end = time.time()
-
-    p_losses = sum([l[0]*l[1] for l in losses])/sum([l[1] for l in losses]) #takes into accunt that the last batch is not full
-    if args.distributed_world_size > 1:
-        dist.all_reduce(p_losses)
-        p_losses = p_losses/args.distributed_world_size
-
-    ips = len(dataloader.dataset) / (validation_end - validation_start)
-
-    log_dict = {'P10':p_losses[0].item(), 'P50':p_losses[1].item(), 'P90':p_losses[2].item(), 'loss': p_losses.sum().item(), 'items/s':ips}
-
-    if log_dict['loss'] < validate.best_valid_loss:
-        validate.best_valid_loss = log_dict['loss']
-        validate.early_stop_c = 0
-        validate.conv_step = global_step
-        if not dist.is_initialized() or dist.get_rank() == 0:
-            state_dict = model.module.state_dict() if isinstance(model, (DDP, ModelEma)) else model.state_dict()
-            ckpt = {'args':args, 'config':config, 'model':state_dict}
-            torch.save(ckpt, os.path.join(args.results, 'checkpoint.pt'))
-        if args.distributed_world_size > 1:
-            dist.barrier()
-    else:
-        validate.early_stop_c += 1
-        
-    log_dict = {'val_'+k:v for k,v in log_dict.items()}
-    dllogger.log(step=global_step, data=log_dict, verbosity=1)
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--data_path', type=str, required=True,
-                        help='Path to the dataset')
-    parser.add_argument('--dataset', type=str, required=True, choices=CONFIGS.keys(),
-                        help='Dataset name')
-    parser.add_argument('--epochs', type=int, default=25,
-                        help='Default number of training epochs')
-    parser.add_argument('--sample_data', type=lambda x: int(float(x)), nargs=2, default=[-1, -1],
-                        help="""Subsample the dataset. Specify number of training and valid examples.
-                        Values can be provided in scientific notation. Floats will be truncated.""")
-    parser.add_argument('--batch_size', type=int, default=64)
-    parser.add_argument('--lr', type=float, default=1e-3)
-    parser.add_argument('--seed', type=int, default=1)
-    parser.add_argument('--use_amp', action='store_true', help='Enable automatic mixed precision')
-    parser.add_argument('--clip_grad', type=float, default=0.0)
-    parser.add_argument('--grad_accumulation', type=int, default=0)
-    parser.add_argument('--early_stopping', type=int, default=1000,
-                        help='Stop training if validation loss does not improve for more than this number of epochs.')
-    parser.add_argument('--results', type=str, default='/results',
-                        help='Directory in which results are stored')
-    parser.add_argument('--log_file', type=str, default='dllogger.json',
-                        help='Name of dllogger output file')
-    parser.add_argument('--distributed_world_size', type=int, metavar='N',
-                       default=torch.cuda.device_count(),
-                       help='total number of GPUs across all nodes (default: all visible GPUs)')
-    parser.add_argument('--distributed_rank', default=os.getenv('LOCAL_RANK', 0), type=int,
-                       help='rank of the current worker')
-    parser.add_argument('--local_rank', default=0, type=int,
-                       help='rank of the current worker')
-    parser.add_argument('--overwrite_config', type=str, default='',
-                       help='JSON string used to overload config')
-    parser.add_argument('--affinity', type=str,
-                         default='socket_unique_interleaved',
-                         choices=['socket', 'single', 'single_unique',
-                                  'socket_unique_interleaved',
-                                  'socket_unique_continuous',
-                                  'disabled'],
-                         help='type of CPU affinity')
-    parser.add_argument("--ema_decay", type=float, default=0.0, help='Use exponential moving average')
-
-
-    ARGS = parser.parse_args()
-    main(ARGS)
diff --git a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/utils.py b/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/utils.py
deleted file mode 100644
index bf88be40..00000000
--- a/Tools/PyTorch/TimeSeriesPredictionPlatform/models/tft_pyt/utils.py
+++ /dev/null
@@ -1,46 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#           http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import time
-
-class PerformanceMeter():
-    def __init__(self):
-        self.reset()
-
-    def reset(self):
-        self.avg = 0
-        self.count = 0
-        self.total_time = 0
-        self.last_update_time = time.time()
-        self.intervals = []
-
-    def update(self, n, exclude_from_total=False):
-        delta = time.time() - self.last_update_time
-        self.intervals.append(delta)
-        if not exclude_from_total:
-            self.total_time += delta
-            self.count += n
-            self.avg = self.count / self.total_time
-        self.last_update_time = time.time()
-
-        return n/delta
-
-    def reset_current_lap(self):
-        self.last_update_time = time.time()
-
-    def p(self, i):
-        assert i <= 100
-        idx = int(len(self.intervals) * i / 100)
-        return sorted(self.intervals)[idx]
-