[NCF/PyT] Adding BYOD capabilities

2021-09-03 06:20:12 -07:00 · 2021-09-03 06:20:12 -07:00 · 5d6d417ff5
parent 4fdd014ebf
commit 5d6d417ff5
38 changed files with 2216 additions and 460 deletions
--- a/PyTorch/Recommendation/NCF/.dockerignore
+++ b/PyTorch/Recommendation/NCF/.dockerignore
@ -0,0 +1,2 @@
+.git
+data/
--- a/PyTorch/Recommendation/NCF/Dockerfile
+++ b/PyTorch/Recommendation/NCF/Dockerfile
@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:20.06-py3
+ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:21.04-py3
 FROM ${FROM_IMAGE_NAME}

 RUN apt-get update && \
--- a/PyTorch/Recommendation/NCF/README.md
+++ b/PyTorch/Recommendation/NCF/README.md
--- a/PyTorch/Recommendation/NCF/convert.py
+++ b/PyTorch/Recommendation/NCF/convert.py
@ -14,7 +14,7 @@
 #
 # -----------------------------------------------------------------------
 #
-# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@ -31,13 +31,21 @@
 from argparse import ArgumentParser
 import pandas as pd
 from load import implicit_load
+from feature_spec import FeatureSpec
+from neumf_constants import USER_CHANNEL_NAME, ITEM_CHANNEL_NAME, LABEL_CHANNEL_NAME, TEST_SAMPLES_PER_SERIES
 import torch
+import os
 import tqdm

-MIN_RATINGS = 20
+TEST_1 = 'test_data_1.pt'
+TEST_0 = 'test_data_0.pt'
+TRAIN_1 = 'train_data_1.pt'
+TRAIN_0 = 'train_data_0.pt'
+
 USER_COLUMN = 'user_id'
 ITEM_COLUMN = 'item_id'

+
 def parse_args():
    parser = ArgumentParser()
    parser.add_argument('--path', type=str, default='/data/ml-20m/ratings.csv',
@ -61,7 +69,7 @@ class _TestNegSampler:
        ids = (train_ratings[:, 0] * self.nb_items) + train_ratings[:, 1]
        self.set = set(ids)

-    def generate(self, batch_size=128*1024):
+    def generate(self, batch_size=128 * 1024):
        users = torch.arange(0, self.nb_users).reshape([1, -1]).repeat([self.nb_neg, 1]).transpose(0, 1).reshape(-1)

        items = [-1] * len(users)
@ -82,6 +90,71 @@ class _TestNegSampler:
        return items


+def save_feature_spec(user_cardinality, item_cardinality, dtypes, test_negative_samples, output_path,
+                      user_feature_name='user',
+                      item_feature_name='item',
+                      label_feature_name='label'):
+    feature_spec = {
+        user_feature_name: {
+            'dtype': dtypes[user_feature_name],
+            'cardinality': int(user_cardinality)
+        },
+        item_feature_name: {
+            'dtype': dtypes[item_feature_name],
+            'cardinality': int(item_cardinality)
+        },
+        label_feature_name: {
+            'dtype': dtypes[label_feature_name],
+        }
+    }
+    metadata = {
+        TEST_SAMPLES_PER_SERIES: test_negative_samples + 1
+    }
+    train_mapping = [
+        {
+            'type': 'torch_tensor',
+            'features': [
+                user_feature_name,
+                item_feature_name
+            ],
+            'files': [TRAIN_0]
+        },
+        {
+            'type': 'torch_tensor',
+            'features': [
+                label_feature_name
+            ],
+            'files': [TRAIN_1]
+        }
+    ]
+    test_mapping = [
+        {
+            'type': 'torch_tensor',
+            'features': [
+                user_feature_name,
+                item_feature_name
+            ],
+            'files': [TEST_0],
+        },
+        {
+            'type': 'torch_tensor',
+            'features': [
+                label_feature_name
+            ],
+            'files': [TEST_1],
+        }
+    ]
+    channel_spec = {
+        USER_CHANNEL_NAME: [user_feature_name],
+        ITEM_CHANNEL_NAME: [item_feature_name],
+        LABEL_CHANNEL_NAME: [label_feature_name]
+    }
+    source_spec = {'train': train_mapping, 'test': test_mapping}
+    feature_spec = FeatureSpec(feature_spec=feature_spec, metadata=metadata, source_spec=source_spec,
+                               channel_spec=channel_spec, base_directory="")
+    feature_spec.to_yaml(output_path=output_path)
+
+
 def main():
    args = parse_args()

@ -91,39 +164,54 @@ def main():
    print("Loading raw data from {}".format(args.path))
    df = implicit_load(args.path, sort=False)

-    print("Filtering out users with less than {} ratings".format(MIN_RATINGS))
-    grouped = df.groupby(USER_COLUMN)
-    df = grouped.filter(lambda x: len(x) >= MIN_RATINGS)
-
    print("Mapping original user and item IDs to new sequential IDs")
    df[USER_COLUMN] = pd.factorize(df[USER_COLUMN])[0]
    df[ITEM_COLUMN] = pd.factorize(df[ITEM_COLUMN])[0]

+    user_cardinality = df[USER_COLUMN].max() + 1
+    item_cardinality = df[ITEM_COLUMN].max() + 1
+
    # Need to sort before popping to get last item
    df.sort_values(by='timestamp', inplace=True)

    # clean up data
    del df['rating'], df['timestamp']
-    df = df.drop_duplicates() # assuming it keeps order
+    df = df.drop_duplicates()  # assuming it keeps order

-    # now we have filtered and sorted by time data, we can split test data out
+    # Test set is the last interaction for a given user
    grouped_sorted = df.groupby(USER_COLUMN, group_keys=False)
-    test_data = grouped_sorted.tail(1).sort_values(by='user_id')
-    # need to pop for each group
+    test_data = grouped_sorted.tail(1).sort_values(by=USER_COLUMN)
+    # Train set is all interactions but the last one
    train_data = grouped_sorted.apply(lambda x: x.iloc[:-1])

-    # Note: no way to keep reference training data ordering because use of python set and multi-process
-    # It should not matter since it will be later randomized again
-    # save train and val data that is fixed.
-    train_ratings = torch.from_numpy(train_data.values)
-    torch.save(train_ratings, args.output+'/train_ratings.pt')
-    test_ratings = torch.from_numpy(test_data.values)
-    torch.save(test_ratings, args.output+'/test_ratings.pt')
-
-    sampler = _TestNegSampler(train_ratings.cpu().numpy(), args.valid_negative)
+    sampler = _TestNegSampler(train_data.values, args.valid_negative)
    test_negs = sampler.generate().cuda()
    test_negs = test_negs.reshape(-1, args.valid_negative)
-    torch.save(test_negs, args.output+'/test_negatives.pt')
+
+    # Reshape train set into user,item,label tabular and save
+    train_ratings = torch.from_numpy(train_data.values).cuda()
+    train_labels = torch.ones_like(train_ratings[:, 0:1], dtype=torch.float32)
+    torch.save(train_ratings, os.path.join(args.output, TRAIN_0))
+    torch.save(train_labels, os.path.join(args.output, TRAIN_1))
+
+    # Reshape test set into user,item,label tabular and save
+    # All users have the same number of items, items for a given user appear consecutively
+    test_ratings = torch.from_numpy(test_data.values).cuda()
+    test_users_pos = test_ratings[:, 0:1]  # slicing instead of indexing to keep dimensions
+    test_items_pos = test_ratings[:, 1:2]
+    test_users = test_users_pos.repeat_interleave(args.valid_negative + 1, dim=0)
+    test_items = torch.cat((test_items_pos.reshape(-1, 1), test_negs), dim=1).reshape(-1, 1)
+    positive_labels = torch.ones_like(test_users_pos, dtype=torch.float32)
+    negative_labels = torch.zeros_like(test_users_pos, dtype=torch.float32).repeat(1, args.valid_negative)
+    test_labels = torch.cat((positive_labels, negative_labels), dim=1).reshape(-1, 1)
+    dtypes = {'user': str(test_users.dtype), 'item': str(test_items.dtype), 'label': str(test_labels.dtype)}
+    test_tensor = torch.cat((test_users, test_items), dim=1)
+    torch.save(test_tensor, os.path.join(args.output, TEST_0))
+    torch.save(test_labels, os.path.join(args.output, TEST_1))
+
+    save_feature_spec(user_cardinality=user_cardinality, item_cardinality=item_cardinality, dtypes=dtypes,
+                      test_negative_samples=args.valid_negative, output_path=args.output + '/feature_spec.yaml')
+

 if __name__ == '__main__':
    main()
--- a/PyTorch/Recommendation/NCF/convert_test.py
+++ b/PyTorch/Recommendation/NCF/convert_test.py
@ -0,0 +1,158 @@
+# Copyright (c) 2018, deepakn94, codyaustun, robieta. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# -----------------------------------------------------------------------
+#
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from argparse import ArgumentParser
+import pandas as pd
+import numpy as np
+from load import implicit_load
+from convert import save_feature_spec, _TestNegSampler, TEST_0, TEST_1, TRAIN_0, TRAIN_1
+import torch
+import os
+
+USER_COLUMN = 'user_id'
+ITEM_COLUMN = 'item_id'
+
+
+def parse_args():
+    parser = ArgumentParser()
+    parser.add_argument('--path', type=str, default='/data/ml-20m/ratings.csv',
+                        help='Path to reviews CSV file from MovieLens')
+    parser.add_argument('--output', type=str, default='/data',
+                        help='Output directory for train and test files')
+    parser.add_argument('--valid_negative', type=int, default=100,
+                        help='Number of negative samples for each positive test example')
+    parser.add_argument('--seed', '-s', type=int, default=1,
+                        help='Manually set random seed for torch')
+    parser.add_argument('--test', type=str, help='select modification to be applied to the set')
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+
+    if args.seed is not None:
+        torch.manual_seed(args.seed)
+
+    print("Loading raw data from {}".format(args.path))
+    df = implicit_load(args.path, sort=False)
+
+    if args.test == 'less_user':
+        to_drop = set(list(df[USER_COLUMN].unique())[-100:])
+        df = df[~df[USER_COLUMN].isin(to_drop)]
+    if args.test == 'less_item':
+        to_drop = set(list(df[ITEM_COLUMN].unique())[-100:])
+        df = df[~df[ITEM_COLUMN].isin(to_drop)]
+    if args.test == 'more_user':
+        sample = df.sample(frac=0.2).copy()
+        sample[USER_COLUMN] = sample[USER_COLUMN] + 10000000
+        df = df.append(sample)
+        users = df[USER_COLUMN]
+        df = df[users.isin(users[users.duplicated(keep=False)])]  # make sure something remains in the train set
+    if args.test == 'more_item':
+        sample = df.sample(frac=0.2).copy()
+        sample[ITEM_COLUMN] = sample[ITEM_COLUMN] + 10000000
+        df = df.append(sample)
+
+    print("Mapping original user and item IDs to new sequential IDs")
+    df[USER_COLUMN] = pd.factorize(df[USER_COLUMN])[0]
+    df[ITEM_COLUMN] = pd.factorize(df[ITEM_COLUMN])[0]
+
+    user_cardinality = df[USER_COLUMN].max() + 1
+    item_cardinality = df[ITEM_COLUMN].max() + 1
+
+    # Need to sort before popping to get last item
+    df.sort_values(by='timestamp', inplace=True)
+
+    # clean up data
+    del df['rating'], df['timestamp']
+    df = df.drop_duplicates()  # assuming it keeps order
+
+    # Test set is the last interaction for a given user
+    grouped_sorted = df.groupby(USER_COLUMN, group_keys=False)
+    test_data = grouped_sorted.tail(1).sort_values(by=USER_COLUMN)
+    # Train set is all interactions but the last one
+    train_data = grouped_sorted.apply(lambda x: x.iloc[:-1])
+
+    sampler = _TestNegSampler(train_data.values, args.valid_negative)
+    test_negs = sampler.generate().cuda()
+    if args.valid_negative > 0:
+        test_negs = test_negs.reshape(-1, args.valid_negative)
+    else:
+        test_negs = test_negs.reshape(test_data.shape[0], 0)
+
+    if args.test == 'more_pos':
+        mask = np.random.rand(len(test_data)) < 0.5
+        sample = test_data[mask].copy()
+        sample[ITEM_COLUMN] = sample[ITEM_COLUMN] + 5
+        test_data = test_data.append(sample)
+        test_negs_copy = test_negs[mask]
+        test_negs = torch.cat((test_negs, test_negs_copy), dim=0)
+    if args.test == 'less_pos':
+        mask = np.random.rand(len(test_data)) < 0.5
+        test_data = test_data[mask]
+        test_negs = test_negs[mask]
+
+    # Reshape train set into user,item,label tabular and save
+    train_ratings = torch.from_numpy(train_data.values).cuda()
+    train_labels = torch.ones_like(train_ratings[:, 0:1], dtype=torch.float32)
+    torch.save(train_ratings, os.path.join(args.output, TRAIN_0))
+    torch.save(train_labels, os.path.join(args.output, TRAIN_1))
+
+    # Reshape test set into user,item,label tabular and save
+    # All users have the same number of items, items for a given user appear consecutively
+    test_ratings = torch.from_numpy(test_data.values).cuda()
+    test_users_pos = test_ratings[:, 0:1]  # slicing instead of indexing to keep dimensions
+    test_items_pos = test_ratings[:, 1:2]
+    test_users = test_users_pos.repeat_interleave(args.valid_negative + 1, dim=0)
+    test_items = torch.cat((test_items_pos.reshape(-1, 1), test_negs), dim=1).reshape(-1, 1)
+    positive_labels = torch.ones_like(test_users_pos, dtype=torch.float32)
+    negative_labels = torch.zeros_like(test_users_pos, dtype=torch.float32).repeat(1, args.valid_negative)
+    test_labels = torch.cat((positive_labels, negative_labels), dim=1).reshape(-1, 1)
+    dtypes = {'user': str(test_users.dtype), 'item': str(test_items.dtype), 'label': str(test_labels.dtype)}
+    test_tensor = torch.cat((test_users, test_items), dim=1)
+    torch.save(test_tensor, os.path.join(args.output, TEST_0))
+    torch.save(test_labels, os.path.join(args.output, TEST_1))
+
+    if args.test == 'other_names':
+        dtypes = {'user_2': str(test_users.dtype),
+                  'item_2': str(test_items.dtype),
+                  'label_2': str(test_labels.dtype)}
+        save_feature_spec(user_cardinality=user_cardinality, item_cardinality=item_cardinality, dtypes=dtypes,
+                          test_negative_samples=args.valid_negative, output_path=args.output + '/feature_spec.yaml',
+                          user_feature_name='user_2',
+                          item_feature_name='item_2',
+                          label_feature_name='label_2')
+    else:
+        save_feature_spec(user_cardinality=user_cardinality, item_cardinality=item_cardinality, dtypes=dtypes,
+                          test_negative_samples=args.valid_negative, output_path=args.output + '/feature_spec.yaml')
+
+
+if __name__ == '__main__':
+    main()
--- a/PyTorch/Recommendation/NCF/data/csv_conversion/feature_spec.yaml
+++ b/PyTorch/Recommendation/NCF/data/csv_conversion/feature_spec.yaml
@ -0,0 +1,43 @@
+feature_spec:
+  user:
+    cardinality: auto
+  item:
+    cardinality: auto
+  label:
+
+metadata:
+  test_samples_per_series: 3
+
+source_spec:
+  train:
+    - type: csv
+      features: #Each line corresponds to a column in the csv files
+        - user
+        - item
+        - label
+      files:
+        - train_data_1.csv # we assume no header
+        - train_data_2.csv
+  test:
+    - type: csv
+      features:
+        - user
+        - item
+        - label
+      files:
+        - test_data_1.csv
+
+channel_spec:
+  user_ch: # Channel names are model-specific magics (in this model, neumf_constants.py)
+    - user
+  item_ch:
+    - item
+  label_ch:
+    - label
+
+# Requirements:
+
+# We assume the ids supplied have already been factorized into 0...N
+
+# In the mapping to be used for validation and testing, candidates for each series (each user) appear consecutively.
+# Each series has the same number of items: metadata['test_samples_per_series']
--- a/PyTorch/Recommendation/NCF/data/csv_conversion/test_data_1.csv
+++ b/PyTorch/Recommendation/NCF/data/csv_conversion/test_data_1.csv
@ -0,0 +1,30 @@
+0, 8, 0
+0, 18, 0
+0, 17, 1
+1, 7, 0
+1, 6, 0
+1, 16, 1
+2, 12, 0
+2, 13, 0
+2, 16, 1
+3, 3, 0
+3, 1, 0
+3, 5, 1
+4, 16, 0
+4, 3, 0
+4, 8, 1
+5, 14, 0
+5, 12, 0
+5, 12, 1
+6, 3, 0
+6, 3, 0
+6, 1, 1
+7, 3, 0
+7, 18, 0
+7, 8, 1
+8, 8, 0
+8, 8, 0
+8, 2, 1
+9, 19, 0
+9, 9, 0
+9, 18, 1
--- a/PyTorch/Recommendation/NCF/data/csv_conversion/train_data_1.csv
+++ b/PyTorch/Recommendation/NCF/data/csv_conversion/train_data_1.csv
@ -0,0 +1,60 @@
+0, 14, 0
+0, 3, 0
+0, 18, 1
+0, 15, 1
+0, 2, 0
+0, 1, 1
+0, 5, 1
+0, 7, 0
+0, 12, 1
+0, 19, 0
+1, 9, 1
+1, 0, 0
+1, 16, 1
+1, 2, 0
+1, 8, 1
+1, 17, 0
+1, 17, 1
+1, 9, 0
+1, 5, 0
+1, 12, 1
+2, 8, 1
+2, 0, 1
+2, 1, 1
+2, 0, 0
+2, 4, 0
+2, 17, 1
+2, 18, 0
+2, 3, 0
+2, 10, 0
+2, 18, 1
+3, 14, 1
+3, 4, 0
+3, 0, 0
+3, 16, 1
+3, 6, 0
+3, 17, 1
+3, 0, 1
+3, 0, 0
+3, 3, 1
+3, 0, 1
+4, 13, 1
+4, 8, 1
+4, 1, 1
+4, 14, 0
+4, 18, 0
+4, 7, 1
+4, 19, 1
+4, 3, 1
+4, 17, 1
+4, 17, 0
+5, 8, 1
+5, 10, 0
+5, 4, 0
+5, 19, 0
+5, 12, 0
+5, 3, 1
+5, 5, 0
+5, 8, 1
+5, 19, 1
+5, 12, 0
--- a/PyTorch/Recommendation/NCF/data/csv_conversion/train_data_2.csv
+++ b/PyTorch/Recommendation/NCF/data/csv_conversion/train_data_2.csv
@ -0,0 +1,40 @@
+6, 18, 0
+6, 19, 0
+6, 4, 0
+6, 16, 1
+6, 19, 0
+6, 2, 0
+6, 4, 0
+6, 2, 1
+6, 0, 0
+6, 12, 1
+7, 3, 0
+7, 7, 0
+7, 16, 0
+7, 4, 0
+7, 19, 0
+7, 11, 1
+7, 10, 1
+7, 13, 1
+7, 18, 0
+7, 4, 0
+8, 4, 0
+8, 5, 0
+8, 12, 0
+8, 2, 0
+8, 14, 0
+8, 19, 1
+8, 0, 0
+8, 17, 1
+8, 19, 1
+8, 15, 1
+9, 9, 1
+9, 17, 0
+9, 9, 1
+9, 14, 0
+9, 11, 0
+9, 17, 1
+9, 4, 0
+9, 1, 0
+9, 8, 0
+9, 10, 1
--- a/PyTorch/Recommendation/NCF/data/like_movielens/ratings.csv
+++ b/PyTorch/Recommendation/NCF/data/like_movielens/ratings.csv
@ -0,0 +1,50 @@
+0, 17, 38308
+0, 1, 38302
+0, 10, 53558
+0, 17, 53042
+0, 12, 43899
+1, 4, 85239
+1, 3, 44884
+1, 3, 37412
+1, 8, 58416
+1, 9, 39814
+2, 6, 53985
+2, 17, 63080
+2, 9, 85791
+2, 19, 37194
+2, 3, 76871
+3, 4, 32445
+3, 1, 97224
+3, 8, 76409
+3, 6, 81547
+3, 0, 52471
+4, 15, 96242
+4, 3, 72309
+4, 9, 54815
+4, 6, 94187
+4, 16, 97208
+5, 5, 56902
+5, 0, 23414
+5, 8, 55770
+5, 5, 27745
+5, 6, 61599
+6, 12, 21675
+6, 4, 53968
+6, 7, 66164
+6, 13, 94933
+6, 1, 92957
+7, 9, 30137
+7, 11, 85128
+7, 18, 30088
+7, 14, 32186
+7, 10, 84664
+8, 1, 39714
+8, 4, 27987
+8, 15, 70023
+8, 17, 93690
+8, 8, 93827
+9, 4, 80146
+9, 8, 20896
+9, 1, 55230
+9, 13, 29631
+9, 2, 46368
--- a/PyTorch/Recommendation/NCF/data/ml-20m/feature_spec_template.yaml
+++ b/PyTorch/Recommendation/NCF/data/ml-20m/feature_spec_template.yaml
@ -0,0 +1,52 @@
+feature_spec:
+  user:
+    dtype: torch.int64
+    cardinality: 138493
+  item:
+    dtype: torch.int64
+    cardinality: 26744
+  label:
+    dtype: torch.float32
+
+metadata:
+  test_samples_per_series: 101
+
+source_spec:
+  train:
+    - type: torch_tensor
+      features:
+        # For torch_tensor, each line corresponds to a column. They are ordered
+        - user
+        - item
+      files:
+        # Loader currently only supports one file per chunk
+        - train_data_0.pt # Paths are relative to data-spec's directory
+    - type: torch_tensor
+      features:
+        - label
+      files:
+        - train_data_1.pt
+  test:
+    - type: torch_tensor
+      features:
+        - user
+        - item
+      files:
+        - test_data_0.pt
+    - type: torch_tensor
+      features:
+        - label
+      files:
+        - test_data_1.pt
+
+channel_spec:
+  user_ch: # Channel names are model-specific magics (in this model, neumf_constants.py)
+    - user
+  item_ch:
+    - item
+  label_ch:
+    - label
+
+# Requirements:
+
+# During validation, for each user we have the same number of samples, supplied consecutively
--- a/PyTorch/Recommendation/NCF/dataloading.py
+++ b/PyTorch/Recommendation/NCF/dataloading.py
@ -14,7 +14,7 @@
 #
 # -----------------------------------------------------------------------
 #
-# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@ -28,95 +28,227 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-import time
 import torch
+import os
+from feature_spec import FeatureSpec
+from neumf_constants import USER_CHANNEL_NAME, ITEM_CHANNEL_NAME, LABEL_CHANNEL_NAME, TEST_SAMPLES_PER_SERIES


-def create_test_data(test_ratings, test_negs, args):
-    test_users = test_ratings[:,0]
-    test_pos = test_ratings[:,1].reshape(-1,1)
+class TorchTensorDataset:
+    """ Warning! This dataset/loader uses torch.load. Torch.load implicitly uses pickle. Pickle is insecure.
+    It is trivial to achieve arbitrary code execution using a prepared pickle payload. Only unpickle data you trust."""

-    # create items with real sample at last position
-    num_valid_negative = test_negs.shape[1]
-    test_users = test_users.reshape(-1,1).repeat(1, 1 + num_valid_negative)
-    test_items = torch.cat((test_negs, test_pos), dim=1)
-    del test_ratings, test_negs
+    def __init__(self, feature_spec: FeatureSpec, mapping_name: str, args):
+        self.local_rank = args.local_rank
+        self.mapping_name = mapping_name
+        self.features = dict()
+        self.feature_spec = feature_spec
+        self._load_features()

-    # generate dup mask and real indices for exact same behavior on duplication compare to reference
-    # here we need a sort that is stable(keep order of duplicates)
-    sorted_items, indices = torch.sort(test_items) # [1,1,1,2], [3,1,0,2]
-    sum_item_indices = sorted_items.float()+indices.float()/len(indices[0]) #[1.75,1.25,1.0,2.5]
-    indices_order = torch.sort(sum_item_indices)[1] #[2,1,0,3]
-    stable_indices = torch.gather(indices, 1, indices_order) #[0,1,3,2]
-    # produce -1 mask
-    dup_mask = (sorted_items[:,0:-1] == sorted_items[:,1:])
-    dup_mask = dup_mask.type(torch.uint8)
-    dup_mask = torch.cat((torch.zeros_like(test_pos, dtype=torch.uint8), dup_mask), dim=1)
-    dup_mask = torch.gather(dup_mask, 1, stable_indices.sort()[1])
-    # produce real sample indices to later check in topk
-    sorted_items, indices = (test_items != test_pos).type(torch.uint8).sort()
-    sum_item_indices = sorted_items.float()+indices.float()/len(indices[0])
-    indices_order = torch.sort(sum_item_indices)[1]
-    stable_indices = torch.gather(indices, 1, indices_order)
-    real_indices = stable_indices[:,0]
-
-    if args.distributed:
-        test_users = torch.chunk(test_users, args.world_size)[args.local_rank]
-        test_items = torch.chunk(test_items, args.world_size)[args.local_rank]
-        dup_mask = torch.chunk(dup_mask, args.world_size)[args.local_rank]
-        real_indices = torch.chunk(real_indices, args.world_size)[args.local_rank]
-
-    test_users = test_users.view(-1).split(args.valid_batch_size)
-    test_items = test_items.view(-1).split(args.valid_batch_size)
-
-    return test_users, test_items, dup_mask, real_indices
+    def _load_features(self):
+        chunks = self.feature_spec.source_spec[self.mapping_name]
+        for chunk in chunks:
+            assert chunk['type'] == 'torch_tensor', "Only torch_tensor files supported in this loader"
+            files_list = chunk['files']
+            assert len(files_list) == 1, "Only one file per chunk supported in this loader"
+            file_relative_path = files_list[0]
+            path_to_load = os.path.join(self.feature_spec.base_directory, file_relative_path)
+            chunk_data = torch.load(path_to_load, map_location=torch.device('cuda:{}'.format(self.local_rank)))
+            running_pos = 0
+            for feature_name in chunk['features']:
+                next_running_pos = running_pos + 1
+                feature_data = chunk_data[:, running_pos:next_running_pos]
+                # This is needed because slicing instead of indexing keeps the data 2-dimensional
+                feature_data = feature_data.reshape(-1, 1)
+                running_pos = next_running_pos
+                self.features[feature_name] = feature_data


-def prepare_epoch_train_data(train_ratings, nb_items, args):
-    # create label
-    train_label = torch.ones_like(train_ratings[:,0], dtype=torch.float32)
-    neg_label = torch.zeros_like(train_label, dtype=torch.float32)
-    neg_label = neg_label.repeat(args.negative_samples)
-    train_label = torch.cat((train_label,neg_label))
-    del neg_label
+class TestDataLoader:
+    def __init__(self, dataset: TorchTensorDataset, args):
+        self.dataset = dataset
+        self.feature_spec = dataset.feature_spec
+        self.channel_spec = self.feature_spec.channel_spec
+        self.samples_in_series = self.feature_spec.metadata[TEST_SAMPLES_PER_SERIES]
+        self.raw_dataset_length = None  # First feature loaded sets this. Total length before splitting across cards
+        self.data = dict()
+        self.world_size = args.world_size
+        self.local_rank = args.local_rank
+        self.batch_size = args.valid_batch_size

-    train_users = train_ratings[:,0]
-    train_items = train_ratings[:,1]
+        self._build_channel_dict()
+        self._deduplication_augmentation()
+        self._split_between_devices()
+        self._split_into_batches()

-    train_users_per_worker = len(train_label) / args.world_size
-    train_users_begin = int(train_users_per_worker * args.local_rank)
-    train_users_end = int(train_users_per_worker * (args.local_rank + 1))
+    def _build_channel_dict(self):
+        for channel_name, channel_features in self.channel_spec.items():
+            channel_tensors = dict()
+            for feature_name in channel_features:
+                channel_tensors[feature_name] = self.dataset.features[feature_name]

-    # prepare data for epoch
-    neg_users = train_users.repeat(args.negative_samples)
-    neg_items = torch.empty_like(neg_users, dtype=torch.int64).random_(0, nb_items)
+                if not self.raw_dataset_length:
+                    self.raw_dataset_length = channel_tensors[feature_name].shape[0]
+                else:
+                    assert self.raw_dataset_length == channel_tensors[feature_name].shape[0]

-    epoch_users = torch.cat((train_users, neg_users))
-    epoch_items = torch.cat((train_items, neg_items))
+            self.data[channel_name] = channel_tensors

-    del neg_users, neg_items
+    def _deduplication_augmentation(self):
+        # Augmentation
+        # This adds a duplication mask tensor.
+        # This is here to exactly replicate the MLPerf training regime. Moving this deduplication to the candidate item
+        # generation stage increases the real diversity of the candidates, which makes the ranking task harder
+        # and results in a drop in HR@10 of approx 0.01. This has been deemed unacceptable (May 2021).

-    # shuffle prepared data and split into batches
-    epoch_indices = torch.randperm(train_users_end - train_users_begin, device='cuda:{}'.format(args.local_rank))
-    epoch_indices += train_users_begin
+        # We need the duplication mask to determine if a given item should be skipped during ranking
+        # If an item with label 1 is duplicated in the sampled ones, we need to be careful to not mark the one with
+        # label 1 as a duplicate. If an item appears repeatedly only with label 1, no duplicates are marked.

-    epoch_users = epoch_users[epoch_indices]
-    epoch_items = epoch_items[epoch_indices]
-    epoch_label = train_label[epoch_indices]
+        # To easily compute candidates, we sort the items. This will impact the distribution of examples between
+        # devices, but should not influence the numerics or performance meaningfully.
+        # We need to assure that the positive item, which we don't want to mark as a duplicate, appears first.
+        # We do this by adding labels as a secondary factor

-    if args.distributed:
-        local_batch = args.batch_size // args.world_size
-    else:
-        local_batch = args.batch_size
+        # Reshape the tensors to have items for a given user in a single row
+        user_feature_name = self.channel_spec[USER_CHANNEL_NAME][0]
+        item_feature_name = self.channel_spec[ITEM_CHANNEL_NAME][0]
+        label_feature_name = self.channel_spec[LABEL_CHANNEL_NAME][0]
+        self.ignore_mask_channel_name = 'mask_ch'
+        self.ignore_mask_feature_name = 'mask'

-    epoch_users = epoch_users.split(local_batch)
-    epoch_items = epoch_items.split(local_batch)
-    epoch_label = epoch_label.split(local_batch)
+        items = self.data[ITEM_CHANNEL_NAME][item_feature_name].view(-1, self.samples_in_series)
+        users = self.data[USER_CHANNEL_NAME][user_feature_name].view(-1, self.samples_in_series)
+        labels = self.data[LABEL_CHANNEL_NAME][label_feature_name].view(-1, self.samples_in_series)

-    # the last batch will almost certainly be smaller, drop it
-    epoch_users = epoch_users[:-1]
-    epoch_items = epoch_items[:-1]
-    epoch_label = epoch_label[:-1]
+        sorting_weights = items.float() - labels.float() * 0.5
+        _, indices = torch.sort(sorting_weights)
+        # The gather reorders according to the indices decided by the sort above
+        sorted_items = torch.gather(items, 1, indices)
+        sorted_labels = torch.gather(labels, 1, indices)
+        sorted_users = torch.gather(users, 1, indices)

-    return epoch_users, epoch_items, epoch_label
+        dup_mask = sorted_items[:, 0:-1] == sorted_items[:, 1:]  # This says if a given item is equal to the next one
+        dup_mask = dup_mask.type(torch.bool)
+        # The first item for a given user can never be a duplicate:
+        dup_mask = torch.cat((torch.zeros_like(dup_mask[:, 0:1]), dup_mask), dim=1)

+        # Reshape them back
+        self.data[ITEM_CHANNEL_NAME][item_feature_name] = sorted_items.view(-1, 1)
+        self.data[USER_CHANNEL_NAME][user_feature_name] = sorted_users.view(-1, 1)
+        self.data[LABEL_CHANNEL_NAME][label_feature_name] = sorted_labels.view(-1, 1)
+        self.data[self.ignore_mask_channel_name] = dict()
+        self.data[self.ignore_mask_channel_name][self.ignore_mask_feature_name] = dup_mask.view(-1, 1)
+
+    def _split_between_devices(self):
+        if self.world_size > 1:
+            # DO NOT REPLACE WITH torch.chunk (number of returned chunks can silently be lower than requested).
+            # It would break compatibility with small datasets.
+            num_test_cases = self.raw_dataset_length / self.samples_in_series
+            smaller_batch = (int(num_test_cases // self.world_size)) * self.samples_in_series
+            bigger_batch = smaller_batch + self.samples_in_series
+            remainder = int(num_test_cases % self.world_size)
+            samples_per_card = [bigger_batch] * remainder + [smaller_batch] * (self.world_size - remainder)
+            for channel_name, channel_dict in self.data.items():
+                for feature_name, feature_tensor in channel_dict.items():
+                    channel_dict[feature_name] = \
+                        channel_dict[feature_name].split(samples_per_card)[self.local_rank]
+
+    def _split_into_batches(self):
+        self.batches = None
+        # This is the structure of each batch, waiting to be copied and filled in with data
+        for channel_name, channel_dict in self.data.items():
+            for feature_name, feature_tensor in channel_dict.items():
+                feature_batches = feature_tensor.view(-1).split(self.batch_size)
+                if not self.batches:
+                    self.batches = list(
+                        {channel_name: dict() for channel_name in self.data.keys()} for _ in feature_batches)
+                for pos, feature_batch_data in enumerate(feature_batches):
+                    self.batches[pos][channel_name][feature_name] = feature_batch_data
+
+    def get_epoch_data(self):
+        return self.batches
+
+    def get_ignore_mask(self):
+        return self.data[self.ignore_mask_channel_name][self.ignore_mask_feature_name]
+
+
+class TrainDataloader:
+    def __init__(self, dataset: TorchTensorDataset, args):
+        self.dataset = dataset
+        self.local_rank = args.local_rank
+        if args.distributed:
+            self.local_batch = args.batch_size // args.world_size
+        else:
+            self.local_batch = args.batch_size
+
+        self.feature_spec = dataset.feature_spec
+        self.channel_spec = self.feature_spec.channel_spec
+        self.negative_samples = args.negative_samples
+
+        self.data = dict()
+        self.raw_dataset_length = None  # first feature loaded sets this
+        self._build_channel_dict()
+        self.length_after_augmentation = self.raw_dataset_length * (self.negative_samples + 1)
+        samples_per_worker = self.length_after_augmentation / args.world_size
+        self.samples_begin = int(samples_per_worker * args.local_rank)
+        self.samples_end = int(samples_per_worker * (args.local_rank + 1))
+
+    def _build_channel_dict(self):
+        for channel_name, channel_features in self.channel_spec.items():
+            channel_tensors = dict()
+            for feature_name in channel_features:
+                channel_tensors[feature_name] = self.dataset.features[feature_name]
+                if not self.raw_dataset_length:
+                    self.raw_dataset_length = channel_tensors[feature_name].shape[0]
+                else:
+                    assert self.raw_dataset_length == channel_tensors[feature_name].shape[0]
+            self.data[channel_name] = channel_tensors
+
+    def get_epoch_data(self):
+
+        # Augment, appending args.negative_samples times the original set, now with random items end negative labels
+        augmented_data = {channel_name: dict() for channel_name in self.data.keys()}
+        user_feature_name = self.channel_spec[USER_CHANNEL_NAME][0]
+        item_feature_name = self.channel_spec[ITEM_CHANNEL_NAME][0]
+        label_feature_name = self.channel_spec[LABEL_CHANNEL_NAME][0]
+
+        # USER
+        user_tensor = self.data[USER_CHANNEL_NAME][user_feature_name]
+        neg_users = user_tensor.repeat(self.negative_samples, 1)
+        augmented_users = torch.cat((user_tensor, neg_users))
+        augmented_data[USER_CHANNEL_NAME][user_feature_name] = augmented_users
+        del neg_users
+
+        # ITEM
+        item_tensor = self.data[ITEM_CHANNEL_NAME][item_feature_name]
+        neg_items = torch.empty_like(item_tensor).repeat(self.negative_samples, 1) \
+            .random_(0, self.feature_spec.feature_spec[item_feature_name]['cardinality'])
+        augmented_items = torch.cat((item_tensor, neg_items))
+        augmented_data[ITEM_CHANNEL_NAME][item_feature_name] = augmented_items
+        del neg_items
+
+        # LABEL
+        label_tensor = self.data[LABEL_CHANNEL_NAME][label_feature_name]
+        neg_label = torch.zeros_like(label_tensor, dtype=torch.float32).repeat(self.negative_samples, 1)
+        augmented_labels = torch.cat((label_tensor, neg_label))
+        del neg_label
+        augmented_data[LABEL_CHANNEL_NAME][label_feature_name] = augmented_labels
+
+        # Labels are not shuffled between cards.
+        # This replicates previous behaviour.
+        epoch_indices = torch.randperm(self.samples_end - self.samples_begin, device='cuda:{}'.format(self.local_rank))
+        epoch_indices += self.samples_begin
+
+        batches = None
+        for channel_name, channel_dict in augmented_data.items():
+            for feature_name, feature_tensor in channel_dict.items():
+                # the last batch will almost certainly be smaller, drop it
+                # Warning: may not work if there's only one
+                feature_batches = feature_tensor.view(-1)[epoch_indices].split(self.local_batch)[:-1]
+                if not batches:
+                    batches = list({channel_name: dict() for channel_name in self.data.keys()} for _ in feature_batches)
+                for pos, feature_batch_data in enumerate(feature_batches):
+                    batches[pos][channel_name][feature_name] = feature_batch_data
+
+        return batches
--- a/PyTorch/Recommendation/NCF/feature_spec.py
+++ b/PyTorch/Recommendation/NCF/feature_spec.py
@ -0,0 +1,50 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import yaml
+import os
+from typing import List, Dict
+
+
+class FeatureSpec:
+    def __init__(self, feature_spec, source_spec, channel_spec, metadata, base_directory):
+        self.feature_spec: Dict = feature_spec
+        self.source_spec: Dict = source_spec
+        self.channel_spec: Dict = channel_spec
+        self.metadata: Dict = metadata
+        self.base_directory: str = base_directory
+
+    @classmethod
+    def from_yaml(cls, path):
+        with open(path, 'r') as feature_spec_file:
+            base_directory = os.path.dirname(path)
+            feature_spec = yaml.safe_load(feature_spec_file)
+            return cls.from_dict(feature_spec, base_directory=base_directory)
+
+    @classmethod
+    def from_dict(cls, source_dict, base_directory):
+        return cls(base_directory=base_directory, **source_dict)
+
+    def to_dict(self) -> Dict:
+        attributes_to_dump = ['feature_spec', 'source_spec', 'channel_spec', 'metadata']
+        return {attr: self.__dict__[attr] for attr in attributes_to_dump}
+
+    def to_string(self):
+        return yaml.dump(self.to_dict())
+
+    def to_yaml(self, output_path=None):
+        if not output_path:
+            output_path = self.base_directory + '/feature_spec.yaml'
+        with open(output_path, 'w') as output_file:
+            print(yaml.dump(self.to_dict()), file=output_file)
--- a/PyTorch/Recommendation/NCF/fp_optimizers.py
+++ b/PyTorch/Recommendation/NCF/fp_optimizers.py
@ -1,45 +0,0 @@
-# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-
-class Fp16Optimizer:
-
-    def __init__(self, fp16_model, loss_scale=8192.0):
-        print('Initializing fp16 optimizer')
-        self.initialize_model(fp16_model)
-        self.loss_scale = loss_scale
-
-    def initialize_model(self, model):
-        print('Reset fp16 grad')
-        self.fp16_model = model
-        for param in self.fp16_model.parameters():
-            param.grad = None
-
-        print('Initializing fp32 clone weights')
-        self.fp32_params = [param.clone().type(torch.cuda.FloatTensor).detach()
-                            for param in model.parameters()]
-        for param in self.fp32_params:
-            param.requires_grad = True
-
-    def backward(self, loss):
-        loss *= self.loss_scale
-        loss.backward()
-
-    def step(self, optimizer):
-        optimizer.step(grads=[p.grad for p in self.fp16_model.parameters()],
-                       output_params=self.fp16_model.parameters(), scale=self.loss_scale)
-
-        for p in self.fp16_model.parameters():
-            p.grad = None
--- a/PyTorch/Recommendation/NCF/full_test_suite.md
+++ b/PyTorch/Recommendation/NCF/full_test_suite.md
@ -0,0 +1,25 @@
+rm -r /data/cache/ml-20m
+
+## Prepare the standard dataset:
+
+./prepare_dataset.sh
+
+## Prepare the modified dataset:
+
+./test_dataset.sh
+
+## Run on the modified dataset:
+
+./test_cases.sh
+
+## Check featurespec:
+
+python test_featurespec_correctness.py /data/cache/ml-20m/feature_spec.yaml /data/ml-20m/feature_spec_template.yaml
+
+## Other dataset:
+
+rm -r /data/cache/ml-1m
+
+./prepare_dataset.sh ml-1m
+
+python -m torch.distributed.launch --nproc_per_node=1 --use_env ncf.py --data /data/cache/ml-1m --epochs 1
--- a/PyTorch/Recommendation/NCF/img/box_plots.png
+++ b/PyTorch/Recommendation/NCF/img/box_plots.png
--- a/PyTorch/Recommendation/NCF/img/df_diagram.png
+++ b/PyTorch/Recommendation/NCF/img/df_diagram.png
--- a/PyTorch/Recommendation/NCF/img/dgx1v16_curve.png
+++ b/PyTorch/Recommendation/NCF/img/dgx1v16_curve.png
--- a/PyTorch/Recommendation/NCF/img/dgx1v_32_curve.png
+++ b/PyTorch/Recommendation/NCF/img/dgx1v_32_curve.png
--- a/PyTorch/Recommendation/NCF/img/hr_histogram.png
+++ b/PyTorch/Recommendation/NCF/img/hr_histogram.png
--- a/PyTorch/Recommendation/NCF/img/layout_example.png
+++ b/PyTorch/Recommendation/NCF/img/layout_example.png
--- a/PyTorch/Recommendation/NCF/img/val_curves.png
+++ b/PyTorch/Recommendation/NCF/img/val_curves.png
--- a/PyTorch/Recommendation/NCF/inference.py
+++ b/PyTorch/Recommendation/NCF/inference.py
@ -82,18 +82,23 @@ def main():
        items = torch.cuda.LongTensor(batch_size).random_(0, args.n_items)

        latencies = []
-        for _ in range(args.num_batches):
+        for i in range(args.num_batches):
            torch.cuda.synchronize()
            start = time.time()
            _ = model(users, items, sigmoid=True)
            torch.cuda.synchronize()
-            latencies.append(time.time() - start)
+            end_time = time.time()
+
+            if i < 10: # warmup iterations
+                continue
+
+            latencies.append(end_time - start)

        result_data[f'batch_{batch_size}_mean_throughput'] = batch_size / np.mean(latencies)
        result_data[f'batch_{batch_size}_mean_latency'] = np.mean(latencies)
-        result_data[f'batch_{batch_size}_p90_latency'] = np.percentile(latencies, 0.90)
-        result_data[f'batch_{batch_size}_p95_latency'] = np.percentile(latencies, 0.95)
-        result_data[f'batch_{batch_size}_p99_latency'] = np.percentile(latencies, 0.99)
+        result_data[f'batch_{batch_size}_p90_latency'] = np.percentile(latencies, 90)
+        result_data[f'batch_{batch_size}_p95_latency'] = np.percentile(latencies, 95)
+        result_data[f'batch_{batch_size}_p99_latency'] = np.percentile(latencies, 99)

    dllogger.log(data=result_data, step=tuple())
    dllogger.flush()
--- a/PyTorch/Recommendation/NCF/load.py
+++ b/PyTorch/Recommendation/NCF/load.py
@ -11,12 +11,26 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+#
+# -----------------------------------------------------------------------
+#
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 from collections import namedtuple

 import pandas as pd

-
 RatingData = namedtuple('RatingData',
                        ['items', 'users', 'ratings', 'min_date', 'max_date'])

@ -67,6 +81,13 @@ def load_ml_20m(filename, sort=True):
    return process_movielens(ratings, sort=sort)


+def load_unknown(filename, sort=True):
+    names = ['user_id', 'item_id', 'timestamp']
+    ratings = pd.read_csv(filename, names=names, header=0, engine='python')
+    ratings['rating'] = 5
+    return process_movielens(ratings, sort=sort)
+
+
 DATASETS = [k.replace('load_', '') for k in locals().keys() if "load_" in k]


@ -74,7 +95,8 @@ def get_dataset_name(filename):
    for dataset in DATASETS:
        if dataset in filename.replace('-', '_').lower():
            return dataset
-    raise NotImplementedError
+    print("Unknown dataset. Expecting `user_id`, `item_id` , and `timestamp`")
+    return "unknown"


 def implicit_load(filename, sort=True):
--- a/PyTorch/Recommendation/NCF/ncf.py
+++ b/PyTorch/Recommendation/NCF/ncf.py
@ -14,7 +14,7 @@
 #
 # -----------------------------------------------------------------------
 #
-# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@ -42,23 +42,29 @@ import torch.nn as nn
 import utils
 import dataloading
 from neumf import NeuMF
+from feature_spec import FeatureSpec
+from neumf_constants import USER_CHANNEL_NAME, ITEM_CHANNEL_NAME, LABEL_CHANNEL_NAME

 import dllogger

 from apex.parallel import DistributedDataParallel as DDP
 from apex import amp

+
 def parse_args():
-    parser = ArgumentParser(description="Train a Nerual Collaborative"
+    parser = ArgumentParser(description="Train a Neural Collaborative"
                                        " Filtering model")
    parser.add_argument('--data', type=str,
-                        help='Path to test and training data files')
+                        help='Path to the directory containing the feature specification yaml')
+    parser.add_argument('--feature_spec_file', type=str, default='feature_spec.yaml',
+                        help='Name of the feature specification file or path relative to the data directory.')
    parser.add_argument('-e', '--epochs', type=int, default=30,
                        help='Number of epochs for training')
-    parser.add_argument('-b', '--batch_size', type=int, default=2**20,
-                        help='Number of examples for each iteration')
-    parser.add_argument('--valid_batch_size', type=int, default=2**20,
-                        help='Number of examples in each validation chunk')
+    parser.add_argument('-b', '--batch_size', type=int, default=2 ** 20,
+                        help='Number of examples for each iteration. This will be divided by the number of devices')
+    parser.add_argument('--valid_batch_size', type=int, default=2 ** 20,
+                        help='Number of examples in each validation chunk. This will be the maximum size of a batch '
+                             'on each device.')
    parser.add_argument('-f', '--factors', type=int, default=64,
                        help='Number of predictive factors')
    parser.add_argument('--layers', nargs='+', type=int,
@ -83,11 +89,13 @@ def parse_args():
    parser.add_argument('--dropout', type=float, default=0.5,
                        help='Dropout probability, if equal to 0 will not use dropout at all')
    parser.add_argument('--checkpoint_dir', default='', type=str,
-                        help='Path to the directory storing the checkpoint file, passing an empty path disables checkpoint saving')
+                        help='Path to the directory storing the checkpoint file, '
+                             'passing an empty path disables checkpoint saving')
    parser.add_argument('--load_checkpoint_path', default=None, type=str,
                        help='Path to the checkpoint file to be loaded before training/evaluation')
    parser.add_argument('--mode', choices=['train', 'test'], default='train', type=str,
-                        help='Passing "test" will only run a single evaluation, otherwise full training will be performed')
+                        help='Passing "test" will only run a single evaluation; '
+                             'otherwise, full training will be performed')
    parser.add_argument('--grads_accumulated', default=1, type=int,
                        help='Number of gradients to accumulate before performing an optimization step')
    parser.add_argument('--amp', action='store_true', help='Enable mixed precision training')
@ -116,34 +124,44 @@ def init_distributed(args):
        args.local_rank = 0


-def val_epoch(model, x, y, dup_mask, real_indices, K, samples_per_user, num_user,
-              epoch=None, distributed=False):
+def val_epoch(model, dataloader: dataloading.TestDataLoader, k, distributed=False):
    model.eval()
-
+    user_feature_name = dataloader.channel_spec[USER_CHANNEL_NAME][0]
+    item_feature_name = dataloader.channel_spec[ITEM_CHANNEL_NAME][0]
+    label_feature_name = dataloader.channel_spec[LABEL_CHANNEL_NAME][0]
    with torch.no_grad():
        p = []
-        for u,n in zip(x,y):
-            p.append(model(u, n, sigmoid=True).detach())
+        labels_list = []
+        for batch_dict in dataloader.get_epoch_data():
+            user_batch = batch_dict[USER_CHANNEL_NAME][user_feature_name]
+            item_batch = batch_dict[ITEM_CHANNEL_NAME][item_feature_name]
+            label_batch = batch_dict[LABEL_CHANNEL_NAME][label_feature_name]

-        temp = torch.cat(p).view(-1,samples_per_user)
-        del x, y, p
+            p.append(model(user_batch, item_batch, sigmoid=True).detach())
+            labels_list.append(label_batch)

-        # set duplicate results for the same item to -1 before topk
-        temp[dup_mask] = -1
-        out = torch.topk(temp,K)[1]
-        # topk in pytorch is stable(if not sort)
-        # key(item):value(prediction) pairs are ordered as original key(item) order
-        # so we need the first position of real item(stored in real_indices) to check if it is in topk
-        ifzero = (out == real_indices.view(-1,1))
+        ignore_mask = dataloader.get_ignore_mask().view(-1, dataloader.samples_in_series)
+        ratings = torch.cat(p).view(-1, dataloader.samples_in_series)
+        ratings[ignore_mask] = -1
+        labels = torch.cat(labels_list).view(-1, dataloader.samples_in_series)
+        del p, labels_list
+
+        top_indices = torch.topk(ratings, k)[1]
+
+        # Positive items are always first in a given series
+        labels_of_selected = torch.gather(labels, 1, top_indices)
+        ifzero = (labels_of_selected == 1)
        hits = ifzero.sum()
-        ndcg = (math.log(2) / (torch.nonzero(ifzero)[:,1].view(-1).to(torch.float)+2).log_()).sum()
+        ndcg = (math.log(2) / (torch.nonzero(ifzero)[:, 1].view(-1).to(torch.float) + 2).log_()).sum()
+        #  torch.nonzero may cause host-device synchronization

    if distributed:
-        torch.distributed.all_reduce(hits, op=torch.distributed.reduce_op.SUM)
-        torch.distributed.all_reduce(ndcg, op=torch.distributed.reduce_op.SUM)
+        torch.distributed.all_reduce(hits, op=torch.distributed.ReduceOp.SUM)
+        torch.distributed.all_reduce(ndcg, op=torch.distributed.ReduceOp.SUM)

-    hr = hits.item() / num_user
-    ndcg = ndcg.item() / num_user
+    num_test_cases = dataloader.raw_dataset_length / dataloader.samples_in_series
+    hr = hits.item() / num_test_cases
+    ndcg = ndcg.item() / num_test_cases

    model.train()
    return hr, ndcg
@ -160,6 +178,12 @@ def main():
    else:
        dllogger.init(backends=[])

+    dllogger.metadata('train_throughput', {"name": 'train_throughput', 'format': ":.3e"})
+    dllogger.metadata('hr@10', {"name": 'hr@10', 'format': ":.5f"})
+    dllogger.metadata('train_epoch_time', {"name": 'train_epoch_time', 'format': ":.3f"})
+    dllogger.metadata('validation_epoch_time', {"name": 'validation_epoch_time', 'format': ":.3f"})
+    dllogger.metadata('eval_throughput', {"name": 'eval_throughput', 'format': ":.3e"})
+
    dllogger.log(data=vars(args), step='PARAMETER')

    if args.seed is not None:
@ -176,25 +200,22 @@ def main():

    main_start_time = time.time()

-    train_ratings = torch.load(args.data+'/train_ratings.pt', map_location=torch.device('cuda:{}'.format(args.local_rank)))
-    test_ratings = torch.load(args.data+'/test_ratings.pt', map_location=torch.device('cuda:{}'.format(args.local_rank)))
-    test_negs = torch.load(args.data+'/test_negatives.pt', map_location=torch.device('cuda:{}'.format(args.local_rank)))
-
-    valid_negative = test_negs.shape[1]
-
-    nb_maxs = torch.max(train_ratings, 0)[0]
-    nb_users = nb_maxs[0].item() + 1
-    nb_items = nb_maxs[1].item() + 1
-
-    all_test_users = test_ratings.shape[0]
-
-    test_users, test_items, dup_mask, real_indices = dataloading.create_test_data(test_ratings, test_negs, args)
+    feature_spec_path = os.path.join(args.data, args.feature_spec_file)
+    feature_spec = FeatureSpec.from_yaml(feature_spec_path)
+    trainset = dataloading.TorchTensorDataset(feature_spec, mapping_name='train', args=args)
+    testset = dataloading.TorchTensorDataset(feature_spec, mapping_name='test', args=args)
+    train_loader = dataloading.TrainDataloader(trainset, args)
+    test_loader = dataloading.TestDataLoader(testset, args)

    # make pytorch memory behavior more consistent later
    torch.cuda.empty_cache()

    # Create model
-    model = NeuMF(nb_users, nb_items,
+    user_feature_name = feature_spec.channel_spec[USER_CHANNEL_NAME][0]
+    item_feature_name = feature_spec.channel_spec[ITEM_CHANNEL_NAME][0]
+    label_feature_name = feature_spec.channel_spec[LABEL_CHANNEL_NAME][0]
+    model = NeuMF(nb_users=feature_spec.feature_spec[user_feature_name]['cardinality'],
+                  nb_items=feature_spec.feature_spec[item_feature_name]['cardinality'],
                  mf_dim=args.factors,
                  mlp_layer_sizes=args.layers,
                  dropout=args.dropout)
@ -202,7 +223,7 @@ def main():
    optimizer = FusedAdam(model.parameters(), lr=args.learning_rate,
                          betas=(args.beta1, args.beta2), eps=args.eps)

-    criterion = nn.BCEWithLogitsLoss(reduction='none') # use torch.mean() with dim later to avoid copy to host
+    criterion = nn.BCEWithLogitsLoss(reduction='none')  # use torch.mean() with dim later to avoid copy to host
    # Move model and loss to GPU
    model = model.cuda()
    criterion = criterion.cuda()
@ -216,48 +237,56 @@ def main():

    local_batch = args.batch_size // args.world_size
    traced_criterion = torch.jit.trace(criterion.forward,
-                                       (torch.rand(local_batch,1),torch.rand(local_batch,1)))
+                                       (torch.rand(local_batch, 1), torch.rand(local_batch, 1)))

    print(model)
    print("{} parameters".format(utils.count_parameters(model)))

    if args.load_checkpoint_path:
        state_dict = torch.load(args.load_checkpoint_path)
-        state_dict = {k.replace('module.', '') : v for k,v in state_dict.items()}
+        state_dict = {k.replace('module.', ''): v for k, v in state_dict.items()}
        model.load_state_dict(state_dict)

    if args.mode == 'test':
        start = time.time()
-        hr, ndcg = val_epoch(model, test_users, test_items, dup_mask, real_indices, args.topk,
-                             samples_per_user=valid_negative + 1,
-                             num_user=all_test_users, distributed=args.distributed)
+        hr, ndcg = val_epoch(model, test_loader, args.topk, distributed=args.distributed)
        val_time = time.time() - start
-        eval_size = all_test_users * (valid_negative + 1)
+        eval_size = test_loader.raw_dataset_length
        eval_throughput = eval_size / val_time

-        dllogger.log(step=tuple(), data={'best_eval_throughput' : eval_throughput,
-                                         'hr@10' : hr})
+        dllogger.log(step=tuple(), data={'best_eval_throughput': eval_throughput,
+                                         'hr@10': hr})
        return
-    
+
+    # this should always be overridden if hr>0.
+    # It is theoretically possible for the hit rate to be zero in the first epoch, which would result in referring
+    # to an uninitialized variable.
    max_hr = 0
    best_epoch = 0
+    best_model_timestamp = time.time()
    train_throughputs, eval_throughputs = [], []

    for epoch in range(args.epochs):

        begin = time.time()
-
-        epoch_users, epoch_items, epoch_label = dataloading.prepare_epoch_train_data(train_ratings, nb_items, args)
-        num_batches = len(epoch_users)
+        batch_dict_list = train_loader.get_epoch_data()
+        num_batches = len(batch_dict_list)
        for i in range(num_batches // args.grads_accumulated):
            for j in range(args.grads_accumulated):
                batch_idx = (args.grads_accumulated * i) + j
-                user = epoch_users[batch_idx]
-                item = epoch_items[batch_idx]
-                label = epoch_label[batch_idx].view(-1,1)
+                batch_dict = batch_dict_list[batch_idx]

-                outputs = model(user, item)
-                loss = traced_criterion(outputs, label).float()
+                user_features = batch_dict[USER_CHANNEL_NAME]
+                item_features = batch_dict[ITEM_CHANNEL_NAME]
+
+                user_batch = user_features[user_feature_name]
+                item_batch = item_features[item_feature_name]
+
+                label_features = batch_dict[LABEL_CHANNEL_NAME]
+                label_batch = label_features[label_feature_name]
+
+                outputs = model(user_batch, item_batch)
+                loss = traced_criterion(outputs, label_batch.view(-1, 1)).float()
                loss = torch.mean(loss.view(-1), 0)

                if args.amp:
@ -270,31 +299,27 @@ def main():
            for p in model.parameters():
                p.grad = None

-        del epoch_users, epoch_items, epoch_label
+        del batch_dict_list
        train_time = time.time() - begin
        begin = time.time()

-        epoch_samples = len(train_ratings) * (args.negative_samples + 1)
+        epoch_samples = train_loader.length_after_augmentation
        train_throughput = epoch_samples / train_time
        train_throughputs.append(train_throughput)

-        hr, ndcg = val_epoch(model, test_users, test_items, dup_mask, real_indices, args.topk,
-                             samples_per_user=valid_negative + 1,
-                             num_user=all_test_users, epoch=epoch, distributed=args.distributed)
+        hr, ndcg = val_epoch(model, test_loader, args.topk, distributed=args.distributed)

        val_time = time.time() - begin
-
-
-        eval_size = all_test_users * (valid_negative + 1)
+        eval_size = test_loader.raw_dataset_length
        eval_throughput = eval_size / val_time
        eval_throughputs.append(eval_throughput)

        dllogger.log(step=(epoch,),
-                     data = {'train_throughput': train_throughput,
-                             'hr@10': hr,
-                             'train_epoch_time': train_time,
-                             'validation_epoch_time': val_time,
-                             'eval_throughput': eval_throughput})
+                     data={'train_throughput': train_throughput,
+                           'hr@10': hr,
+                           'train_epoch_time': train_time,
+                           'validation_epoch_time': val_time,
+                           'eval_throughput': eval_throughput})

        if hr > max_hr and args.local_rank == 0:
            max_hr = hr
--- a/PyTorch/Recommendation/NCF/neumf.py
+++ b/PyTorch/Recommendation/NCF/neumf.py
@ -35,6 +35,7 @@ import torch.nn as nn
 import sys
 from os.path import abspath, join, dirname

+
 class NeuMF(nn.Module):
    def __init__(self, nb_users, nb_items,
                 mf_dim, mlp_layer_sizes, dropout=0):
--- a/PyTorch/Recommendation/NCF/neumf_constants.py
+++ b/PyTorch/Recommendation/NCF/neumf_constants.py
@ -0,0 +1,18 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+USER_CHANNEL_NAME = 'user_ch'
+ITEM_CHANNEL_NAME = 'item_ch'
+LABEL_CHANNEL_NAME = 'label_ch'
+TEST_SAMPLES_PER_SERIES = 'test_samples_per_series'
--- a/PyTorch/Recommendation/NCF/prepare_dataset.sh
+++ b/PyTorch/Recommendation/NCF/prepare_dataset.sh
@ -14,7 +14,7 @@
 #
 # -----------------------------------------------------------------------
 #
-# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@ -34,22 +34,26 @@ set -e
 set -x

 DATASET_NAME=${1:-'ml-20m'}
-RAW_DATADIR=${2:-'/data'}
-CACHED_DATADIR=${3:-"${RAW_DATADIR}/cache/${DATASET_NAME}"}
+RAW_DATADIR=${2:-"/data/${DATASET_NAME}"}
+CACHED_DATADIR=${3:-"/data/cache/${DATASET_NAME}"}

 # you can add another option to this case in order to support other datasets
 case ${DATASET_NAME} in
    'ml-20m')
 	ZIP_PATH=${RAW_DATADIR}/'ml-20m.zip'
+	SHOULD_UNZIP=1
 	RATINGS_PATH=${RAW_DATADIR}'/ml-20m/ratings.csv'
 	;;
    'ml-1m')
 	ZIP_PATH=${RAW_DATADIR}/'ml-1m.zip'
+	SHOULD_UNZIP=1
 	RATINGS_PATH=${RAW_DATADIR}'/ml-1m/ratings.dat'
 	;;
-	*)
-	echo "Unsupported dataset name: $DATASET_NAME"
-	exit 1
+	  *)
+	echo "Using unknown dataset: $DATASET_NAME."
+	RATINGS_PATH=${RAW_DATADIR}'/ratings.csv'
+	echo "Expecting file at ${RATINGS_PATH}"
+	SHOULD_UNZIP=0
 esac

 if [ ! -d ${RAW_DATADIR} ]; then
@ -64,16 +68,21 @@ if [ -f log ]; then
    rm -f log
 fi

-if [ ! -f ${ZIP_PATH} ]; then
-    echo "Dataset not found. Please download it from: https://grouplens.org/datasets/movielens/20m/ and put it in ${ZIP_PATH}"
-    exit 1
-fi
-
 if [ ! -f ${RATINGS_PATH} ]; then
-    unzip -u ${ZIP_PATH}  -d ${RAW_DATADIR}
+    if [ $SHOULD_UNZIP == 1 ]; then
+        if [ ! -f ${ZIP_PATH} ]; then
+          echo "Dataset not found. Please download it from: https://grouplens.org/datasets/movielens/20m/ and put it in ${ZIP_PATH}"
+          exit 1
+        fi
+        unzip -u ${ZIP_PATH}  -d ${RAW_DATADIR}
+    else
+      echo "File not found at ${RATINGS_PATH}. Aborting."
+      exit 1
+    fi
 fi

-if [ ! -f ${CACHED_DATADIR}/train_ratings.pt ]; then
+
+if [ ! -f ${CACHED_DATADIR}/feature_spec.yaml ]; then
    echo "preprocessing ${RATINGS_PATH} and save to disk"
    t0=$(date +%s)
    python convert.py --path ${RATINGS_PATH} --output ${CACHED_DATADIR}
@ -84,7 +93,7 @@ else
    echo 'Using cached preprocessed data'
 fi

-echo "Dataset $DATASET_NAME successfully prepared at: $CACHED_DATADIR\n"
+echo "Dataset $DATASET_NAME successfully prepared at: $CACHED_DATADIR"
 echo "You can now run the training with: python -m torch.distributed.launch --nproc_per_node=<number_of_GPUs> --use_env ncf.py --data ${CACHED_DATADIR}"


--- a/PyTorch/Recommendation/NCF/qa/generate_boxplots.py
+++ b/PyTorch/Recommendation/NCF/qa/generate_boxplots.py
@ -0,0 +1,70 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import matplotlib.pyplot as plt
+
+
+def get_training_data(filename):
+    with open(filename, 'r') as opened:
+        line = opened.readlines()[-1]
+    json_content = line[len("DLLL "):]
+    data = json.loads(json_content)["data"]
+    with open(filename, 'r') as opened:
+        for line in opened.readlines():
+            d = json.loads(line[len("DLLL "):])
+            if d.get("step", "") == "PARAMETER":
+                data['batch_size'] = d["data"]["batch_size"]
+    return data
+
+
+a100 = "runs/pytorch_ncf_A100-SXM4-40GBx{numgpus}gpus_{precision}_{num_run}.json"
+v16 = "runs/pytorch_ncf_Tesla V100-SXM2-16GBx{numgpus}gpus_{precision}_{num_run}.json"
+v32 = "runs/pytorch_ncf_Tesla V100-SXM2-32GBx{numgpus}gpus_{precision}_{num_run}.json"
+dgx2 = "runs/pytorch_ncf_Tesla V100-SXM3-32GBx{numgpus}gpus_{precision}_{num_run}.json"
+
+fp32 = "FP32"
+amp = "Mixed (AMP)"
+tf32 = "TF32"
+
+
+def get_accs(arch, numgpu, prec):
+    data = [get_training_data(arch.format(numgpus=numgpu, num_run=num_run, precision=prec)) for num_run in range(1, 21)]
+    accs = [d["best_accuracy"] for d in data]
+    return accs
+
+
+def get_plots():
+    archs = [dgx2, a100]
+    gpuranges = [(1, 8, 16), (1, 8)]
+    titles = ["DGX2 32GB", "DGX A100 40GB"]
+    fullprecs = [fp32, tf32]
+    fig, axs = plt.subplots(2, 3, sharey=True, figsize=(8, 8))
+    plt.subplots_adjust(hspace=0.5)
+    for x, arch in enumerate(archs):
+        gpurange = gpuranges[x]
+        for y, gpu in enumerate(gpurange):
+            f_data = get_accs(arch, gpu, fullprecs[x])
+            h_data = get_accs(arch, gpu, amp)
+            axs[x, y].boxplot([f_data, h_data])
+            axs[x, y].set_xticklabels([fullprecs[x], amp])
+            axs[x, y].set_title(f"{gpu} GPUs" if gpu > 1 else "1 GPU")
+        axs[x, 0].set_ylabel(titles[x])
+    fig.delaxes(axs[1, 2])
+    # plt.show()
+    plt.savefig("box_plots.png")
+
+
+if __name__ == "__main__":
+    get_plots()
--- a/PyTorch/Recommendation/NCF/qa/generate_tables.py
+++ b/PyTorch/Recommendation/NCF/qa/generate_tables.py
@ -0,0 +1,113 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import tabulate
+import numpy as np
+
+
+def get_training_data(filename):
+    with open(filename, 'r') as opened:
+        line = opened.readlines()[-1]
+    json_content = line[len("DLLL "):]
+    data = json.loads(json_content)["data"]
+    with open(filename, 'r') as opened:
+        for line in opened.readlines():
+            d = json.loads(line[len("DLLL "):])
+            if d.get("step", "") == "PARAMETER":
+                data['batch_size'] = d["data"]["batch_size"]
+    return data
+
+
+a100 = "runs/pytorch_ncf_A100-SXM4-40GBx{numgpus}gpus_{precision}_{num_run}.json"
+v16 = "runs/pytorch_ncf_Tesla V100-SXM2-16GBx{numgpus}gpus_{precision}_{num_run}.json"
+v32 = "runs/pytorch_ncf_Tesla V100-SXM2-32GBx{numgpus}gpus_{precision}_{num_run}.json"
+dgx2 = "runs/pytorch_ncf_Tesla V100-SXM3-32GBx{numgpus}gpus_{precision}_{num_run}.json"
+
+fp32 = "FP32"
+amp = "Mixed (AMP)"
+tf32 = "TF32"
+
+first = a100.format(numgpus=1, precision=fp32, num_run=1)
+
+timevar = 'time_to_target' #"time_to_best_model"
+
+
+def get_acc_table(arch, numgpus, fullprec):
+    headers = ["GPUs", "Batch size / GPU", f"Accuracy - {fullprec}", "Accuracy - mixed precision", f"Time to train - {fullprec}", "Time to train - mixed precision", f"Time to train speedup ({fullprec} to mixed precision)"]
+    table = []
+    for numgpus in numgpus:
+        data_full = [get_training_data(arch.format(numgpus=numgpus, num_run=num_run, precision=fullprec)) for num_run in range(1, 21)]
+        data_mixed = [get_training_data(arch.format(numgpus=numgpus, num_run=num_run, precision=amp)) for num_run in range(1, 21)]
+        bsize = data_full[0]['batch_size']/numgpus
+        accs_full = np.mean([d["best_accuracy"] for d in data_full])
+        accs_mixed = np.mean([d["best_accuracy"] for d in data_mixed])
+        time_full = np.mean([d[timevar] for d in data_full])
+        time_mixed = np.mean([d[timevar] for d in data_mixed])
+        speedup = time_full / time_mixed
+        row = [numgpus, int(bsize),
+               "{:.6f}".format(accs_full),
+               "{:.6f}".format(accs_mixed),
+               "{:.6f}".format(time_full),
+               "{:.6f}".format(time_mixed),
+               "{:.2f}".format(speedup)]
+        table.append(row)
+    print(tabulate.tabulate(table, headers, tablefmt='pipe'))
+
+
+def get_perf_table(arch, numgpus, fullprec):
+    headers = ["GPUs",
+               "Batch size / GPU",
+               f"Throughput - {fullprec} (samples/s)",
+               "Throughput - mixed precision (samples/s)",
+               f"Throughput speedup ({fullprec} to mixed precision)",
+               f"Strong scaling - {fullprec}",
+               "Strong scaling - mixed precision",
+               ]
+    table = []
+    base_full = None
+    base_mixed = None
+    for numgpus in numgpus:
+        data_full = [get_training_data(arch.format(numgpus=numgpus, num_run=num_run, precision=fullprec)) for num_run in range(1, 21)]
+        data_mixed = [get_training_data(arch.format(numgpus=numgpus, num_run=num_run, precision=amp)) for num_run in range(1, 21)]
+        bsize = data_full[0]['batch_size']/numgpus
+        _full = np.mean([d["best_train_throughput"] for d in data_full])
+        _mixed = np.mean([d["best_train_throughput"] for d in data_mixed])
+        if numgpus == 1:
+            base_full = _full
+            base_mixed = _mixed
+        scaling_full = _full/ base_full
+        scaling_mixed = _mixed / base_mixed
+        time_mixed = np.mean([d[timevar] for d in data_mixed])
+        speedup = _full / _mixed
+        row = [numgpus, int(bsize),
+               "{:.2f}M".format(_full / 10**6),
+               "{:.2f}M".format(_mixed / 10**6),
+               "{:.2f}".format(speedup),
+               "{:.2f}".format(scaling_full),
+               "{:.2f}".format(scaling_mixed)]
+
+        table.append(row)
+    print(tabulate.tabulate(table, headers, tablefmt='pipe'))
+
+
+#get_acc_table(a100, (1, 8), tf32)
+#get_acc_table(v16, (1, 8), fp32)
+#get_acc_table(v32, (1, 8), fp32)
+#get_acc_table(dgx2, (1, 8, 16), fp32)
+
+#get_perf_table(a100, (1, 8), tf32)
+#get_perf_table(v16, (1, 8), fp32)
+#get_perf_table(v32, (1, 8), fp32)
+#get_perf_table(dgx2, (1, 8, 16), fp32)
--- a/PyTorch/Recommendation/NCF/qa/generate_validation_curves.py
+++ b/PyTorch/Recommendation/NCF/qa/generate_validation_curves.py
@ -0,0 +1,66 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import matplotlib.pyplot as plt
+
+
+def get_curve(filename):
+    hrs = []
+    with open(filename, 'r') as opened:
+        for line in opened.readlines():
+            d = json.loads(line[len("DLLL "):])
+            try:
+                hrs.append(d["data"]["hr@10"])
+            except KeyError:
+                pass
+    return hrs
+
+
+a100 = "runs/pytorch_ncf_A100-SXM4-40GBx{numgpus}gpus_{precision}_{num_run}.json"
+v16 = "runs/pytorch_ncf_Tesla V100-SXM2-16GBx{numgpus}gpus_{precision}_{num_run}.json"
+v32 = "runs/pytorch_ncf_Tesla V100-SXM2-32GBx{numgpus}gpus_{precision}_{num_run}.json"
+dgx2 = "runs/pytorch_ncf_Tesla V100-SXM3-32GBx{numgpus}gpus_{precision}_{num_run}.json"
+
+fp32 = "FP32"
+amp = "Mixed (AMP)"
+tf32 = "TF32"
+
+
+def get_accs(arch, numgpu, prec):
+    data = [get_curve(arch.format(numgpus=numgpu, num_run=num_run, precision=prec)) for num_run in range(1, 21)]
+    return data[0]
+
+
+def get_plots():
+    archs = [dgx2, a100]
+    titles = ["DGX2 32GB", "DGX A100 40GB"]
+    fullprecs = [fp32, tf32]
+    halfprecs = [amp, amp]
+    gpuranges = [(1, 8, 16), (1, 8)]
+    fig, axs = plt.subplots(1, 2, sharey=True, figsize=(10, 5))
+    plt.subplots_adjust(hspace=0.5)
+    for x, prec in enumerate([fullprecs, halfprecs]):
+        for i, arch in enumerate(archs):
+            for numgpu in gpuranges[i]:
+                d = get_accs(arch, numgpu, prec[i])
+                axs[x].plot(range(len(d)), d, label=f"{titles[i]} x {numgpu} {prec[i]}")
+        axs[x].legend()
+
+    #plt.show()
+    plt.savefig("val_curves.png")
+
+
+if __name__ == "__main__":
+    get_plots()
--- a/PyTorch/Recommendation/NCF/qa/inference_table.py
+++ b/PyTorch/Recommendation/NCF/qa/inference_table.py
@ -0,0 +1,44 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import tabulate
+
+archs = ["a100", "v100"]
+precs = ["full", "half"]
+
+for arch in archs:
+    for prec in precs:
+        filename = f"inference/{arch}_{prec}.log"
+        with open(filename) as opened:
+            line = opened.readlines()[-1]
+        log = json.loads(line[len("DLLL "):])['data']
+        print(log)
+        batch_sizes = [1024, 4096, 16384, 65536, 262144, 1048576]
+        t_avg = "batch_{}_mean_throughput"
+        l_mean = "batch_{}_mean_latency"
+        l_90 = "batch_{}_p90_latency"
+        l_95 = "batch_{}_p95_latency"
+        l_99 = "batch_{}_p99_latency"
+        headers = ["Batch size", "Throughput Avg", "Latency Avg", "Latency 90%", "Latency 95%", "Latency 99%"]
+        table = []
+        for bsize in batch_sizes:
+            table.append([bsize,
+                          "{:3.3f}".format(log[t_avg.format(bsize)]),
+                          "{:.6f}".format(log[l_mean.format(bsize)]),
+                          "{:.6f}".format(log[l_90.format(bsize)]),
+                          "{:.6f}".format(log[l_95.format(bsize)]),
+                          "{:.6f}".format(log[l_99.format(bsize)])])
+        print(filename)
+        print(tabulate.tabulate(table, headers, tablefmt='pipe'))
--- a/PyTorch/Recommendation/NCF/requirements.txt
+++ b/PyTorch/Recommendation/NCF/requirements.txt
@ -1,3 +1,4 @@
-pandas
+pandas>=0.24.2
 tqdm
-git+https://github.com/NVIDIA/dllogger#egg=dllogger 
+pyyaml
+git+https://github.com/NVIDIA/dllogger#egg=dllogger
--- a/PyTorch/Recommendation/NCF/test_cases.sh
+++ b/PyTorch/Recommendation/NCF/test_cases.sh
@ -0,0 +1,42 @@
+#!/bin/bash
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+set -x
+
+for test_name in more_pos less_pos less_user less_item more_user more_item other_names;
+do
+    CACHED_DATADIR='/data/cache/ml-20m'
+    NEW_DIR=${CACHED_DATADIR}/${test_name}
+    echo "Trying to run on modified dataset: $test_name"
+    python -m torch.distributed.launch --nproc_per_node=1 --use_env ncf.py --data ${NEW_DIR} --epochs 1
+    echo "Model runs on modified dataset: $test_name"
+done
+
+for test_sample in '0' '10' '200';
+do
+    CACHED_DATADIR='/data/cache/ml-20m'
+    NEW_DIR=${CACHED_DATADIR}/sample_${test_name}
+    echo "Trying to run on dataset with test sampling: $test_sample"
+    python -m torch.distributed.launch --nproc_per_node=1 --use_env ncf.py --data ${NEW_DIR} --epochs 1
+    echo "Model runs on dataset with test sampling: $test_sample"
+done
+
+for online_sample in '0' '1' '10';
+do
+    CACHED_DATADIR='/data/cache/ml-20m'
+    echo "Trying to run with train sampling: $online_sample"
+    python -m torch.distributed.launch --nproc_per_node=1 --use_env ncf.py --data ${CACHED_DATADIR} --epochs 1 -n ${online_sample}
+    echo "Model runs with train sampling: $online_sample"
+done
--- a/PyTorch/Recommendation/NCF/test_dataset.sh
+++ b/PyTorch/Recommendation/NCF/test_dataset.sh
@ -0,0 +1,103 @@
+ # Copyright (c) 2018, deepakn94, robieta. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# -----------------------------------------------------------------------
+#
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+#!/bin/bash
+set -e
+set -x
+
+DATASET_NAME=${1:-'ml-20m'}
+RAW_DATADIR=${2:-"/data/${DATASET_NAME}"}
+CACHED_DATADIR=${3:-"$/data/cache/${DATASET_NAME}"}
+
+# you can add another option to this case in order to support other datasets
+case ${DATASET_NAME} in
+    'ml-20m')
+	ZIP_PATH=${RAW_DATADIR}/'ml-20m.zip'
+	RATINGS_PATH=${RAW_DATADIR}'/ml-20m/ratings.csv'
+	;;
+    'ml-1m')
+	ZIP_PATH=${RAW_DATADIR}/'ml-1m.zip'
+	RATINGS_PATH=${RAW_DATADIR}'/ml-1m/ratings.dat'
+	;;
+	*)
+	echo "Unsupported dataset name: $DATASET_NAME"
+	exit 1
+esac
+
+if [ ! -d ${RAW_DATADIR} ]; then
+    mkdir -p ${RAW_DATADIR}
+fi
+
+if [ ! -d ${CACHED_DATADIR} ]; then
+    mkdir -p ${CACHED_DATADIR}
+fi
+
+if [ -f log ]; then
+    rm -f log
+fi
+
+if [ ! -f ${ZIP_PATH} ]; then
+    echo "Dataset not found. Please download it from: https://grouplens.org/datasets/movielens/20m/ and put it in ${ZIP_PATH}"
+    exit 1
+fi
+
+if [ ! -f ${RATINGS_PATH} ]; then
+    unzip -u ${ZIP_PATH}  -d ${RAW_DATADIR}
+fi
+
+for test_name in more_pos less_pos less_user less_item more_user more_item other_names;
+do
+    NEW_DIR=${CACHED_DATADIR}/${test_name}
+
+    if [ ! -d ${NEW_DIR} ]; then
+    mkdir -p ${NEW_DIR}
+    fi
+
+    python convert_test.py --path ${RATINGS_PATH} --output $NEW_DIR --test ${test_name}
+    echo "Generated testing for $test_name"
+done
+
+for test_sample in '0' '10' '200';
+do
+    NEW_DIR=${CACHED_DATADIR}/sample_${test_name}
+
+    if [ ! -d ${NEW_DIR} ]; then
+    mkdir -p ${NEW_DIR}
+    fi
+
+    python convert_test.py --path ${RATINGS_PATH} --output $NEW_DIR --valid_negative $test_sample
+    echo "Generated testing for $test_name"
+done
+
+echo "Dataset $DATASET_NAME successfully prepared at: $CACHED_DATADIR"
+echo "You can now run the training with: python -m torch.distributed.launch --nproc_per_node=<number_of_GPUs> --use_env ncf.py --data ${CACHED_DATADIR}"
+
+
--- a/PyTorch/Recommendation/NCF/test_featurespec_correctness.py
+++ b/PyTorch/Recommendation/NCF/test_featurespec_correctness.py
@ -0,0 +1,90 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from feature_spec import FeatureSpec
+from neumf_constants import TEST_SAMPLES_PER_SERIES
+from dataloading import TorchTensorDataset
+import torch
+import os
+import sys
+
+
+def test_matches_template(path, template_path):
+    loaded_featurespec_string = FeatureSpec.from_yaml(path).to_string()
+    loaded_template_string = FeatureSpec.from_yaml(template_path).to_string()
+    assert loaded_template_string == loaded_featurespec_string
+
+
+def mock_args():
+    class Obj:
+        pass
+
+    args = Obj()
+    args.__dict__['local_rank'] = 0
+    return args
+
+
+def test_dtypes(path):
+    loaded_featurespec = FeatureSpec.from_yaml(path)
+    features = loaded_featurespec.feature_spec
+    declared_dtypes = {name: data['dtype'] for name, data in features.items()}
+    source_spec = loaded_featurespec.source_spec
+    for mapping in source_spec.values():
+        for chunk in mapping:
+            chunk_dtype = None
+            for present_feature in chunk['features']:
+                assert present_feature in declared_dtypes, "unknown feature in mapping"
+                # Check declared type
+                feature_dtype = declared_dtypes[present_feature]
+                if chunk_dtype is None:
+                    chunk_dtype = feature_dtype
+                else:
+                    assert chunk_dtype == feature_dtype
+
+            path_to_load = os.path.join(loaded_featurespec.base_directory, chunk['files'][0])
+            loaded_data = torch.load(path_to_load)
+            assert str(loaded_data.dtype) == chunk_dtype
+
+
+def test_cardinalities(path):
+    loaded_featurespec = FeatureSpec.from_yaml(path)
+    features = loaded_featurespec.feature_spec
+    declared_cardinalities = {name: data['cardinality'] for name, data in features.items() if 'cardinality' in data}
+    source_spec = loaded_featurespec.source_spec
+
+    for mapping_name, mapping in source_spec.items():
+        dataset = TorchTensorDataset(loaded_featurespec, mapping_name, mock_args())
+        for feature_name, cardinality in declared_cardinalities.items():
+            feature_data = dataset.features[feature_name]
+            biggest_num = feature_data.max().item()
+            assert biggest_num < cardinality
+
+
+def test_samples_in_test_series(path):
+    loaded_featurespec = FeatureSpec.from_yaml(path)
+
+    series_length = loaded_featurespec.metadata[TEST_SAMPLES_PER_SERIES]
+    dataset = TorchTensorDataset(loaded_featurespec, 'test', mock_args())
+    for feature in dataset.features.values():
+        assert len(feature) % series_length == 0
+
+
+if __name__ == '__main__':
+    tested_spec = sys.argv[1]
+    template = sys.argv[2]
+
+    test_cardinalities(tested_spec)
+    test_dtypes(tested_spec)
+    test_samples_in_test_series(tested_spec)
+    test_matches_template(tested_spec, template)
--- a/PyTorch/Recommendation/NCF/transcode.py
+++ b/PyTorch/Recommendation/NCF/transcode.py
@ -0,0 +1,127 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from argparse import ArgumentParser
+import os
+import torch
+import pandas as pd
+
+from feature_spec import FeatureSpec
+from neumf_constants import USER_CHANNEL_NAME, ITEM_CHANNEL_NAME, LABEL_CHANNEL_NAME
+
+
+def parse_args():
+    parser = ArgumentParser()
+    parser.add_argument('--path', type=str, default='',
+                        help='Path to input data directory')
+    parser.add_argument('--feature_spec_in', type=str, default='feature_spec.yaml',
+                        help='Name of the input feature specification file, or path relative to data directory.')
+    parser.add_argument('--output', type=str, default='/data',
+                        help='Path to output data directory')
+    parser.add_argument('--feature_spec_out', type=str, default='feature_spec.yaml',
+                        help='Name of the output feature specification file, or path relative to data directory.')
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+    args_output = args.output
+    args_path = args.path
+    args_feature_spec_in = args.feature_spec_in
+    args_feature_spec_out = args.feature_spec_out
+
+    feature_spec_path = os.path.join(args_path, args_feature_spec_in)
+    feature_spec = FeatureSpec.from_yaml(feature_spec_path)
+
+    # Only three features are transcoded - this is NCF specific
+    user_feature_name = feature_spec.channel_spec[USER_CHANNEL_NAME][0]
+    item_feature_name = feature_spec.channel_spec[ITEM_CHANNEL_NAME][0]
+    label_feature_name = feature_spec.channel_spec[LABEL_CHANNEL_NAME][0]
+
+    categorical_features = [user_feature_name, item_feature_name]
+
+    found_cardinalities = {f: 0 for f in categorical_features}
+
+    new_source_spec = {}
+    for mapping_name, mapping in feature_spec.source_spec.items():
+        # Load all chunks and link into one df
+        chunk_dfs = []
+        for chunk in mapping:
+            assert chunk['type'] == 'csv', "Only csv files supported in this transcoder"
+            file_dfs = []
+            for file in chunk['files']:
+                path_to_load = os.path.join(feature_spec.base_directory, file)
+                file_dfs.append(pd.read_csv(path_to_load, header=None))
+            chunk_df = pd.concat(file_dfs, ignore_index=True)
+            chunk_df.columns = chunk['features']
+            chunk_df.reset_index(drop=True, inplace=True)
+            chunk_dfs.append(chunk_df)
+        mapping_df = pd.concat(chunk_dfs, axis=1)  # This takes care of making sure feature names are unique
+
+        for feature in categorical_features:
+            mapping_cardinality = mapping_df[feature].max() + 1
+            previous_cardinality = found_cardinalities[feature]
+            found_cardinalities[feature] = max(previous_cardinality, mapping_cardinality)
+
+        # We group together users and items, while separating labels. This is because of the target dtypes: ids are int,
+        # while labels are float to compute loss.
+        ints_tensor = torch.from_numpy(mapping_df[[user_feature_name, item_feature_name]].values).long()
+        ints_file = f"{mapping_name}_data_0.pt"
+        ints_chunk = {"type": "torch_tensor",
+                      "features": [user_feature_name, item_feature_name],
+                      "files": [ints_file]}
+        torch.save(ints_tensor, os.path.join(args_output, ints_file))
+
+        floats_tensor = torch.from_numpy(mapping_df[[label_feature_name]].values).float()
+        floats_file = f"{mapping_name}_data_1.pt"
+        floats_chunk = {"type": "torch_tensor",
+                        "features": [label_feature_name],
+                        "files": [floats_file]}
+        torch.save(floats_tensor, os.path.join(args_output, floats_file))
+
+        new_source_spec[mapping_name] = [ints_chunk, floats_chunk]
+
+    for feature in categorical_features:
+        found_cardinality = found_cardinalities[feature]
+        declared_cardinality = feature_spec.feature_spec[feature].get('cardinality', 'auto')
+        if declared_cardinality != "auto":
+            declared = int(declared_cardinality)
+            assert declared >= found_cardinality, "Specified cardinality conflicts data"
+            found_cardinalities[feature] = declared
+
+    new_inner_feature_spec = {
+        user_feature_name: {
+            "dtype": "torch.int64",
+            "cardinality": int(found_cardinalities[user_feature_name])
+        },
+        item_feature_name: {
+            "dtype": "torch.int64",
+            "cardinality": int(found_cardinalities[item_feature_name])
+        },
+        label_feature_name: {
+            "dtype": "torch.float32"
+        }
+    }
+
+    new_feature_spec = FeatureSpec(feature_spec=new_inner_feature_spec,
+                                   source_spec=new_source_spec,
+                                   channel_spec=feature_spec.channel_spec,
+                                   metadata=feature_spec.metadata,
+                                   base_directory="")
+    feature_spec_save_path = os.path.join(args_output, args_feature_spec_out)
+    new_feature_spec.to_yaml(output_path=feature_spec_save_path)
+
+
+if __name__ == '__main__':
+    main()
--- a/PyTorch/Recommendation/NCF/utils.py
+++ b/PyTorch/Recommendation/NCF/utils.py
@ -13,9 +13,8 @@
 # limitations under the License.

 import os
-import json
 from functools import reduce
-import time
+

 def count_parameters(model):
    c = map(lambda p: reduce(lambda x, y: x * y, p.size()), model.parameters())